Skip to content

Commit 0ef7ce3

Browse files
clean up
1 parent 29e889a commit 0ef7ce3

File tree

3 files changed

+149
-186
lines changed

3 files changed

+149
-186
lines changed

pandas/tests/frame/methods/test_cov_corr.py

Lines changed: 31 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from itertools import combinations
2-
31
import numpy as np
42
import pytest
53

@@ -255,11 +253,10 @@ def test_corr_numeric_only(self, meth, numeric_only):
255253
df.corr(meth, numeric_only=numeric_only)
256254

257255
@pytest.mark.parametrize("method", ["kendall", "spearman"])
256+
@pytest.mark.parametrize("col1", ["ord_cat", "ord_cat_none", "ord_cat_shuff"])
257+
@pytest.mark.parametrize("col2", ["ord_cat", "ord_cat_none", "ord_cat_shuff"])
258258
@td.skip_if_no("scipy")
259-
def test_corr_rank_ordered_categorical(
260-
self,
261-
method,
262-
):
259+
def test_corr_rank_ordered_categorical(self, method, col1, col2):
263260
# GH #60306
264261
df = DataFrame(
265262
{
@@ -281,15 +278,15 @@ def test_corr_rank_ordered_categorical(
281278
}
282279
)
283280
corr_calc = df.corr(method=method)
284-
for col1, col2 in combinations(df.columns, r=2):
285-
corr_expected = df[col1].corr(df[col2], method=method)
286-
tm.assert_almost_equal(corr_calc[col1][col2], corr_expected)
281+
corr_expected = df[col1].corr(df[col2], method=method)
282+
tm.assert_almost_equal(corr_calc[col1][col2], corr_expected)
287283

288284
@pytest.mark.parametrize("method", ["kendall", "spearman"])
285+
@pytest.mark.parametrize("col1_idx", [0, 1, 2, 3, 4])
286+
@pytest.mark.parametrize("col2_idx", [0, 1, 2, 3, 4])
289287
@td.skip_if_no("scipy")
290288
def test_corr_rank_ordered_categorical_duplicate_columns(
291-
self,
292-
method,
289+
self, method, col1_idx, col2_idx
293290
):
294291
# GH #60306
295292
cat = pd.CategoricalDtype(categories=[4, 3, 2, 1], ordered=True)
@@ -305,11 +302,8 @@ def test_corr_rank_ordered_categorical_duplicate_columns(
305302
df.columns = ["a", "a", "c", "c", "e"]
306303

307304
corr_calc = df.corr(method=method)
308-
for col1_idx, col2_idx in combinations(range(len(df.columns)), r=2):
309-
corr_expected = df.iloc[:, col1_idx].corr(
310-
df.iloc[:, col2_idx], method=method
311-
)
312-
tm.assert_almost_equal(corr_calc.iloc[col1_idx, col2_idx], corr_expected)
305+
corr_expected = df.iloc[:, col1_idx].corr(df.iloc[:, col2_idx], method=method)
306+
tm.assert_almost_equal(corr_calc.iloc[col1_idx, col2_idx], corr_expected)
313307

314308

315309
class TestDataFrameCorrWith:
@@ -554,49 +548,40 @@ def test_cov_with_missing_values(self):
554548
tm.assert_frame_equal(result2, expected)
555549

556550
@pytest.mark.parametrize("method", ["kendall", "spearman"])
557-
def test_corr_rank_ordered_categorical(
558-
self,
559-
method,
560-
):
551+
@pytest.mark.parametrize("col", ["a", "b", "c", "d"])
552+
def test_corr_rank_ordered_categorical(self, method, col):
561553
# GH #60306
562554
pytest.importorskip("scipy")
563555
df1 = DataFrame(
564556
{
565-
"a": Series(
566-
pd.Categorical(
567-
["low", "m", "h", "vh"],
568-
categories=["low", "m", "h", "vh"],
569-
ordered=True,
570-
)
557+
"a": pd.Categorical(
558+
["low", "m", "h", "vh"],
559+
categories=["low", "m", "h", "vh"],
560+
ordered=True,
571561
),
572-
"b": Series(
573-
pd.Categorical(
574-
["low", "m", "h", None],
575-
categories=["low", "m", "h"],
576-
ordered=True,
577-
)
562+
"b": pd.Categorical(
563+
["low", "m", "h", None],
564+
categories=["low", "m", "h"],
565+
ordered=True,
578566
),
579-
"c": Series([0, 1, 2, 3]),
580-
"d": Series([2.0, 3.0, 4.5, 6.5]),
567+
"c": [0, 1, 2, 3],
568+
"d": [2.0, 3.0, 4.5, 6.5],
581569
}
582570
)
583571

584572
df2 = DataFrame(
585573
{
586-
"a": Series([2.0, 3.0, 4.5, np.nan]),
587-
"b": Series(
588-
pd.Categorical(
589-
["m", "h", "vh", "low"],
590-
categories=["low", "m", "h", "vh"],
591-
ordered=True,
592-
)
574+
"a": [2.0, 3.0, 4.5, np.nan],
575+
"b": pd.Categorical(
576+
["m", "h", "vh", "low"],
577+
categories=["low", "m", "h", "vh"],
578+
ordered=True,
593579
),
594-
"c": Series([2, 3, 0, 1]),
595-
"d": Series([2.0, 3.0, 4.5, 6.5]),
580+
"c": [2, 3, 0, 1],
581+
"d": [2.0, 3.0, 4.5, 6.5],
596582
}
597583
)
598584

599585
corr_calc = df1.corrwith(df2, method=method)
600-
for col in df1.columns:
601-
corr_expected = df1[col].corr(df2[col], method=method)
602-
tm.assert_almost_equal(corr_calc.get(col), corr_expected)
586+
corr_expected = df1[col].corr(df2[col], method=method)
587+
tm.assert_almost_equal(corr_calc.get(col), corr_expected)

pandas/tests/methods/corr.py

Lines changed: 99 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -15,136 +15,118 @@
1515

1616

1717
@pytest.mark.parametrize(
18-
("input_df", "expected_df"),
18+
("input_df_dict", "expected_df_dict"),
1919
[
2020
pytest.param(
2121
# 1) Simple: two ordered categorical columns (with and without None)
22-
DataFrame(
23-
{
24-
"ord_cat": Series(
25-
Categorical(
26-
["low", "m", "h", "vh"],
27-
categories=["low", "m", "h", "vh"],
28-
ordered=True,
29-
)
30-
),
31-
"ord_cat_none": Series(
32-
Categorical(
33-
["low", "m", "h", None],
34-
categories=["low", "m", "h"],
35-
ordered=True,
36-
)
37-
),
38-
}
39-
),
40-
DataFrame(
41-
{
42-
# codes: low=0, m=1, h=2, vh=3
43-
"ord_cat": Series([0, 1, 2, 3], dtype="int8"),
44-
# codes: low=0, m=1, h=2, None -> NaN
45-
"ord_cat_none": Series([0, 1.0, 2.0, np.nan]),
46-
}
47-
),
22+
{
23+
"ord_cat": Categorical(
24+
["low", "m", "h", "vh"],
25+
categories=["low", "m", "h", "vh"],
26+
ordered=True,
27+
),
28+
"ord_cat_none": Categorical(
29+
["low", "m", "h", None],
30+
categories=["low", "m", "h"],
31+
ordered=True,
32+
),
33+
},
34+
{
35+
# codes: low=0, m=1, h=2, vh=3
36+
"ord_cat": Series([0, 1, 2, 3], dtype="int8"),
37+
# codes: low=0, m=1, h=2, None -> NaN
38+
"ord_cat_none": [0, 1.0, 2.0, np.nan],
39+
},
4840
id="ordered-categoricals-basic",
4941
),
5042
pytest.param(
5143
# 2) Mixed dtypes: only the ordered categorical should change
52-
DataFrame(
53-
{
54-
"ordered": Series(
55-
Categorical(
56-
["a", "c", "b"],
57-
categories=["a", "b", "c"],
58-
ordered=True,
59-
)
60-
),
61-
"unordered": Series(Categorical(["x", "y", "x"], ordered=False)),
62-
"num": Series([10, 20, 30]),
63-
"text": Series(["u", "v", "w"]),
64-
}
65-
),
66-
DataFrame(
67-
{
68-
# codes: a=0, c=2, b=1
69-
"ordered": Series([0, 2, 1], dtype="int8"),
70-
# unordered categorical should be untouched (still categorical)
71-
"unordered": Series(Categorical(["x", "y", "x"], ordered=False)),
72-
"num": Series([10, 20, 30]),
73-
"text": Series(["u", "v", "w"]),
74-
}
75-
),
44+
{
45+
"ordered": Categorical(
46+
["a", "c", "b"],
47+
categories=["a", "b", "c"],
48+
ordered=True,
49+
),
50+
"unordered": Categorical(["x", "y", "x"], ordered=False),
51+
"num": [10, 20, 30],
52+
"text": ["u", "v", "w"],
53+
},
54+
{
55+
# codes: a=0, c=2, b=1
56+
"ordered": Series([0, 2, 1], dtype="int8"),
57+
# unordered categorical should be untouched (still categorical)
58+
"unordered": Categorical(["x", "y", "x"], ordered=False),
59+
"num": [10, 20, 30],
60+
"text": ["u", "v", "w"],
61+
},
7662
id="mixed-types-only-ordered-changes",
7763
),
78-
pytest.param(
79-
# 3 Duplicate column names: first 'dup' is ordered categorical,
80-
# second 'dup' is non-categorical
81-
DataFrame(
82-
{
83-
"dup_1": Series(
84-
Categorical(
85-
["low", "m", "h"],
86-
categories=["low", "m", "h"],
87-
ordered=True,
88-
)
89-
),
90-
"dup_2": Series([5, 6, 7]), # duplicate name, later column
91-
}
92-
),
93-
DataFrame(
94-
{
95-
# After transform: position 0 (ordered cat) becomes codes [0,1,2],
96-
# position 1 remains untouched numbers [5,6,7].
97-
"dup_1": Series([0, 1, 2], dtype="int8"),
98-
"dup_2": Series([5, 6, 7]),
99-
}
100-
),
101-
id="duplicate-names-ordered-first",
102-
),
103-
pytest.param(
104-
# 4 Duplicate column names: first 'dup' is non-categorical,
105-
# second 'dup' is ordered categorical, third 'dup' is ordered categorical
106-
DataFrame(
107-
{
108-
"dup_1": Series(["a", "b", "c"]), # non-categorical (object)
109-
"dup_2": Series(
110-
Categorical(
111-
["p", "q", None],
112-
categories=["p", "q"],
113-
ordered=True,
114-
)
115-
),
116-
"dup_3": Series(
117-
Categorical(
118-
["low", "m", "h"],
119-
categories=["low", "m", "h"],
120-
ordered=True,
121-
)
122-
),
123-
}
124-
),
125-
DataFrame(
126-
{
127-
# First stays object; second turns into codes [0, 1, NaN]
128-
# and third changes into codes [0, 1, 2]
129-
"dup_1": Series(["a", "b", "c"]),
130-
"dup_2": Series([0.0, 1.0, np.nan]),
131-
"dup_3": Series([0, 1, 2], dtype="int8"),
132-
}
133-
),
134-
id="duplicate-names-ordered-and-non-categorical-and-none",
135-
),
13664
],
13765
)
13866
def test_transform_ord_cat_cols_to_coded_cols(
139-
input_df: DataFrame, expected_df: DataFrame
67+
input_df_dict: dict, expected_df_dict: dict
14068
) -> None:
14169
# GH #60306
142-
# duplicate columns creation for dup columns
143-
if "dup_1" in input_df.columns:
144-
input_df.columns = ["dup" for _ in range(len(input_df.columns))]
145-
expected_df.columns = ["dup" for _ in range(len(expected_df.columns))]
146-
70+
input_df = DataFrame(input_df_dict)
71+
expected_df = DataFrame(expected_df_dict)
14772
out_df = transform_ord_cat_cols_to_coded_cols(input_df)
14873
assert list(out_df.columns) == list(expected_df.columns)
149-
for i, col in enumerate(out_df.columns):
150-
tm.assert_series_equal(out_df.iloc[:, i], expected_df.iloc[:, i])
74+
tm.assert_frame_equal(out_df, expected_df)
75+
76+
77+
def test_transform_ord_cat_cols_to_coded_cols_duplicated_col() -> None:
78+
# GH #60306
79+
input_df_1 = DataFrame(
80+
{
81+
"dup_1": Categorical(
82+
["low", "m", "h"],
83+
categories=["low", "m", "h"],
84+
ordered=True,
85+
),
86+
"dup_2": [5, 6, 7],
87+
}
88+
)
89+
expected_df_1 = DataFrame(
90+
{
91+
# After transform: position 0 (ordered cat) becomes codes [0,1,2],
92+
# position 1 remains untouched numbers [5,6,7].
93+
"dup_1": Series([0, 1, 2], dtype="int8"),
94+
"dup_2": [5, 6, 7],
95+
}
96+
)
97+
input_df_1.columns = ["dup" for _ in range(len(input_df_1.columns))]
98+
expected_df_1.columns = ["dup" for _ in range(len(input_df_1.columns))]
99+
100+
out_df_1 = transform_ord_cat_cols_to_coded_cols(input_df_1)
101+
tm.assert_frame_equal(out_df_1, expected_df_1)
102+
103+
input_df_2 = DataFrame(
104+
{
105+
"dup_1": ["a", "b", "c"], # non-categorical
106+
"dup_2": Categorical(
107+
["p", "q", None],
108+
categories=["p", "q"],
109+
ordered=True,
110+
),
111+
"dup_3": Categorical(
112+
["low", "m", "h"],
113+
categories=["low", "m", "h"],
114+
ordered=True,
115+
),
116+
}
117+
)
118+
119+
expected_df_2 = DataFrame(
120+
{
121+
# First stays object; second turns into codes [0, 1, NaN]
122+
# and third changes into codes [0, 1, 2]
123+
"dup_1": ["a", "b", "c"],
124+
"dup_2": [0.0, 1.0, np.nan],
125+
"dup_3": Series([0, 1, 2], dtype="int8"),
126+
}
127+
)
128+
input_df_2.columns = ["dup" for _ in range(len(input_df_2.columns))]
129+
expected_df_2.columns = ["dup" for _ in range(len(input_df_2.columns))]
130+
131+
out_df_2 = transform_ord_cat_cols_to_coded_cols(input_df_2)
132+
tm.assert_frame_equal(out_df_2, expected_df_2)

0 commit comments

Comments
 (0)