|
15 | 15 |
|
16 | 16 |
|
17 | 17 | @pytest.mark.parametrize( |
18 | | - ("input_df", "expected_df"), |
| 18 | + ("input_df_dict", "expected_df_dict"), |
19 | 19 | [ |
20 | 20 | pytest.param( |
21 | 21 | # 1) Simple: two ordered categorical columns (with and without None) |
22 | | - DataFrame( |
23 | | - { |
24 | | - "ord_cat": Series( |
25 | | - Categorical( |
26 | | - ["low", "m", "h", "vh"], |
27 | | - categories=["low", "m", "h", "vh"], |
28 | | - ordered=True, |
29 | | - ) |
30 | | - ), |
31 | | - "ord_cat_none": Series( |
32 | | - Categorical( |
33 | | - ["low", "m", "h", None], |
34 | | - categories=["low", "m", "h"], |
35 | | - ordered=True, |
36 | | - ) |
37 | | - ), |
38 | | - } |
39 | | - ), |
40 | | - DataFrame( |
41 | | - { |
42 | | - # codes: low=0, m=1, h=2, vh=3 |
43 | | - "ord_cat": Series([0, 1, 2, 3], dtype="int8"), |
44 | | - # codes: low=0, m=1, h=2, None -> NaN |
45 | | - "ord_cat_none": Series([0, 1.0, 2.0, np.nan]), |
46 | | - } |
47 | | - ), |
| 22 | + { |
| 23 | + "ord_cat": Categorical( |
| 24 | + ["low", "m", "h", "vh"], |
| 25 | + categories=["low", "m", "h", "vh"], |
| 26 | + ordered=True, |
| 27 | + ), |
| 28 | + "ord_cat_none": Categorical( |
| 29 | + ["low", "m", "h", None], |
| 30 | + categories=["low", "m", "h"], |
| 31 | + ordered=True, |
| 32 | + ), |
| 33 | + }, |
| 34 | + { |
| 35 | + # codes: low=0, m=1, h=2, vh=3 |
| 36 | + "ord_cat": Series([0, 1, 2, 3], dtype="int8"), |
| 37 | + # codes: low=0, m=1, h=2, None -> NaN |
| 38 | + "ord_cat_none": [0, 1.0, 2.0, np.nan], |
| 39 | + }, |
48 | 40 | id="ordered-categoricals-basic", |
49 | 41 | ), |
50 | 42 | pytest.param( |
51 | 43 | # 2) Mixed dtypes: only the ordered categorical should change |
52 | | - DataFrame( |
53 | | - { |
54 | | - "ordered": Series( |
55 | | - Categorical( |
56 | | - ["a", "c", "b"], |
57 | | - categories=["a", "b", "c"], |
58 | | - ordered=True, |
59 | | - ) |
60 | | - ), |
61 | | - "unordered": Series(Categorical(["x", "y", "x"], ordered=False)), |
62 | | - "num": Series([10, 20, 30]), |
63 | | - "text": Series(["u", "v", "w"]), |
64 | | - } |
65 | | - ), |
66 | | - DataFrame( |
67 | | - { |
68 | | - # codes: a=0, c=2, b=1 |
69 | | - "ordered": Series([0, 2, 1], dtype="int8"), |
70 | | - # unordered categorical should be untouched (still categorical) |
71 | | - "unordered": Series(Categorical(["x", "y", "x"], ordered=False)), |
72 | | - "num": Series([10, 20, 30]), |
73 | | - "text": Series(["u", "v", "w"]), |
74 | | - } |
75 | | - ), |
| 44 | + { |
| 45 | + "ordered": Categorical( |
| 46 | + ["a", "c", "b"], |
| 47 | + categories=["a", "b", "c"], |
| 48 | + ordered=True, |
| 49 | + ), |
| 50 | + "unordered": Categorical(["x", "y", "x"], ordered=False), |
| 51 | + "num": [10, 20, 30], |
| 52 | + "text": ["u", "v", "w"], |
| 53 | + }, |
| 54 | + { |
| 55 | + # codes: a=0, c=2, b=1 |
| 56 | + "ordered": Series([0, 2, 1], dtype="int8"), |
| 57 | + # unordered categorical should be untouched (still categorical) |
| 58 | + "unordered": Categorical(["x", "y", "x"], ordered=False), |
| 59 | + "num": [10, 20, 30], |
| 60 | + "text": ["u", "v", "w"], |
| 61 | + }, |
76 | 62 | id="mixed-types-only-ordered-changes", |
77 | 63 | ), |
78 | | - pytest.param( |
79 | | - # 3 Duplicate column names: first 'dup' is ordered categorical, |
80 | | - # second 'dup' is non-categorical |
81 | | - DataFrame( |
82 | | - { |
83 | | - "dup_1": Series( |
84 | | - Categorical( |
85 | | - ["low", "m", "h"], |
86 | | - categories=["low", "m", "h"], |
87 | | - ordered=True, |
88 | | - ) |
89 | | - ), |
90 | | - "dup_2": Series([5, 6, 7]), # duplicate name, later column |
91 | | - } |
92 | | - ), |
93 | | - DataFrame( |
94 | | - { |
95 | | - # After transform: position 0 (ordered cat) becomes codes [0,1,2], |
96 | | - # position 1 remains untouched numbers [5,6,7]. |
97 | | - "dup_1": Series([0, 1, 2], dtype="int8"), |
98 | | - "dup_2": Series([5, 6, 7]), |
99 | | - } |
100 | | - ), |
101 | | - id="duplicate-names-ordered-first", |
102 | | - ), |
103 | | - pytest.param( |
104 | | - # 4 Duplicate column names: first 'dup' is non-categorical, |
105 | | - # second 'dup' is ordered categorical, third 'dup' is ordered categorical |
106 | | - DataFrame( |
107 | | - { |
108 | | - "dup_1": Series(["a", "b", "c"]), # non-categorical (object) |
109 | | - "dup_2": Series( |
110 | | - Categorical( |
111 | | - ["p", "q", None], |
112 | | - categories=["p", "q"], |
113 | | - ordered=True, |
114 | | - ) |
115 | | - ), |
116 | | - "dup_3": Series( |
117 | | - Categorical( |
118 | | - ["low", "m", "h"], |
119 | | - categories=["low", "m", "h"], |
120 | | - ordered=True, |
121 | | - ) |
122 | | - ), |
123 | | - } |
124 | | - ), |
125 | | - DataFrame( |
126 | | - { |
127 | | - # First stays object; second turns into codes [0, 1, NaN] |
128 | | - # and third changes into codes [0, 1, 2] |
129 | | - "dup_1": Series(["a", "b", "c"]), |
130 | | - "dup_2": Series([0.0, 1.0, np.nan]), |
131 | | - "dup_3": Series([0, 1, 2], dtype="int8"), |
132 | | - } |
133 | | - ), |
134 | | - id="duplicate-names-ordered-and-non-categorical-and-none", |
135 | | - ), |
136 | 64 | ], |
137 | 65 | ) |
138 | 66 | def test_transform_ord_cat_cols_to_coded_cols( |
139 | | - input_df: DataFrame, expected_df: DataFrame |
| 67 | + input_df_dict: dict, expected_df_dict: dict |
140 | 68 | ) -> None: |
141 | 69 | # GH #60306 |
142 | | - # duplicate columns creation for dup columns |
143 | | - if "dup_1" in input_df.columns: |
144 | | - input_df.columns = ["dup" for _ in range(len(input_df.columns))] |
145 | | - expected_df.columns = ["dup" for _ in range(len(expected_df.columns))] |
146 | | - |
| 70 | + input_df = DataFrame(input_df_dict) |
| 71 | + expected_df = DataFrame(expected_df_dict) |
147 | 72 | out_df = transform_ord_cat_cols_to_coded_cols(input_df) |
148 | 73 | assert list(out_df.columns) == list(expected_df.columns) |
149 | | - for i, col in enumerate(out_df.columns): |
150 | | - tm.assert_series_equal(out_df.iloc[:, i], expected_df.iloc[:, i]) |
| 74 | + tm.assert_frame_equal(out_df, expected_df) |
| 75 | + |
| 76 | + |
| 77 | +def test_transform_ord_cat_cols_to_coded_cols_duplicated_col() -> None: |
| 78 | + # GH #60306 |
| 79 | + input_df_1 = DataFrame( |
| 80 | + { |
| 81 | + "dup_1": Categorical( |
| 82 | + ["low", "m", "h"], |
| 83 | + categories=["low", "m", "h"], |
| 84 | + ordered=True, |
| 85 | + ), |
| 86 | + "dup_2": [5, 6, 7], |
| 87 | + } |
| 88 | + ) |
| 89 | + expected_df_1 = DataFrame( |
| 90 | + { |
| 91 | + # After transform: position 0 (ordered cat) becomes codes [0,1,2], |
| 92 | + # position 1 remains untouched numbers [5,6,7]. |
| 93 | + "dup_1": Series([0, 1, 2], dtype="int8"), |
| 94 | + "dup_2": [5, 6, 7], |
| 95 | + } |
| 96 | + ) |
| 97 | + input_df_1.columns = ["dup" for _ in range(len(input_df_1.columns))] |
| 98 | + expected_df_1.columns = ["dup" for _ in range(len(input_df_1.columns))] |
| 99 | + |
| 100 | + out_df_1 = transform_ord_cat_cols_to_coded_cols(input_df_1) |
| 101 | + tm.assert_frame_equal(out_df_1, expected_df_1) |
| 102 | + |
| 103 | + input_df_2 = DataFrame( |
| 104 | + { |
| 105 | + "dup_1": ["a", "b", "c"], # non-categorical |
| 106 | + "dup_2": Categorical( |
| 107 | + ["p", "q", None], |
| 108 | + categories=["p", "q"], |
| 109 | + ordered=True, |
| 110 | + ), |
| 111 | + "dup_3": Categorical( |
| 112 | + ["low", "m", "h"], |
| 113 | + categories=["low", "m", "h"], |
| 114 | + ordered=True, |
| 115 | + ), |
| 116 | + } |
| 117 | + ) |
| 118 | + |
| 119 | + expected_df_2 = DataFrame( |
| 120 | + { |
| 121 | + # First stays object; second turns into codes [0, 1, NaN] |
| 122 | + # and third changes into codes [0, 1, 2] |
| 123 | + "dup_1": ["a", "b", "c"], |
| 124 | + "dup_2": [0.0, 1.0, np.nan], |
| 125 | + "dup_3": Series([0, 1, 2], dtype="int8"), |
| 126 | + } |
| 127 | + ) |
| 128 | + input_df_2.columns = ["dup" for _ in range(len(input_df_2.columns))] |
| 129 | + expected_df_2.columns = ["dup" for _ in range(len(input_df_2.columns))] |
| 130 | + |
| 131 | + out_df_2 = transform_ord_cat_cols_to_coded_cols(input_df_2) |
| 132 | + tm.assert_frame_equal(out_df_2, expected_df_2) |
0 commit comments