From 1290366392b0993f4fd8e4719b2231fbb2c0c94e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 7 Sep 2024 10:15:37 -0400 Subject: [PATCH 1/6] API: value_counts to consistently maintain order of input --- pandas/core/frame.py | 4 +- pandas/core/groupby/groupby.py | 4 +- pandas/core/groupby/ops.py | 42 ++++++++++++++++--- .../tests/frame/methods/test_value_counts.py | 14 ++++++- .../groupby/methods/test_value_counts.py | 20 ++++++--- 5 files changed, 69 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c80e9dfd23ba2..7d3b819334f5c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7370,7 +7370,9 @@ def value_counts( subset = self.columns.tolist() name = "proportion" if normalize else "count" - counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size() + counts = self.groupby( + subset, sort=False, dropna=dropna, observed=False + )._grouper.size() counts.name = name if sort: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 79fe78b7e5405..9ec25858a455a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2669,7 +2669,7 @@ def _value_counts( grouper, _, _ = get_grouper( df, key=key, - sort=self.sort, + sort=False, observed=False, dropna=dropna, ) @@ -2678,7 +2678,7 @@ def _value_counts( # Take the size of the overall columns gb = df.groupby( groupings, - sort=self.sort, + sort=False, observed=self.observed, dropna=self.dropna, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index da80969b613cd..c2688e049b291 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -747,12 +747,14 @@ def result_index(self) -> Index: def ids(self) -> npt.NDArray[np.intp]: return self.result_index_and_ids[1] - @cache_readonly + # @cache_readonly + @property def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: levels = [Index._with_infer(ping.uniques) for ping in self.groupings] obs = [ ping._observed or not ping._passed_categorical for ping in self.groupings ] + sorts = [ping._sort for ping in self.groupings] # When passed a categorical grouping, keep all categories for k, (ping, level) in enumerate(zip(self.groupings, levels)): if ping._passed_categorical: @@ -763,7 +765,9 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: result_index.name = self.names[0] ids = ensure_platform_int(self.codes[0]) elif all(obs): - result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names) + result_index, ids = self._ob_index_and_ids( + levels, self.codes, self.names, sorts + ) elif not any(obs): result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names) else: @@ -776,6 +780,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: levels=[levels[idx] for idx in ob_indices], codes=[codes[idx] for idx in ob_indices], names=[names[idx] for idx in ob_indices], + sorts=[sorts[idx] for idx in ob_indices], ) unob_index, unob_ids = self._unob_index_and_ids( levels=[levels[idx] for idx in unob_indices], @@ -798,9 +803,18 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: ).reorder_levels(index) ids = len(unob_index) * ob_ids + unob_ids - if self._sort: + if any(sorts): # Sort result_index and recode ids using the new order - sorter = result_index.argsort() + n_levels = len(sorts) + drop_levels = [ + n_levels - idx + for idx, sort in enumerate(reversed(sorts), 1) + if not sort + ] + if len(drop_levels) > 0: + sorter = result_index._drop_level_numbers(drop_levels).argsort() + else: + sorter = result_index.argsort() result_index = result_index.take(sorter) _, index = np.unique(sorter, return_index=True) ids = ensure_platform_int(ids) @@ -835,10 +849,13 @@ def _ob_index_and_ids( levels: list[Index], codes: list[npt.NDArray[np.intp]], names: list[Hashable], + sorts: list[bool], ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + consistent_sorting = all(sorts[0] == sort for sort in sorts[1:]) + sort_in_compress = sorts[0] if consistent_sorting else False shape = tuple(len(level) for level in levels) group_index = get_group_index(codes, shape, sort=True, xnull=True) - ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) + ob_ids, obs_group_ids = compress_group_index(group_index, sort=sort_in_compress) ob_ids = ensure_platform_int(ob_ids) ob_index_codes = decons_obs_group_ids( ob_ids, obs_group_ids, shape, codes, xnull=True @@ -849,6 +866,21 @@ def _ob_index_and_ids( names=names, verify_integrity=False, ) + if not consistent_sorting: + # Sort by the levels where the corresponding sort argument is True + n_levels = len(sorts) + drop_levels = [ + n_levels - idx + for idx, sort in enumerate(reversed(sorts), 1) + if not sort + ] + if len(drop_levels) > 0: + sorter = ob_index._drop_level_numbers(drop_levels).argsort() + else: + sorter = ob_index.argsort() + ob_index = ob_index.take(sorter) + _, index = np.unique(sorter, return_index=True) + ob_ids = np.where(ob_ids == -1, -1, index.take(ob_ids)) ob_ids = ensure_platform_int(ob_ids) return ob_index, ob_ids diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 7670b53f23173..ee138407b9ecd 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -128,7 +128,17 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): expected = pd.Series( data=[1, 1], index=pd.MultiIndex.from_arrays( - [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] + [ + ( + "John", + "Beth", + ), + ( + "Smith", + "Louise", + ), + ], + names=["first_name", "middle_name"], ), name="count", ) @@ -156,7 +166,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): pd.Index(["Anne", "Beth", "John"]), pd.Index(["Louise", "Smith", np.nan]), ], - codes=[[0, 1, 2, 2], [2, 0, 1, 2]], + codes=[[2, 0, 2, 1], [1, 2, 2, 0]], names=["first_name", "middle_name"], ), name="count", diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index da3d626f2d777..7735aedaca747 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -484,7 +484,7 @@ def test_data_frame_value_counts( [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], ), (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), - (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), + (True, False, [0, 1, 5, 6, 7, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), ], ) @@ -526,7 +526,17 @@ def test_dropna_combinations( True, [1, 1], MultiIndex.from_arrays( - [(1, 1), ("Beth", "John"), ("Louise", "Smith")], + [ + (1, 1), + ( + "John", + "Beth", + ), + ( + "Smith", + "Louise", + ), + ], names=["key", "first_name", "middle_name"], ), ), @@ -539,7 +549,7 @@ def test_dropna_combinations( Index(["Anne", "Beth", "John"]), Index(["Louise", "Smith", np.nan]), ], - codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], + codes=[[0, 0, 0, 0], [2, 0, 2, 1], [1, 2, 2, 0]], names=["key", "first_name", "middle_name"], ), ), @@ -845,8 +855,8 @@ def test_categorical_single_grouper_observed_false( ("US", "high", "male"), ("US", "low", "male"), ("US", "low", "female"), - ("US", "medium", "female"), ("US", "medium", "male"), + ("US", "medium", "female"), ], ), ( @@ -1186,7 +1196,7 @@ def test_value_counts_sort(sort, vc_sort, normalize): if sort and vc_sort: taker = [0, 1, 2] elif sort and not vc_sort: - taker = [0, 1, 2] + taker = [1, 0, 2] elif not sort and vc_sort: taker = [0, 2, 1] else: From 055562cae26c2e2387b84a3a85da7c3193b184c8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 7 Sep 2024 11:21:41 -0400 Subject: [PATCH 2/6] Docs --- doc/source/whatsnew/v3.0.0.rst | 61 ++++++++++++++++++++++++++++++++++ pandas/core/frame.py | 6 +++- pandas/core/groupby/generic.py | 8 ++++- 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9a29ff4d49966..de9ea6597974c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -201,6 +201,67 @@ In cases with mixed-resolution inputs, the highest resolution is used: In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype Out[2]: dtype(' Date: Sat, 7 Sep 2024 11:25:43 -0400 Subject: [PATCH 3/6] Cleanup --- pandas/core/groupby/ops.py | 3 +-- pandas/tests/frame/methods/test_value_counts.py | 12 +----------- pandas/tests/groupby/methods/test_value_counts.py | 12 +----------- 3 files changed, 3 insertions(+), 24 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c2688e049b291..04bc9db302af8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -747,8 +747,7 @@ def result_index(self) -> Index: def ids(self) -> npt.NDArray[np.intp]: return self.result_index_and_ids[1] - # @cache_readonly - @property + @cache_readonly def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: levels = [Index._with_infer(ping.uniques) for ping in self.groupings] obs = [ diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index ee138407b9ecd..de5029b9f18b2 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -128,17 +128,7 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): expected = pd.Series( data=[1, 1], index=pd.MultiIndex.from_arrays( - [ - ( - "John", - "Beth", - ), - ( - "Smith", - "Louise", - ), - ], - names=["first_name", "middle_name"], + [("John", "Beth"), ("Smith", "Louise")], names=["first_name", "middle_name"] ), name="count", ) diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 7735aedaca747..4bf128dc50507 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -526,17 +526,7 @@ def test_dropna_combinations( True, [1, 1], MultiIndex.from_arrays( - [ - (1, 1), - ( - "John", - "Beth", - ), - ( - "Smith", - "Louise", - ), - ], + [(1, 1), ("John", "Beth"), ("Smith", "Louise")], names=["key", "first_name", "middle_name"], ), ), From 6eb4474d550a076a71f1e18d87d18f55c737d28f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 8 Sep 2024 12:46:50 -0400 Subject: [PATCH 4/6] Test & docs fixups --- pandas/core/groupby/generic.py | 20 +++---- .../groupby/methods/test_value_counts.py | 54 +++++++++---------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0b6c3a9b3ad8d..35c4433dae7c5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2361,8 +2361,8 @@ def value_counts( >>> df.groupby("gender").value_counts() gender education country - female high FR 1 - US 1 + female high US 1 + FR 1 male low FR 2 US 1 medium FR 1 @@ -2370,8 +2370,8 @@ def value_counts( >>> df.groupby("gender").value_counts(ascending=True) gender education country - female high FR 1 - US 1 + female high US 1 + FR 1 male low US 1 medium FR 1 low FR 2 @@ -2379,8 +2379,8 @@ def value_counts( >>> df.groupby("gender").value_counts(normalize=True) gender education country - female high FR 0.50 - US 0.50 + female high US 0.50 + FR 0.50 male low FR 0.50 US 0.25 medium FR 0.25 @@ -2388,16 +2388,16 @@ def value_counts( >>> df.groupby("gender", as_index=False).value_counts() gender education country count - 0 female high FR 1 - 1 female high US 1 + 0 female high US 1 + 1 female high FR 1 2 male low FR 2 3 male low US 1 4 male medium FR 1 >>> df.groupby("gender", as_index=False).value_counts(normalize=True) gender education country proportion - 0 female high FR 0.50 - 1 female high US 0.50 + 0 female high US 0.50 + 1 female high FR 0.50 2 male low FR 0.50 3 male low US 0.25 4 male medium FR 0.25 diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 4bf128dc50507..1c2f98c3701d5 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -257,10 +257,10 @@ def test_basic(education_df, request): index=MultiIndex.from_tuples( [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), - ("US", "female", "high"), + ("FR", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), ], names=["country", "gender", "education"], ), @@ -480,7 +480,7 @@ def test_data_frame_value_counts( ( False, False, - [0, 1, 3, 5, 7, 6, 8, 2, 4], + [0, 1, 3, 5, 6, 7, 8, 2, 4], [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], ), (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), @@ -617,17 +617,17 @@ def test_categorical_single_grouper_with_only_observed_categories( expected_index = MultiIndex.from_tuples( [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ], names=["country", "gender", "education"], ) @@ -719,17 +719,17 @@ def test_categorical_single_grouper_observed_true( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ] assert_categorical_single_grouper( @@ -799,23 +799,23 @@ def test_categorical_single_grouper_observed_false( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ("ASIA", "female", "high"), - ("ASIA", "female", "low"), - ("ASIA", "female", "medium"), - ("ASIA", "male", "high"), ("ASIA", "male", "low"), ("ASIA", "male", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "female", "high"), ] assert_categorical_single_grouper( @@ -957,17 +957,17 @@ def test_categorical_non_groupers( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ] expected_series = Series( data=expected_data, From 93d289547545177103efa8097133313f2b76477c Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 11 Sep 2024 21:29:32 -0400 Subject: [PATCH 5/6] Refine whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index de9ea6597974c..31da92755831d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -206,7 +206,7 @@ In cases with mixed-resolution inputs, the highest resolution is used: Changed behavior in ``value_counts`` when ``sort=False`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In previous versions of pandas, :meth:`DataFrame.value_counts` with ``sort=False`` would sort the result by column label (as was documented). This was nonintuitive and inconsistent with :meth:`Series.value_counts` which would maintain the order of the input. Now :meth:`DataFrame.value_counts` will maintain the order of the input. +In previous versions of pandas, :meth:`DataFrame.value_counts` with ``sort=False`` would sort the result by row labels (as was documented). This was nonintuitive and inconsistent with :meth:`Series.value_counts` which would maintain the order of the input. Now :meth:`DataFrame.value_counts` will maintain the order of the input. .. ipython:: python From ede98f7afce837a7af863f251f6ab1cd562a5890 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 11 Sep 2024 21:31:18 -0400 Subject: [PATCH 6/6] Refine whatsnew --- doc/source/whatsnew/v3.0.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 31da92755831d..af61551156fbc 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -203,8 +203,8 @@ In cases with mixed-resolution inputs, the highest resolution is used: .. _whatsnew_300.api_breaking.value_counts_sorting: -Changed behavior in ``value_counts`` when ``sort=False`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Changed behavior in :meth:`DataFrame.value_counts` and :meth:`DataFrameGroupBy.value_counts` when ``sort=False`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions of pandas, :meth:`DataFrame.value_counts` with ``sort=False`` would sort the result by row labels (as was documented). This was nonintuitive and inconsistent with :meth:`Series.value_counts` which would maintain the order of the input. Now :meth:`DataFrame.value_counts` will maintain the order of the input.