Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling conversion of empty categorical with dtype_backend='pyarrow' #59935

Closed
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,30 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Period.freq GL08" \
-i "pandas.Period.ordinal GL08" \
-i "pandas.RangeIndex.from_range PR01,SA01" \
-i "pandas.Series.cat.add_categories PR01,PR02" \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like merge conflict-resolution went the wrong way here. I think these should likely all be removed.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

applied the changes, should be OK now

-i "pandas.Series.cat.as_ordered PR01" \
-i "pandas.Series.cat.as_unordered PR01" \
-i "pandas.Series.cat.remove_categories PR01,PR02" \
-i "pandas.Series.cat.remove_unused_categories PR01" \
-i "pandas.Series.cat.rename_categories PR01,PR02" \
-i "pandas.Series.cat.reorder_categories PR01,PR02" \
-i "pandas.Series.cat.set_categories PR01,PR02" \
-i "pandas.Series.dt.as_unit PR01,PR02" \
-i "pandas.Series.dt.ceil PR01,PR02" \
-i "pandas.Series.dt.day_name PR01,PR02" \
-i "pandas.Series.dt.floor PR01,PR02" \
-i "pandas.Series.dt.freq GL08" \
-i "pandas.Series.dt.month_name PR01,PR02" \
-i "pandas.Series.dt.normalize PR01" \
-i "pandas.Series.dt.round PR01,PR02" \
-i "pandas.Series.dt.strftime PR01,PR02" \
-i "pandas.Series.dt.to_period PR01,PR02" \
-i "pandas.Series.dt.total_seconds PR01" \
-i "pandas.Series.dt.tz_convert PR01,PR02" \
-i "pandas.Series.dt.tz_localize PR01,PR02" \
-i "pandas.Series.dt.unit GL08" \
-i "pandas.Series.pad PR01,SA01" \
-i "pandas.Series.sparse.from_coo PR07,SA01" \
-i "pandas.Timedelta.max PR02" \
-i "pandas.Timedelta.min PR02" \
-i "pandas.Timedelta.resolution PR02" \
Expand All @@ -85,11 +106,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.Timestamp.resolution PR02" \
-i "pandas.Timestamp.tzinfo GL08" \
-i "pandas.Timestamp.year GL08" \
-i "pandas.api.types.is_float PR01,SA01" \
-i "pandas.api.types.is_integer PR01,SA01" \
-i "pandas.api.types.is_iterator PR07,SA01" \
-i "pandas.api.types.is_re_compilable PR07,SA01" \
-i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
-i "pandas.arrays.DatetimeArray SA01" \
-i "pandas.arrays.IntegerArray SA01" \
-i "pandas.arrays.IntervalArray.left SA01" \
-i "pandas.arrays.IntervalArray.length SA01" \
Expand Down Expand Up @@ -134,11 +157,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.core.resample.Resampler.var SA01" \
-i "pandas.errors.AttributeConflictWarning SA01" \
-i "pandas.errors.CSSWarning SA01" \
-i "pandas.errors.CategoricalConversionWarning SA01" \
-i "pandas.errors.ChainedAssignmentError SA01" \
-i "pandas.errors.DataError SA01" \
-i "pandas.errors.DuplicateLabelError SA01" \
-i "pandas.errors.IntCastingNaNError SA01" \
-i "pandas.errors.InvalidIndexError SA01" \
-i "pandas.errors.InvalidVersion SA01" \
-i "pandas.errors.NullFrequencyError SA01" \
-i "pandas.errors.NumExprClobberingError SA01" \
-i "pandas.errors.NumbaUtilError SA01" \
Expand All @@ -147,17 +172,24 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.errors.PerformanceWarning SA01" \
-i "pandas.errors.PossibleDataLossError SA01" \
-i "pandas.errors.PossiblePrecisionLoss SA01" \
-i "pandas.errors.SpecificationError SA01" \
-i "pandas.errors.UndefinedVariableError PR01,SA01" \
-i "pandas.errors.UnsortedIndexError SA01" \
-i "pandas.errors.UnsupportedFunctionCall SA01" \
-i "pandas.errors.ValueLabelTypeMismatch SA01" \
-i "pandas.infer_freq SA01" \
-i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
-i "pandas.io.stata.StataReader.data_label SA01" \
-i "pandas.io.stata.StataReader.value_labels RT03,SA01" \
-i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \
-i "pandas.io.stata.StataWriter.write_file SA01" \
-i "pandas.json_normalize RT03,SA01" \
-i "pandas.period_range RT03,SA01" \
-i "pandas.plotting.andrews_curves RT03,SA01" \
-i "pandas.plotting.lag_plot RT03,SA01" \
-i "pandas.plotting.scatter_matrix PR07,SA01" \
-i "pandas.set_eng_float_format RT03,SA01" \
-i "pandas.testing.assert_extension_array_equal SA01" \
-i "pandas.tseries.offsets.BDay PR02,SA01" \
-i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
-i "pandas.tseries.offsets.BQuarterBegin.n GL08" \
Expand Down
1 change: 1 addition & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1143,6 +1143,7 @@ def convert_dtypes(
base_dtype.kind == "O" # type: ignore[union-attr]
and input_array.size > 0
and isna(input_array).all()
and not isinstance(input_array.dtype, CategoricalDtype)
):
import pyarrow as pa

Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/frame/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm

Expand Down Expand Up @@ -35,6 +37,26 @@ def test_convert_empty(self):
empty_df = pd.DataFrame()
tm.assert_frame_equal(empty_df, empty_df.convert_dtypes())

@td.skip_if_no("pyarrow")
def test_convert_empty_categorical_to_pyarrow(self):
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
# GH#59934
df = pd.DataFrame(
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
{
"A": pd.Categorical([None] * 5),
"B": pd.Categorical([None] * 5, categories=["B1", "B2"]),
}
)
converted = df.convert_dtypes(dtype_backend="pyarrow")
expected = df
tm.assert_frame_equal(converted, expected)

assert converted.A.dtype == "category", "Dtype in column A is not 'category'"
assert converted.B.dtype == "category", "Dtype in column B is not 'category'"
assert converted.A.cat.categories.empty, "Categories in column A are not empty"
assert converted.B.cat.categories.isin(
["B1", "B2"]
).all(), "Categories in column B doesn't contain adequate categories"
Comment on lines +53 to +58
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are all covered in tm.assert_frame_equal above and can be removed.


def test_convert_dtypes_retain_column_names(self):
# GH#41435
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/series/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from pandas._libs import lib
import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm
Expand Down Expand Up @@ -297,3 +298,20 @@ def test_convert_dtypes_pyarrow_null(self):
result = ser.convert_dtypes(dtype_backend="pyarrow")
expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null()))
tm.assert_series_equal(result, expected)

@td.skip_if_no("pyarrow")
def test_convert_empty_categorical_to_pyarrow(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def test_convert_empty_categorical_to_pyarrow(self):
def test_convert_empty_categorical_to_pyarrow(self):
pytest.importorskip("pyarrow")

# GH#59934
ser1 = pd.Series(pd.Categorical([None] * 5))
converted1 = ser1.convert_dtypes(dtype_backend="pyarrow")
expected = ser1

tm.assert_series_equal(converted1, expected)
assert converted1.dtype == "category", "Series dtype is not 'category'"
assert converted1.cat.categories.empty, "Series categories are not empty"
Comment on lines +310 to +311
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar, can remove.


ser2 = pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"]))
converted2 = ser2.convert_dtypes(dtype_backend="pyarrow")
assert converted2.cat.categories.isin(
["S1", "S2"]
).all(), "Categories in ser2 doesn't contain adequate categories"
Comment on lines +315 to +317
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar, can remove.

Loading