Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/2.3.x' into backport-59433
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Dec 17, 2024
2 parents 712d19f + 3362822 commit 60c6f84
Show file tree
Hide file tree
Showing 145 changed files with 1,452 additions and 846 deletions.
1 change: 0 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ jobs:
- checkout
- run: .circleci/setup_env.sh
- run: |
sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH \
LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD \
ci/run_tests.sh
Expand Down
10 changes: 7 additions & 3 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ enhancement1
Other enhancements
^^^^^^^^^^^^^^^^^^

- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
updated to work correctly with NumPy >= 2 (:issue:`57739`)
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
-

Expand Down Expand Up @@ -104,10 +107,10 @@ Conversion
Strings
^^^^^^^
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
-

Interval
^^^^^^^^
Expand All @@ -116,7 +119,7 @@ Interval

Indexing
^^^^^^^^
-
- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
-

Missing
Expand Down Expand Up @@ -171,7 +174,8 @@ Styler

Other
^^^^^
-
- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2``
are not installed (:issue:`60196`)
-

.. ---------------------------------------------------------------------------
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ...
class MaskedUInt8Engine(MaskedIndexEngine): ...
class MaskedBoolEngine(MaskedUInt8Engine): ...

class StringObjectEngine(ObjectEngine):
def __init__(self, values: object, na_value) -> None: ...

class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
offsets: np.ndarray # ndarray[uint64_t, ndim=1]
Expand Down
26 changes: 26 additions & 0 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,32 @@ cdef class ObjectEngine(IndexEngine):
return loc


cdef class StringObjectEngine(ObjectEngine):

cdef:
object na_value
bint uses_na

def __init__(self, ndarray values, na_value):
super().__init__(values)
self.na_value = na_value
self.uses_na = na_value is C_NA

cdef bint _checknull(self, object val):
if self.uses_na:
return val is C_NA
else:
return util.is_nan(val)

cdef _check_type(self, object val):
if isinstance(val, str):
return val
elif self._checknull(val):
return self.na_value
else:
raise KeyError(val)


cdef class DatetimeEngine(Int64Engine):

cdef:
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: Literal[False] = ...,
convert_string: Literal[False] = ...,
convert_to_nullable_dtype: Literal[False] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> npt.NDArray[np.object_ | np.number]: ...
Expand All @@ -97,6 +98,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: bool = ...,
convert_string: bool = ...,
convert_to_nullable_dtype: Literal[True] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
Expand All @@ -108,6 +110,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: bool = ...,
convert_string: bool = ...,
convert_to_nullable_dtype: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
Expand Down
13 changes: 12 additions & 1 deletion pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects,
bint convert_numeric=True, # NB: different default!
bint convert_to_nullable_dtype=False,
bint convert_non_numeric=False,
bint convert_string=True,
object dtype_if_all_nat=None) -> "ArrayLike":
"""
Type inference function-- convert object array to proper dtype
Expand Down Expand Up @@ -2741,7 +2742,17 @@ def maybe_convert_objects(ndarray[object] objects,
seen.object_ = True

elif seen.str_:
if using_string_dtype() and is_string_array(objects, skipna=True):
if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype()
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

elif (
convert_string
and using_string_dtype()
and is_string_array(objects, skipna=True)
):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(na_value=np.nan)
Expand Down
2 changes: 2 additions & 0 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,8 @@ def shares_memory(left, right) -> bool:
if isinstance(left, MultiIndex):
return shares_memory(left._codes, right)
if isinstance(left, (Index, Series)):
if isinstance(right, (Index, Series)):
return shares_memory(left._values, right._values)
return shares_memory(left._values, right)

if isinstance(left, NDArrayBackedExtensionArray):
Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
pa_version_under14p1,
pa_version_under16p0,
pa_version_under17p0,
pa_version_under18p0,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -191,6 +192,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
"pa_version_under14p1",
"pa_version_under16p0",
"pa_version_under17p0",
"pa_version_under18p0",
"HAS_PYARROW",
"IS64",
"ISMUSL",
Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
pa_version_under15p0 = _palv < Version("15.0.0")
pa_version_under16p0 = _palv < Version("16.0.0")
pa_version_under17p0 = _palv < Version("17.0.0")
pa_version_under18p0 = _palv < Version("18.0.0")
HAS_PYARROW = True
except ImportError:
pa_version_under10p1 = True
Expand All @@ -28,4 +29,5 @@
pa_version_under15p0 = True
pa_version_under16p0 = True
pa_version_under17p0 = True
pa_version_under18p0 = False
HAS_PYARROW = False
10 changes: 5 additions & 5 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ def multiindex_year_month_day_dataframe_random_data():
"""
tdf = DataFrame(
np.random.default_rng(2).standard_normal((100, 4)),
columns=Index(list("ABCD"), dtype=object),
columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=100, freq="B"),
)
ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()
Expand Down Expand Up @@ -743,7 +743,7 @@ def string_series() -> Series:
"""
return Series(
np.arange(30, dtype=np.float64) * 1.1,
index=Index([f"i_{i}" for i in range(30)], dtype=object),
index=Index([f"i_{i}" for i in range(30)]),
name="series",
)

Expand All @@ -754,7 +754,7 @@ def object_series() -> Series:
Fixture for Series of dtype object with Index of unique strings
"""
data = [f"foo_{i}" for i in range(30)]
index = Index([f"bar_{i}" for i in range(30)], dtype=object)
index = Index([f"bar_{i}" for i in range(30)])
return Series(data, index=index, name="objects", dtype=object)


Expand Down Expand Up @@ -846,8 +846,8 @@ def int_frame() -> DataFrame:
"""
return DataFrame(
np.ones((30, 4), dtype=np.int64),
index=Index([f"foo_{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"foo_{i}" for i in range(30)]),
columns=Index(list("ABCD")),
)


Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/arrow/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:

def _validate(self, data):
dtype = data.dtype
if not isinstance(dtype, ArrowDtype):
if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
# Raise AttributeError so that inspect can handle non-struct Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))

Expand Down
37 changes: 32 additions & 5 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,16 @@ def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
return self.to_numpy(dtype=dtype)
if copy is False:
# TODO: By using `zero_copy_only` it may be possible to implement this
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
elif copy is None:
# `to_numpy(copy=False)` has the meaning of NumPy `copy=None`.
copy = False

return self.to_numpy(dtype=dtype, copy=copy)

def __invert__(self) -> Self:
# This is a bit wise op for integer types
Expand Down Expand Up @@ -728,7 +737,7 @@ def _cmp_method(self, other, op):
try:
result[valid] = op(np_array[valid], other)
except TypeError:
result = ops.invalid_comparison(np_array, other, op)
result = ops.invalid_comparison(self, other, op)
result = pa.array(result, type=pa.bool_())
result = pc.if_else(valid, result, None)
else:
Expand Down Expand Up @@ -1125,7 +1134,7 @@ def fillna(
try:
fill_value = self._box_pa(value, pa_type=self._pa_array.type)
except pa.ArrowTypeError as err:
msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
raise TypeError(msg) from err

try:
Expand Down Expand Up @@ -1624,7 +1633,11 @@ def _accumulate(
else:
data_to_accum = data_to_accum.cast(pa.int64())

result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
try:
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
except pa.ArrowNotImplementedError as err:
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
raise TypeError(msg) from err

if convert_to_int:
result = result.cast(pa_dtype)
Expand Down Expand Up @@ -2117,7 +2130,7 @@ def _maybe_convert_setitem_value(self, value):
try:
value = self._box_pa(value, self._pa_array.type)
except pa.ArrowTypeError as err:
msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
raise TypeError(msg) from err
return value

Expand Down Expand Up @@ -2276,6 +2289,20 @@ def _groupby_op(
**kwargs,
):
if isinstance(self.dtype, StringDtype):
if how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
return super()._groupby_op(
how=how,
has_dropped_na=has_dropped_na,
Expand Down
14 changes: 14 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2369,6 +2369,20 @@ def _groupby_op(
# GH#43682
if isinstance(self.dtype, StringDtype):
# StringArray
if op.how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
if op.how not in ["any", "all"]:
# Fail early to avoid conversion to object
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
Expand Down
33 changes: 25 additions & 8 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,11 +577,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
raise ValueError("Cannot convert float NaN to integer")

elif len(self.codes) == 0 or len(self.categories) == 0:
result = np.array(
self,
dtype=dtype,
copy=copy,
)
# For NumPy 1.x compatibility we cannot use copy=None. And
# `copy=False` has the meaning of `copy=None` here:
if not copy:
result = np.asarray(self, dtype=dtype)
else:
result = np.array(self, dtype=dtype)

else:
# GH8628 (PERF): astype category codes instead of astyping array
Expand Down Expand Up @@ -1642,6 +1643,17 @@ def __array__(
"""
The numpy array interface.
Users should not call this directly. Rather, it is invoked by
:func:`numpy.array` and :func:`numpy.asarray`.
Parameters
----------
dtype : np.dtype or None
Specifies the the dtype for the array.
copy : bool or None, optional
See :func:`numpy.asarray`.
Returns
-------
numpy.array
Expand All @@ -1659,13 +1671,18 @@ def __array__(
>>> np.asarray(cat)
array(['a', 'b'], dtype=object)
"""
if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

ret = take_nd(self.categories._values, self._codes)
if dtype and np.dtype(dtype) != self.categories.dtype:
return np.asarray(ret, dtype)
# When we're a Categorical[ExtensionArray], like Interval,
# we need to ensure __array__ gets all the way to an
# ndarray.
return np.asarray(ret)

# `take_nd` should already make a copy, so don't force again.
return np.asarray(ret, dtype=dtype)

def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
# for binary ops, use our custom dunder methods
Expand Down
Loading

0 comments on commit 60c6f84

Please sign in to comment.