Merge remote-tracking branch 'upstream/2.3.x' into backport-59433

jorisvandenbossche · Dec 17, 2024 · 60c6f84 · 60c6f84
2 parents 712d19f + 3362822
commit 60c6f84
Show file tree

Hide file tree

Showing 145 changed files with 1,452 additions and 846 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -15,7 +15,6 @@ jobs:
       - checkout
       - run: .circleci/setup_env.sh
       - run: |
-          sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
           PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH \
           LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD \
           ci/run_tests.sh

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -32,6 +32,9 @@ enhancement1
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
+- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
+  when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
+  updated to work correctly with NumPy >= 2 (:issue:`57739`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 -
 
@@ -104,10 +107,10 @@ Conversion
 Strings
 ^^^^^^^
 - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
+- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
 - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
 - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
--
 
 Interval
 ^^^^^^^^
@@ -116,7 +119,7 @@ Interval
 
 Indexing
 ^^^^^^^^
--
+- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
 -
 
 Missing
@@ -171,7 +174,8 @@ Styler
 
 Other
 ^^^^^
--
+- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2``
+  are not installed (:issue:`60196`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
@@ -68,6 +68,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ...
 class MaskedUInt8Engine(MaskedIndexEngine): ...
 class MaskedBoolEngine(MaskedUInt8Engine): ...
 
+class StringObjectEngine(ObjectEngine):
+    def __init__(self, values: object, na_value) -> None: ...
+
 class BaseMultiIndexCodesEngine:
     levels: list[np.ndarray]
     offsets: np.ndarray  # ndarray[uint64_t, ndim=1]

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -532,6 +532,32 @@ cdef class ObjectEngine(IndexEngine):
         return loc
 
 
+cdef class StringObjectEngine(ObjectEngine):
+
+    cdef:
+        object na_value
+        bint uses_na
+
+    def __init__(self, ndarray values, na_value):
+        super().__init__(values)
+        self.na_value = na_value
+        self.uses_na = na_value is C_NA
+
+    cdef bint _checknull(self, object val):
+        if self.uses_na:
+            return val is C_NA
+        else:
+            return util.is_nan(val)
+
+    cdef _check_type(self, object val):
+        if isinstance(val, str):
+            return val
+        elif self._checknull(val):
+            return self.na_value
+        else:
+            raise KeyError(val)
+
+
 cdef class DatetimeEngine(Int64Engine):
 
     cdef:

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -86,6 +86,7 @@ def maybe_convert_objects(
     safe: bool = ...,
     convert_numeric: bool = ...,
     convert_non_numeric: Literal[False] = ...,
+    convert_string: Literal[False] = ...,
     convert_to_nullable_dtype: Literal[False] = ...,
     dtype_if_all_nat: DtypeObj | None = ...,
 ) -> npt.NDArray[np.object_ | np.number]: ...
@@ -97,6 +98,7 @@ def maybe_convert_objects(
     safe: bool = ...,
     convert_numeric: bool = ...,
     convert_non_numeric: bool = ...,
+    convert_string: bool = ...,
     convert_to_nullable_dtype: Literal[True] = ...,
     dtype_if_all_nat: DtypeObj | None = ...,
 ) -> ArrayLike: ...
@@ -108,6 +110,7 @@ def maybe_convert_objects(
     safe: bool = ...,
     convert_numeric: bool = ...,
     convert_non_numeric: bool = ...,
+    convert_string: bool = ...,
     convert_to_nullable_dtype: bool = ...,
     dtype_if_all_nat: DtypeObj | None = ...,
 ) -> ArrayLike: ...

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects,
                           bint convert_numeric=True,  # NB: different default!
                           bint convert_to_nullable_dtype=False,
                           bint convert_non_numeric=False,
+                          bint convert_string=True,
                           object dtype_if_all_nat=None) -> "ArrayLike":
     """
     Type inference function-- convert object array to proper dtype
@@ -2741,7 +2742,17 @@ def maybe_convert_objects(ndarray[object] objects,
         seen.object_ = True
 
     elif seen.str_:
-        if using_string_dtype() and is_string_array(objects, skipna=True):
+        if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
+            from pandas.core.arrays.string_ import StringDtype
+
+            dtype = StringDtype()
+            return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
+
+        elif (
+            convert_string
+            and using_string_dtype()
+            and is_string_array(objects, skipna=True)
+        ):
             from pandas.core.arrays.string_ import StringDtype
 
             dtype = StringDtype(na_value=np.nan)

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -519,6 +519,8 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, MultiIndex):
         return shares_memory(left._codes, right)
     if isinstance(left, (Index, Series)):
+        if isinstance(right, (Index, Series)):
+            return shares_memory(left._values, right._values)
         return shares_memory(left._values, right)
 
     if isinstance(left, NDArrayBackedExtensionArray):

diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -33,6 +33,7 @@
     pa_version_under14p1,
     pa_version_under16p0,
     pa_version_under17p0,
+    pa_version_under18p0,
 )
 
 if TYPE_CHECKING:
@@ -191,6 +192,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
     "pa_version_under14p1",
     "pa_version_under16p0",
     "pa_version_under17p0",
+    "pa_version_under18p0",
     "HAS_PYARROW",
     "IS64",
     "ISMUSL",

diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py
@@ -17,6 +17,7 @@
     pa_version_under15p0 = _palv < Version("15.0.0")
     pa_version_under16p0 = _palv < Version("16.0.0")
     pa_version_under17p0 = _palv < Version("17.0.0")
+    pa_version_under18p0 = _palv < Version("18.0.0")
     HAS_PYARROW = True
 except ImportError:
     pa_version_under10p1 = True
@@ -28,4 +29,5 @@
     pa_version_under15p0 = True
     pa_version_under16p0 = True
     pa_version_under17p0 = True
+    pa_version_under18p0 = False
     HAS_PYARROW = False
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -548,7 +548,7 @@ def multiindex_year_month_day_dataframe_random_data():
     """
     tdf = DataFrame(
         np.random.default_rng(2).standard_normal((100, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=100, freq="B"),
     )
     ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()
@@ -743,7 +743,7 @@ def string_series() -> Series:
     """
     return Series(
         np.arange(30, dtype=np.float64) * 1.1,
-        index=Index([f"i_{i}" for i in range(30)], dtype=object),
+        index=Index([f"i_{i}" for i in range(30)]),
         name="series",
     )
 
@@ -754,7 +754,7 @@ def object_series() -> Series:
     Fixture for Series of dtype object with Index of unique strings
     """
     data = [f"foo_{i}" for i in range(30)]
-    index = Index([f"bar_{i}" for i in range(30)], dtype=object)
+    index = Index([f"bar_{i}" for i in range(30)])
     return Series(data, index=index, name="objects", dtype=object)
 
 
@@ -846,8 +846,8 @@ def int_frame() -> DataFrame:
     """
     return DataFrame(
         np.ones((30, 4), dtype=np.int64),
-        index=Index([f"foo_{i}" for i in range(30)], dtype=object),
-        columns=Index(list("ABCD"), dtype=object),
+        index=Index([f"foo_{i}" for i in range(30)]),
+        columns=Index(list("ABCD")),
     )
 
 

diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py
@@ -46,7 +46,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
 
     def _validate(self, data):
         dtype = data.dtype
-        if not isinstance(dtype, ArrowDtype):
+        if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
             # Raise AttributeError so that inspect can handle non-struct Series.
             raise AttributeError(self._validation_msg.format(dtype=dtype))
 

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -662,7 +662,16 @@ def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
         """Correctly construct numpy arrays when passed to `np.asarray()`."""
-        return self.to_numpy(dtype=dtype)
+        if copy is False:
+            # TODO: By using `zero_copy_only` it may be possible to implement this
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+        elif copy is None:
+            # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`.
+            copy = False
+
+        return self.to_numpy(dtype=dtype, copy=copy)
 
     def __invert__(self) -> Self:
         # This is a bit wise op for integer types
@@ -728,7 +737,7 @@ def _cmp_method(self, other, op):
                 try:
                     result[valid] = op(np_array[valid], other)
                 except TypeError:
-                    result = ops.invalid_comparison(np_array, other, op)
+                    result = ops.invalid_comparison(self, other, op)
                 result = pa.array(result, type=pa.bool_())
                 result = pc.if_else(valid, result, None)
         else:
@@ -1125,7 +1134,7 @@ def fillna(
         try:
             fill_value = self._box_pa(value, pa_type=self._pa_array.type)
         except pa.ArrowTypeError as err:
-            msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
+            msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
             raise TypeError(msg) from err
 
         try:
@@ -1624,7 +1633,11 @@ def _accumulate(
             else:
                 data_to_accum = data_to_accum.cast(pa.int64())
 
-        result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
+        try:
+            result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
+        except pa.ArrowNotImplementedError as err:
+            msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
+            raise TypeError(msg) from err
 
         if convert_to_int:
             result = result.cast(pa_dtype)
@@ -2117,7 +2130,7 @@ def _maybe_convert_setitem_value(self, value):
         try:
             value = self._box_pa(value, self._pa_array.type)
         except pa.ArrowTypeError as err:
-            msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
+            msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
             raise TypeError(msg) from err
         return value
 
@@ -2276,6 +2289,20 @@ def _groupby_op(
         **kwargs,
     ):
         if isinstance(self.dtype, StringDtype):
+            if how in [
+                "prod",
+                "mean",
+                "median",
+                "cumsum",
+                "cumprod",
+                "std",
+                "sem",
+                "var",
+                "skew",
+            ]:
+                raise TypeError(
+                    f"dtype '{self.dtype}' does not support operation '{how}'"
+                )
             return super()._groupby_op(
                 how=how,
                 has_dropped_na=has_dropped_na,

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -2369,6 +2369,20 @@ def _groupby_op(
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
+            if op.how in [
+                "prod",
+                "mean",
+                "median",
+                "cumsum",
+                "cumprod",
+                "std",
+                "sem",
+                "var",
+                "skew",
+            ]:
+                raise TypeError(
+                    f"dtype '{self.dtype}' does not support operation '{how}'"
+                )
             if op.how not in ["any", "all"]:
                 # Fail early to avoid conversion to object
                 op._get_cython_function(op.kind, op.how, np.dtype(object), False)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -577,11 +577,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
             raise ValueError("Cannot convert float NaN to integer")
 
         elif len(self.codes) == 0 or len(self.categories) == 0:
-            result = np.array(
-                self,
-                dtype=dtype,
-                copy=copy,
-            )
+            # For NumPy 1.x compatibility we cannot use copy=None.  And
+            # `copy=False` has the meaning of `copy=None` here:
+            if not copy:
+                result = np.asarray(self, dtype=dtype)
+            else:
+                result = np.array(self, dtype=dtype)
 
         else:
             # GH8628 (PERF): astype category codes instead of astyping array
@@ -1642,6 +1643,17 @@ def __array__(
         """
         The numpy array interface.
 
+        Users should not call this directly. Rather, it is invoked by
+        :func:`numpy.array` and :func:`numpy.asarray`.
+
+        Parameters
+        ----------
+        dtype : np.dtype or None
+            Specifies the the dtype for the array.
+
+        copy : bool or None, optional
+            See :func:`numpy.asarray`.
+
         Returns
         -------
         numpy.array
@@ -1659,13 +1671,18 @@ def __array__(
         >>> np.asarray(cat)
         array(['a', 'b'], dtype=object)
         """
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
         ret = take_nd(self.categories._values, self._codes)
-        if dtype and np.dtype(dtype) != self.categories.dtype:
-            return np.asarray(ret, dtype)
         # When we're a Categorical[ExtensionArray], like Interval,
         # we need to ensure __array__ gets all the way to an
         # ndarray.
-        return np.asarray(ret)
+
+        # `take_nd` should already make a copy, so don't force again.
+        return np.asarray(ret, dtype=dtype)
 
     def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
         # for binary ops, use our custom dunder methods