diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c986e5b1b054..139ea9d220453 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -34,7 +34,6 @@ jobs: fi python -m pip install --no-build-isolation -ve . -Csetup-args="--werror" PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH - sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ci/run_tests.sh test-linux-musl: docker: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 07fb0c19262a1..899b49cc4eff5 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -385,10 +385,12 @@ jobs: nogil: true - name: Build Environment + # TODO: Once numpy 2.2.1 is out, don't install nightly version + # Tests segfault with numpy 2.2.0: https://github.com/numpy/numpy/pull/27955 run: | python --version - python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1 - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython numpy python -m pip install versioneer[toml] python -m pip install python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" diff --git a/ci/code_checks.sh b/ci/code_checks.sh index adc5bc9a01bdd..39cea0c361a72 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -81,10 +81,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.IntervalArray.length SA01" \ -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \ @@ -95,14 +93,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.std SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.errors.NullFrequencyError SA01" \ - -i "pandas.errors.NumbaUtilError SA01" \ - -i "pandas.errors.PerformanceWarning SA01" \ - -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ - -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 1e4a851d0e72d..c572559dcc3e0 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -488,7 +488,7 @@ Post-Release for reference): - The pandas-dev and pydata mailing lists - - Twitter, Mastodon, Telegram and LinkedIn + - X, Mastodon, Telegram and LinkedIn 7. Update this release instructions to fix anything incorrect and to update about any change since the last release. diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 7680c8b434866..e701d48a89db7 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -185,7 +185,6 @@ Reindexing / selection / label manipulation DataFrame.duplicated DataFrame.equals DataFrame.filter - DataFrame.head DataFrame.idxmax DataFrame.idxmin DataFrame.reindex @@ -196,7 +195,6 @@ Reindexing / selection / label manipulation DataFrame.sample DataFrame.set_axis DataFrame.set_index - DataFrame.tail DataFrame.take DataFrame.truncate diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 1525afcac87f7..b2b5c5cc1014e 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -459,7 +459,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df # List the size of the animals with the highest weight. - df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()], include_groups=False) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) `Using get_group `__ @@ -482,7 +482,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) - expected_df = gb.apply(GrowUp, include_groups=False) + expected_df = gb.apply(GrowUp) expected_df `Expanding apply diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index acb5a2b7919ac..4a32381a7de47 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1074,7 +1074,7 @@ missing values with the ``ffill()`` method. ).set_index("date") df_re - df_re.groupby("group").resample("1D", include_groups=False).ffill() + df_re.groupby("group").resample("1D").ffill() .. _groupby.filter: @@ -1252,13 +1252,13 @@ the argument ``group_keys`` which defaults to ``True``. Compare .. ipython:: python - df.groupby("A", group_keys=True).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=True).apply(lambda x: x) with .. ipython:: python - df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False) + df.groupby("A", group_keys=False).apply(lambda x: x) Numba accelerated routines @@ -1742,7 +1742,7 @@ column index name will be used as the name of the inserted column: result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} return pd.Series(result, name="metrics") - result = df.groupby("a").apply(compute_metrics, include_groups=False) + result = df.groupby("a").apply(compute_metrics) result diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 66eeb74b363a3..4b5cdca23103c 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1210,11 +1210,6 @@ You may set the ``xlabel`` and ``ylabel`` arguments to give the plot custom labe for x and y axis. By default, pandas will pick up index name as xlabel, while leaving it empty for ylabel. -.. ipython:: python - :suppress: - - plt.figure(); - .. ipython:: python df.plot(); diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ab5746eca1b18..92c67865ae88f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -56,6 +56,7 @@ Other enhancements - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) +- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) @@ -553,6 +554,7 @@ Other Removals - Removed the ``method`` keyword in ``ExtensionArray.fillna``, implement ``ExtensionArray._pad_or_backfill`` instead (:issue:`53621`) - Removed the attribute ``dtypes`` from :class:`.DataFrameGroupBy` (:issue:`51997`) - Enforced deprecation of ``argmin``, ``argmax``, ``idxmin``, and ``idxmax`` returning a result when ``skipna=False`` and an NA value is encountered or all values are NA values; these operations will now raise in such cases (:issue:`33941`, :issue:`51276`) +- Removed specifying ``include_groups=True`` in :class:`.DataFrameGroupBy.apply` and :class:`.Resampler.apply` (:issue:`7155`) .. --------------------------------------------------------------------------- .. _whatsnew_300.performance: @@ -626,6 +628,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`) - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`) - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) +- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`) - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) @@ -798,7 +801,9 @@ Other - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) +- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) +- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) .. ***DO NOT USE THIS SECTION*** diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 1c0a99eb1ea25..2657b1b9d197b 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -704,7 +704,7 @@ class NaTType(_NaT): difference between the current timezone and UTC. Returns - -------- + ------- timedelta The difference between UTC and the local time as a `timedelta` object. diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index a3429fc840347..6b4b90167e625 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -2217,7 +2217,7 @@ class Timestamp(_Timestamp): difference between the current timezone and UTC. Returns - -------- + ------- timedelta The difference between UTC and the local time as a `timedelta` object. diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 230522846d377..b220a94d032b5 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -117,7 +117,10 @@ def len(self) -> Series: value_lengths = pc.list_value_length(self._pa_array) return Series( - value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index + value_lengths, + dtype=ArrowDtype(value_lengths.type), + index=self._data.index, + name=self._data.name, ) def __getitem__(self, key: int | slice) -> Series: @@ -162,7 +165,10 @@ def __getitem__(self, key: int | slice) -> Series: # key = pc.add(key, pc.list_value_length(self._pa_array)) element = pc.list_element(self._pa_array, key) return Series( - element, dtype=ArrowDtype(element.type), index=self._data.index + element, + dtype=ArrowDtype(element.type), + index=self._data.index, + name=self._data.name, ) elif isinstance(key, slice): if pa_version_under11p0: @@ -181,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series: if step is None: step = 1 sliced = pc.list_slice(self._pa_array, start, stop, step) - return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index) + return Series( + sliced, + dtype=ArrowDtype(sliced.type), + index=self._data.index, + name=self._data.name, + ) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") @@ -223,7 +234,12 @@ def flatten(self) -> Series: counts = pa.compute.list_value_length(self._pa_array) flattened = pa.compute.list_flatten(self._pa_array) index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type))) - return Series(flattened, dtype=ArrowDtype(flattened.type), index=index) + return Series( + flattened, + dtype=ArrowDtype(flattened.type), + index=index, + name=self._data.name, + ) class StructAccessor(ArrowAccessor): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index bbbf1d9ca60bd..0bf2089df5f85 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1306,6 +1306,20 @@ def length(self) -> Index: """ Return an Index with entries denoting the length of each Interval. + The length of an interval is calculated as the difference between + its `right` and `left` bounds. This property is particularly useful + when working with intervals where the size of the interval is an important + attribute, such as in time-series analysis or spatial data analysis. + + See Also + -------- + arrays.IntervalArray.left : Return the left endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.right : Return the right endpoints of each Interval in + the IntervalArray as an Index. + arrays.IntervalArray.mid : Return the midpoint of each Interval in the + IntervalArray as an Index. + Examples -------- diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 86f83489e71ae..9d844e590582a 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -190,8 +190,8 @@ def eval( .. warning:: - ``eval`` can run arbitrary code which can make you vulnerable to code - injection and untrusted data. + This function can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index e2acd9a2c97c2..5a5fad0d83d7a 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -65,23 +65,23 @@ def set_numexpr_threads(n=None) -> None: ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b): +def _evaluate_standard(op, op_str, left_op, right_op): """ Standard evaluation. """ if _TEST_MODE: _store_test_result(False) - return op(a, b) + return op(left_op, right_op) -def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: - """return a boolean if we WILL be using numexpr""" +def _can_use_numexpr(op, op_str, left_op, right_op, dtype_check) -> bool: + """return left_op boolean if we WILL be using numexpr""" if op_str is not None: # required min elements (otherwise we are adding overhead) - if a.size > _MIN_ELEMENTS: + if left_op.size > _MIN_ELEMENTS: # check for dtype compatibility dtypes: set[str] = set() - for o in [a, b]: + for o in [left_op, right_op]: # ndarray and Series Case if hasattr(o, "dtype"): dtypes |= {o.dtype.name} @@ -93,22 +93,22 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool: return False -def _evaluate_numexpr(op, op_str, a, b): +def _evaluate_numexpr(op, op_str, left_op, right_op): result = None - if _can_use_numexpr(op, op_str, a, b, "evaluate"): + if _can_use_numexpr(op, op_str, left_op, right_op, "evaluate"): is_reversed = op.__name__.strip("_").startswith("r") if is_reversed: # we were originally called by a reversed op method - a, b = b, a + left_op, right_op = right_op, left_op - a_value = a - b_value = b + left_value = left_op + right_value = right_op try: result = ne.evaluate( - f"a_value {op_str} b_value", - local_dict={"a_value": a_value, "b_value": b_value}, + f"left_value {op_str} right_value", + local_dict={"left_value": left_value, "right_value": right_value}, casting="safe", ) except TypeError: @@ -116,20 +116,20 @@ def _evaluate_numexpr(op, op_str, a, b): # (https://github.com/pydata/numexpr/issues/379) pass except NotImplementedError: - if _bool_arith_fallback(op_str, a, b): + if _bool_arith_fallback(op_str, left_op, right_op): pass else: raise if is_reversed: # reverse order to original for fallback - a, b = b, a + left_op, right_op = right_op, left_op if _TEST_MODE: _store_test_result(result is not None) if result is None: - result = _evaluate_standard(op, op_str, a, b) + result = _evaluate_standard(op, op_str, left_op, right_op) return result @@ -170,24 +170,24 @@ def _evaluate_numexpr(op, op_str, a, b): } -def _where_standard(cond, a, b): +def _where_standard(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary - return np.where(cond, a, b) + return np.where(cond, left_op, right_op) -def _where_numexpr(cond, a, b): +def _where_numexpr(cond, left_op, right_op): # Caller is responsible for extracting ndarray if necessary result = None - if _can_use_numexpr(None, "where", a, b, "where"): + if _can_use_numexpr(None, "where", left_op, right_op, "where"): result = ne.evaluate( "where(cond_value, a_value, b_value)", - local_dict={"cond_value": cond, "a_value": a, "b_value": b}, + local_dict={"cond_value": cond, "a_value": left_op, "b_value": right_op}, casting="safe", ) if result is None: - result = _where_standard(cond, a, b) + result = _where_standard(cond, left_op, right_op) return result @@ -206,13 +206,13 @@ def _has_bool_dtype(x): _BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"} -def _bool_arith_fallback(op_str, a, b) -> bool: +def _bool_arith_fallback(op_str, left_op, right_op) -> bool: """ Check if we should fallback to the python `_evaluate_standard` in case of an unsupported operation by numexpr, which is the case for some boolean ops. """ - if _has_bool_dtype(a) and _has_bool_dtype(b): + if _has_bool_dtype(left_op) and _has_bool_dtype(right_op): if op_str in _BOOL_OP_UNSUPPORTED: warnings.warn( f"evaluating in Python space because the {op_str!r} " @@ -224,15 +224,15 @@ def _bool_arith_fallback(op_str, a, b) -> bool: return False -def evaluate(op, a, b, use_numexpr: bool = True): +def evaluate(op, left_op, right_op, use_numexpr: bool = True): """ - Evaluate and return the expression of the op on a and b. + Evaluate and return the expression of the op on left_op and right_op. Parameters ---------- op : the actual operand - a : left operand - b : right operand + left_op : left operand + right_op : right operand use_numexpr : bool, default True Whether to try to use numexpr. """ @@ -240,24 +240,27 @@ def evaluate(op, a, b, use_numexpr: bool = True): if op_str is not None: if use_numexpr: # error: "None" not callable - return _evaluate(op, op_str, a, b) # type: ignore[misc] - return _evaluate_standard(op, op_str, a, b) + return _evaluate(op, op_str, left_op, right_op) # type: ignore[misc] + return _evaluate_standard(op, op_str, left_op, right_op) -def where(cond, a, b, use_numexpr: bool = True): +def where(cond, left_op, right_op, use_numexpr: bool = True): """ - Evaluate the where condition cond on a and b. + Evaluate the where condition cond on left_op and right_op. Parameters ---------- cond : np.ndarray[bool] - a : return if cond is True - b : return if cond is False + left_op : return if cond is True + right_op : return if cond is False use_numexpr : bool, default True Whether to try to use numexpr. """ assert _where is not None - return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) + if use_numexpr: + return _where(cond, left_op, right_op) + else: + return _where_standard(cond, left_op, right_op) def set_test_mode(v: bool = True) -> None: diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index fe7e27f537b01..166c9d47294cd 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -205,7 +205,7 @@ def generate(self, v) -> str: val = v.tostring(self.encoding) return f"({self.lhs} {self.op} {val})" - def convert_value(self, v) -> TermValue: + def convert_value(self, conv_val) -> TermValue: """ convert the expression that is in the term to something that is accepted by pytables @@ -219,44 +219,44 @@ def stringify(value): kind = ensure_decoded(self.kind) meta = ensure_decoded(self.meta) if kind == "datetime" or (kind and kind.startswith("datetime64")): - if isinstance(v, (int, float)): - v = stringify(v) - v = ensure_decoded(v) - v = Timestamp(v).as_unit("ns") - if v.tz is not None: - v = v.tz_convert("UTC") - return TermValue(v, v._value, kind) + if isinstance(conv_val, (int, float)): + conv_val = stringify(conv_val) + conv_val = ensure_decoded(conv_val) + conv_val = Timestamp(conv_val).as_unit("ns") + if conv_val.tz is not None: + conv_val = conv_val.tz_convert("UTC") + return TermValue(conv_val, conv_val._value, kind) elif kind in ("timedelta64", "timedelta"): - if isinstance(v, str): - v = Timedelta(v) + if isinstance(conv_val, str): + conv_val = Timedelta(conv_val) else: - v = Timedelta(v, unit="s") - v = v.as_unit("ns")._value - return TermValue(int(v), v, kind) + conv_val = Timedelta(conv_val, unit="s") + conv_val = conv_val.as_unit("ns")._value + return TermValue(int(conv_val), conv_val, kind) elif meta == "category": metadata = extract_array(self.metadata, extract_numpy=True) result: npt.NDArray[np.intp] | np.intp | int - if v not in metadata: + if conv_val not in metadata: result = -1 else: - result = metadata.searchsorted(v, side="left") + result = metadata.searchsorted(conv_val, side="left") return TermValue(result, result, "integer") elif kind == "integer": try: - v_dec = Decimal(v) + v_dec = Decimal(conv_val) except InvalidOperation: # GH 54186 # convert v to float to raise float's ValueError - float(v) + float(conv_val) else: - v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) - return TermValue(v, v, kind) + conv_val = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN")) + return TermValue(conv_val, conv_val, kind) elif kind == "float": - v = float(v) - return TermValue(v, v, kind) + conv_val = float(conv_val) + return TermValue(conv_val, conv_val, kind) elif kind == "bool": - if isinstance(v, str): - v = v.strip().lower() not in [ + if isinstance(conv_val, str): + conv_val = conv_val.strip().lower() not in [ "false", "f", "no", @@ -268,13 +268,15 @@ def stringify(value): "", ] else: - v = bool(v) - return TermValue(v, v, kind) - elif isinstance(v, str): + conv_val = bool(conv_val) + return TermValue(conv_val, conv_val, kind) + elif isinstance(conv_val, str): # string quoting - return TermValue(v, stringify(v), "string") + return TermValue(conv_val, stringify(conv_val), "string") else: - raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column") + raise TypeError( + f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column" + ) def convert_values(self) -> None: pass diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 6fa21d9410187..b0c8ec1ffc083 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -430,7 +430,7 @@ def is_period_dtype(arr_or_dtype) -> bool: Check whether an array-like or dtype is of the Period dtype. .. deprecated:: 2.2.0 - Use isinstance(dtype, pd.Period) instead. + Use isinstance(dtype, pd.PeriodDtype) instead. Parameters ---------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 33a419925f70c..851bc1ce4075c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1018,7 +1018,7 @@ def shape(self) -> tuple[int, int]: See Also -------- - ndarray.shape : Tuple of array dimensions. + numpy.ndarray.shape : Tuple of array dimensions. Examples -------- @@ -4476,8 +4476,10 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No """ Query the columns of a DataFrame with a boolean expression. - This method can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. + .. warning:: + + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- @@ -4634,6 +4636,11 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: """ Evaluate a string describing operations on DataFrame columns. + .. warning:: + + This method can run arbitrary code which can make you vulnerable to code + injection if you pass user input to this function. + Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. @@ -5009,7 +5016,7 @@ def assign(self, **kwargs) -> DataFrame: Parameters ---------- - **kwargs : dict of {str: callable or Series} + **kwargs : callable or Series The column names are keywords. If the values are callable, they are computed on the DataFrame and assigned to the new columns. The callable must not diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3a48cc8a66076..de7fb3682fb4f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -640,7 +640,7 @@ def ndim(self) -> int: See Also -------- - ndarray.ndim : Number of array dimensions. + numpy.ndarray.ndim : Number of array dimensions. Examples -------- @@ -665,7 +665,7 @@ def size(self) -> int: See Also -------- - ndarray.size : Number of elements in the array. + numpy.ndarray.size : Number of elements in the array. Examples -------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f0513be3498d1..f4ba40e275a8d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1393,7 +1393,7 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs): # ----------------------------------------------------------------- # apply/agg/transform - def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: + def apply(self, func, *args, include_groups: bool = False, **kwargs) -> NDFrameT: """ Apply function ``func`` group-wise and combine the results together. @@ -1419,7 +1419,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: *args : tuple Optional positional arguments to pass to ``func``. - include_groups : bool, default True + include_groups : bool, default False When True, will attempt to apply ``func`` to the groupings in the case that they are columns of the DataFrame. If this raises a TypeError, the result will be computed with the groupings excluded. @@ -1427,10 +1427,9 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: .. versionadded:: 2.2.0 - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0.0 - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. + The default changed from True to False, and True is no longer allowed. **kwargs : dict Optional keyword arguments to pass to ``func``. @@ -1520,7 +1519,7 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: each group together into a Series, including setting the index as appropriate: - >>> g1.apply(lambda x: x.C.max() - x.B.min(), include_groups=False) + >>> g1.apply(lambda x: x.C.max() - x.B.min()) A a 5 b 2 @@ -1529,11 +1528,13 @@ def apply(self, func, *args, include_groups: bool = True, **kwargs) -> NDFrameT: Example 4: The function passed to ``apply`` returns ``None`` for one of the group. This group is filtered from the result: - >>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x, include_groups=False) + >>> g1.apply(lambda x: None if x.iloc[0, 0] == 3 else x) B C 0 1 4 1 2 6 """ + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") if isinstance(func, str): if hasattr(self, func): res = getattr(self, func) @@ -1560,33 +1561,7 @@ def f(g): else: f = func - if not include_groups: - return self._python_apply_general(f, self._obj_with_exclusions) - - try: - result = self._python_apply_general(f, self._selected_obj) - if ( - not isinstance(self.obj, Series) - and self._selection is None - and self._selected_obj.shape != self._obj_with_exclusions.shape - ): - warnings.warn( - message=_apply_groupings_depr.format(type(self).__name__, "apply"), - category=DeprecationWarning, - stacklevel=find_stack_level(), - ) - except TypeError: - # gh-20949 - # try again, with .apply acting as a filtering - # operation, by excluding the grouping column - # This would normally not be triggered - # except if the udf is trying an operation that - # fails on *some* columns, e.g. a numeric operation - # on a string grouper column - - return self._python_apply_general(f, self._obj_with_exclusions) - - return result + return self._python_apply_general(f, self._obj_with_exclusions) @final def _python_apply_general( @@ -3424,7 +3399,9 @@ def describe( return result @final - def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resampler: + def resample( + self, rule, *args, include_groups: bool = False, **kwargs + ) -> Resampler: """ Provide resampling when using a TimeGrouper. @@ -3449,10 +3426,9 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp .. versionadded:: 2.2.0 - .. deprecated:: 2.2.0 + .. versionchanged:: 3.0 - Setting include_groups to True is deprecated. Only the value - False will be allowed in a future version of pandas. + The default was changed to False, and True is no longer allowed. **kwargs Possible arguments are `how`, `fill_method`, `limit`, `kind` and @@ -3485,7 +3461,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the DataFrame into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> df.groupby("a").resample("3min", include_groups=False).sum() + >>> df.groupby("a").resample("3min").sum() b a 0 2000-01-01 00:00:00 2 @@ -3494,7 +3470,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Upsample the series into 30 second bins. - >>> df.groupby("a").resample("30s", include_groups=False).sum() + >>> df.groupby("a").resample("30s").sum() b a 0 2000-01-01 00:00:00 1 @@ -3508,7 +3484,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Resample by month. Values are assigned to the month of the period. - >>> df.groupby("a").resample("ME", include_groups=False).sum() + >>> df.groupby("a").resample("ME").sum() b a 0 2000-01-31 3 @@ -3517,11 +3493,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> ( - ... df.groupby("a") - ... .resample("3min", closed="right", include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right").sum()) b a 0 1999-12-31 23:57:00 1 @@ -3532,11 +3504,7 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp the bin interval, but label each bin using the right edge instead of the left. - >>> ( - ... df.groupby("a") - ... .resample("3min", closed="right", label="right", include_groups=False) - ... .sum() - ... ) + >>> (df.groupby("a").resample("3min", closed="right", label="right").sum()) b a 0 2000-01-01 00:00:00 1 @@ -3545,11 +3513,10 @@ def resample(self, rule, *args, include_groups: bool = True, **kwargs) -> Resamp """ from pandas.core.resample import get_resampler_for_grouping - # mypy flags that include_groups could be specified via `*args` or `**kwargs` - # GH#54961 would resolve. - return get_resampler_for_grouping( # type: ignore[misc] - self, rule, *args, include_groups=include_groups, **kwargs - ) + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") + + return get_resampler_for_grouping(self, rule, *args, **kwargs) @final def rolling( @@ -5561,13 +5528,3 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde mi = MultiIndex(levels=levels, codes=codes, names=[idx.name, None]) return mi - - -# GH#7155 -_apply_groupings_depr = ( - "{}.{} operated on the grouping columns. This behavior is deprecated, " - "and in a future version of pandas the grouping columns will be excluded " - "from the operation. Either pass `include_groups=False` to exclude the " - "groupings or explicitly select the grouping columns after groupby to silence " - "this warning." -) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fdfb9f21bdb9f..27e498683bf8f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -31,10 +31,7 @@ Substitution, doc, ) -from pandas.util._exceptions import ( - find_stack_level, - rewrite_warning, -) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -59,7 +56,6 @@ from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, - _apply_groupings_depr, _pipe_template, get_groupby, ) @@ -167,14 +163,15 @@ def __init__( gpr_index: Index, group_keys: bool = False, selection=None, - include_groups: bool = True, + include_groups: bool = False, ) -> None: + if include_groups: + raise ValueError("include_groups=True is no longer allowed.") self._timegrouper = timegrouper self.keys = None self.sort = True self.group_keys = group_keys self.as_index = True - self.include_groups = include_groups self.obj, self.ax, self._indexer = self._timegrouper._set_grouper( self._convert_obj(obj), sort=True, gpr_index=gpr_index @@ -465,9 +462,7 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # a DataFrame column, but aggregate_item_by_item operates column-wise # on Series, raising AttributeError or KeyError # (depending on whether the column lookup uses getattr/__getitem__) - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -479,21 +474,23 @@ def _groupby_and_aggregate(self, how, *args, **kwargs): # we have a non-reducing function # try to evaluate - result = _apply( - grouped, how, *args, include_groups=self.include_groups, **kwargs - ) + result = grouped.apply(how, *args, **kwargs) return self._wrap_result(result) @final def _get_resampler_for_grouping( - self, groupby: GroupBy, key, include_groups: bool = True + self, + groupby: GroupBy, + key, ): """ Return the correct class for resampling with groupby. """ return self._resampler_for_grouping( - groupby=groupby, key=key, parent=self, include_groups=include_groups + groupby=groupby, + key=key, + parent=self, ) def _wrap_result(self, result): @@ -694,7 +691,7 @@ def bfill(self, limit: int | None = None): References ---------- - .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics) + .. [1] https://en.wikipedia.org/wiki/Imputation_%28statistics%29 Examples -------- @@ -935,7 +932,7 @@ def interpolate( "supported. If you tried to resample and interpolate on a " "grouped data frame, please use:\n" "`df.groupby(...).apply(lambda x: x.resample(...)." - "interpolate(...), include_groups=False)`" + "interpolate(...))`" "\ninstead, as resampling and interpolation has to be " "performed for each group independently." ) @@ -1541,7 +1538,6 @@ def __init__( groupby: GroupBy, key=None, selection: IndexLabel | None = None, - include_groups: bool = False, ) -> None: # reached via ._gotitem and _get_resampler_for_grouping @@ -1564,7 +1560,6 @@ def __init__( self.ax = parent.ax self.obj = parent.obj - self.include_groups = include_groups @no_type_check def _apply(self, f, *args, **kwargs): @@ -1581,7 +1576,7 @@ def func(x): return x.apply(f, *args, **kwargs) - result = _apply(self._groupby, func, include_groups=self.include_groups) + result = self._groupby.apply(func) return self._wrap_result(result) _upsample = _apply @@ -1937,7 +1932,6 @@ def get_resampler_for_grouping( fill_method=None, limit: int | None = None, on=None, - include_groups: bool = True, **kwargs, ) -> Resampler: """ @@ -1946,9 +1940,7 @@ def get_resampler_for_grouping( # .resample uses 'on' similar to how .groupby uses 'key' tg = TimeGrouper(freq=rule, key=on, **kwargs) resampler = tg._get_resampler(groupby.obj) - return resampler._get_resampler_for_grouping( - groupby=groupby, include_groups=include_groups, key=tg.key - ) + return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key) class TimeGrouper(Grouper): @@ -2727,18 +2719,3 @@ def _asfreq_compat(index: FreqIndexT, freq) -> FreqIndexT: else: # pragma: no cover raise TypeError(type(index)) return new_index - - -def _apply( - grouped: GroupBy, how: Callable, *args, include_groups: bool, **kwargs -) -> DataFrame: - # GH#7155 - rewrite warning to appear as if it came from `.resample` - target_message = "DataFrameGroupBy.apply operated on the grouping columns" - new_message = _apply_groupings_depr.format("DataFrameGroupBy", "resample") - with rewrite_warning( - target_message=target_message, - target_category=DeprecationWarning, - new_message=new_message, - ): - result = grouped.apply(how, *args, include_groups=include_groups, **kwargs) - return result diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 05e1a36877e06..e5b434edacc59 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1374,6 +1374,11 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string starts with a match of a regular expression. + Determines whether each string in the Series or Index starts with a + match to a specified regular expression. This function is especially + useful for validating prefixes, such as ensuring that codes, tags, or + identifiers begin with a specific pattern. + Parameters ---------- pat : str @@ -1419,6 +1424,11 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string entirely matches a regular expression. + Checks if each string in the Series or Index fully matches the + specified regular expression pattern. This function is useful when the + requirement is for an entire string to conform to a pattern, such as + validating formats like phone numbers or email addresses. + Parameters ---------- pat : str @@ -1647,6 +1657,10 @@ def repeat(self, repeats): """ Duplicate each string in the Series or Index. + Duplicates each string in the Series or Index, either by applying the + same repeat count to all elements or by using different repeat values + for each element. + Parameters ---------- repeats : int or sequence of int @@ -1710,6 +1724,12 @@ def pad( """ Pad strings in the Series/Index up to width. + This function pads strings in a Series or Index to a specified width, + filling the extra space with a character of your choice. It provides + flexibility in positioning the padding, allowing it to be added to the + left, right, or both sides. This is useful for formatting strings to + align text or ensure consistent string lengths in data processing. + Parameters ---------- width : int @@ -1920,6 +1940,11 @@ def slice(self, start=None, stop=None, step=None): """ Slice substrings from each element in the Series or Index. + Slicing substrings from strings in a Series or Index helps extract + specific portions of data, making it easier to analyze or manipulate + text. This is useful for tasks like parsing structured text fields or + isolating parts of strings with a consistent format. + Parameters ---------- start : int, optional @@ -1996,6 +2021,11 @@ def slice_replace(self, start=None, stop=None, repl=None): """ Replace a positional slice of a string with another value. + This function allows replacing specific parts of a string in a Series + or Index by specifying start and stop positions. It is useful for + modifying substrings in a controlled way, such as updating sections of + text based on their positions or patterns. + Parameters ---------- start : int, optional @@ -3670,7 +3700,7 @@ def casefold(self): Series.str.isupper : Check whether all characters are uppercase. Examples - ------------ + -------- The ``s5.str.istitle`` method checks for whether all words are in title case (whether only the first letter of each word is capitalized). Words are assumed to be as any sequence of non-numeric characters separated by diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4680a63bf57a1..30487de7bafd5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -44,6 +44,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_float, + is_float_dtype, is_integer, is_integer_dtype, is_list_like, @@ -1153,6 +1154,10 @@ def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) + # prevent prevision issues in case of float32 # GH#60506 + if is_float_dtype(values.dtype): + values = values.astype("float64") + # prevent overflow in case of int8 or int16 if is_integer_dtype(values.dtype): values = values.astype("int64") diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 43a3c03b6cef9..73e4de6ea6208 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -490,7 +490,7 @@ def online( klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -981,7 +981,7 @@ def reset(self) -> None: """ self._mean.reset() - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): raise NotImplementedError("aggregate is not implemented.") def std(self, bias: bool = False, *args, **kwargs): diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 4bf77b3d38689..bff3a1660eba9 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -167,7 +167,7 @@ def _get_window_indexer(self) -> BaseIndexer: klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 4446b21976069..385ffb901acf0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -44,7 +44,10 @@ from pandas.core._numba import executor from pandas.core.algorithms import factorize -from pandas.core.apply import ResamplerWindowApply +from pandas.core.apply import ( + ResamplerWindowApply, + reconstruct_func, +) from pandas.core.arrays import ExtensionArray from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -646,8 +649,12 @@ def _numba_apply( out = obj._constructor(result, index=index, columns=columns) return self._resolve_output(out, obj) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): + relabeling, func, columns, order = reconstruct_func(func, **kwargs) result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() + if isinstance(result, ABCDataFrame) and relabeling: + result = result.iloc[:, order] + result.columns = columns # type: ignore[union-attr] if result is None: return self.apply(func, raw=False, args=args, kwargs=kwargs) return result @@ -1239,7 +1246,7 @@ def calc(x): klass="Series/DataFrame", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: # these must apply directly @@ -1951,7 +1958,7 @@ def _raise_monotonic_error(self, msg: str): klass="Series/Dataframe", axis="", ) - def aggregate(self, func, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 1de6f06ef316c..f150de3d217f2 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -45,6 +45,11 @@ class NullFrequencyError(ValueError): Particularly ``DatetimeIndex.shift``, ``TimedeltaIndex.shift``, ``PeriodIndex.shift``. + See Also + -------- + Index.shift : Shift values of Index. + Series.shift : Shift values of Series. + Examples -------- >>> df = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) @@ -58,6 +63,12 @@ class PerformanceWarning(Warning): """ Warning raised when there is a possible performance impact. + See Also + -------- + DataFrame.set_index : Set the DataFrame index using existing columns. + DataFrame.loc : Access a group of rows and columns by label(s) \ + or a boolean array. + Examples -------- >>> df = pd.DataFrame( @@ -385,6 +396,13 @@ class NumbaUtilError(Exception): """ Error raised for unsupported Numba engine routines. + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + Series.groupby : Group Series using a mapper or by a Series of columns. + DataFrame.agg : Aggregate using one or more operations over the specified axis. + Series.agg : Aggregate using one or more operations over the specified axis. + Examples -------- >>> df = pd.DataFrame( @@ -570,6 +588,20 @@ class UndefinedVariableError(NameError): It will also specify whether the undefined variable is local or not. + Parameters + ---------- + name : str + The name of the undefined variable. + is_local : bool or None, optional + Indicates whether the undefined variable is considered a local variable. + If ``True``, the error message specifies it as a local variable. + If ``False`` or ``None``, the variable is treated as a non-local name. + + See Also + -------- + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + DataFrame.eval : Evaluate a string describing operations on DataFrame columns. + Examples -------- >>> df = pd.DataFrame({"A": [1, 1, 1]}) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 17460eae8c049..46ecb2b9a8f12 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -78,7 +78,6 @@ ) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.reshape.concat import concat from pandas.io.common import ( check_parent_directory, @@ -245,7 +244,11 @@ def _chk_truncate(self) -> None: series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], series.iloc[-row_num:])) + _len = len(series) + _slice = np.hstack( + [np.arange(row_num), np.arange(_len - row_num, _len)] + ) + series = series.iloc[_slice] self.tr_row_num = row_num else: self.tr_row_num = None diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index eb6773310da69..6f164c4b97514 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1644,7 +1644,7 @@ def _update_ctx_header(self, attrs: DataFrame, axis: AxisInt) -> None: for j in attrs.columns: ser = attrs[j] for i, c in ser.items(): - if not c: + if not c or pd.isna(c): continue css_list = maybe_convert_css_to_tuples(c) if axis == 0: diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 9d250ee5c08ce..7879be18b52c9 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -239,9 +239,16 @@ def build_table_schema( """ Create a Table schema from ``data``. + This method is a utility to generate a JSON-serializable schema + representation of a pandas Series or DataFrame, compatible with the + Table Schema specification. It enables structured data to be shared + and validated in various applications, ensuring consistency and + interoperability. + Parameters ---------- - data : Series, DataFrame + data : Series or DataFrame + The input data for which the table schema is to be created. index : bool, default True Whether to include ``data.index`` in the schema. primary_key : bool or None, default True @@ -256,6 +263,12 @@ def build_table_schema( Returns ------- dict + A dictionary representing the Table schema. + + See Also + -------- + DataFrame.to_json : Convert the object to a JSON string. + read_json : Convert a JSON string to pandas object. Notes ----- diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7d265bc430125..b75dc6c3a43b4 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5297,6 +5297,8 @@ def _dtype_to_kind(dtype_str: str) -> str: kind = "integer" elif dtype_str == "object": kind = "object" + elif dtype_str == "str": + kind = "str" else: raise ValueError(f"cannot interpret dtype of [{dtype_str}]") diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 3c0c5cc64c24c..5652d7fab0c7c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -241,7 +241,7 @@ def read_sql_table( # pyright: ignore[reportOverlappingOverload] schema=..., index_col: str | list[str] | None = ..., coerce_float=..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., columns: list[str] | None = ..., chunksize: None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -255,7 +255,7 @@ def read_sql_table( schema=..., index_col: str | list[str] | None = ..., coerce_float=..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., columns: list[str] | None = ..., chunksize: int = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -268,7 +268,7 @@ def read_sql_table( schema: str | None = None, index_col: str | list[str] | None = None, coerce_float: bool = True, - parse_dates: list[str] | dict[str, str] | None = None, + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None, columns: list[str] | None = None, chunksize: int | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, @@ -372,7 +372,7 @@ def read_sql_query( # pyright: ignore[reportOverlappingOverload] index_col: str | list[str] | None = ..., coerce_float=..., params: list[Any] | Mapping[str, Any] | None = ..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., chunksize: None = ..., dtype: DtypeArg | None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -386,7 +386,7 @@ def read_sql_query( index_col: str | list[str] | None = ..., coerce_float=..., params: list[Any] | Mapping[str, Any] | None = ..., - parse_dates: list[str] | dict[str, str] | None = ..., + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ..., chunksize: int = ..., dtype: DtypeArg | None = ..., dtype_backend: DtypeBackend | lib.NoDefault = ..., @@ -399,7 +399,7 @@ def read_sql_query( index_col: str | list[str] | None = None, coerce_float: bool = True, params: list[Any] | Mapping[str, Any] | None = None, - parse_dates: list[str] | dict[str, str] | None = None, + parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None, chunksize: int | None = None, dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index fbf9009cedc40..aee872f9ae50a 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -570,18 +570,23 @@ def boxplot_frame_groupby( Parameters ---------- - grouped : Grouped DataFrame + grouped : DataFrameGroupBy + The grouped DataFrame object over which to create the box plots. subplots : bool * ``False`` - no subplots will be used * ``True`` - create a subplot for each group. - column : column name or list of names, or vector Can be any valid input to groupby. fontsize : float or str - rot : label rotation angle - grid : Setting this to True will show the grid + Font size for the labels. + rot : float + Rotation angle of labels (in degrees) on the x-axis. + grid : bool + Whether to show grid lines on the plot. ax : Matplotlib axis object, default None - figsize : A tuple (width, height) in inches + The axes on which to draw the plots. If None, uses the current axes. + figsize : tuple of (float, float) + The figure size in inches (width, height). layout : tuple (optional) The layout of the plot: (rows, columns). sharex : bool, default False @@ -599,8 +604,15 @@ def boxplot_frame_groupby( Returns ------- - dict of key/value = group key/DataFrame.boxplot return value - or DataFrame.boxplot return value in case subplots=figures=False + dict or DataFrame.boxplot return value + If ``subplots=True``, returns a dictionary of group keys to the boxplot + return values. If ``subplots=False``, returns the boxplot return value + of a single DataFrame. + + See Also + -------- + DataFrame.boxplot : Create a box plot from a DataFrame. + Series.plot : Plot a Series. Examples -------- diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 68682344f98ca..5ad30a68ae3c9 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -20,6 +20,7 @@ import pandas as pd import pandas.core.common as com +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import ( @@ -54,7 +55,8 @@ def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> ticks = ax.get_xticks() if is_vertical else ax.get_yticks() if len(ticks) != len(labels): i, remainder = divmod(len(ticks), len(labels)) - assert remainder == 0, remainder + if Version(mpl.__version__) < Version("3.10"): + assert remainder == 0, remainder labels *= i if is_vertical: ax.set_xticklabels(labels, **kwargs) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index d5624aecd1215..8ee75e7fe553e 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -56,7 +56,7 @@ def format_date_labels(ax: Axes, rot) -> None: fig = ax.get_figure() if fig is not None: # should always be a Figure but can technically be None - maybe_adjust_figure(fig, bottom=0.2) + maybe_adjust_figure(fig, bottom=0.2) # type: ignore[arg-type] def table( diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 7face74dcbc89..b20f8ac5f4796 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -178,14 +178,21 @@ def scatter_matrix( """ Draw a matrix of scatter plots. + Each pair of numeric columns in the DataFrame is plotted against each other, + resulting in a matrix of scatter plots. The diagonal plots can display either + histograms or Kernel Density Estimation (KDE) plots for each variable. + Parameters ---------- frame : DataFrame + The data to be plotted. alpha : float, optional Amount of transparency applied. figsize : (float,float), optional A tuple (width, height) in inches. ax : Matplotlib axis object, optional + An existing Matplotlib axis object for the plots. If None, a new axis is + created. grid : bool, optional Setting this to True will show the grid. diagonal : {'hist', 'kde'} @@ -208,6 +215,14 @@ def scatter_matrix( numpy.ndarray A matrix of scatter plots. + See Also + -------- + plotting.parallel_coordinates : Plots parallel coordinates for multivariate data. + plotting.andrews_curves : Generates Andrews curves for visualizing clusters of + multivariate data. + plotting.radviz : Creates a RadViz visualization. + plotting.bootstrap_plot : Visualizes uncertainty in data via bootstrap sampling. + Examples -------- diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index bab8566a06dc2..60cade97ab528 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -113,13 +113,9 @@ def test_groupby_extension_transform(self, data_for_grouping): def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) + df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op) df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c6ac6368f2770..6dd1f3f15bc15 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1647,7 +1647,7 @@ def test_from_arrow_respecting_given_dtype(): def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) - with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): + with tm.external_error_raised(pa.ArrowInvalid): array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index ec979ac6d22dc..011bf0b2016b2 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -101,6 +101,31 @@ def test_fillna_limit_series(self, data_missing): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object_works(self, data, as_frame): + super().test_hash_pandas_object_works(data, as_frame) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_str(self, data): + super().test_astype_str(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index c15952339ef18..d5e94382b8314 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -320,7 +318,6 @@ def test_corrwith_non_timeseries_data(self): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -334,9 +331,8 @@ def test_corrwith_with_objects(self, using_infer_string): df2["obj"] = "bar" if using_infer_string: - import pyarrow as pa - - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + msg = "Cannot perform reduction 'mean' with string dtype" + with pytest.raises(TypeError, match=msg): df1.corrwith(df2) else: with pytest.raises(TypeError, match="Could not convert"): diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 1685f9ee331f5..bf01ec73cf72b 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -135,13 +133,9 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - if using_infer_string: - expected = Series([np.array(["bar"])]) - else: - expected = Series(["bar"]) + expected = Series(np.array("bar")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index d38bc06260a0e..36088cceb13f1 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( DataFrame, @@ -31,6 +32,9 @@ def test_to_numpy_copy(self): # and that can be respected because we are already numpy-float assert df.to_numpy(copy=False).base is df.values.base + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3d8213cb3d11a..9b6080603f0c9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2404,6 +2404,9 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 ser = Series(range(100)) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 57c803c23b001..dae7fe2575c22 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1858,10 +1858,7 @@ def test_unstack_bug(self, future_stack): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) - + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack(future_stack=future_stack) tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) diff --git a/pandas/tests/groupby/__init__.py b/pandas/tests/groupby/__init__.py index 446d9da437771..79046cd7ed415 100644 --- a/pandas/tests/groupby/__init__.py +++ b/pandas/tests/groupby/__init__.py @@ -2,7 +2,7 @@ def get_groupby_method_args(name, obj): """ Get required arguments for a groupby method. - When parametrizing a test over groupby methods (e.g. "sum", "mean", "fillna"), + When parametrizing a test over groupby methods (e.g. "sum", "mean"), it is often the case that arguments are required for certain methods. Parameters @@ -16,7 +16,7 @@ def get_groupby_method_args(name, obj): ------- A tuple of required arguments for the method. """ - if name in ("nth", "fillna", "take"): + if name in ("nth", "take"): return (0,) if name == "quantile": return (0.5,) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index ce78b58e5d8f4..1c016143d50c3 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -499,17 +499,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[0])["B"].iloc[0] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] + assert ts == grouped.apply(lambda x: x.iloc[-1])["B"].iloc[0] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8ca6593a19f20..1050f8154572a 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -324,12 +324,9 @@ def test_against_frame_and_seriesgroupby( ) if frame: # compare against apply with DataFrame value_counts - warn = DeprecationWarning if groupby == "column" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gp.apply( - _frame_value_counts, ["gender", "education"], normalize, sort, ascending - ) + expected = gp.apply( + _frame_value_counts, ["gender", "education"], normalize, sort, ascending + ) if as_index: tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1a4127ab49b0e..62d4a0ddcc0f5 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -27,12 +27,9 @@ def test_apply_func_that_appends_group_to_list_without_copy(): def store(group): groups.append(group) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("index").apply(store) - expected_value = DataFrame( - {"index": [0] * 10, 0: [1] * 10}, index=pd.RangeIndex(0, 100, 10) - ) + df.groupby("index").apply(store) + expected_value = DataFrame({0: [1] * 10}, index=pd.RangeIndex(0, 100, 10)) + expected_value.columns = expected_value.columns.astype(object) tm.assert_frame_equal(groups[0], expected_value) @@ -111,11 +108,7 @@ def test_apply_index_date_object(): ] exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date") expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("date", group_keys=False).apply( - lambda x: x["time"][x["value"].idxmax()] - ) + result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()]) tm.assert_series_equal(result, expected) @@ -189,9 +182,7 @@ def f_constant_df(group): for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: del names[:] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("a", group_keys=False).apply(func) + df.groupby("a").apply(func) assert names == group_names @@ -209,11 +200,9 @@ def test_group_apply_once_per_group2(capsys): index=["0", "2", "4", "6", "8", "10", "12", "14"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("group_by_column", group_keys=False).apply( - lambda df: print("function_called") - ) + df.groupby("group_by_column", group_keys=False).apply( + lambda df: print("function_called") + ) result = capsys.readouterr().out.count("function_called") # If `groupby` behaves unexpectedly, this test will break @@ -233,15 +222,27 @@ def slow(group): def fast(group): return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - fast_df = df.groupby("A", group_keys=False).apply(fast) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - slow_df = df.groupby("A", group_keys=False).apply(slow) - + fast_df = df.groupby("A", group_keys=False).apply(fast) + slow_df = df.groupby("A", group_keys=False).apply(slow) tm.assert_frame_equal(fast_df, slow_df) +def test_apply_fast_slow_identical_index(): + # GH#44803 + df = DataFrame( + { + "name": ["Alice", "Bob", "Carl"], + "age": [20, 21, 20], + } + ).set_index("name") + + grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group) + grp_by_copy = df.groupby(["age"], group_keys=False).apply( + lambda group: group.copy() + ) + tm.assert_frame_equal(grp_by_same_value, grp_by_copy) + + @pytest.mark.parametrize( "func", [ @@ -258,11 +259,8 @@ def test_groupby_apply_identity_maybecopy_index_identical(func): # transparent to the user df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("g", group_keys=False).apply(func) - tm.assert_frame_equal(result, df) + result = df.groupby("g", group_keys=False).apply(func) + tm.assert_frame_equal(result, df[["a", "b"]]) def test_apply_with_mixed_dtype(): @@ -273,19 +271,19 @@ def test_apply_with_mixed_dtype(): "foo2": ["one", "two", "two", "three", "one", "two"], } ) - result = df.apply(lambda x: x, axis=1).dtypes - expected = df.dtypes - tm.assert_series_equal(result, expected) + result = df.apply(lambda x: x, axis=1) + expected = df + tm.assert_frame_equal(result, expected) # GH 3610 incorrect dtype conversion with as_index=False df = DataFrame({"c1": [1, 2, 6, 6, 8]}) df["c2"] = df.c1 / 2.0 - result1 = df.groupby("c2").mean().reset_index().c2 - result2 = df.groupby("c2", as_index=False).mean().c2 - tm.assert_series_equal(result1, result2) + result1 = df.groupby("c2").mean().reset_index() + result2 = df.groupby("c2", as_index=False).mean() + tm.assert_frame_equal(result1, result2) -def test_groupby_as_index_apply(): +def test_groupby_as_index_apply(as_index): # GH #4648 and #3417 df = DataFrame( { @@ -294,38 +292,41 @@ def test_groupby_as_index_apply(): "time": range(6), } ) + gb = df.groupby("user_id", as_index=as_index) - g_as = df.groupby("user_id", as_index=True) - g_not_as = df.groupby("user_id", as_index=False) - - res_as = g_as.head(2).index - res_not_as = g_not_as.head(2).index - exp = Index([0, 1, 2, 4]) - tm.assert_index_equal(res_as, exp) - tm.assert_index_equal(res_not_as, exp) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res_as_apply = g_as.apply(lambda x: x.head(2)).index - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + expected = DataFrame( + { + "item_id": ["b", "b", "a", "a"], + "user_id": [1, 2, 1, 3], + "time": [0, 1, 2, 4], + }, + index=[0, 1, 2, 4], + ) + result = gb.head(2) + tm.assert_frame_equal(result, expected) # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = Index([0, 2, 1, 4]) - tp = [(1, 0), (1, 2), (2, 1), (3, 4)] - exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) - - tm.assert_index_equal(res_as_apply, exp_as_apply) - tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) + if as_index: + tp = [(1, 0), (1, 2), (2, 1), (3, 4)] + index = MultiIndex.from_tuples(tp, names=["user_id", None]) + else: + index = Index([0, 2, 1, 4]) + expected = DataFrame( + { + "item_id": list("baba"), + "time": [0, 2, 1, 4], + }, + index=index, + ) + result = gb.apply(lambda x: x.head(2)) + tm.assert_frame_equal(result, expected) def test_groupby_as_index_apply_str(): ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index + res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) @@ -354,19 +355,13 @@ def desc3(group): # weirdo return result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(desc) + result = grouped.apply(desc) assert result.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = grouped.apply(desc2) + result2 = grouped.apply(desc2) assert result2.index.names == ("A", "B", "stat") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result3 = grouped.apply(desc3) + result3 = grouped.apply(desc3) assert result3.index.names == ("A", "B", None) @@ -396,9 +391,7 @@ def test_apply_series_yield_constant(df): def test_apply_frame_yield_constant(df): # GH13568 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["A", "B"]).apply(len) + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None @@ -409,9 +402,7 @@ def test_apply_frame_yield_constant(df): def test_apply_frame_to_series(df): grouped = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) @@ -420,9 +411,7 @@ def test_apply_frame_to_series(df): def test_apply_frame_not_as_index_column_name(df): # GH 35964 - path within _wrap_applied_output not hit by a test grouped = df.groupby(["A", "B"], as_index=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(len) + result = grouped.apply(len) expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan tm.assert_index_equal(result.index, expected.index) @@ -445,9 +434,7 @@ def trans2(group): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(trans) + result = df.groupby("A").apply(trans) exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) assert result.name == "C" @@ -476,10 +463,8 @@ def test_apply_chunk_view(group_keys): # Low level tinkering could be unsafe, make sure not df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) - expected = df.take([0, 1, 3, 4, 6, 7]) + result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2]) + expected = df[["value"]].take([0, 1, 3, 4, 6, 7]) if group_keys: expected.index = MultiIndex.from_arrays( [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None] @@ -499,9 +484,7 @@ def test_apply_no_name_column_conflict(): # it works! #2605 grouped = df.groupby(["name", "name2"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grouped.apply(lambda x: x.sort_values("value", inplace=True)) + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): @@ -518,11 +501,9 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) tm.assert_frame_equal(result, expected) @@ -544,13 +525,10 @@ def f(group): group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("d", group_keys=False).apply(f) + result = df.groupby("d", group_keys=False).apply(f) - expected = df.copy() + expected = df[["c", "v"]] expected["v2"] = np.tile([0.0, 0.5, 1], 2) - tm.assert_frame_equal(result, expected) @@ -584,11 +562,8 @@ def filt2(x): else: return x[x.category == "c"] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = data.groupby("id_field").apply(filt1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = data.groupby("id_field").apply(filt2) + expected = data.groupby("id_field").apply(filt1) + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -601,19 +576,11 @@ def test_apply_with_duplicated_non_sorted_axis(test_series): if test_series: ser = df.set_index("Y")["X"] result = ser.groupby(level=0, group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_index() - expected = ser.sort_index() + expected = ser tm.assert_series_equal(result, expected) else: - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("Y", group_keys=False).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_values("Y") - expected = df.sort_values("Y") + result = df.groupby("Y", group_keys=False).apply(lambda x: x) + expected = df[["X"]] tm.assert_frame_equal(result, expected) @@ -654,9 +621,7 @@ def f(g): g["value3"] = g["value1"] * 2 return g - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert "value3" in result @@ -670,13 +635,9 @@ def test_apply_numeric_coercion_when_datetime(): df = DataFrame( {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) tm.assert_series_equal(result["Str"], expected["Str"]) @@ -689,9 +650,7 @@ def test_apply_numeric_coercion_when_datetime_getitem(): def get_B(g): return g.iloc[0][["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(get_B)["B"] + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) @@ -718,11 +677,8 @@ def predictions(tool): ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df1.groupby("Key").apply(predictions).p1 - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df2.groupby("Key").apply(predictions).p1 + expected = df1.groupby("Key").apply(predictions).p1 + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -737,13 +693,11 @@ def test_apply_aggregating_timedelta_and_datetime(): } ) df["time_delta_zero"] = df.datetime - df.datetime - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("clientid").apply( - lambda ddf: Series( - {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} - ) + result = df.groupby("clientid").apply( + lambda ddf: Series( + {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()} ) + ) expected = DataFrame( { "clientid": ["A", "B", "C"], @@ -786,15 +740,11 @@ def func_with_no_date(batch): def func_with_date(batch): return Series({"b": datetime(2015, 1, 1), "c": 2}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1]) dfg_no_conversion_expected.index.name = "a" - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = DataFrame( {"b": pd.Timestamp(2015, 1, 1), "c": 2}, index=[1] ) @@ -838,11 +788,8 @@ def test_groupby_apply_all_none(): def test_func(x): pass - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = test_df.groupby("groups").apply(test_func) - expected = DataFrame(columns=test_df.columns) - expected = expected.astype(test_df.dtypes) + result = test_df.groupby("groups").apply(test_func) + expected = DataFrame(columns=["random_vars"], dtype="int64") tm.assert_frame_equal(result, expected) @@ -852,12 +799,12 @@ def test_func(x): [ {"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}, [[1, 1], [0, 2]], - {"groups": [1, 1], "vars": [0, 2]}, + {"vars": [0, 2]}, ], [ {"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}, [[2, 2], [1, 3]], - {"groups": [2, 2], "vars": [1, 3]}, + {"vars": [1, 3]}, ], ], ) @@ -870,9 +817,7 @@ def test_func(x): return None return x.iloc[[0, -1]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = test_df1.groupby("groups").apply(test_func) + result1 = test_df1.groupby("groups").apply(test_func) index1 = MultiIndex.from_arrays(out_idx, names=["groups", None]) expected1 = DataFrame(out_data, index=index1) tm.assert_frame_equal(result1, expected1) @@ -882,9 +827,7 @@ def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]}) groups = df.groupby("group") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = groups.apply(lambda group: group[group.value != 1]["value"]) + result = groups.apply(lambda group: group[group.value != 1]["value"]) expected = Series( [0], name="value", @@ -909,9 +852,7 @@ def test_apply_with_mixed_types(meth): def test_func_returns_object(): # GH 28652 df = DataFrame({"a": [1, 2]}, index=Index([1, 2])) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("a").apply(lambda g: g.index) + result = df.groupby("a").apply(lambda g: g.index) expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a")) tm.assert_series_equal(result, expected) @@ -928,9 +869,7 @@ def test_apply_datetime_issue(group_column_dtlike): # standard int values in range(len(num_columns)) df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) + result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -967,9 +906,7 @@ def test_apply_series_return_dataframe_groups(): def most_common_values(df): return Series({c: s.value_counts().index[0] for c, s in df.items()}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = tdf.groupby("day").apply(most_common_values)["userId"] + result = tdf.groupby("day").apply(most_common_values)["userId"] expected = Series( ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) @@ -1010,13 +947,11 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): ], columns=["observation", "color", "mood", "intensity", "score"], ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object expected = Series( - [np.dtype("datetime64[us]"), dtype, dtype, np.int64, dtype], - index=["observation", "color", "mood", "intensity", "score"], + [np.dtype("datetime64[us]"), dtype, np.int64, dtype], + index=["observation", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) @@ -1033,10 +968,8 @@ def test_groupby_apply_datetime_result_dtypes(using_infer_string): def test_apply_index_has_complex_internals(index): # GH 31248 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("group", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["value"]]) @pytest.mark.parametrize( @@ -1058,9 +991,7 @@ def test_apply_index_has_complex_internals(index): def test_apply_function_returns_non_pandas_non_scalar(function, expected_values): # GH 31441 df = DataFrame(["A", "A", "B", "B"], columns=["groups"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("groups").apply(function) + result = df.groupby("groups").apply(function) expected = Series(expected_values, index=Index(["A", "B"], name="groups")) tm.assert_series_equal(result, expected) @@ -1072,9 +1003,7 @@ def fct(group): df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(fct) + result = df.groupby("A").apply(fct) expected = Series( [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A") ) @@ -1085,9 +1014,7 @@ def fct(group): def test_apply_function_index_return(function): # GH: 22541 df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("id").apply(function) + result = df.groupby("id").apply(function) expected = Series( [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])], index=Index([1, 2, 3], name="id"), @@ -1123,9 +1050,7 @@ def test_apply_result_type(group_keys, udf): # We'd like to control whether the group keys end up in the index # regardless of whether the UDF happens to be a transform. df = DataFrame({"A": ["a", "b"], "B": [1, 2]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df_result = df.groupby("A", group_keys=group_keys).apply(udf) + df_result = df.groupby("A", group_keys=group_keys).apply(udf) series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf) if group_keys: @@ -1140,11 +1065,8 @@ def test_result_order_group_keys_false(): # GH 34998 # apply result order should not depend on whether index is the same or just equal df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A", group_keys=False).apply(lambda x: x) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) + result = df.groupby("A", group_keys=False).apply(lambda x: x) + expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy()) tm.assert_frame_equal(result, expected) @@ -1156,15 +1078,8 @@ def test_apply_with_timezones_aware(): df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df1.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df2.groupby("x", group_keys=False).apply( - lambda df: df[["x", "y"]].copy() - ) + result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) + result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy()) tm.assert_frame_equal(result1, result2) @@ -1187,7 +1102,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): # Check output when no other methods are called before .apply() grp = df.groupby(by="a") - result = grp.apply(np.sum, axis=0, include_groups=False) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) # Check output when another method is called before .apply() @@ -1201,7 +1116,7 @@ def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): msg = "" with tm.assert_produces_warning(warn, match=msg): _ = getattr(grp, reduction_func)(*args) - result = grp.apply(np.sum, axis=0, include_groups=False) + result = grp.apply(np.sum, axis=0) tm.assert_frame_equal(result, expected) @@ -1223,14 +1138,12 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): ) grp = df.groupby(["A", "B"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grp.apply(lambda x: x.head(1)) + result = grp.apply(lambda x: x.head(1)) expected = df.iloc[[0, 2, 3]] expected = expected.reset_index() expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]]) - expected = expected.drop(columns=["idx"]) + expected = expected.drop(columns=["A", "B", "idx"]) tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: @@ -1247,10 +1160,8 @@ def test_apply_dropna_with_indexed_same(dropna): }, index=list("xxyxz"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) - expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] + result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x) + expected = df.dropna()[["col"]] if dropna else df[["col"]].iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected) @@ -1274,9 +1185,7 @@ def test_apply_dropna_with_indexed_same(dropna): def test_apply_as_index_constant_lambda(as_index, expected): # GH 13217 df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]}) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) + result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1) tm.assert_equal(result, expected) @@ -1286,9 +1195,7 @@ def test_sort_index_groups(): {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]}, index=range(5), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("C").apply(lambda x: x.A.sort_index()) + result = df.groupby("C").apply(lambda x: x.A.sort_index()) expected = Series( range(1, 6), index=MultiIndex.from_tuples( @@ -1308,12 +1215,10 @@ def test_positional_slice_groups_datetimelike(): "let": list("abcde"), } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = expected.groupby( - [expected.let, expected.date.dt.date], group_keys=False - ).apply(lambda x: x.iloc[0:]) - tm.assert_frame_equal(result, expected) + result = expected.groupby( + [expected.let, expected.date.dt.date], group_keys=False + ).apply(lambda x: x.iloc[0:]) + tm.assert_frame_equal(result, expected[["date", "vals"]]) def test_groupby_apply_shape_cache_safety(): @@ -1354,32 +1259,27 @@ def test_apply_na(dropna): {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} ) dfgrp = df.groupby("grp", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) tm.assert_frame_equal(result, expected) def test_apply_empty_string_nan_coerce_bug(): # GH#24903 - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = ( - DataFrame( - { - "a": [1, 1, 2, 2], - "b": ["", "", "", ""], - "c": pd.to_datetime([1, 2, 3, 4], unit="s"), - } - ) - .groupby(["a", "b"]) - .apply(lambda df: df.iloc[-1]) + result = ( + DataFrame( + { + "a": [1, 1, 2, 2], + "b": ["", "", "", ""], + "c": pd.to_datetime([1, 2, 3, 4], unit="s"), + } ) + .groupby(["a", "b"]) + .apply(lambda df: df.iloc[-1]) + ) expected = DataFrame( - [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]], - columns=["a", "b", "c"], + [[pd.to_datetime(2, unit="s")], [pd.to_datetime(4, unit="s")]], + columns=["c"], index=MultiIndex.from_tuples([(1, ""), (2, "")], names=["a", "b"]), ) tm.assert_frame_equal(result, expected) @@ -1401,11 +1301,9 @@ def test_apply_index_key_error_bug(index_values): }, index=Index(["a2", "a3", "aa"], name="a"), ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = result.groupby("a").apply( - lambda df: Series([df["b"].mean()], index=["b_mean"]) - ) + result = result.groupby("a").apply( + lambda df: Series([df["b"].mean()], index=["b_mean"]) + ) tm.assert_frame_equal(result, expected) @@ -1452,10 +1350,9 @@ def test_apply_index_key_error_bug(index_values): ) def test_apply_nonmonotonic_float_index(arg, idx): # GH 34455 - expected = DataFrame({"col": arg}, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = expected.groupby("col", group_keys=False).apply(lambda x: x) + df = DataFrame({"grp": arg, "col": arg}, index=idx) + result = df.groupby("grp", group_keys=False).apply(lambda x: x) + expected = df[["col"]] tm.assert_frame_equal(result, expected) @@ -1502,19 +1399,12 @@ def test_empty_df(method, op): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("include_groups", [True, False]) -def test_include_groups(include_groups): +def test_include_groups(): # GH#7155 df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) gb = df.groupby("a") - warn = DeprecationWarning if include_groups else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - result = gb.apply(lambda x: x.sum(), include_groups=include_groups) - expected = DataFrame({"a": [2, 2], "b": [7, 5]}, index=Index([1, 2], name="a")) - if not include_groups: - expected = expected[["b"]] - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match="include_groups=True is no longer allowed"): + gb.apply(lambda x: x.sum(), include_groups=True) @pytest.mark.parametrize("func, value", [(max, 2), (min, 1), (sum, 3)]) @@ -1523,7 +1413,7 @@ def test_builtins_apply(func, value): # Builtins act as e.g. sum(group), which sums the column labels of group df = DataFrame({0: [1, 1, 2], 1: [3, 4, 5], 2: [3, 4, 5]}) gb = df.groupby(0) - result = gb.apply(func, include_groups=False) + result = gb.apply(func) expected = Series([value, value], index=Index([1, 2], name=0)) tm.assert_series_equal(result, expected) @@ -1544,9 +1434,7 @@ def f_0(grp): return grp.iloc[0] expected = df.groupby("A").first()[["B"]] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_0)[["B"]] + result = df.groupby("A").apply(f_0)[["B"]] tm.assert_frame_equal(result, expected) def f_1(grp): @@ -1554,9 +1442,7 @@ def f_1(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_1)[["B"]] + result = df.groupby("A").apply(f_1)[["B"]] e = expected.copy() e.loc["Tiger"] = np.nan tm.assert_frame_equal(result, e) @@ -1566,9 +1452,7 @@ def f_2(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_2)[["B"]] + result = df.groupby("A").apply(f_2)[["B"]] e = expected.copy() e.loc["Pony"] = np.nan tm.assert_frame_equal(result, e) @@ -1579,9 +1463,7 @@ def f_3(grp): return None return grp.iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_3)[["C"]] + result = df.groupby("A").apply(f_3)[["C"]] e = df.groupby("A").first()[["C"]] e.loc["Pony"] = pd.NaT tm.assert_frame_equal(result, e) @@ -1592,10 +1474,42 @@ def f_4(grp): return None return grp.iloc[0].loc["C"] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").apply(f_4) + result = df.groupby("A").apply(f_4) e = df.groupby("A").first()["C"].copy() e.loc["Pony"] = np.nan e.name = None tm.assert_series_equal(result, e) + + +def test_nonreducer_nonstransform(): + # GH3380, GH60619 + # Was originally testing mutating in a UDF; now kept as an example + # of using apply with a nonreducer and nontransformer. + df = DataFrame( + { + "cat1": ["a"] * 8 + ["b"] * 6, + "cat2": ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2 + + ["f"] * 2 + + ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2, + "val": np.random.default_rng(2).integers(100, size=14), + } + ) + + def f(x): + x = x.copy() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() + + expected = DataFrame( + { + "cat1": list("aaaabbb"), + "cat2": list("cdefcde"), + "rank": [3.0, 2.0, 5.0, 1.0, 2.0, 4.0, 1.0], + } + ).set_index(["cat1", "cat2"])["rank"] + result = df.groupby("cat1").apply(f) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py deleted file mode 100644 index fa20efad4da77..0000000000000 --- a/pandas/tests/groupby/test_apply_mutate.py +++ /dev/null @@ -1,96 +0,0 @@ -import numpy as np - -import pandas as pd -import pandas._testing as tm - - -def test_group_by_copy(): - # GH#44803 - df = pd.DataFrame( - { - "name": ["Alice", "Bob", "Carl"], - "age": [20, 21, 20], - } - ).set_index("name") - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grp_by_same_value = df.groupby(["age"], group_keys=False).apply( - lambda group: group - ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grp_by_copy = df.groupby(["age"], group_keys=False).apply( - lambda group: group.copy() - ) - tm.assert_frame_equal(grp_by_same_value, grp_by_copy) - - -def test_mutate_groups(): - # GH3380 - - df = pd.DataFrame( - { - "cat1": ["a"] * 8 + ["b"] * 6, - "cat2": ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2 - + ["f"] * 2 - + ["c"] * 2 - + ["d"] * 2 - + ["e"] * 2, - "cat3": [f"g{x}" for x in range(1, 15)], - "val": np.random.default_rng(2).integers(100, size=14), - } - ) - - def f_copy(x): - x = x.copy() - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - def f_no_copy(x): - x["rank"] = x.val.rank(method="min") - return x.groupby("cat2")["rank"].min() - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grpby_copy = df.groupby("cat1").apply(f_copy) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grpby_no_copy = df.groupby("cat1").apply(f_no_copy) - tm.assert_series_equal(grpby_copy, grpby_no_copy) - - -def test_no_mutate_but_looks_like(): - # GH 8467 - # first show's mutation indicator - # second does not, but should yield the same results - df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) - tm.assert_series_equal(result1, result2) - - -def test_apply_function_with_indexing(): - # GH: 33058 - df = pd.DataFrame( - {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]} - ) - - def fn(x): - x.loc[x.index[-1], "col2"] = 0 - return x.col2 - - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["col1"], as_index=False).apply(fn) - expected = pd.Series( - [1, 2, 0, 4, 5, 0], - index=range(6), - name="col2", - ) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 6d84dae1d25d8..656a61de5d105 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -127,10 +127,8 @@ def test_basic_string(using_infer_string): def f(x): return x.drop_duplicates("person_name").iloc[0] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(f) - expected = x.iloc[[0, 1]].copy() + result = g.apply(f) + expected = x[["person_name"]].iloc[[0, 1]] expected.index = Index([1, 2], name="person_id") dtype = "str" if using_infer_string else object expected["person_name"] = expected["person_name"].astype(dtype) @@ -314,9 +312,7 @@ def test_apply(ordered): # but for transform we should still get back the original index idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(lambda x: 1) + result = grouped.apply(lambda x: 1) tm.assert_series_equal(result, expected) @@ -1357,11 +1353,7 @@ def test_get_nonexistent_category(): # Accessing a Category that is not in the dataframe df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)}) with pytest.raises(KeyError, match="'vau'"): - df.groupby("var").apply( - lambda rows: DataFrame( - {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} - ) - ) + df.groupby("var").apply(lambda rows: DataFrame({"val": [rows.iloc[-1]["vau"]]})) def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed): @@ -1963,10 +1955,7 @@ def test_category_order_transformer( df = df.set_index(keys) args = get_groupby_method_args(transformation_func, df) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = FutureWarning if transformation_func == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, transformation_func)(*args) + op_result = getattr(gb, transformation_func)(*args) result = op_result.index.get_level_values("a").categories expected = Index([1, 4, 3, 2]) tm.assert_index_equal(result, expected) @@ -2037,10 +2026,7 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde df["a2"] = df["a"] df = df.set_index(keys) gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed) - warn = DeprecationWarning if method == "apply" and index_kind == "range" else None - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) + op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True)) if (method == "transform" or not as_index) and index_kind == "range": result = op_result["a"].cat.categories else: diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 47ad18c9ad2c8..679f7eb7f7f11 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -289,9 +289,7 @@ def test_count(): for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + right = df.groupby(key).apply(DataFrame.count) tm.assert_frame_equal(left, right) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 702bbfef2be3b..c4c1e7bd9ac4f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -66,11 +66,9 @@ def test_groupby_nonobject_dtype_mixed(): def max_value(group): return group.loc[group["value"].idxmax()] - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - applied = df.groupby("A").apply(max_value) + applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = df.dtypes + expected = df.drop(columns="A").dtypes tm.assert_series_equal(result, expected) @@ -229,11 +227,8 @@ def f3(x): df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) # correct result - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result1 = df.groupby("a").apply(f1) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result2 = df2.groupby("a").apply(f1) + result1 = df.groupby("a").apply(f1) + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -1055,17 +1050,13 @@ def summarize_random_name(df): # Provide a different name for each Series. In this case, groupby # should not attempt to propagate the Series name since they are # inconsistent. - return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) + return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["C"]) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize) + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize, "metrics") + metrics = df.groupby("A").apply(summarize, "metrics") assert metrics.columns.name == "metrics" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - metrics = df.groupby("A").apply(summarize_random_name) + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1361,10 +1352,8 @@ def test_dont_clobber_name_column(): {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("key", group_keys=False).apply(lambda x: x) - tm.assert_frame_equal(result, df) + result = df.groupby("key", group_keys=False).apply(lambda x: x) + tm.assert_frame_equal(result, df[["name"]]) def test_skip_group_keys(): @@ -1441,9 +1430,7 @@ def freducex(x): grouped = df.groupby(grouper, group_keys=False) # make sure all these work - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - grouped.apply(f) + grouped.apply(f) grouped.aggregate(freduce) grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) @@ -1464,10 +1451,7 @@ def f(group): names.append(group.name) return group.copy() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - df.groupby("a", sort=False, group_keys=False).apply(f) - + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1672,9 +1656,7 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): tm.assert_frame_equal(x, x.sort_values(by=sort_column)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - g.apply(test_sort) + g.apply(test_sort) def test_pivot_table_values_key_error(): @@ -1860,10 +1842,8 @@ def test_empty_groupby_apply_nonunique_columns(): df[3] = df[3].astype(np.int64) df.columns = [0, 1, 2, 0] gb = df.groupby(df[1], group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = gb.apply(lambda x: x) - assert (res.dtypes == df.dtypes).all() + res = gb.apply(lambda x: x) + assert (res.dtypes == df.drop(columns=1).dtypes).all() def test_tuple_as_grouping(): @@ -2098,36 +2078,14 @@ def test_group_on_empty_multiindex(transformation_func, request): df["col_3"] = df["col_3"].astype(int) df["col_4"] = df["col_4"].astype(int) df = df.set_index(["col_1", "col_2"]) - if transformation_func == "fillna": - args = ("ffill",) - else: - args = () - warn = FutureWarning if transformation_func == "fillna" else None - warn_msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) - with tm.assert_produces_warning(warn, match=warn_msg): - expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] + result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func) + expected = df.groupby(["col_1"]).transform(transformation_func).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) - warn_msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - result = ( - df["col_3"] - .iloc[:0] - .groupby(["col_1"]) - .transform(transformation_func, *args) - ) - warn_msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=warn_msg): - expected = ( - df["col_3"] - .groupby(["col_1"]) - .transform(transformation_func, *args) - .iloc[:0] - ) + result = df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func) + expected = df["col_3"].groupby(["col_1"]).transform(transformation_func).iloc[:0] if transformation_func in ("diff", "shift"): expected = expected.astype(int) tm.assert_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 060a8b7fd3824..8c4ab42b7be7a 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -323,9 +323,7 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, df = pd.DataFrame(data) gb = df.groupby("groups", dropna=dropna) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) mi_tuples = tuple(zip(data["groups"], selected_data["values"])) mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index a1f4627475bab..3ee9c9ea0c7fd 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -36,11 +36,11 @@ def test_groupby_preserves_subclass(obj, groupby_func): args = get_groupby_method_args(groupby_func, obj) - warn = FutureWarning if groupby_func == "fillna" else None - msg = f"{type(grouped).__name__}.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + warn = FutureWarning if groupby_func == "corrwith" else None + msg = f"{type(grouped).__name__}.corrwith is deprecated" + with tm.assert_produces_warning(warn, match=msg): result1 = getattr(grouped, groupby_func)(*args) - with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False): + with tm.assert_produces_warning(warn, match=msg): result2 = grouped.agg(groupby_func, *args) # Reduction or transformation kernels should preserve type @@ -72,18 +72,11 @@ def func(group): assert group.testattr == "hello" return group.testattr - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - raise_on_extra_warnings=False, - check_stacklevel=False, - ): - result = custom_df.groupby("c").apply(func) + result = custom_df.groupby("c").apply(func) expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c")) tm.assert_series_equal(result, expected) - result = custom_df.groupby("c").apply(func, include_groups=False) + result = custom_df.groupby("c").apply(func) tm.assert_series_equal(result, expected) # https://github.com/pandas-dev/pandas/pull/56761 @@ -124,12 +117,5 @@ def test_groupby_resample_preserves_subclass(obj): df = df.set_index("Date") # Confirm groupby.resample() preserves dataframe type - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning( - DeprecationWarning, - match=msg, - raise_on_extra_warnings=False, - check_stacklevel=False, - ): - result = df.groupby("Buyer").resample("5D").sum() + result = df.groupby("Buyer").resample("5D").sum() assert isinstance(result, obj) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 4e7c0acb127ed..53e9c53efebf7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -233,11 +233,7 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(lambda x: x.sum()) - expected["A"] = [0, 2, 4] - expected = expected.loc[:, ["A", "B"]] + result = g.apply(lambda x: x.sum()) tm.assert_frame_equal(result, expected) def test_grouper_creation_bug2(self): @@ -788,7 +784,7 @@ def test_groupby_apply_empty_with_group_keys_false(self): # different index objects. df = DataFrame({"A": [], "B": [], "C": []}) g = df.groupby("A", group_keys=False) - result = g.apply(lambda x: x / x.sum(), include_groups=False) + result = g.apply(lambda x: x / x.sum()) expected = DataFrame({"B": [], "C": []}, index=None) tm.assert_frame_equal(result, expected) @@ -872,9 +868,7 @@ def test_groupby_tuple_keys_handle_multiindex(self): } ) expected = df.sort_values(by=["category_tuple", "num1"]) - result = df.groupby("category_tuple").apply( - lambda x: x.sort_values(by="num1"), include_groups=False - ) + result = df.groupby("category_tuple").apply(lambda x: x.sort_values(by="num1")) expected = expected[result.columns] tm.assert_frame_equal(result.reset_index(drop=True), expected) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index cb4569812f600..0779faa8d8975 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -278,14 +278,11 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): kernel in ("first", "last") or ( # kernels that work on any dtype and don't have numeric_only arg - kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique") + kernel in ("any", "all", "bfill", "ffill", "nth", "nunique") and numeric_only is lib.no_default ) ): - warn = FutureWarning if kernel == "fillna" else None - msg = "DataFrameGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = method(*args, **kwargs) + result = method(*args, **kwargs) assert "b" in result.columns elif has_arg: assert numeric_only is not True diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 1e0a15d0ba796..789105c275625 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -144,7 +144,6 @@ def test_groupby_raises_string( ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), - "fillna": (None, ""), "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -211,10 +210,7 @@ def test_groupby_raises_string( elif groupby_func == "corrwith": msg = "Cannot perform reduction 'mean' with string dtype" - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" @@ -301,7 +297,6 @@ def test_groupby_raises_datetime( "cumsum": (TypeError, "datetime64 type does not support operation 'cumsum'"), "diff": (None, ""), "ffill": (None, ""), - "fillna": (None, ""), "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -333,10 +328,7 @@ def test_groupby_raises_datetime( "var": (TypeError, "datetime64 type does not support operation 'var'"), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" @@ -457,7 +449,6 @@ def test_groupby_raises_category( r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'", ), "ffill": (None, ""), - "fillna": (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -532,10 +523,7 @@ def test_groupby_raises_category( ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" @@ -650,7 +638,6 @@ def test_groupby_raises_category_on_category( ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), - "fillna": (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (ValueError, "empty group due to unobserved categories") if empty_groups @@ -710,10 +697,7 @@ def test_groupby_raises_category_on_category( ), }[groupby_func] - if groupby_func == "fillna": - kind = "Series" if groupby_series else "DataFrame" - warn_msg = f"{kind}GroupBy.fillna is deprecated" - elif groupby_func == "corrwith": + if groupby_func == "corrwith": warn_msg = "DataFrameGroupBy.corrwith is deprecated" else: warn_msg = "" diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a7712d9dc6586..550efe9187fe8 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -483,12 +483,8 @@ def test_timegrouper_apply_return_type_series(self): def sumfunc_series(x): return Series([x["value"].sum()], ("sum",)) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) tm.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -504,11 +500,8 @@ def test_timegrouper_apply_return_type_value(self): def sumfunc_value(x): return x.value.sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) + expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) + result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) tm.assert_series_equal( result.reset_index(drop=True), expected.reset_index(drop=True) ) @@ -934,9 +927,7 @@ def test_groupby_apply_timegrouper_with_nat_apply_squeeze( assert gb._selected_obj.index.nlevels == 1 # function that returns a Series - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - res = gb.apply(lambda x: x["Quantity"] * 2) + res = gb.apply(lambda x: x["Quantity"] * 2) dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") expected = DataFrame( diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 022d3d51ded4e..888b97f2e0206 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -329,9 +329,6 @@ def test_transform_transformation_func(transformation_func): if transformation_func == "cumcount": test_op = lambda x: x.transform("cumcount") mock_op = lambda x: Series(range(len(x)), x.index) - elif transformation_func == "fillna": - test_op = lambda x: x.transform("fillna", value=0) - mock_op = lambda x: x.fillna(value=0) elif transformation_func == "ngroup": test_op = lambda x: x.transform("ngroup") counter = -1 @@ -534,15 +531,13 @@ def f(group): return group[:1] grouped = df.groupby("c") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = grouped.apply(f) + result = grouped.apply(f) assert result["d"].dtype == np.float64 # this is by definition a mutating operation! for key, group in grouped: - res = f(group) + res = f(group.drop(columns="c")) tm.assert_frame_equal(res, result.loc[key]) @@ -688,18 +683,14 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target): f = gb[["float", "float_missing"]].apply(targop) expected = concat([f, i], axis=1) else: - if op != "shift" or not isinstance(gb_target.get("by"), (str, list)): - warn = None - else: - warn = DeprecationWarning - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(warn, match=msg): - expected = gb.apply(targop) + expected = gb.apply(targop) expected = expected.sort_index(axis=1) if op == "shift": expected["string_missing"] = expected["string_missing"].fillna(np.nan) - expected["string"] = expected["string"].fillna(np.nan) + by = gb_target.get("by") + if not isinstance(by, (str, list)) or (by != "string" and "string" not in by): + expected["string"] = expected["string"].fillna(np.nan) result = gb[expected.columns].transform(op, *args).sort_index(axis=1) tm.assert_frame_equal(result, expected) @@ -1436,11 +1427,7 @@ def test_null_group_str_transformer_series(dropna, transformation_func): dtype = object if transformation_func in ("any", "all") else None buffer.append(Series([np.nan], index=[3], dtype=dtype)) expected = concat(buffer) - - warn = FutureWarning if transformation_func == "fillna" else None - msg = "SeriesGroupBy.fillna is deprecated" - with tm.assert_produces_warning(warn, match=msg): - result = gb.transform(transformation_func, *args) + result = gb.transform(transformation_func, *args) tm.assert_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 59c555b9644a1..dde5f38074efb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype): with pytest.raises(TypeError, match=msg): index.astype(dtype) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_category(self, index): + super().test_astype_category(index) + class TestDatetimelikeSubtype(AstypeTests): """Tests specific to IntervalIndex with datetime-like subtype""" diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index f858ae137ca4e..73bbfc91028b3 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -59,6 +59,9 @@ def test_repr_floats(self): expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) @pytest.mark.parametrize( "tuples, closed, expected_data", [ diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 787461b944bd0..5783a16e81d37 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -340,6 +340,9 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_get_indexer_categorical_with_nans(self): # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5)) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 5f934ca3e6e83..58b69d79c65ce 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -525,6 +525,7 @@ def test_intersection_difference_match_empty(self, index, sort): tm.assert_index_equal(inter, diff, exact=True) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e0e9d4cfc5ccb..7aeded5a6cb7f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3297,3 +3297,23 @@ def test_loc_reindexing_of_empty_index(self): df.loc[Series([False] * 4, index=df.index, name=0), 0] = df[0] expected = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"]) tm.assert_frame_equal(df, expected) + + def test_loc_setitem_matching_index(self): + # GH 25548 + s = Series(0.0, index=list("abcd")) + s1 = Series(1.0, index=list("ab")) + s2 = Series(2.0, index=list("xy")) + + # Test matching indices + s.loc[["a", "b"]] = s1 + + result = s[["a", "b"]] + expected = s1 + tm.assert_series_equal(result, expected) + + # Test unmatched indices + s.loc[["a", "b"]] = s2 + + result = s[["a", "b"]] + expected = Series([np.nan, np.nan], index=["a", "b"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 18948de72200a..ced4feb9e7eb9 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -800,6 +800,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_excel_interval_no_labels(self, tmp_excel, using_infer_string): # see gh-19242 # diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index 1abe6238d3922..eb221686dd165 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, MultiIndex, @@ -731,7 +729,6 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize( "columns, siunitx", diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index d7db3d5082135..86682e8160762 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -136,6 +136,13 @@ def test_repr_truncation_dataframe_attrs(self): with option_context("display.max_columns", 2, "display.show_dimensions", False): assert repr(df) == " 0 ... 9\n0 0 ... 0" + def test_repr_truncation_series_with_dataframe_attrs(self): + # GH#60568 + ser = Series([0] * 10) + ser.attrs["b"] = DataFrame([]) + with option_context("display.max_rows", 2, "display.show_dimensions", False): + assert repr(ser) == "0 0\n ..\n9 0\ndtype: int64" + def test_max_colwidth_negative_int_raises(self): # Deprecation enforced from: # https://github.com/pandas-dev/pandas/issues/31532 diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 606b19ac0ed75..16c3c6798ff76 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -37,12 +37,11 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) -def test_mode(setup_path, tmp_path, mode): +def test_mode(setup_path, tmp_path, mode, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -91,10 +90,12 @@ def test_mode(setup_path, tmp_path, mode): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) + if using_infer_string: + df.columns = df.columns.astype("str") tm.assert_frame_equal(result, df) -def test_default_mode(tmp_path, setup_path): +def test_default_mode(tmp_path, setup_path, using_infer_string): # read_hdf uses default mode df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -104,7 +105,10 @@ def test_default_mode(tmp_path, setup_path): path = tmp_path / setup_path df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) def test_reopen_handle(tmp_path, setup_path): @@ -163,7 +167,7 @@ def test_reopen_handle(tmp_path, setup_path): assert not store.is_open -def test_open_args(setup_path): +def test_open_args(setup_path, using_infer_string): with tm.ensure_clean(setup_path) as path: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -178,8 +182,13 @@ def test_open_args(setup_path): store["df"] = df store.append("df2", df) - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + + tm.assert_frame_equal(store["df"], expected) + tm.assert_frame_equal(store["df2"], expected) store.close() @@ -194,7 +203,7 @@ def test_flush(setup_path): store.flush(fsync=True) -def test_complibs_default_settings(tmp_path, setup_path): +def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): # GH15943 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -207,7 +216,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complevel=9) result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -218,7 +231,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complib="zlib") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -229,7 +246,11 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -308,6 +329,7 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -325,6 +347,7 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index bbe1cd77e0d9f..03622faa2b5a8 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -19,7 +17,6 @@ class TestHDFStoreSubclass: # GH 33748 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 70422a0ea6edc..7ff3d24336f00 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,8 +19,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( WASM, is_platform_windows, @@ -365,7 +363,6 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 845f369d3090f..d18f098267599 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -1070,28 +1070,43 @@ def test_boxplot_series_positions(self, hist_df): tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] # if horizontal, yticklabels are rotated - ax = df.plot.box(rot=50, fontsize=8, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(rot=50, fontsize=8, **kwargs) _check_ticks_props(ax, xrot=0, yrot=50, ylabelsize=8) _check_text_labels(ax.get_yticklabels(), labels) assert len(ax.lines) == 7 * len(numeric_cols) - @pytest.mark.filterwarnings("ignore:Attempt:UserWarning") + @pytest.mark.filterwarnings("ignore::UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib version 3.10", + ) def test_boxplot_vertical_subplots(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) axes = _check_plot_works( - df.plot.box, - default_axes=True, - subplots=True, - vert=False, - logx=True, + df.plot.box, default_axes=True, subplots=True, logx=True, **kwargs ) _check_axes_shape(axes, axes_num=3, layout=(1, 3)) _check_ax_scales(axes, xaxis="log") @@ -1099,12 +1114,22 @@ def test_boxplot_vertical_subplots(self, hist_df): _check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == 7 + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + @pytest.mark.xfail( + Version(mpl.__version__) >= Version("3.10"), + reason="Fails starting with matplotlib 3.10", + ) def test_boxplot_vertical_positions(self, hist_df): df = hist_df numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] positions = np.array([3, 2, 8]) - ax = df.plot.box(positions=positions, vert=False) + kwargs = ( + {"vert": False} + if Version(mpl.__version__) < Version("3.10") + else {"orientation": "horizontal"} + ) + ax = df.plot.box(positions=positions, **kwargs) _check_text_labels(ax.get_yticklabels(), labels) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), positions) assert len(ax.lines) == 7 * len(numeric_cols) diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 4916963ab7c87..2267b6197cd80 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -1,5 +1,7 @@ """Test cases for .boxplot method""" +from __future__ import annotations + import itertools import string @@ -22,6 +24,7 @@ _check_ticks_props, _check_visible, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -35,6 +38,17 @@ def _check_ax_limits(col, ax): assert y_max >= col.max() +if Version(mpl.__version__) < Version("3.10"): + verts: list[dict[str, bool | str]] = [{"vert": False}, {"vert": True}] +else: + verts = [{"orientation": "horizontal"}, {"orientation": "vertical"}] + + +@pytest.fixture(params=verts) +def vert(request): + return request.param + + class TestDataFramePlots: def test_stacked_boxplot_set_axis(self): # GH2980 @@ -312,7 +326,7 @@ def test_specified_props_kwd(self, props, expected): assert result[expected][0].get_color() == "C1" - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -322,11 +336,11 @@ def test_plot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.plot(kind="box", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.plot(kind="box", xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_plot_box(self, vert): # GH 54941 rng = np.random.default_rng(2) @@ -335,13 +349,13 @@ def test_plot_box(self, vert): xlabel, ylabel = "x", "y" _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True) - df1.plot.box(ax=axs[0], vert=vert, xlabel=xlabel, ylabel=ylabel) - df2.plot.box(ax=axs[1], vert=vert, xlabel=xlabel, ylabel=ylabel) + df1.plot.box(ax=axs[0], xlabel=xlabel, ylabel=ylabel, **vert) + df2.plot.box(ax=axs[1], xlabel=xlabel, ylabel=ylabel, **vert) for ax in axs: assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_xlabel_ylabel(self, vert): df = DataFrame( { @@ -351,11 +365,11 @@ def test_boxplot_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(xlabel=xlabel, ylabel=ylabel, **vert) assert ax.get_xlabel() == xlabel assert ax.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") def test_boxplot_group_xlabel_ylabel(self, vert): df = DataFrame( { @@ -365,13 +379,19 @@ def test_boxplot_group_xlabel_ylabel(self, vert): } ) xlabel, ylabel = "x", "y" - ax = df.boxplot(by="group", vert=vert, xlabel=xlabel, ylabel=ylabel) + ax = df.boxplot(by="group", xlabel=xlabel, ylabel=ylabel, **vert) for subplot in ax: assert subplot.get_xlabel() == xlabel assert subplot.get_ylabel() == ylabel - @pytest.mark.parametrize("vert", [True, False]) - def test_boxplot_group_no_xlabel_ylabel(self, vert): + @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning") + def test_boxplot_group_no_xlabel_ylabel(self, vert, request): + if Version(mpl.__version__) >= Version("3.10") and vert == { + "orientation": "horizontal" + }: + request.applymarker( + pytest.mark.xfail(reason=f"{vert} fails starting with matplotlib 3.10") + ) df = DataFrame( { "a": np.random.default_rng(2).standard_normal(10), @@ -379,9 +399,13 @@ def test_boxplot_group_no_xlabel_ylabel(self, vert): "group": np.random.default_rng(2).choice(["group1", "group2"], 10), } ) - ax = df.boxplot(by="group", vert=vert) + ax = df.boxplot(by="group", **vert) for subplot in ax: - target_label = subplot.get_xlabel() if vert else subplot.get_ylabel() + target_label = ( + subplot.get_xlabel() + if vert == {"vert": True} or vert == {"orientation": "vertical"} + else subplot.get_ylabel() + ) assert target_label == pprint_thing(["group"]) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 179f2c0e6cfa9..3a7fd548ca961 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1022,12 +1022,8 @@ def test_resample_segfault(unit): all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") ).set_index("timestamp") df.index = df.index.as_unit(unit) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("ID").resample("5min").sum() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) + result = df.groupby("ID").resample("5min").sum() + expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) tm.assert_frame_equal(result, expected) @@ -1046,9 +1042,7 @@ def test_resample_dtype_preservation(unit): result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 @@ -1821,12 +1815,8 @@ def f(data, add_arg): multiplier = 10 df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10)) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("A").resample("D").mean().multiply(multiplier) + result = df.groupby("A").resample("D").agg(f, multiplier).astype(float) + expected = df.groupby("A").resample("D").mean().multiply(multiplier) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index b7b80b5e427ff..da1774cf22587 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -76,9 +76,7 @@ def test_groupby_resample_api(): ) index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index ff1b82210e20d..e7850f96b3b0f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -71,12 +71,8 @@ def test_deferred_with_groupby(): def f_0(x): return x.set_index("date").resample("D").asfreq() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("id").apply(f_0) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.set_index("date").groupby("id").resample("D").asfreq() + expected = df.groupby("id").apply(f_0) + result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( @@ -90,12 +86,8 @@ def f_0(x): def f_1(x): return x.resample("1D").ffill() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = df.groupby("group").apply(f_1) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("group").resample("1D").ffill() + expected = df.groupby("group").apply(f_1) + result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected) @@ -110,9 +102,7 @@ def test_getitem(test_frame): result = g.B.resample("2s").mean() tm.assert_series_equal(result, expected) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.resample("2s").mean().B + result = g.resample("2s").mean().B tm.assert_series_equal(result, expected) @@ -236,12 +226,8 @@ def test_methods(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) tm.assert_equal(result, expected) @@ -258,12 +244,8 @@ def test_methods_nunique(test_frame): def test_methods_std_var(f, test_frame): g = test_frame.groupby("A") r = g.resample("2s") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) + result = getattr(r, f)(ddof=1) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -272,24 +254,18 @@ def test_apply(test_frame): r = g.resample("2s") # reduction - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.resample("2s").sum() + expected = g.resample("2s").sum() def f_0(x): return x.resample("2s").sum() - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = r.apply(f_0) + result = r.apply(f_0) tm.assert_frame_equal(result, expected) def f_1(x): return x.resample("2s").apply(lambda y: y.sum()) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(f_1) + result = g.apply(f_1) # y.sum() results in int64 instead of int32 on 32-bit architectures expected = expected.astype("int64") tm.assert_frame_equal(result, expected) @@ -357,9 +333,7 @@ def test_resample_groupby_with_label(unit): # GH 13235 index = date_range("2000-01-01", freq="2D", periods=5, unit=unit) df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("col0").resample("1W", label="left").sum() + result = df.groupby("col0").resample("1W", label="left").sum() mi = [ np.array([0, 0, 1, 2], dtype=np.int64), @@ -369,9 +343,7 @@ def test_resample_groupby_with_label(unit): ), ] mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) - expected = DataFrame( - data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex - ) + expected = DataFrame(data={"col1": [1, 1, 2, 1]}, index=mindex) tm.assert_frame_equal(result, expected) @@ -380,9 +352,7 @@ def test_consistency_with_window(test_frame): # consistent return values with window df = test_frame expected = Index([1, 2, 3], name="A") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("2s").mean() + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -479,13 +449,12 @@ def test_resample_groupby_agg_listlike(): def test_empty(keys): # GH 26411 df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = ( DataFrame(columns=["a", "b"]) .set_index(keys, drop=False) - .set_index(TimedeltaIndex([]), append=True) + .set_index(TimedeltaIndex([]), append=True)[expected_columns] ) if len(keys) == 1: expected.index.name = keys[0] @@ -505,9 +474,7 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): if consolidate: df = df._consolidate() - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(["key"]).resample("W", on="date").min() + result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( [ ["A"] * 3 + ["B"] * 3, @@ -519,7 +486,6 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate): ) expected = DataFrame( { - "key": ["A"] * 3 + ["B"] * 3, "col1": [0, 5, 12] * 2, "col_object": ["val"] * 3 + [np.nan] * 3, }, @@ -557,12 +523,11 @@ def test_resample_no_index(keys): df = DataFrame([], columns=["a", "b", "date"]) df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date") - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected_columns = ["b"] if keys == ["a"] else [] expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False) expected["date"] = pd.to_datetime(expected["date"]) - expected = expected.set_index("date", append=True, drop=True) + expected = expected.set_index("date", append=True, drop=True)[expected_columns] if len(keys) == 1: expected.index.name = keys[0] @@ -606,9 +571,7 @@ def test_groupby_resample_size_all_index_same(): {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, index=date_range("31/12/2000 18:00", freq="h", periods=12), ) - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = df.groupby("A").resample("D").size() + result = df.groupby("A").resample("D").size() mi_exp = pd.MultiIndex.from_arrays( [ diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index f694b90a707c7..30e2c9dfe3d30 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -351,14 +351,11 @@ def test_groupby_resample_interpolate_raises(groupy_test_df): dfs = [groupy_test_df, groupy_test_df_without_index_name] for df in dfs: - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - with pytest.raises( - NotImplementedError, - match="Direct interpolation of MultiIndex data frames is " - "not supported", - ): - df.groupby("volume").resample("1D").interpolate(method="linear") + with pytest.raises( + NotImplementedError, + match="Direct interpolation of MultiIndex data frames is " "not supported", + ): + df.groupby("volume").resample("1D").interpolate(method="linear") def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): @@ -373,7 +370,6 @@ def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): for df in dfs: result = df.groupby("volume").apply( lambda x: x.resample("1D").interpolate(method="linear"), - include_groups=False, ) volume = [50] * 15 + [60] @@ -417,7 +413,7 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) See GH#21351.""" # GH#21351 result = groupy_test_df.groupby("volume").apply( - lambda x: x.resample("265h").interpolate(method="linear"), include_groups=False + lambda x: x.resample("265h").interpolate(method="linear") ) volume = [50, 50, 60] diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index d8bb4fba1e1fe..63332fe4658e5 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -733,6 +733,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_cut_with_nonexact_categorical_indices(): # GH 42424 diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py index c153e800cb534..bec8ca13a2f5f 100644 --- a/pandas/tests/series/accessors/test_list_accessor.py +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -25,9 +25,10 @@ def test_list_getitem(list_dtype): ser = Series( [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(list_dtype), + name="a", ) actual = ser.list[1] - expected = Series([2, None, None], dtype="int64[pyarrow]") + expected = Series([2, None, None], dtype="int64[pyarrow]", name="a") tm.assert_series_equal(actual, expected) @@ -37,9 +38,15 @@ def test_list_getitem_index(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) actual = ser.list[1] - expected = Series([2, None, None], dtype="int64[pyarrow]", index=[1, 3, 7]) + expected = Series( + [2, None, None], + dtype="int64[pyarrow]", + index=[1, 3, 7], + name="a", + ) tm.assert_series_equal(actual, expected) @@ -48,6 +55,7 @@ def test_list_getitem_slice(): [[1, 2, 3], [4, None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) if pa_version_under11p0: with pytest.raises( @@ -60,6 +68,7 @@ def test_list_getitem_slice(): [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())), index=[1, 3, 7], + name="a", ) tm.assert_series_equal(actual, expected) @@ -68,9 +77,10 @@ def test_list_len(): ser = Series( [[1, 2, 3], [4, None], None], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.len() - expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32())) + expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32()), name="a") tm.assert_series_equal(actual, expected) @@ -78,12 +88,14 @@ def test_list_flatten(): ser = Series( [[1, 2, 3], None, [4, None], [], [7, 8]], dtype=ArrowDtype(pa.list_(pa.int64())), + name="a", ) actual = ser.list.flatten() expected = Series( [1, 2, 3, 4, None, 7, 8], dtype=ArrowDtype(pa.int64()), index=[0, 0, 0, 2, 2, 4, 4], + name="a", ) tm.assert_series_equal(actual, expected) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b73839f406a29..74b051aec71a4 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2084,6 +2084,18 @@ def test_dataframe_str_dtype(self, df, cache): ) tm.assert_series_equal(result, expected) + def test_dataframe_float32_dtype(self, df, cache): + # GH#60506 + # coerce to float64 + result = to_datetime(df.astype(np.float32), cache=cache) + expected = Series( + [ + Timestamp("20150204 06:58:10.001002003"), + Timestamp("20160305 07:59:11.001002003"), + ] + ) + tm.assert_series_equal(result, expected) + def test_dataframe_coerce(self, cache): # passing coerce df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 4d37c6d57f788..f53250378e33c 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -6,6 +6,7 @@ DatetimeIndex, Index, MultiIndex, + NamedAgg, Series, Timestamp, date_range, @@ -100,11 +101,7 @@ def test_rolling(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -116,11 +113,7 @@ def test_rolling_ddof(self, f, roll_frame): r = g.rolling(window=4) result = getattr(r, f)(ddof=1) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -134,13 +127,9 @@ def test_rolling_quantile(self, interpolation, roll_frame): r = g.rolling(window=4) result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -181,9 +170,7 @@ def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame): def func(x): return getattr(x.rolling(4), f)(roll_frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) # GH 39591: The grouped column should be all np.nan # (groupby.apply inserts 0s for cov) expected["A"] = np.nan @@ -199,9 +186,7 @@ def test_rolling_corr_cov_pairwise(self, f, roll_frame): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func) + expected = g.apply(func) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -246,11 +231,7 @@ def test_rolling_apply(self, raw, roll_frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)]) expected.index = expected_index @@ -489,6 +470,36 @@ def test_groupby_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_agg_namedagg(self): + # GH#28333 + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0], + "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0], + } + ) + result = ( + df.groupby("kind") + .rolling(2) + .agg( + total_weight=NamedAgg(column="weight", aggfunc=sum), + min_height=NamedAgg(column="height", aggfunc=min), + ) + ) + expected = DataFrame( + { + "total_weight": [np.nan, 17.8, 19.9, np.nan, 205.5, 240.0], + "min_height": [np.nan, 9.1, 9.5, np.nan, 6.0, 8.0], + }, + index=MultiIndex( + [["cat", "dog"], [0, 1, 2, 3, 4, 5]], + [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]], + names=["kind", None], + ), + ) + tm.assert_frame_equal(result, expected) + def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( @@ -795,13 +806,9 @@ def test_groupby_rolling_resulting_multiindex3(self): def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame): # GH 39732 g = roll_frame.groupby("A", group_keys=False) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: x.rolling(4).sum()).index + expected = g.apply(lambda x: x.rolling(4).sum()).index _ = g.rolling(window=4) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = g.apply(lambda x: x.rolling(4).sum()).index + result = g.apply(lambda x: x.rolling(4).sum()).index tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -977,13 +984,11 @@ def test_groupby_monotonic(self): df["date"] = to_datetime(df["date"]) df = df.sort_values("date") - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) @@ -1002,13 +1007,9 @@ def test_datelike_on_monotonic_within_each_group(self): } ) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = ( - df.set_index("B") - .groupby("A") - .apply(lambda x: x.rolling("4s")["C"].mean()) - ) + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) result = df.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) @@ -1038,11 +1039,7 @@ def test_expanding(self, f, frame): r = g.expanding() result = getattr(r, f)() - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)()) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)()) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1054,11 +1051,7 @@ def test_expanding_ddof(self, f, frame): r = g.expanding() result = getattr(r, f)(ddof=0) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1072,13 +1065,9 @@ def test_expanding_quantile(self, interpolation, frame): r = g.expanding() result = r.quantile(0.4, interpolation=interpolation) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().quantile(0.4, interpolation=interpolation) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply( + lambda x: x.expanding().quantile(0.4, interpolation=interpolation) + ) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index @@ -1094,9 +1083,7 @@ def test_expanding_corr_cov(self, f, frame): def func_0(x): return getattr(x.expanding(), f)(frame) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func_0) + expected = g.apply(func_0) # GH 39591: groupby.apply returns 1 instead of nan for windows # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) @@ -1111,9 +1098,7 @@ def func_0(x): def func_1(x): return getattr(x.B.expanding(), f)(pairwise=True) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply(func_1) + expected = g.apply(func_1) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw, frame): @@ -1122,18 +1107,42 @@ def test_expanding_apply(self, raw, frame): # reduction result = r.apply(lambda x: x.sum(), raw=raw) - msg = "DataFrameGroupBy.apply operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - expected = g.apply( - lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw) - ) - # groupby.apply doesn't drop the grouped-by column - expected = expected.drop("A", axis=1) + expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) # GH 39732 expected_index = MultiIndex.from_arrays([frame["A"], range(40)]) expected.index = expected_index tm.assert_frame_equal(result, expected) + def test_groupby_expanding_agg_namedagg(self): + # GH#28333 + df = DataFrame( + { + "kind": ["cat", "dog", "cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0], + "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0], + } + ) + result = ( + df.groupby("kind") + .expanding(1) + .agg( + total_weight=NamedAgg(column="weight", aggfunc=sum), + min_height=NamedAgg(column="height", aggfunc=min), + ) + ) + expected = DataFrame( + { + "total_weight": [7.9, 17.8, 27.8, 7.5, 205.5, 247.5], + "min_height": [9.1, 9.1, 9.1, 6.0, 6.0, 6.0], + }, + index=MultiIndex( + [["cat", "dog"], [0, 1, 2, 3, 4, 5]], + [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]], + names=["kind", None], + ), + ) + tm.assert_frame_equal(result, expected) + class TestEWM: @pytest.mark.parametrize( @@ -1162,6 +1171,41 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) + def test_groupby_ewm_agg_namedagg(self): + # GH#28333 + df = DataFrame({"A": ["a"] * 4, "B": range(4)}) + result = ( + df.groupby("A") + .ewm(com=1.0) + .agg( + B_mean=NamedAgg(column="B", aggfunc="mean"), + B_std=NamedAgg(column="B", aggfunc="std"), + B_var=NamedAgg(column="B", aggfunc="var"), + ) + ) + expected = DataFrame( + { + "B_mean": [ + 0.0, + 0.6666666666666666, + 1.4285714285714286, + 2.2666666666666666, + ], + "B_std": [np.nan, 0.707107, 0.963624, 1.177164], + "B_var": [np.nan, 0.5, 0.9285714285714286, 1.3857142857142857], + }, + index=MultiIndex.from_tuples( + [ + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ], + names=["A", None], + ), + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "method, expected_data", [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]], diff --git a/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md b/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md new file mode 100644 index 0000000000000..b8eba90f399c9 --- /dev/null +++ b/web/pandas/pdeps/0017-backwards-compatibility-and-deprecation-policy.md @@ -0,0 +1,74 @@ +# PDEP-17: Backwards compatibility and deprecation policy + +- Created: 27 June 2024 +- Status: Accepted +- Discussion: [#59125](https://github.com/pandas-dev/pandas/issues/59125) +- Author: [Abdulaziz Aloqeely](https://github.com/Aloqeely) +- Revision: 1 + +## Abstract + +This PDEP defines pandas' backwards compatibility and deprecation policy. + +The main additions to [pandas' current version policy](https://pandas.pydata.org/pandas-docs/version/2.2/development/policies.html) are: +- Deprecated functionality should remain unchanged in at least 2 minor releases before being changed or removed. +- Deprecations should initially use DeprecationWarning, and then be switched to FutureWarning in the last minor release before the major release they are planned to be removed in + +## Motivation + +Having a clear backwards compatibility and deprecation policy is crucial to having a healthy ecosystem. We want to ensure users can rely on pandas being stable while still allowing the library to evolve. + +This policy will ensure that users have enough time to deal with deprecations while also minimizing disruptions on downstream packages' users. + +## Scope + +This PDEP covers pandas' approach to backwards compatibility and the deprecation and removal process. + +## Background + +pandas uses a loose variant of semantic versioning. +A pandas release number is written in the format of ``MAJOR.MINOR.PATCH``. + +## General policy + +This policy applies to the [public API][1]. Anything not part of the [public API][1] or is marked as "Experimental" may be changed or removed at anytime. + +- Breaking backwards compatibility should benefit more than it harms users. +- Breaking changes should go through a deprecation cycle before being implemented if possible. +- Breaking changes should only occur in major releases. +- No deprecations should be introduced in patch releases. +- Deprecated functionality should remain unchanged in at least 2 minor releases before being changed or removed. + +Some bug fixes may require breaking backwards compatibility. In these cases, a deprecation cycle is not necessary. However, bug fixes which have a large impact on users might be treated as a breaking change. Whether or not a change is a bug fix or an API breaking change is a judgement call. + +## Deprecation process + +Deprecation provides a way to warn developers and give them time to adapt their code to the new functionality before the old behavior is eventually removed. + +A deprecation's warning message should: +- Provide information on what is changing. +- Mention how to achieve similar behavior if an alternative is available. +- For large-scale deprecations, it is recommended to include a reason for the deprecation, alongside a discussion link to get user feedback. + +Additionally, when one introduces a deprecation, they should: +- Use the appropriate warning class. More info on this can be found below. +- Add the GitHub issue/PR number as a comment above the warning line. +- Add an entry in the release notes. +- Mention that the functionality is deprecated in the documentation using the ``.. deprecated::`` directive. + +### Which warning class to use + +Deprecations should initially use ``DeprecationWarning``, and then be switched to ``FutureWarning`` for broader visibility in the last minor release before the major release they are planned to be removed in. +This implementation detail can be ignored by using the appropriate ``PandasDeprecationWarning`` variable, which will be aliased to the proper warning class based on the pandas version. + +### Enforcement of deprecations + +When one enforces a deprecation, they should: +- Add an entry in the release notes. +- For API changes, replace the ``.. deprecated::`` directive in the documentation with a ``.. versionchanged::`` directive. + +### PDEP-17 History + +- 27 June 2024: Initial version. + +[1]: https://pandas.pydata.org/docs/reference/index.html