Merge branch 'main' into adjust-parquet-xfails

rjzamora · Jan 29, 2024 · da04e43 · da04e43
2 parents f7d5e2f + 3aa1f4e
commit da04e43
Show file tree

Hide file tree

Showing 37 changed files with 1,119 additions and 892 deletions.
diff --git a/.github/workflows/test-report.yaml b/.github/workflows/test-report.yaml
@@ -42,7 +42,7 @@ jobs:
       - name: mamba list
         run: mamba list
 
-      - uses: actions/cache@v3
+      - uses: actions/cache@v4
         id: cache
         with:
           # Suffix is depending on the backend / OS. Let's be agnostic here

diff --git a/continuous_integration/environment-3.10.yaml b/continuous_integration/environment-3.10.yaml
@@ -21,9 +21,9 @@ dependencies:
   - pytest-rerunfailures
   - pytest-timeout
   - pytest-xdist
-  - moto
+  - moto<5
   # Optional dependencies
-  - mimesis
+  - mimesis<13.1.0  # https://github.com/dask/dask/issues/10858
   - numpy=1.23
   - pandas=1.5
   - flask

diff --git a/continuous_integration/environment-3.11.yaml b/continuous_integration/environment-3.11.yaml
@@ -21,9 +21,9 @@ dependencies:
   - pytest-rerunfailures
   - pytest-timeout
   - pytest-xdist
-  - moto
+  - moto<5
   # Optional dependencies
-  - mimesis
+  - mimesis<13.1.0  # https://github.com/dask/dask/issues/10858
   - numpy
   - pandas
   - flask

diff --git a/continuous_integration/environment-3.12.yaml b/continuous_integration/environment-3.12.yaml
@@ -21,9 +21,9 @@ dependencies:
   - pytest-rerunfailures
   - pytest-timeout
   - pytest-xdist
-  - moto
+  - moto<5
   # Optional dependencies
-  - mimesis
+  - mimesis<13.1.0  # https://github.com/dask/dask/issues/10858
   - numpy
   - pandas
   - flask

diff --git a/continuous_integration/environment-3.9.yaml b/continuous_integration/environment-3.9.yaml
@@ -21,7 +21,7 @@ dependencies:
   - pytest-rerunfailures
   - pytest-timeout
   - pytest-xdist
-  - moto
+  - moto<5
   # Optional dependencies
   - mimesis<12
   - numpy=1.22

diff --git a/dask/array/fft.py b/dask/array/fft.py
@@ -15,7 +15,7 @@
 from dask.array.core import asarray
 from dask.array.core import concatenate as _concatenate
 from dask.array.creation import arange as _arange
-from dask.array.numpy_compat import _numpy_200
+from dask.array.numpy_compat import NUMPY_GE_200
 from dask.utils import derived_from, skip_doctest
 
 chunk_error = (
@@ -167,7 +167,7 @@ def func(a, s=None, axes=None):
                 if s is None:
                     axes = tuple(range(a.ndim))
                 else:
-                    if _numpy_200:
+                    if NUMPY_GE_200:
                         # Match deprecation in numpy
                         warnings.warn(
                             "DeprecationWarning: `axes` should not be `None` "

diff --git a/dask/array/numpy_compat.py b/dask/array/numpy_compat.py
@@ -8,16 +8,18 @@
 from dask.utils import derived_from
 
 _np_version = parse_version(np.__version__)
-_numpy_122 = _np_version >= parse_version("1.22.0")
-_numpy_123 = _np_version >= parse_version("1.23.0")
-_numpy_124 = _np_version >= parse_version("1.24.0")
-_numpy_125 = _np_version.release >= (1, 25, 0)
-_numpy_200 = _np_version.release >= (2, 0, 0)
+NUMPY_GE_122 = _np_version.release >= (1, 22)
+NUMPY_GE_123 = _np_version.release >= (1, 23)
+NUMPY_GE_124 = _np_version.release >= (1, 24)
+NUMPY_GE_125 = _np_version.release >= (1, 25)
+NUMPY_GE_200 = _np_version.release >= (2, 0)
 
 
-if _numpy_200:
+if NUMPY_GE_200:
+    from numpy.exceptions import AxisError, ComplexWarning  # noqa: F401
     from numpy.lib.array_utils import normalize_axis_index, normalize_axis_tuple
 else:
+    from numpy import AxisError, ComplexWarning  # noqa: F401
     from numpy.core.numeric import normalize_axis_index  # type: ignore[attr-defined]
     from numpy.core.numeric import normalize_axis_tuple  # type: ignore[attr-defined]
 
@@ -183,11 +185,7 @@ def rollaxis(a, axis, start=0):
 
 # kwarg is renamed in numpy 1.22.0
 def percentile(a, q, method="linear"):
-    if _numpy_122:
+    if NUMPY_GE_122:
         return np.percentile(a, q, method=method)
     else:
         return np.percentile(a, q, interpolation=method)
-
-
-ComplexWarning = np.exceptions.ComplexWarning if _numpy_200 else np.ComplexWarning
-AxisError = np.exceptions.AxisError if _numpy_200 else np.AxisError
diff --git a/dask/array/percentile.py b/dask/array/percentile.py
@@ -9,7 +9,7 @@
 from tlz import merge
 
 from dask.array.core import Array
-from dask.array.numpy_compat import _numpy_122
+from dask.array.numpy_compat import NUMPY_GE_122
 from dask.array.numpy_compat import percentile as np_percentile
 from dask.base import tokenize
 from dask.highlevelgraph import HighLevelGraph
@@ -126,7 +126,7 @@ def percentile(a, q, method="linear", internal_method="default", **kwargs):
         internal_method = method
 
     if "interpolation" in kwargs:
-        if _numpy_122:
+        if NUMPY_GE_122:
             warnings.warn(
                 "In Dask 2022.1.0, the `interpolation=` argument to percentile was renamed to "
                 "`method= ` ",

diff --git a/dask/array/routines.py b/dask/array/routines.py
@@ -29,7 +29,7 @@
 )
 from dask.array.creation import arange, diag, empty, indices, tri
 from dask.array.einsumfuncs import einsum  # noqa
-from dask.array.numpy_compat import _numpy_200
+from dask.array.numpy_compat import NUMPY_GE_200
 from dask.array.reductions import reduction
 from dask.array.ufunc import multiply, sqrt
 from dask.array.utils import (
@@ -1801,7 +1801,7 @@ def unique(ar, return_index=False, return_inverse=False, return_counts=False):
         # mapping of the original values.
         matches = (ar[:, None] == out["values"][None, :]).astype(np.intp)
         inverse = (matches * out["inverse"]).sum(axis=1)
-        if _numpy_200:
+        if NUMPY_GE_200:
             inverse = inverse.reshape(orig_shape)
         result.append(inverse)
     if return_counts:

diff --git a/dask/array/slicing.py b/dask/array/slicing.py
@@ -559,9 +559,14 @@ def slicing_plan(chunks, index):
 
     if not is_arraylike(index):
         index = np.asanyarray(index)
-    cum_chunks = cached_cumsum(chunks)
 
-    cum_chunks = asarray_safe(cum_chunks, like=index)
+    cum_chunks_tup = cached_cumsum(chunks)
+    cum_chunks = asarray_safe(cum_chunks_tup, like=index)
+    if cum_chunks.dtype.kind != "f":  # Don't cast NaN chunks to int
+        # This is important when index.dtype=uint64 (or uint32 on 32-bit hosts) to
+        # prevent accidental automatic casting during `index - cum_chunks` below
+        cum_chunks = cum_chunks.astype(index.dtype)
+
     # this dispactches to the array library
     chunk_locations = np.searchsorted(cum_chunks, index, side="right")
 
@@ -634,8 +639,6 @@ def take(outname, inname, chunks, index, itemsize, axis=0):
             PerformanceWarning,
             stacklevel=6,
         )
-    if not is_arraylike(index):
-        index = np.asarray(index)
 
     # Check for chunks from the plan that would violate the user's
     # configured chunk size.

diff --git a/dask/array/tests/test_array_core.py b/dask/array/tests/test_array_core.py
@@ -51,7 +51,7 @@
     stack,
     store,
 )
-from dask.array.numpy_compat import _numpy_200
+from dask.array.numpy_compat import NUMPY_GE_200
 from dask.array.reshape import _not_implemented_message
 from dask.array.tests.test_dispatch import EncapsulateNDArray
 from dask.array.utils import assert_eq, same_keys
@@ -888,7 +888,7 @@ def test_elemwise_on_scalars():
     dy = from_array(ny, chunks=(5,))
     dz = dx.sum() * dy
 
-    if _numpy_200:
+    if NUMPY_GE_200:
         assert_eq(dz, nz)
     else:
         # Dask 0-d arrays do not behave like numpy scalars for type promotion
@@ -3112,25 +3112,19 @@ def test_slice_with_floats():
         d[[1, 1.5]]
 
 
-def test_slice_with_integer_types():
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.uint32, np.uint64])
+def test_slice_with_integer_types(dtype):
     x = np.arange(10)
     dx = da.from_array(x, chunks=5)
-    inds = np.array([0, 3, 6], dtype="u8")
+    inds = np.array([0, 3, 6], dtype=dtype)
     assert_eq(dx[inds], x[inds])
-    assert_eq(dx[inds.astype("u4")], x[inds.astype("u4")])
-
-    inds = np.array([0, 3, 6], dtype=np.int64)
-    assert_eq(dx[inds], x[inds])
-    assert_eq(dx[inds.astype("u4")], x[inds.astype("u4")])
 
 
-def test_index_with_integer_types():
+@pytest.mark.parametrize("cls", [int, np.int32, np.int64, np.uint32, np.uint64])
+def test_index_with_integer_types(cls):
     x = np.arange(10)
     dx = da.from_array(x, chunks=5)
-    inds = int(3)
-    assert_eq(dx[inds], x[inds])
-
-    inds = np.int64(3)
+    inds = cls(3)
     assert_eq(dx[inds], x[inds])
 
 

diff --git a/dask/array/tests/test_fft.py b/dask/array/tests/test_fft.py
@@ -10,7 +10,7 @@
 import dask.array.fft
 from dask.array.core import normalize_chunks
 from dask.array.fft import fft_wrap
-from dask.array.numpy_compat import _numpy_200
+from dask.array.numpy_compat import NUMPY_GE_200
 from dask.array.utils import assert_eq, same_keys
 
 all_1d_funcnames = ["fft", "ifft", "rfft", "irfft", "hfft", "ihfft"]
@@ -60,7 +60,7 @@ def test_fft2n_shapes(funcname):
         da_fft(darr3, (12, 11), axes=(1, 0)), np_fft(nparr, (12, 11), axes=(1, 0))
     )
 
-    if _numpy_200 and funcname.endswith("fftn"):
+    if NUMPY_GE_200 and funcname.endswith("fftn"):
         ctx = pytest.warns(
             DeprecationWarning,
             match="`axes` should not be `None` if `s` is not `None`",

diff --git a/dask/array/tests/test_masked.py b/dask/array/tests/test_masked.py
@@ -9,7 +9,7 @@
 import pytest
 
 import dask.array as da
-from dask.array.numpy_compat import ComplexWarning, _numpy_123
+from dask.array.numpy_compat import NUMPY_GE_123, ComplexWarning
 from dask.array.utils import assert_eq
 from dask.base import tokenize
 from dask.utils import typename
@@ -387,7 +387,7 @@ def test_average_weights_with_masked_array(keepdims):
 
     da_avg = da.ma.average(d_a, weights=d_weights, axis=1, keepdims=keepdims)
 
-    if _numpy_123:
+    if NUMPY_GE_123:
         assert_eq(da_avg, np.ma.average(a, weights=weights, axis=1, keepdims=keepdims))
     elif not keepdims:
         assert_eq(da_avg, np.ma.average(a, weights=weights, axis=1))

diff --git a/dask/array/tests/test_reductions.py b/dask/array/tests/test_reductions.py
@@ -13,7 +13,7 @@
 
 import dask.array as da
 import dask.config as config
-from dask.array.numpy_compat import ComplexWarning, _numpy_122
+from dask.array.numpy_compat import NUMPY_GE_122, ComplexWarning
 from dask.array.utils import assert_eq, same_keys
 from dask.core import get_deps
 
@@ -261,7 +261,7 @@ def test_arg_reductions(dfunc, func):
         assert_eq(dfunc(a, 0), func(x, 0))
         assert_eq(dfunc(a, 1), func(x, 1))
         assert_eq(dfunc(a, 2), func(x, 2))
-    if _numpy_122:
+    if NUMPY_GE_122:
         assert_eq(dfunc(a, keepdims=True), func(x, keepdims=True))
 
     pytest.raises(ValueError, lambda: dfunc(a, 3))

diff --git a/dask/array/tests/test_routines.py b/dask/array/tests/test_routines.py
@@ -14,7 +14,7 @@
 np = pytest.importorskip("numpy")
 
 import dask.array as da
-from dask.array.numpy_compat import AxisError, _numpy_123, _numpy_200
+from dask.array.numpy_compat import NUMPY_GE_123, NUMPY_GE_200, AxisError
 from dask.array.utils import assert_eq, same_keys
 
 
@@ -2351,7 +2351,7 @@ def test_result_type():
     # Effect of scalars depends on their value
     assert da.result_type(1, b) == np.int16
     assert da.result_type(1.0, a) == np.float32
-    if _numpy_200:
+    if NUMPY_GE_200:
         assert da.result_type(np.int64(1), b) == np.int64
         assert da.result_type(np.ones((), np.int64), b) == np.int64
         assert da.result_type(1e200, a) == np.float32
@@ -2594,7 +2594,7 @@ def test_average_keepdims(a):
 
     da_avg = da.average(d_a, keepdims=True)
 
-    if _numpy_123:
+    if NUMPY_GE_123:
         np_avg = np.average(a, keepdims=True)
         assert_eq(np_avg, da_avg)
 
@@ -2609,7 +2609,7 @@ def test_average_weights(keepdims):
 
     da_avg = da.average(d_a, weights=d_weights, axis=1, keepdims=keepdims)
 
-    if _numpy_123:
+    if NUMPY_GE_123:
         assert_eq(da_avg, np.average(a, weights=weights, axis=1, keepdims=keepdims))
     elif not keepdims:
         assert_eq(da_avg, np.average(a, weights=weights, axis=1))

diff --git a/dask/bytes/tests/test_s3.py b/dask/bytes/tests/test_s3.py
@@ -137,7 +137,6 @@ def s3_context(bucket=test_bucket_name, files=files):
 
 
 @pytest.fixture()
-@pytest.mark.slow
 def s3_with_yellow_tripdata(s3):
     """
     Fixture with sample yellowtrip CSVs loaded into S3.

diff --git a/dask/dataframe/__init__.py b/dask/dataframe/__init__.py
@@ -10,7 +10,7 @@ def _dask_expr_enabled() -> bool:
             import dask_expr  # noqa: F401
         except ImportError:
             raise ValueError("Must install dask-expr to activate query planning.")
-    return use_dask_expr
+    return use_dask_expr if use_dask_expr is not None else False
 
 
 if _dask_expr_enabled():

diff --git a/dask/dataframe/_testing.py b/dask/dataframe/_testing.py
@@ -6,15 +6,13 @@
 PKG = os.path.dirname(os.path.dirname(__file__))
 
 
-def test_dataframe(query_planning="True") -> None:
-    import os
-
+def test_dataframe(query_planning: bool = True) -> None:
     import pytest
 
     cmd = PKG + "/dataframe"
     print(f"running: pytest {cmd}")
-    os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = query_planning
-    sys.exit(pytest.main(["-n 4"] + [cmd]))
+    os.environ["DASK_DATAFRAME__QUERY_PLANNING"] = str(query_planning)
+    sys.exit(pytest.main(["-n 4", cmd]))
 
 
 __all__ = ["test_dataframe"]
diff --git a/dask/dataframe/backends.py b/dask/dataframe/backends.py
@@ -11,7 +11,7 @@
 from dask.array.dispatch import percentile_lookup
 from dask.array.percentile import _percentile
 from dask.backends import CreationDispatch, DaskBackendEntrypoint
-from dask.dataframe._compat import is_any_real_numeric_dtype
+from dask.dataframe._compat import PANDAS_GE_220, is_any_real_numeric_dtype
 from dask.dataframe.core import DataFrame, Index, Scalar, Series, _Frame
 from dask.dataframe.dispatch import (
     categorical_dtype_dispatch,
@@ -623,11 +623,11 @@ def concat_pandas(
         if uniform
         else any(isinstance(df, pd.DataFrame) for df in dfs2)
     ):
-        if uniform:
+        if uniform or PANDAS_GE_220:
             dfs3 = dfs2
             cat_mask = dfs2[0].dtypes == "category"
         else:
-            # When concatenating mixed dataframes and series on axis 1, Pandas
+            # When concatenating mixed dataframes and series on axis 1, Pandas <2.2
             # converts series to dataframes with a single column named 0, then
             # concatenates.
             dfs3 = [
@@ -647,7 +647,7 @@ def concat_pandas(
                     **kwargs,
                 ).any()
 
-        if cat_mask.any():
+        if isinstance(cat_mask, pd.Series) and cat_mask.any():
             not_cat = cat_mask[~cat_mask].index
             # this should be aligned, so no need to filter warning
             out = pd.concat(

diff --git a/dask/dataframe/io/tests/test_csv.py b/dask/dataframe/io/tests/test_csv.py
@@ -1219,8 +1219,15 @@ def test_parse_dates_multi_column():
     """
     )
 
-    warn = FutureWarning if PANDAS_GE_220 else None
-    with pytest.warns(warn, match="nested"):
+    if PANDAS_GE_220:
+        with pytest.warns(FutureWarning, match="nested"):
+            with filetext(pdmc_text) as fn:
+                ddf = dd.read_csv(fn, parse_dates=[["date", "time"]])
+                df = pd.read_csv(fn, parse_dates=[["date", "time"]])
+
+                assert (df.columns == ddf.columns).all()
+                assert len(df) == len(ddf)
+    else:
         with filetext(pdmc_text) as fn:
             ddf = dd.read_csv(fn, parse_dates=[["date", "time"]])
             df = pd.read_csv(fn, parse_dates=[["date", "time"]])