Skip to content

Commit

Permalink
Delay setting MultiIndex.level/codes until needed (#17728)
Browse files Browse the repository at this point in the history
Follow up to #17644

This PR changes `MultiIndex` to delay computing `self._level` and `self._codes` via factorization of `self._column` until needed by certain methods. Before, for state consistency, those attributes were always eagerly computed. As discussed offline the performance benefit of not eagerly computing those attributes is more desirable.

```python
import cudf
df_train = cudf.datasets.randomdata(nrows=50_000_000, dtypes={"label": int, "weekday": int, "cat_2": int, "brand": int})
target = "label"
col = ['weekday', 'cat_2', 'brand']
df_gb= df_train[col + [target]].groupby(col)
%%time
df_gb.agg(['mean', 'count'])


# PR
CPU times: user 144 ms, sys: 23.9 ms, total: 168 ms
Wall time: 166 ms

  _     ._   __/__   _ _  _  _ _/_   Recorded: 12:12:46  Samples:  4
 /_//_/// /_\ / //_// / //_'/ //     Duration: 0.159     CPU time: 0.160
/   _/                      v5.0.0

Cell [3]

0.158 <module>  <ipython-input-3-ee51998a643a>:1
`- 0.158 wrapper  cudf/utils/performance_tracking.py:30
   `- 0.158 DataFrameGroupBy.agg  cudf/core/groupby/groupby.py:879
      |- 0.129 DataFrameGroupBy._aggregate  cudf/core/groupby/groupby.py:789
      `- 0.029 NumericalColumn.astype  cudf/core/column/column.py:1126
         `- 0.029 NumericalColumn.as_numerical_column  cudf/core/column/numerical.py:428
            `- 0.029 inner  contextlib.py:78
               `- 0.029 cast  cudf/core/_internals/unary.py:40


# branch 25.02 head
CPU times: user 369 ms, sys: 105 ms, total: 474 ms
Wall time: 478 ms


  _     ._   __/__   _ _  _  _ _/_   Recorded: 12:11:20  Samples:  79
 /_//_/// /_\ / //_// / //_'/ //     Duration: 0.480     CPU time: 0.478
/   _/                      v5.0.0

Cell [3]

0.479 <module>  <ipython-input-3-ee51998a643a>:1
`- 0.478 wrapper  cudf/utils/performance_tracking.py:30
   `- 0.478 DataFrameGroupBy.agg  cudf/core/groupby/groupby.py:879
      |- 0.267 cached_property.__get__  functools.py:979
      |  `- 0.267 _Grouping.keys  cudf/core/groupby/groupby.py:3534
      |     `- 0.267 wrapper  cudf/utils/performance_tracking.py:30
      |        `- 0.267 MultiIndex._from_data  cudf/core/multiindex.py:344
      |           `- 0.265 _compute_levels_and_codes  cudf/core/multiindex.py:67
      |              |- 0.230 factorize  cudf/core/algorithms.py:22
      |              |  |- 0.168 NumericalColumn._label_encoding  cudf/core/column/column.py:1516
      |              |  |  |- 0.097 [self]  cudf/core/column/column.py
      |              |  |  |- 0.031 as_column  cudf/core/column/column.py:1948
      |              |  |  |  `- 0.028 NumericalColumn.astype  cudf/core/column/column.py:1126
      |              |  |  |     `- 0.028 NumericalColumn.as_numerical_column  cudf/core/column/numerical.py:428
      |              |  |  |        `- 0.028 inner  contextlib.py:78
      |              |  |  |           `- 0.028 cast  cudf/core/_internals/unary.py:40
      |              |  |  |- 0.030 inner  contextlib.py:78
      |              |  |  |  `- 0.030 sort_by_key  cudf/core/_internals/sorting.py:160
      |              |  |  `- 0.007 NumericalColumn.take  cudf/core/column/column.py:943
      |              |  |     `- 0.007 inner  contextlib.py:78
      |              |  |        `- 0.007 gather  cudf/core/_internals/copying.py:18
      |              |  |- 0.051 NumericalColumn.unique  cudf/core/column/column.py:1342
      |              |  |  |- 0.036 inner  contextlib.py:78
      |              |  |  |  `- 0.036 drop_duplicates  cudf/core/_internals/stream_compaction.py:82
      |              |  |  `- 0.015 NumericalColumn.is_unique  cudf/core/column/column.py:1056
      |              |  |     `- 0.015 NumericalColumn.distinct_count  cudf/core/column/column.py:1108
      |              |  `- 0.006 NumericalColumn.dropna  cudf/core/column/column.py:294
      |              |     `- 0.006 NumericalColumn.copy  cudf/core/column/column.py:481
      |              `- 0.028 _compile_module_with_cache  cupy/cuda/compiler.py:473
      |                    [4 frames hidden]  cupy, <built-in>
      |- 0.133 DataFrameGroupBy._aggregate  cudf/core/groupby/groupby.py:789
      `- 0.079 wrapper  cudf/utils/performance_tracking.py:30
         `- 0.079 MultiIndex._from_columns_like_self  cudf/core/frame.py:194
            `- 0.079 wrapper  cudf/utils/performance_tracking.py:30
               |- 0.040 MultiIndex._copy_type_metadata  cudf/core/multiindex.py:2101
               |  `- 0.038 _compute_levels_and_codes  cudf/core/multiindex.py:67
               |     `- 0.038 factorize  cudf/core/algorithms.py:22
               |        |- 0.021 NumericalColumn._label_encoding  cudf/core/column/column.py:1516
               |        |  |- 0.010 [self]  cudf/core/column/column.py
               |        |  `- 0.007 inner  contextlib.py:78
               |        |     `- 0.007 sort_by_key  cudf/core/_internals/sorting.py:160
               |        `- 0.011 NumericalColumn.unique  cudf/core/column/column.py:1342
               |           `- 0.010 inner  contextlib.py:78
               |              `- 0.010 drop_duplicates  cudf/core/_internals/stream_compaction.py:82
               `- 0.039 MultiIndex._from_data  cudf/core/multiindex.py:344
                  `- 0.038 _compute_levels_and_codes  cudf/core/multiindex.py:67
                     `- 0.038 factorize  cudf/core/algorithms.py:22
                        |- 0.021 NumericalColumn._label_encoding  cudf/core/column/column.py:1516
                        |  |- 0.010 [self]  cudf/core/column/column.py
                        |  `- 0.007 inner  contextlib.py:78
                        |     `- 0.007 sort_by_key  cudf/core/_internals/sorting.py:160
                        `- 0.010 NumericalColumn.unique  cudf/core/column/column.py:1342
                           `- 0.010 inner  contextlib.py:78
                              `- 0.010 drop_duplicates  cudf/core/_internals/stream_compaction.py:82
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17728
  • Loading branch information
mroeschke authored Jan 15, 2025
1 parent 8538ec8 commit 960c723
Showing 1 changed file with 74 additions and 39 deletions.
113 changes: 74 additions & 39 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,6 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray:
return indices


def _compute_levels_and_codes(
data: MutableMapping,
) -> tuple[list[cudf.Index], list[column.ColumnBase]]:
"""Return MultiIndex level and codes from a ColumnAccessor-like mapping."""
levels = []
codes = []
for col in data.values():
code, cats = factorize(col)
codes.append(column.as_column(code.astype(np.int64)))
levels.append(cats)

return levels, codes


class MultiIndex(Frame, BaseIndex, NotIterable):
"""A multi-level or hierarchical index.
Expand Down Expand Up @@ -203,8 +189,8 @@ def __init__(
source_data[i] = result_col._with_type_metadata(level.dtype)

super().__init__(ColumnAccessor(source_data))
self._levels = new_levels
self._codes = new_codes
self._levels: None | list[cudf.Index] = new_levels
self._codes: None | list[column.ColumnBase] = new_codes
self._name = None
self.names = names

Expand Down Expand Up @@ -341,6 +327,26 @@ def set_names(

return self._set_names(names=names, inplace=inplace)

def _maybe_materialize_codes_and_levels(self: Self) -> Self:
"""
Set self._codes and self._levels from self._columns _when_ needed.
Factorization of self._columns to self._codes and self._levels is delayed
due to being expensive and sometimes unnecessary for operations.
MultiIndex methods are responsible for calling this when needed.
"""
if self._levels is None and self._codes is None:
levels = []
codes = []
for col in self._data.values():
code, cats = factorize(col)
codes.append(column.as_column(code.astype(np.int64)))
levels.append(cats)
self._levels = levels
self._codes = codes
return self

@classmethod
@_performance_tracking
def _from_data(
Expand All @@ -350,12 +356,13 @@ def _from_data(
) -> Self:
"""
Use when you have a ColumnAccessor-like mapping but no codes and levels.
Preferable to use _simple_new if you have codes and levels.
"""
levels, codes = _compute_levels_and_codes(data)
return cls._simple_new(
data=ColumnAccessor(data),
levels=levels,
codes=codes,
levels=None,
codes=None,
names=pd.core.indexes.frozen.FrozenList(data.keys()),
name=name,
)
Expand All @@ -371,8 +378,8 @@ def _from_data_like_self(self, data: MutableMapping) -> Self:
def _simple_new(
cls,
data: ColumnAccessor,
levels: list[cudf.Index],
codes: list[column.ColumnBase],
levels: None | list[cudf.Index],
codes: None | list[column.ColumnBase],
names: pd.core.indexes.frozen.FrozenList,
name: Any = None,
) -> Self:
Expand Down Expand Up @@ -457,10 +464,22 @@ def copy(
names = pd.core.indexes.frozen.FrozenList(names)
else:
names = self.names
if self._levels is not None:
levels: None | list[cudf.Index] = [
idx.copy(deep=deep) for idx in self._levels
]
else:
levels = self._levels
if self._codes is not None:
codes: None | list[column.ColumnBase] = [
code.copy(deep=deep) for code in self._codes
]
else:
codes = self._codes
return type(self)._simple_new(
data=self._data.copy(deep=deep),
levels=[idx.copy(deep=deep) for idx in self._levels],
codes=[code.copy(deep=deep) for code in self._codes],
levels=levels,
codes=codes,
names=names,
name=name,
)
Expand Down Expand Up @@ -529,8 +548,10 @@ def codes(self) -> pd.core.indexes.frozen.FrozenList:
>>> midx.codes
FrozenList([[0, 1, 2], [0, 1, 2]])
"""
self._maybe_materialize_codes_and_levels()
return pd.core.indexes.frozen.FrozenList(
col.values for col in self._codes
col.values
for col in self._codes # type: ignore[union-attr]
)

def get_slice_bound(self, label, side):
Expand Down Expand Up @@ -572,8 +593,10 @@ def levels(self) -> list[cudf.Index]:
>>> midx.levels
[Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')]
"""
self._maybe_materialize_codes_and_levels()
return [
idx.rename(name) for idx, name in zip(self._levels, self.names)
idx.rename(name)
for idx, name in zip(self._levels, self.names) # type: ignore[arg-type]
]

@property # type: ignore
Expand Down Expand Up @@ -953,7 +976,10 @@ def __getitem__(self, index):
ca = self._data._from_columns_like_self(
(col.take(indexer) for col in self._columns), verify=False
)
codes = [code.take(indexer) for code in self._codes]
if self._codes is not None:
codes = [code.take(indexer) for code in self._codes]
else:
codes = self._codes
result = type(self)._simple_new(
data=ca, codes=codes, levels=self._levels, names=self.names
)
Expand Down Expand Up @@ -1124,6 +1150,8 @@ def _is_interval(self) -> bool:
@classmethod
@_performance_tracking
def _concat(cls, objs) -> Self:
# TODO: This will discard previously computed self._codes and self._levels.
# Try preserving them if defined.
source_data = [o.to_frame(index=False) for o in objs]

# TODO: Verify if this is really necessary or if we can rely on
Expand Down Expand Up @@ -1484,15 +1512,17 @@ def swaplevel(self, i=-2, j=-1) -> Self:
"""
name_i = self._column_names[i] if isinstance(i, int) else i
name_j = self._column_names[j] if isinstance(j, int) else j
to_swap = {name_i, name_j}
new_data = {}
# TODO: Preserve self._codes and self._levels if set
for k, v in self._column_labels_and_values:
if k not in (name_i, name_j):
if k not in to_swap:
new_data[k] = v
elif k == name_i:
new_data[name_j] = self._data[name_j]
elif k == name_j:
new_data[name_i] = self._data[name_i]
midx = MultiIndex._from_data(new_data)
midx = type(self)._from_data(new_data)
if all(n is None for n in self.names):
midx = midx.set_names(self.names)
return midx
Expand Down Expand Up @@ -1569,7 +1599,7 @@ def droplevel(self, level=-1) -> Self | cudf.Index:
if len(new_data) == 1:
return cudf.core.index._index_from_data(new_data)
else:
mi = MultiIndex._from_data(new_data)
mi = type(self)._from_data(new_data)
mi.names = new_names
return mi

Expand All @@ -1579,12 +1609,13 @@ def to_pandas(
) -> pd.MultiIndex:
# cudf uses np.iinfo(SIZE_TYPE_DTYPE).min as missing code
# pandas uses -1 as missing code
self._maybe_materialize_codes_and_levels()
pd_codes = (
code.find_and_replace(
column.as_column(np.iinfo(SIZE_TYPE_DTYPE).min, length=1),
column.as_column(-1, length=1),
)
for code in self._codes
for code in self._codes # type: ignore[union-attr]
)
return pd.MultiIndex(
levels=[
Expand Down Expand Up @@ -1683,7 +1714,9 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
f"{type(null_position)}"
)
return sorting.is_sorted(
[*self._columns], ascending=ascending, null_position=null_position
self._columns, # type: ignore[arg-type]
ascending=ascending,
null_position=null_position,
)

@cached_property # type: ignore
Expand Down Expand Up @@ -1762,8 +1795,12 @@ def nunique(self, dropna: bool = True) -> int:
@_performance_tracking
def memory_usage(self, deep: bool = False) -> int:
usage = sum(col.memory_usage for col in self._columns)
usage += sum(level.memory_usage(deep=deep) for level in self._levels)
usage += sum(code.memory_usage for code in self._codes)
if self._levels is not None:
usage += sum(
level.memory_usage(deep=deep) for level in self._levels
)
if self._codes is not None:
usage += sum(code.memory_usage for code in self._codes)
return usage

@_performance_tracking
Expand Down Expand Up @@ -1820,8 +1857,7 @@ def append(self, other) -> Self:
)
"""
if isinstance(other, (list, tuple)):
to_concat = [self]
to_concat.extend(other)
to_concat = [self, *other]
else:
to_concat = [self, other]

Expand All @@ -1833,7 +1869,7 @@ def append(self, other) -> Self:
f"found object of type: {type(obj)}"
)

return MultiIndex._concat(to_concat)
return type(self)._concat(to_concat)

@_performance_tracking
def __array_function__(self, func, types, args, kwargs):
Expand Down Expand Up @@ -2010,7 +2046,7 @@ def _get_reconciled_name_object(self, other) -> Self:
return self.rename(names)
return self

def _maybe_match_names(self, other):
def _maybe_match_names(self, other) -> list[Hashable]:
"""
Try to find common names to attach to the result of an operation
between a and b. Return a consensus list of names if they match
Expand Down Expand Up @@ -2103,7 +2139,6 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
res = super()._copy_type_metadata(other)
if isinstance(other, MultiIndex):
res._names = other._names
self._levels, self._codes = _compute_levels_and_codes(res._data)
return res

@_performance_tracking
Expand Down Expand Up @@ -2167,6 +2202,6 @@ def _columns_for_reset_index(
def repeat(self, repeats, axis=None) -> Self:
return self._from_data(
self._data._from_columns_like_self(
super()._repeat([*self._columns], repeats, axis)
super()._repeat(self._columns, repeats, axis)
)
)

0 comments on commit 960c723

Please sign in to comment.