-
Notifications
You must be signed in to change notification settings - Fork 61
Series combine #821
base: master
Are you sure you want to change the base?
Series combine #821
Changes from 10 commits
7ec05f3
b576e03
95d233e
2b95e9d
2104aae
aeef026
f09b792
fafcaed
b1ed1b3
4fdb369
6d930d8
c85feff
18771f2
8628e31
6f37f53
f66ace3
9fc55a7
e7dc1f5
6c78b8e
d623b89
ecd3dba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -45,6 +45,7 @@ | |||||
from numba.typed import List, Dict | ||||||
from numba import prange | ||||||
from numba.np.arraymath import get_isnan | ||||||
from numba.core.registry import cpu_target | ||||||
from pandas.core.indexing import IndexingError | ||||||
|
||||||
import sdc | ||||||
|
@@ -69,6 +70,7 @@ | |||||
from sdc.functions import numpy_like | ||||||
from sdc.hiframes.api import isna | ||||||
from sdc.datatypes.hpat_pandas_groupby_functions import init_series_groupby | ||||||
from sdc.utilities.prange_utils import parallel_chunks | ||||||
|
||||||
from .pandas_series_functions import apply | ||||||
from .pandas_series_functions import map as _map | ||||||
|
@@ -4899,7 +4901,8 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None): | |||||
|
||||||
Limitations | ||||||
----------- | ||||||
- Only supports the case when data in series of the same type | ||||||
- Only supports the case when data in series of the same type. | ||||||
- With the default fill_value parameter value, the type of the resulting series will be float. | ||||||
|
||||||
Examples | ||||||
-------- | ||||||
|
@@ -4917,10 +4920,9 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None): | |||||
Pandas Series method :meth:`pandas.Series.combine` implementation. | ||||||
|
||||||
.. only:: developer | ||||||
|
||||||
Tests: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_combine* | ||||||
Test: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_combine* | ||||||
""" | ||||||
_func_name = 'Method Series.combine().' | ||||||
_func_name = 'Method Series.combine()' | ||||||
|
||||||
ty_checker = TypeChecker(_func_name) | ||||||
ty_checker.check(self, SeriesType) | ||||||
|
@@ -4930,22 +4932,43 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None): | |||||
if not isinstance(fill_value, (types.Omitted, types.NoneType, types.Number)) and fill_value is not None: | ||||||
ty_checker.raise_exc(fill_value, 'number', 'fill_value') | ||||||
|
||||||
fill_is_default = isinstance(fill_value, (types.Omitted, types.NoneType)) or fill_value is None | ||||||
|
||||||
sig = func.get_call_type(cpu_target.typing_context, [self.dtype, other.dtype], {}) | ||||||
ret_type = sig.return_type | ||||||
|
||||||
fill_dtype = types.float64 if fill_is_default else fill_value | ||||||
res_dtype = find_common_dtype_from_numpy_dtypes([], [ret_type, fill_dtype]) | ||||||
|
||||||
def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): | ||||||
|
||||||
if fill_value is None: | ||||||
fill_value = numpy.nan | ||||||
if fill_value is not None: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
_fill_value = fill_value | ||||||
else: | ||||||
_fill_value = numpy.nan | ||||||
|
||||||
len_val = max(len(self), len(other)) | ||||||
result = numpy.empty(len_val, self._data.dtype) | ||||||
for ind in range(len_val): | ||||||
val_self = self._data[ind] | ||||||
val_other = other._data[ind] | ||||||
if len(self) < ind + 1: | ||||||
val_self = fill_value | ||||||
if len(other) < ind + 1: | ||||||
val_other = fill_value | ||||||
result[ind] = func(val_self, val_other) | ||||||
indexes, self_indexes, other_indexes = sdc_join_series_indexes(self.index, other.index) | ||||||
len_val = len(indexes) | ||||||
|
||||||
return pandas.Series(result) | ||||||
result = numpy.empty(len_val, res_dtype) | ||||||
|
||||||
chunks = parallel_chunks(len_val) | ||||||
for i in prange(len(chunks)): | ||||||
chunk = chunks[i] | ||||||
for j in range(chunk.start, chunk.stop): | ||||||
self_idx = self_indexes[j] | ||||||
if self_idx == -1: | ||||||
val_self = _fill_value | ||||||
else: | ||||||
val_self = self[self_idx]._data[0] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. self_idx (and other_idx) is position in the Series, not the index, so instead of using getitem on a Series, that performs index lookup and returns a Series, so that you have to take _data[0] from it, you can just write:
Suggested change
|
||||||
|
||||||
other_idx = other_indexes[j] | ||||||
if other_idx == -1: | ||||||
val_other = _fill_value | ||||||
else: | ||||||
val_other = other[other_idx]._data[0] | ||||||
|
||||||
result[j] = func(val_self, val_other) | ||||||
return pandas.Series(result, index=indexes) | ||||||
|
||||||
return sdc_pandas_series_combine_impl |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2781,27 +2781,23 @@ def test_impl(S1, S2): | |
np.float32(3), np.float32(4), np.float32(5)]) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
def test_series_combine_assert1(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
hpat_func = self.jit(test_impl) | ||
|
||
S1 = pd.Series([1, 2, 3]) | ||
S2 = pd.Series([6., 21., 3., 5.]) | ||
with self.assertRaises(AssertionError): | ||
hpat_func(S1, S2) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. General comment for tests: not all combinations of input series dtypes and fill_value are tested e.g. the one I mentioned before - where float fill_value is assigned to otherwise int series. There are no tests with series with non-default indexes (we refer to samelen, but it's not fully correct - series may have same len, but not same indexes), and no tests for checking func impact on result dtype, so it's hard to see from such tests what's really tested and what is not. So the suggestion is to organize tests in a different manner:
New test: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For example, test 1 can look like this (it can also be split into two: one when we use check_dtype=False and one when we don't):
|
||
|
||
@skip_numba_jit | ||
def test_series_combine_assert2(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
hpat_func = self.jit(test_impl) | ||
|
||
S1 = pd.Series([6., 21., 3., 5.]) | ||
S2 = pd.Series([1, 2, 3]) | ||
with self.assertRaises(AssertionError): | ||
hpat_func(S1, S2) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
def test_series_combine_integer(self): | ||
def test_impl(S1, S2): | ||
|
@@ -2821,7 +2817,10 @@ def test_impl(S1, S2): | |
S2 = pd.Series([1, 2, 3, 4, 5]) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@unittest.expectedFailure | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add comment why the test is skipped. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Rubtsowa No need to skip the test if that's how impl is intended to work. Use check_dtype=False in assert_series_equal and add a comment just before this check to refer to SDC Limitation. |
||
def test_series_combine_integer_samelen(self): | ||
"""Result series type `int` is expected, | ||
`float` is returned since this is the default fill_value type""" | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
hpat_func = self.jit(test_impl) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This line is not correct - impl handles all cases. For the next line we need exact definition of difference to pandas, e.g: