Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Series combine #821

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 14 additions & 22 deletions sdc/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4913,7 +4913,7 @@ def sdc_pandas_series_skew_impl(self, axis=None, skipna=None, level=None, numeri
return sdc_pandas_series_skew_impl


@sdc_overload_method(SeriesType, 'combine')
@sdc_overload_method(SeriesType, 'combine', jit_options={'error_model': 'numpy'})
def sdc_pandas_series_combine(self, other, func, fill_value=None):
"""
Intel Scalable Dataframe Compiler User Guide
Expand All @@ -4923,8 +4923,12 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None):

Limitations
-----------
- Only supports the case when data in series of the same type.
- With the default fill_value parameter value, the type of the resulting series will be float.
- Resulting series dtype may be wider than in pandas due to
type-stability requirements and depends on fill_value dtype
and result of series indexes alignment.
- Indixes should be strictly ascending, as inside the function
they are sorted in ascending order and the answer becomes
different from the result of the pandas.
Comment on lines +4929 to +4931
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Rubtsowa What? This sounds like a bug...

Copy link
Contributor Author

@Rubtsowa Rubtsowa Jun 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kozlov-alexey This 'bug' in function sdc_join_series_indexes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Rubtsowa Then it should be fixed (please create a JIRA case with a reproducer) before this is merged. Adapting to a bug is no way. @AlexanderKalistratov correct?


Examples
--------
Expand Down Expand Up @@ -4964,33 +4968,21 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None):

def sdc_pandas_series_combine_impl(self, other, func, fill_value=None):

if fill_value is not None:
_fill_value = fill_value
else:
_fill_value = numpy.nan
_fill_value = numpy.nan if fill_value is None else fill_value

indexes, self_indexes, other_indexes = sdc_join_series_indexes(self.index, other.index)
len_val = len(indexes)

result = numpy.empty(len_val, res_dtype)

chunks = parallel_chunks(len_val)
for i in prange(len(chunks)):
chunk = chunks[i]
for j in range(chunk.start, chunk.stop):
self_idx = self_indexes[j]
if self_idx == -1:
val_self = _fill_value
else:
val_self = self[self_idx]._data[0]
for i in prange(len_val):
self_idx, other_idx = self_indexes[i], other_indexes[i]
val_self = _fill_value if self_idx == -1 else self._data[self_idx]

other_idx = other_indexes[j]
if other_idx == -1:
val_other = _fill_value
else:
val_other = other[other_idx]._data[0]
val_other = _fill_value if other_idx == -1 else other._data[other_idx]

result[i] = func(val_self, val_other)

result[j] = func(val_self, val_other)
return pandas.Series(result, index=indexes)

return sdc_pandas_series_combine_impl
170 changes: 105 additions & 65 deletions sdc/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2770,91 +2770,131 @@ def test_impl(S1, S2):
S2 = pd.Series([6.0, 21., 3.6, 5.])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))

def test_series_combine_float3264(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([np.float64(1), np.float64(2),
np.float64(3), np.float64(4), np.float64(5)])
S2 = pd.Series([np.float32(1), np.float32(2),
np.float32(3), np.float32(4), np.float32(5)])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))

def test_series_combine_assert1(self):
@unittest.expectedFailure
# https://github.com/numba/numba/issues/5792
def test_series_combine_div(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)
return S1.combine(S2, lambda a, b: a/b, 0)

S1 = pd.Series([1, 2, 3])
S2 = pd.Series([6., 21., 3., 5.])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))

def test_series_combine_assert2(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([6., 21., 3., 5.])
S2 = pd.Series([1, 2, 3])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))

def test_series_combine_integer(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b, 16)
hpat_func = self.jit(test_impl)
sizes1 = [2, 4, 5, 6, 8]
sizes2 = [1, 3, 5, 7, 9]
series_dtypes = [None, np.int64, np.float64]

S1 = pd.Series([1, 2, 3, 4, 5])
S2 = pd.Series([6, 21, 3, 5])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))
for n in sizes1:
for k in sizes2:
for dtype1, dtype2 in product(series_dtypes, series_dtypes):
A = np.random.randint(-100, 100, n)
B = np.arange(k) * 2 + 1
S1 = pd.Series(A, dtype=dtype1)
S2 = pd.Series(B, dtype=dtype2)
with self.subTest(S1=S1, S2=S2):
result = hpat_func(S1, S2)
result_ref = test_impl(S1, S2)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

def test_series_combine_different_types(self):
def test_series_combine_value(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([6.1, 21.2, 3.3, 5.4, 6.7])
S2 = pd.Series([1, 2, 3, 4, 5])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))
series_indexes = [[1, 2, 3, 4, 5],
[4, 5, 7, 8, 9],
[0, 1, 7, 13, 25]]
# Only indixes ascending due to difference to pandas in some cases

@unittest.expectedFailure
def test_series_combine_integer_samelen(self):
"""Result series type `int` is expected,
`float` is returned since this is the default fill_value type"""
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([1, 2, 3, 4, 5])
S2 = pd.Series([6, 21, 17, -5, 4])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))
n = 5
np.random.seed(0)
A = np.random.randint(-100, 100, n)
B = np.arange(n) * 2 + 1

series_dtypes = [None, np.int64, np.float64]
fill_values = [None, np.nan, 4, 4.2]
for dtype1, dtype2 in product(series_dtypes, series_dtypes):
for series_index1 in series_indexes:
for series_index2 in series_indexes:
S1 = pd.Series(A, index=series_index1, dtype=dtype1)
S2 = pd.Series(B, index=series_index2, dtype=dtype2)
with self.subTest(S1=S1, S2=S2):
result = hpat_func(S1, S2)
result_ref = test_impl(S1, S2)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

def test_series_combine_value_with_fill_value(self):
def test_impl(S1, S2, fill_value):
return S1.combine(S2, lambda a, b: 2 * a + b, fill_value)
hpat_func = self.jit(test_impl)

series_indexes = [[1, 2, 3, 4, 5],
[4, 5, 7, 8, 9],
[0, 1, 7, 13, 25]]
# Only indixes ascending due to difference to pandas in some cases

n = 5
np.random.seed(0)
A = np.random.randint(-100, 100, n)
B = np.arange(n) * 2 + 1

series_dtypes = [None, np.int64, np.float64]
fill_values = [None, np.nan, 4, 4.2]
for dtype1, dtype2, fill_value in product(series_dtypes, series_dtypes, fill_values):
for series_index1 in series_indexes:
for series_index2 in series_indexes:
S1 = pd.Series(A, index=series_index1, dtype=dtype1)
S2 = pd.Series(B, index=series_index2, dtype=dtype2)
with self.subTest(S1=S1, S2=S2, fill_value=fill_value):
result = hpat_func(S1, S2, fill_value)
result_ref = test_impl(S1, S2, fill_value)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

def test_series_combine_samelen(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
def test_series_combine_value_samelen(self):
def test_impl(S1, S2, fill_value):
return S1.combine(S2, lambda a, b: 2 * a + b, fill_value=fill_value)
hpat_func = self.jit(test_impl)

S1 = pd.Series([1.0, 2., 3., 4., 5.])
S2 = pd.Series([6.0, 21., 3.6, 5., 0.0])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))
n = 11
np.random.seed(0)
A = np.random.randint(-100, 100, n)
B = np.arange(n) * 2 + 1
series_index = 1 + np.arange(n)

series_dtypes = [None, np.int64, np.float64]
fill_values = [None, np.nan, 4, 4.2]
for dtype1, dtype2, fill_value in product(series_dtypes, series_dtypes, fill_values):
S1 = pd.Series(A, index=series_index, dtype=dtype1)
S2 = pd.Series(B, index=series_index, dtype=dtype2)
with self.subTest(S1=S1, S2=S2, fill_value=fill_value):
result = hpat_func(S1, S2, fill_value)
result_ref = test_impl(S1, S2, fill_value)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

def test_series_combine_value(self):
def test_series_combine_different_types(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56)
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([1.0, 2., 3., 4., 5.])
S2 = pd.Series([6.0, 21., 3.6, 5.])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))
sizes1 = [2, 4, 5, 6, 8]
sizes2 = [1, 3, 5, 7, 9]
series_dtypes = [None, np.int64, np.float64]

def test_series_combine_value_samelen(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56)
hpat_func = self.jit(test_impl)
for n in sizes1:
for k in sizes2:
for dtype1, dtype2 in product(series_dtypes, series_dtypes):
A = np.random.randint(-100, 100, n)
B = np.arange(k) * 2 + 1
S1 = pd.Series(A, dtype=dtype1)
S2 = pd.Series(B, dtype=dtype2)
with self.subTest(S1=S1, S2=S2):
result = hpat_func(S1, S2)
result_ref = test_impl(S1, S2)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

S1 = pd.Series([1.0, 2., 3., 4., 5.])
S2 = pd.Series([6.0, 21., 3.6, 5., 0.0])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))

def test_series_abs1(self):
def test_impl(S):
Expand Down