Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Series combine #821

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions examples/series/series_combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# *****************************************************************************
# Copyright (c) 2020, Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************

import pandas as pd
from numba import njit


@njit
def series_combine():
s1 = pd.Series([1, 5, 2])
s2 = pd.Series([0, 3, 7, 8, 0])

return s1.combine(s2, max, fill_value=0) # Expect series of 1, 5, 7, 8, 0


print(series_combine())
76 changes: 76 additions & 0 deletions sdc/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from numba.typed import List, Dict
from numba import prange
from numba.np.arraymath import get_isnan
from numba.core.registry import cpu_target
from pandas.core.indexing import IndexingError

import sdc
Expand Down Expand Up @@ -4910,3 +4911,78 @@ def sdc_pandas_series_skew_impl(self, axis=None, skipna=None, level=None, numeri
return numpy_like.skew(self._data)

return sdc_pandas_series_skew_impl


@sdc_overload_method(SeriesType, 'combine', jit_options={'error_model': 'numpy'})
def sdc_pandas_series_combine(self, other, func, fill_value=None):
"""
Intel Scalable Dataframe Compiler User Guide
********************************************

Pandas API: pandas.Series.combine

Limitations
-----------
- Resulting series dtype may be wider than in pandas due to
type-stability requirements and depends on fill_value dtype
and result of series indexes alignment.
- Indixes should be strictly ascending, as inside the function
they are sorted in ascending order and the answer becomes
different from the result of the pandas.
Comment on lines +4929 to +4931
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Rubtsowa What? This sounds like a bug...

Copy link
Contributor Author

@Rubtsowa Rubtsowa Jun 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kozlov-alexey This 'bug' in function sdc_join_series_indexes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Rubtsowa Then it should be fixed (please create a JIRA case with a reproducer) before this is merged. Adapting to a bug is no way. @AlexanderKalistratov correct?


Examples
--------
.. literalinclude:: ../../../examples/series/series_combine.py
:language: python
:lines: 27-
:caption: Combined the Series with a Series according to func.
:name: ex_series_combine

.. command-output:: python ./series/series_combine.py
:cwd: ../../../examples

Intel Scalable Dataframe Compiler Developer Guide
*************************************************
Pandas Series method :meth:`pandas.Series.combine` implementation.

.. only:: developer
Test: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_combine*
"""
_func_name = 'Method Series.combine()'

ty_checker = TypeChecker(_func_name)
ty_checker.check(self, SeriesType)

ty_checker.check(other, SeriesType)

if not isinstance(fill_value, (types.Omitted, types.NoneType, types.Number)) and fill_value is not None:
ty_checker.raise_exc(fill_value, 'number', 'fill_value')

fill_is_default = isinstance(fill_value, (types.Omitted, types.NoneType)) or fill_value is None

sig = func.get_call_type(cpu_target.typing_context, [self.dtype, other.dtype], {})
ret_type = sig.return_type

fill_dtype = types.float64 if fill_is_default else fill_value
res_dtype = find_common_dtype_from_numpy_dtypes([], [ret_type, fill_dtype])

def sdc_pandas_series_combine_impl(self, other, func, fill_value=None):

_fill_value = numpy.nan if fill_value is None else fill_value

indexes, self_indexes, other_indexes = sdc_join_series_indexes(self.index, other.index)
len_val = len(indexes)

result = numpy.empty(len_val, res_dtype)

for i in prange(len_val):
self_idx, other_idx = self_indexes[i], other_indexes[i]
val_self = _fill_value if self_idx == -1 else self._data[self_idx]

val_other = _fill_value if other_idx == -1 else other._data[other_idx]

result[i] = func(val_self, val_other)

return pandas.Series(result, index=indexes)

return sdc_pandas_series_combine_impl
179 changes: 105 additions & 74 deletions sdc/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2761,7 +2761,6 @@ def test_impl(S1, S2):
S2 = pd.Series([6., 7.])
np.testing.assert_array_equal(hpat_func(S1, S2), test_impl(S1, S2))

@skip_numba_jit
def test_series_combine(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
Expand All @@ -2771,99 +2770,131 @@ def test_impl(S1, S2):
S2 = pd.Series([6.0, 21., 3.6, 5.])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))

@skip_numba_jit
def test_series_combine_float3264(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([np.float64(1), np.float64(2),
np.float64(3), np.float64(4), np.float64(5)])
S2 = pd.Series([np.float32(1), np.float32(2),
np.float32(3), np.float32(4), np.float32(5)])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))

@skip_numba_jit
def test_series_combine_assert1(self):
@unittest.expectedFailure
# https://github.com/numba/numba/issues/5792
def test_series_combine_div(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([1, 2, 3])
S2 = pd.Series([6., 21., 3., 5.])
with self.assertRaises(AssertionError):
hpat_func(S1, S2)
return S1.combine(S2, lambda a, b: a/b, 0)

@skip_numba_jit
def test_series_combine_assert2(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([6., 21., 3., 5.])
S2 = pd.Series([1, 2, 3])
with self.assertRaises(AssertionError):
hpat_func(S1, S2)

@skip_numba_jit
def test_series_combine_integer(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b, 16)
hpat_func = self.jit(test_impl)
sizes1 = [2, 4, 5, 6, 8]
sizes2 = [1, 3, 5, 7, 9]
series_dtypes = [None, np.int64, np.float64]

S1 = pd.Series([1, 2, 3, 4, 5])
S2 = pd.Series([6, 21, 3, 5])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))
for n in sizes1:
for k in sizes2:
for dtype1, dtype2 in product(series_dtypes, series_dtypes):
A = np.random.randint(-100, 100, n)
B = np.arange(k) * 2 + 1
S1 = pd.Series(A, dtype=dtype1)
S2 = pd.Series(B, dtype=dtype2)
with self.subTest(S1=S1, S2=S2):
result = hpat_func(S1, S2)
result_ref = test_impl(S1, S2)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

@skip_numba_jit
def test_series_combine_different_types(self):
def test_series_combine_value(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([6.1, 21.2, 3.3, 5.4, 6.7])
S2 = pd.Series([1, 2, 3, 4, 5])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))

@skip_numba_jit
def test_series_combine_integer_samelen(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)
series_indexes = [[1, 2, 3, 4, 5],
[4, 5, 7, 8, 9],
[0, 1, 7, 13, 25]]
# Only indixes ascending due to difference to pandas in some cases

S1 = pd.Series([1, 2, 3, 4, 5])
S2 = pd.Series([6, 21, 17, -5, 4])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))
n = 5
np.random.seed(0)
A = np.random.randint(-100, 100, n)
B = np.arange(n) * 2 + 1

series_dtypes = [None, np.int64, np.float64]
fill_values = [None, np.nan, 4, 4.2]
for dtype1, dtype2 in product(series_dtypes, series_dtypes):
for series_index1 in series_indexes:
for series_index2 in series_indexes:
S1 = pd.Series(A, index=series_index1, dtype=dtype1)
S2 = pd.Series(B, index=series_index2, dtype=dtype2)
with self.subTest(S1=S1, S2=S2):
result = hpat_func(S1, S2)
result_ref = test_impl(S1, S2)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

def test_series_combine_value_with_fill_value(self):
def test_impl(S1, S2, fill_value):
return S1.combine(S2, lambda a, b: 2 * a + b, fill_value)
hpat_func = self.jit(test_impl)

series_indexes = [[1, 2, 3, 4, 5],
[4, 5, 7, 8, 9],
[0, 1, 7, 13, 25]]
# Only indixes ascending due to difference to pandas in some cases

n = 5
np.random.seed(0)
A = np.random.randint(-100, 100, n)
B = np.arange(n) * 2 + 1

series_dtypes = [None, np.int64, np.float64]
fill_values = [None, np.nan, 4, 4.2]
for dtype1, dtype2, fill_value in product(series_dtypes, series_dtypes, fill_values):
for series_index1 in series_indexes:
for series_index2 in series_indexes:
S1 = pd.Series(A, index=series_index1, dtype=dtype1)
S2 = pd.Series(B, index=series_index2, dtype=dtype2)
with self.subTest(S1=S1, S2=S2, fill_value=fill_value):
result = hpat_func(S1, S2, fill_value)
result_ref = test_impl(S1, S2, fill_value)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

@skip_numba_jit
def test_series_combine_samelen(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b)
def test_series_combine_value_samelen(self):
def test_impl(S1, S2, fill_value):
return S1.combine(S2, lambda a, b: 2 * a + b, fill_value=fill_value)
hpat_func = self.jit(test_impl)

S1 = pd.Series([1.0, 2., 3., 4., 5.])
S2 = pd.Series([6.0, 21., 3.6, 5., 0.0])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))
n = 11
np.random.seed(0)
A = np.random.randint(-100, 100, n)
B = np.arange(n) * 2 + 1
series_index = 1 + np.arange(n)

series_dtypes = [None, np.int64, np.float64]
fill_values = [None, np.nan, 4, 4.2]
for dtype1, dtype2, fill_value in product(series_dtypes, series_dtypes, fill_values):
S1 = pd.Series(A, index=series_index, dtype=dtype1)
S2 = pd.Series(B, index=series_index, dtype=dtype2)
with self.subTest(S1=S1, S2=S2, fill_value=fill_value):
result = hpat_func(S1, S2, fill_value)
result_ref = test_impl(S1, S2, fill_value)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

@skip_numba_jit
def test_series_combine_value(self):
def test_series_combine_different_types(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56)
return S1.combine(S2, lambda a, b: 2 * a + b)
hpat_func = self.jit(test_impl)

S1 = pd.Series([1.0, 2., 3., 4., 5.])
S2 = pd.Series([6.0, 21., 3.6, 5.])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))
sizes1 = [2, 4, 5, 6, 8]
sizes2 = [1, 3, 5, 7, 9]
series_dtypes = [None, np.int64, np.float64]

@skip_numba_jit
def test_series_combine_value_samelen(self):
def test_impl(S1, S2):
return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56)
hpat_func = self.jit(test_impl)
for n in sizes1:
for k in sizes2:
for dtype1, dtype2 in product(series_dtypes, series_dtypes):
A = np.random.randint(-100, 100, n)
B = np.arange(k) * 2 + 1
S1 = pd.Series(A, dtype=dtype1)
S2 = pd.Series(B, dtype=dtype2)
with self.subTest(S1=S1, S2=S2):
result = hpat_func(S1, S2)
result_ref = test_impl(S1, S2)
# check_dtype=False due to difference to pandas in some cases
pd.testing.assert_series_equal(result, result_ref, check_dtype=False)

S1 = pd.Series([1.0, 2., 3., 4., 5.])
S2 = pd.Series([6.0, 21., 3.6, 5., 0.0])
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2))

def test_series_abs1(self):
def test_impl(S):
Expand Down