Skip to content

Commit

Permalink
Merge pull request #467 from TyberiusPrime/pandas_polars
Browse files Browse the repository at this point in the history
feature: optional pandas and polars support
  • Loading branch information
seperman authored Jul 1, 2024
2 parents 1846b7b + dae46b7 commit ee36c1d
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 2 deletions.
27 changes: 25 additions & 2 deletions deepdiff/deephash.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@
number_to_string, datetime_normalize, KEY_TO_VAL_STR, short_repr,
get_truncate_datetime, dict_, add_root_to_paths)
from deepdiff.base import Base

try:
import pandas
except ImportError:
pandas = False

try:
import polars
except ImportError:
polars = False

logger = logging.getLogger(__name__)

UNPROCESSED_KEY = object()
Expand Down Expand Up @@ -448,7 +459,6 @@ def _prep_path(self, obj):
type_ = obj.__class__.__name__
return KEY_TO_VAL_STR.format(type_, obj)


def _prep_number(self, obj):
type_ = "number" if self.ignore_numeric_type_changes else obj.__class__.__name__
if self.significant_digits is not None:
Expand Down Expand Up @@ -479,7 +489,7 @@ def _prep_tuple(self, obj, parent, parents_ids):
return result, counts

def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
"""The main diff method"""
"""The main hash method"""
counts = 1

if isinstance(obj, bool):
Expand Down Expand Up @@ -529,6 +539,19 @@ def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
elif isinstance(obj, tuple):
result, counts = self._prep_tuple(obj=obj, parent=parent, parents_ids=parents_ids)

elif (pandas and isinstance(obj, pandas.DataFrame)):
def gen():
yield ('dtype', obj.dtypes)
yield ('index', obj.index)
yield from obj.items() # which contains (column name, series tuples)
result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)
elif (polars and isinstance(obj, polars.DataFrame)):
def gen():
yield from obj.columns
yield from list(obj.schema.items())
yield from obj.rows()
result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)

elif isinstance(obj, Iterable):
result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids)

Expand Down
2 changes: 2 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ tomli==2.0.1
tomli-w==1.0.0
pydantic==2.7.4
pytest-benchmark==4.0.0
pandas>=1.6
polars=>0.19.11
90 changes: 90 additions & 0 deletions tests/test_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,96 @@ def test_hash_numpy_array2_multi_dimensional_can_not_retrieve_individual_array_i
except Exception as e:
assert str(e).strip("'") == HASH_LOOKUP_ERR_MSG.format(t1[0])

def test_pandas(self):
import pandas as pd
df = pd.DataFrame({"a": [1]})
equal_df = pd.DataFrame({"a": [1]})
df_same_column_names = pd.DataFrame({"a": [1, 2]})
other_df = pd.DataFrame({"b": [1]})
df_hash = DeepHashPrep(df)[df]
equal_df_hash = DeepHashPrep(equal_df)[equal_df]
df_same_column_names_hash = DeepHashPrep(df_same_column_names)[df_same_column_names]
other_df_hash = DeepHashPrep(other_df)[other_df]
assert df_hash == equal_df_hash
assert df_hash != df_same_column_names_hash
assert df_hash != other_df_hash

df_mixed = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
df_mixed_2 = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
df_mixed_3 = pd.DataFrame({'a': [1], 'b': ['one'], 'c': [(1, 2)]})
df_mixed_4 = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 3)]})
df_mixed_hash = DeepHashPrep(df_mixed)[df_mixed]
df_mixed_2_hash = DeepHashPrep(df_mixed_2)[df_mixed_2]
df_mixed_3_hash = DeepHashPrep(df_mixed_3)[df_mixed_3]
df_mixed_4_hash = DeepHashPrep(df_mixed_4)[df_mixed_4]
assert df_mixed_hash == df_mixed_2_hash
assert df_mixed_hash != df_mixed_3_hash
assert df_mixed_hash != df_mixed_4_hash

df_u8 = pd.DataFrame({'a': np.array([1], dtype=np.uint8)})
df_u16 = pd.DataFrame({'a': np.array([1], dtype=np.uint16)})
df_float = pd.DataFrame({'a': np.array([1], dtype=np.float32)})
df_u8_hash = DeepHashPrep(df_u8)[df_u8]
df_u16_hash = DeepHashPrep(df_u16)[df_u16]
df_float_hash = DeepHashPrep(df_float)[df_float]
assert df_u8_hash != df_float_hash
assert df_u8_hash != df_u16_hash

df_index = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3])
df_index_diff = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 4])
df_index_hash = DeepHashPrep(df_index)[df_index]
df_index_diff_hash = DeepHashPrep(df_index_diff)[df_index_diff]
assert df_index_hash != df_index_diff_hash

def test_polars(self):
import polars as pl
df = pl.DataFrame({"a": [1]})
equal_df = pl.DataFrame({"a": [1]})
df_same_column_names = pl.DataFrame({"a": [1, 2]})
other_df = pl.DataFrame({"b": [1]})
df_hash = DeepHashPrep(df)[df]
equal_df_hash = DeepHashPrep(equal_df)[equal_df]
df_same_column_names_hash = DeepHashPrep(df_same_column_names)[df_same_column_names]
other_df_hash = DeepHashPrep(other_df)[other_df]
assert df_hash == equal_df_hash
assert df_hash != df_same_column_names_hash
assert df_hash != other_df_hash

df_mixed = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
df_mixed_2 = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
df_mixed_3 = pl.DataFrame({'a': [1], 'b': ['one'], 'c': [(1, 2)]})
df_mixed_4 = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 3)]})
df_mixed_hash = DeepHashPrep(df_mixed)[df_mixed]
df_mixed_2_hash = DeepHashPrep(df_mixed_2)[df_mixed_2]
df_mixed_3_hash = DeepHashPrep(df_mixed_3)[df_mixed_3]
df_mixed_4_hash = DeepHashPrep(df_mixed_4)[df_mixed_4]
assert df_mixed_hash == df_mixed_2_hash
assert df_mixed_hash != df_mixed_3_hash
assert df_mixed_hash != df_mixed_4_hash

df_u8 = pl.DataFrame({'a': np.array([1], dtype=np.uint8)})
df_u16 = pl.DataFrame({'a': np.array([1], dtype=np.uint16)})
df_float = pl.DataFrame({'a': np.array([1], dtype=np.float32)})
df_u8_hash = DeepHashPrep(df_u8)[df_u8]
df_u16_hash = DeepHashPrep(df_u16)[df_u16]
df_float_hash = DeepHashPrep(df_float)[df_float]
assert df_u8_hash != df_float_hash
assert df_u8_hash != df_u16_hash

lazy_1 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy()
lazy_2 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy()
lazy_3 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2], "foobar": 5}).lazy()
with pytest.raises(TypeError):
DeepHashPrep(lazy_1)[lazy_1] # lazy dfs can not be compared
df_1 = lazy_1.collect()
df_2 = lazy_2.collect()
df_3 = lazy_3.collect()
df_1_hash = DeepHashPrep(df_1)[df_1]
df_2_hash = DeepHashPrep(df_2)[df_2]
df_3_hash = DeepHashPrep(df_3)[df_3]
assert df_1_hash == df_2_hash
assert df_1_hash != df_3_hash


class TestDeepHashSHA:
"""DeepHash with SHA Tests."""
Expand Down

0 comments on commit ee36c1d

Please sign in to comment.