Merge pull request #467 from TyberiusPrime/pandas_polars

feature: optional pandas and polars support
seperman · Jul 1, 2024 · ee36c1d · ee36c1d
2 parents 1846b7b + dae46b7
commit ee36c1d
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 2 deletions.
diff --git a/deepdiff/deephash.py b/deepdiff/deephash.py
@@ -14,6 +14,17 @@
                              number_to_string, datetime_normalize, KEY_TO_VAL_STR, short_repr,
                              get_truncate_datetime, dict_, add_root_to_paths)
 from deepdiff.base import Base
+
+try:
+    import pandas
+except ImportError:
+    pandas = False
+
+try:
+    import polars
+except ImportError:
+    polars = False
+
 logger = logging.getLogger(__name__)
 
 UNPROCESSED_KEY = object()
@@ -448,7 +459,6 @@ def _prep_path(self, obj):
         type_ = obj.__class__.__name__
         return KEY_TO_VAL_STR.format(type_, obj)
 
-
     def _prep_number(self, obj):
         type_ = "number" if self.ignore_numeric_type_changes else obj.__class__.__name__
         if self.significant_digits is not None:
@@ -479,7 +489,7 @@ def _prep_tuple(self, obj, parent, parents_ids):
         return result, counts
 
     def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
-        """The main diff method"""
+        """The main hash method"""
         counts = 1
 
         if isinstance(obj, bool):
@@ -529,6 +539,19 @@ def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
         elif isinstance(obj, tuple):
             result, counts = self._prep_tuple(obj=obj, parent=parent, parents_ids=parents_ids)
 
+        elif (pandas and isinstance(obj, pandas.DataFrame)):
+            def gen():
+                yield ('dtype', obj.dtypes)
+                yield ('index', obj.index)
+                yield from obj.items()  # which contains (column name, series tuples)
+            result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)
+        elif (polars and isinstance(obj, polars.DataFrame)):
+            def gen():
+                yield from obj.columns
+                yield from list(obj.schema.items())
+                yield from obj.rows()
+            result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)
+
         elif isinstance(obj, Iterable):
             result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids)
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -20,3 +20,5 @@ tomli==2.0.1
 tomli-w==1.0.0
 pydantic==2.7.4
 pytest-benchmark==4.0.0
+pandas>=1.6
+polars=>0.19.11
diff --git a/tests/test_hash.py b/tests/test_hash.py
@@ -744,6 +744,96 @@ def test_hash_numpy_array2_multi_dimensional_can_not_retrieve_individual_array_i
         except Exception as e:
             assert str(e).strip("'") == HASH_LOOKUP_ERR_MSG.format(t1[0])
 
+    def test_pandas(self):
+        import pandas as pd
+        df = pd.DataFrame({"a": [1]})
+        equal_df = pd.DataFrame({"a": [1]})
+        df_same_column_names = pd.DataFrame({"a": [1, 2]})
+        other_df = pd.DataFrame({"b": [1]})
+        df_hash = DeepHashPrep(df)[df]
+        equal_df_hash = DeepHashPrep(equal_df)[equal_df]
+        df_same_column_names_hash = DeepHashPrep(df_same_column_names)[df_same_column_names]
+        other_df_hash = DeepHashPrep(other_df)[other_df]
+        assert df_hash == equal_df_hash
+        assert df_hash != df_same_column_names_hash
+        assert df_hash != other_df_hash
+
+        df_mixed = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
+        df_mixed_2 = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
+        df_mixed_3 = pd.DataFrame({'a': [1], 'b': ['one'], 'c': [(1, 2)]})
+        df_mixed_4 = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 3)]})
+        df_mixed_hash = DeepHashPrep(df_mixed)[df_mixed]
+        df_mixed_2_hash = DeepHashPrep(df_mixed_2)[df_mixed_2]
+        df_mixed_3_hash = DeepHashPrep(df_mixed_3)[df_mixed_3]
+        df_mixed_4_hash = DeepHashPrep(df_mixed_4)[df_mixed_4]
+        assert df_mixed_hash == df_mixed_2_hash
+        assert df_mixed_hash != df_mixed_3_hash
+        assert df_mixed_hash != df_mixed_4_hash
+
+        df_u8 = pd.DataFrame({'a': np.array([1], dtype=np.uint8)})
+        df_u16 = pd.DataFrame({'a': np.array([1], dtype=np.uint16)})
+        df_float = pd.DataFrame({'a': np.array([1], dtype=np.float32)})
+        df_u8_hash = DeepHashPrep(df_u8)[df_u8]
+        df_u16_hash = DeepHashPrep(df_u16)[df_u16]
+        df_float_hash = DeepHashPrep(df_float)[df_float]
+        assert df_u8_hash != df_float_hash
+        assert df_u8_hash != df_u16_hash
+
+        df_index = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3])
+        df_index_diff = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 4])
+        df_index_hash = DeepHashPrep(df_index)[df_index]
+        df_index_diff_hash = DeepHashPrep(df_index_diff)[df_index_diff]
+        assert df_index_hash != df_index_diff_hash
+
+    def test_polars(self):
+        import polars as pl
+        df = pl.DataFrame({"a": [1]})
+        equal_df = pl.DataFrame({"a": [1]})
+        df_same_column_names = pl.DataFrame({"a": [1, 2]})
+        other_df = pl.DataFrame({"b": [1]})
+        df_hash = DeepHashPrep(df)[df]
+        equal_df_hash = DeepHashPrep(equal_df)[equal_df]
+        df_same_column_names_hash = DeepHashPrep(df_same_column_names)[df_same_column_names]
+        other_df_hash = DeepHashPrep(other_df)[other_df]
+        assert df_hash == equal_df_hash
+        assert df_hash != df_same_column_names_hash
+        assert df_hash != other_df_hash
+
+        df_mixed = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
+        df_mixed_2 = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
+        df_mixed_3 = pl.DataFrame({'a': [1], 'b': ['one'], 'c': [(1, 2)]})
+        df_mixed_4 = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 3)]})
+        df_mixed_hash = DeepHashPrep(df_mixed)[df_mixed]
+        df_mixed_2_hash = DeepHashPrep(df_mixed_2)[df_mixed_2]
+        df_mixed_3_hash = DeepHashPrep(df_mixed_3)[df_mixed_3]
+        df_mixed_4_hash = DeepHashPrep(df_mixed_4)[df_mixed_4]
+        assert df_mixed_hash == df_mixed_2_hash
+        assert df_mixed_hash != df_mixed_3_hash
+        assert df_mixed_hash != df_mixed_4_hash
+
+        df_u8 = pl.DataFrame({'a': np.array([1], dtype=np.uint8)})
+        df_u16 = pl.DataFrame({'a': np.array([1], dtype=np.uint16)})
+        df_float = pl.DataFrame({'a': np.array([1], dtype=np.float32)})
+        df_u8_hash = DeepHashPrep(df_u8)[df_u8]
+        df_u16_hash = DeepHashPrep(df_u16)[df_u16]
+        df_float_hash = DeepHashPrep(df_float)[df_float]
+        assert df_u8_hash != df_float_hash
+        assert df_u8_hash != df_u16_hash
+
+        lazy_1 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy()
+        lazy_2 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy()
+        lazy_3 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2], "foobar": 5}).lazy()
+        with pytest.raises(TypeError):
+            DeepHashPrep(lazy_1)[lazy_1]  # lazy dfs can not be compared
+        df_1 = lazy_1.collect()
+        df_2 = lazy_2.collect()
+        df_3 = lazy_3.collect()
+        df_1_hash = DeepHashPrep(df_1)[df_1]
+        df_2_hash = DeepHashPrep(df_2)[df_2]
+        df_3_hash = DeepHashPrep(df_3)[df_3]
+        assert df_1_hash == df_2_hash
+        assert df_1_hash != df_3_hash
+
 
 class TestDeepHashSHA:
     """DeepHash with SHA Tests."""