From 47f4526cd5217dd55a88185fdc99c93fed00953e Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Thu, 10 Oct 2024 19:05:39 +0100
Subject: [PATCH 001/105] added folder_transform

---
 .../pure_python/transform_file_processor.py   | 15 ++++--
 .../pure_python/transform_orchestrator.py     | 42 ++++++++++------
 .../runtime/transform_file_processor.py       | 41 ++++++++-------
 .../src/data_processing/transform/__init__.py |  2 +
 .../transform/abstract_transform.py           | 16 ++++++
 .../transform/binary_transform.py             |  5 +-
 .../transform/folder_transform.py             | 50 +++++++++++++++++++
 .../runtime/ray/transform_file_processor.py   |  1 +
 .../runtime/ray/transform_orchestrator.py     | 19 ++++---
 .../runtime/spark/transform_file_processor.py |  5 +-
 .../runtime/spark/transform_orchestrator.py   | 25 +++++++---
 11 files changed, 168 insertions(+), 53 deletions(-)
 create mode 100644 data-processing-lib/python/src/data_processing/transform/abstract_transform.py
 create mode 100644 data-processing-lib/python/src/data_processing/transform/folder_transform.py

diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
index 143835dd0..fa3e69e4a 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
@@ -14,7 +14,7 @@
 
 from data_processing.data_access import DataAccessFactoryBase
 from data_processing.runtime import AbstractTransformFileProcessor
-from data_processing.transform import AbstractBinaryTransform, TransformStatistics
+from data_processing.transform import AbstractTransform, TransformStatistics
 from data_processing.utils import UnrecoverableException
 
 
@@ -28,7 +28,8 @@ def __init__(
         data_access_factory: DataAccessFactoryBase,
         statistics: TransformStatistics,
         transform_params: dict[str, Any],
-        transform_class: type[AbstractBinaryTransform],
+        transform_class: type[AbstractTransform],
+        is_folder: bool,
     ):
         """
         Init method
@@ -36,11 +37,13 @@ def __init__(
         :param statistics - reference to statistics class
         :param transform_params - transform parameters
         :param transform_class: transform class
+        :param is_folder: folder transform flag
         """
         # invoke superclass
         super().__init__(
             data_access_factory=data_access_factory,
             transform_parameters=dict(transform_params),
+            is_folder=is_folder,
         )
         self.transform_params["statistics"] = statistics
         # Create local processor
@@ -52,7 +55,8 @@ def __init__(
         # Create statistics
         self.stats = statistics
 
-    def _publish_stats(self, stats: dict[str, Any]) -> None:
+
+def _publish_stats(self, stats: dict[str, Any]) -> None:
         self.stats.add_stats(stats)
 
 
@@ -65,17 +69,20 @@ def __init__(
         self,
         data_access_factory: DataAccessFactoryBase,
         transform_params: dict[str, Any],
-        transform_class: type[AbstractBinaryTransform],
+        transform_class: type[AbstractTransform],
+        is_folder: bool
     ):
         """
         Init method
         :param data_access_factory - data access factory
         :param transform_params - transform parameters
         :param transform_class: transform class
+        :param is_folder: folder tranform flag
         """
         super().__init__(
             data_access_factory=data_access_factory,
             transform_parameters=dict(transform_params),
+            is_folder=is_folder,
         )
         # Add data access and statistics to the processor parameters
         self.transform_params["data_access"] = self.data_access
diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
index 8692da29e..153eaaf0a 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
@@ -24,7 +24,7 @@
     PythonTransformFileProcessor,
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.transform import AbstractBinaryTransform, TransformStatistics
+from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 
 
@@ -48,8 +48,6 @@ def _execution_resources() -> dict[str, Any]:
         "object_store": 0,
     }
 
-
-
 def orchestrate(
     data_access_factory: DataAccessFactoryBase,
     runtime_config: PythonTransformRuntimeConfiguration,
@@ -74,15 +72,21 @@ def orchestrate(
         return 1
     # create additional execution parameters
     runtime = runtime_config.create_transform_runtime()
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        if retries > 0:
-            statistics.add_stats({"data access retries": retries})
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")
+        else:
+            # Get files to process
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            if retries > 0:
+                statistics.add_stats({"data access retries": retries})
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # Print interval
         print_interval = int(len(files) / 100)
         if print_interval == 0:
@@ -99,6 +103,7 @@ def orchestrate(
                     data_access_factory=data_access_factory, statistics=statistics, files=files
                 ),
                 transform_class=runtime_config.get_transform_class(),
+                is_folder=is_folder,
             )
         else:
             # using sequential execution
@@ -111,6 +116,7 @@ def orchestrate(
                     data_access_factory=data_access_factory, statistics=statistics, files=files
                 ),
                 transform_class=runtime_config.get_transform_class(),
+                is_folder=is_folder,
             )
         status = "success"
         return_code = 0
@@ -157,7 +163,8 @@ def _process_transforms(
     data_access_factory: DataAccessFactoryBase,
     statistics: TransformStatistics,
     transform_params: dict[str, Any],
-    transform_class: type[AbstractBinaryTransform],
+    transform_class: type[AbstractTransform],
+    is_folder: bool,
 ) -> None:
     """
     Process transforms sequentially
@@ -167,9 +174,8 @@ def _process_transforms(
     :param data_access_factory: data access factory
     :param transform_params - transform parameters
     :param transform_class: transform class
+    :param is_folder: folder transform flag
     :return: metadata for the execution
-
-    :return: None
     """
     # create executor
     executor = PythonTransformFileProcessor(
@@ -177,6 +183,7 @@ def _process_transforms(
         statistics=statistics,
         transform_params=transform_params,
         transform_class=transform_class,
+        is_folder=is_folder,
     )
     # process data
     t_start = time.time()
@@ -203,6 +210,7 @@ def _process_transforms_multiprocessor(
     data_access_factory: DataAccessFactoryBase,
     transform_params: dict[str, Any],
     transform_class: type[AbstractBinaryTransform],
+    is_folder: bool
 ) -> TransformStatistics:
     """
     Process transforms using multiprocessing pool
@@ -212,13 +220,17 @@ def _process_transforms_multiprocessor(
     :param data_access_factory: data access factory
     :param transform_params - transform parameters
     :param transform_class: transform class
+    :param is_folder: folder transform class
     :return: metadata for the execution
     """
     # result statistics
     statistics = TransformStatistics()
     # create processor
     processor = PythonPoolTransformFileProcessor(
-        data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class
+        data_access_factory=data_access_factory,
+        transform_params=transform_params,
+        transform_class=transform_class,
+        is_folder=is_folder,
     )
     completed = 0
     t_start = time.time()
diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
index d4ec548d8..1d268875f 100644
--- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
@@ -26,11 +26,13 @@ def __init__(
         self,
         data_access_factory: DataAccessFactoryBase,
         transform_parameters: dict[str, Any],
+        is_folder: bool = False,
     ):
         """
         Init method
         :param data_access_factory: Data Access Factory
         :param transform_parameters: Transform parameters
+        :param is_folder: folder transform flag
         """
         self.logger = get_logger(__name__)
         # validate parameters
@@ -46,6 +48,7 @@ def __init__(
         # Add data access and statistics to the processor parameters
         self.transform_params = transform_parameters
         self.transform_params["data_access"] = self.data_access
+        self.is_folder = is_folder
 
     def process_file(self, f_name: str) -> None:
         """
@@ -58,25 +61,29 @@ def process_file(self, f_name: str) -> None:
             self.logger.warning("No data_access found. Returning.")
             return
         t_start = time.time()
-        # Read source file
-        filedata, retries = self.data_access.get_file(path=f_name)
-        if retries > 0:
-            self._publish_stats({"data access retries": retries})
-        if filedata is None:
-            self.logger.warning(f"File read resulted in None for {f_name}. Returning.")
-            self._publish_stats({"failed_reads": 1})
-            return
-        self._publish_stats({"source_files": 1, "source_size": len(filedata)})
+        if not self.is_folder:
+            # Read source file only if we are processing file
+            filedata, retries = self.data_access.get_file(path=f_name)
+            if retries > 0:
+                self._publish_stats({"data access retries": retries})
+            if filedata is None:
+                self.logger.warning(f"File read resulted in None for {f_name}. Returning.")
+                self._publish_stats({"failed_reads": 1})
+                return
+            self._publish_stats({"source_files": 1, "source_size": len(filedata)})
         # Process input file
         try:
-            # execute local processing
-            name_extension = TransformUtils.get_file_extension(f_name)
             self.logger.debug(f"Begin transforming file {f_name}")
-            out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata)
+            if not self.is_folder:
+                # execute local processing
+                out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata)
+                name_extension = TransformUtils.get_file_extension(f_name)
+                self.last_file_name = name_extension[0]
+                self.last_file_name_next_index = None
+                self.last_extension = name_extension[1]
+            else:
+                out_files, stats = self.transform.transform(folder_name=f_name)
             self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
-            self.last_file_name = name_extension[0]
-            self.last_file_name_next_index = None
-            self.last_extension = name_extension[1]
             # save results
             self._submit_file(t_start=t_start, out_files=out_files, stats=stats)
         # Process unrecoverable exceptions
@@ -95,10 +102,10 @@ def flush(self) -> None:
         the hook for them to return back locally stored data and their statistics.
         :return: None
         """
-        if self.last_file_name is None:
+        if self.last_file_name is None or self.is_folder:
             # for some reason a given worker never processed anything. Happens in testing
             # when the amount of workers is greater than the amount of files
-            self.logger.debug("skipping flush, no name for file is defined")
+            self.logger.debug("skipping flush, no name for file is defined or this is a folder transform")
             return
         try:
             t_start = time.time()
diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py
index 6af43ad60..20254e47b 100644
--- a/data-processing-lib/python/src/data_processing/transform/__init__.py
+++ b/data-processing-lib/python/src/data_processing/transform/__init__.py
@@ -1,3 +1,5 @@
+from data_processing.transform.abstract_transform import AbstractTransform
+from data_processing.transform.folder_transform import AbstractFolderTransform
 from data_processing.transform.binary_transform import AbstractBinaryTransform
 from data_processing.transform.table_transform import AbstractTableTransform
 from data_processing.transform.transform_statistics import TransformStatistics
diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py
new file mode 100644
index 000000000..89db70f42
--- /dev/null
+++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py
@@ -0,0 +1,16 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+class AbstractTransform:
+    """
+    Base class for all transform types
+    """
\ No newline at end of file
diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py
index 80dff61ea..b313aff2f 100644
--- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py
@@ -10,10 +10,11 @@
 # limitations under the License.
 ################################################################################
 
-from typing import Any, TypeVar
+from typing import Any
+from data_processing.transform import AbstractTransform
 
 
-class AbstractBinaryTransform:
+class AbstractBinaryTransform(AbstractTransform):
     """
     Converts input binary file to output file(s) (binary)
     Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
new file mode 100644
index 000000000..866e3286f
--- /dev/null
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -0,0 +1,50 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from typing import Any
+from data_processing.data_access import data_access
+from data_processing.transform import AbstractTransform
+
+
+class AbstractFolderTransform(AbstractTransform):
+    """
+    Converts input folder to output file(s) (binary)
+    Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or
+    more new binary files and metadata.
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This simply stores the given instance in this instance for later use.
+        """
+        self.config = config
+
+    def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        """
+        Converts input folder into o or more output files.
+        If there is an error, an exception must be raised - exit()ing is not generally allowed.
+        :param folder_name: the name of the folder containing arbitrary amount of files.
+        :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
+                to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
+                holding the extension to be used when writing out the new bytes.
+        """
+        raise NotImplemented()
+
+    @staticmethod
+    def get_folders(data_access:data_access) -> list(str):
+        """
+        Compute the list of folders to use.
+        :param data_access - data access class
+        :return:
+        """
+        raise NotImplemented()
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py
index e1fabb144..cdad1309f 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py
@@ -35,6 +35,7 @@ def __init__(self, params: dict[str, Any]):
         super().__init__(
             data_access_factory=params.get("data_access_factory", None),
             transform_parameters=dict(params.get("transform_params", {})),
+            is_folder=params.get("is_folder", False)
         )
         # Create statistics
         self.stats = params.get("statistics", None)
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index 42eba47a6..8276eb56c 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -16,6 +16,7 @@
 
 import ray
 from data_processing.data_access import DataAccessFactoryBase
+from data_processing.transform import AbstractFolderTransform
 from data_processing_ray.runtime.ray import (
     RayTransformExecutionConfiguration,
     RayTransformFileProcessor,
@@ -56,13 +57,18 @@ def orchestrate(
     # create transformer runtime
     runtime = runtime_config.create_transform_runtime()
     resources = RayUtils.get_cluster_resources()
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")        # Get files to process
+        else:
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # Print interval
         print_interval = int(len(files) / 100)
         if print_interval == 0:
@@ -84,6 +90,7 @@ def orchestrate(
                 data_access_factory=data_access_factory, statistics=statistics, files=files
             ),
             "statistics": statistics,
+            "is_folder": is_folder,
         }
         logger.debug("Creating actors")
         processors = RayUtils.create_actors(
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py
index d63664ac4..a0968ab1d 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py
@@ -29,12 +29,15 @@ def __init__(
         data_access_factory: DataAccessFactoryBase,
         runtime_configuration: SparkTransformRuntimeConfiguration,
         statistics: TransformStatistics,
+        is_folder: bool,
     ):
         """
         Init method
         """
         super().__init__(
-            data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params()
+            data_access_factory=data_access_factory,
+            transform_parameters=runtime_configuration.get_transform_params(),
+            is_folder=is_folder,
         )
         # Add data access ant statistics to the processor parameters
         self.runtime_configuration = runtime_configuration
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
index 57a6c58fc..11589dbaf 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
@@ -15,7 +15,7 @@
 from datetime import datetime
 
 from data_processing.data_access import DataAccessFactoryBase
-from data_processing.transform import TransformStatistics
+from data_processing.transform import TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 from data_processing_spark.runtime.spark import (
     SparkTransformFileProcessor,
@@ -68,7 +68,10 @@ def process_partition(iterator):
         runtime = runtime_conf.create_transform_runtime()
         # create file processor
         file_processor = SparkTransformFileProcessor(
-            data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics
+            data_access_factory=d_access_factory,
+            runtime_configuration=runtime_conf,
+            statistics=statistics,
+            is_folder=is_folder,
         )
         first = True
         for f in iterator:
@@ -92,13 +95,19 @@ def process_partition(iterator):
         return list(statistics.get_execution_stats().items())
 
     num_partitions = 0
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")        # Get files to process
+        else:
+            # Get files to process
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # process data
         logger.debug("Begin processing files")
         # process files split by partitions

From 5fd20a125a71a40d6db7dc958dce50321369f3c0 Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Thu, 10 Oct 2024 19:13:01 +0100
Subject: [PATCH 002/105] added folder_transform

---
 .../runtime/pure_python/transform_orchestrator.py             | 2 +-
 .../python/src/data_processing/transform/folder_transform.py  | 4 ++--
 .../data_processing_ray/runtime/ray/transform_orchestrator.py | 2 +-
 .../runtime/spark/transform_orchestrator.py                   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
index 153eaaf0a..d51f80a8a 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
@@ -76,7 +76,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            files = AbstractFolderTransform.get_folders(d_access=data_access)
             logger.info(f"Number of folders is {len(files)}")
         else:
             # Get files to process
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
index 866e3286f..eca191bbb 100644
--- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -41,10 +41,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
         raise NotImplemented()
 
     @staticmethod
-    def get_folders(data_access:data_access) -> list(str):
+    def get_folders(d_access: data_access) -> list(str):
         """
         Compute the list of folders to use.
-        :param data_access - data access class
+        :param d_access - data access class
         :return:
         """
         raise NotImplemented()
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index 8276eb56c..a8ff95729 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -61,7 +61,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            files = AbstractFolderTransform.get_folders(d_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             files, profile, retries = data_access.get_files_to_process()
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
index 11589dbaf..a4c0c5835 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
@@ -99,7 +99,7 @@ def process_partition(iterator):
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            files = AbstractFolderTransform.get_folders(d_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             # Get files to process

From 38b47259977fbe64ead50231a52660e375625add Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Thu, 10 Oct 2024 21:00:43 +0100
Subject: [PATCH 003/105] added folder_transform

---
 .../runtime/pure_python/transform_file_processor.py  |  3 +--
 .../runtime/pure_python/transform_orchestrator.py    | 11 ++++++-----
 .../runtime/pure_python/transform_runtime.py         | 10 +++++++++-
 .../data_processing/transform/folder_transform.py    | 12 +-----------
 .../runtime/ray/transform_orchestrator.py            |  2 +-
 .../runtime/ray/transform_runtime.py                 | 10 +++++++++-
 6 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
index fa3e69e4a..44ccd0ef0 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
@@ -55,8 +55,7 @@ def __init__(
         # Create statistics
         self.stats = statistics
 
-
-def _publish_stats(self, stats: dict[str, Any]) -> None:
+    def _publish_stats(self, stats: dict[str, Any]) -> None:
         self.stats.add_stats(stats)
 
 
diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
index d51f80a8a..812be8caf 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
@@ -24,14 +24,13 @@
     PythonTransformFileProcessor,
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform
+from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 
 
 logger = get_logger(__name__)
 
 
-@staticmethod
 def _execution_resources() -> dict[str, Any]:
     """
     Get Execution resource
@@ -48,6 +47,7 @@ def _execution_resources() -> dict[str, Any]:
         "object_store": 0,
     }
 
+
 def orchestrate(
     data_access_factory: DataAccessFactoryBase,
     runtime_config: PythonTransformRuntimeConfiguration,
@@ -76,7 +76,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(d_access=data_access)
+            files = runtime.get_folders(data_access=data_access)
             logger.info(f"Number of folders is {len(files)}")
         else:
             # Get files to process
@@ -145,7 +145,8 @@ def orchestrate(
             "job_input_params": input_params
             | data_access_factory.get_input_params()
             | execution_config.get_input_params(),
-            "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
+            "execution_stats": _execution_resources() |
+                               {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
             "job_output_stats": stats,
         }
         logger.debug(f"Saving job metadata: {metadata}.")
@@ -209,7 +210,7 @@ def _process_transforms_multiprocessor(
     print_interval: int,
     data_access_factory: DataAccessFactoryBase,
     transform_params: dict[str, Any],
-    transform_class: type[AbstractBinaryTransform],
+    transform_class: type[AbstractTransform],
     is_folder: bool
 ) -> TransformStatistics:
     """
diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py
index 4173154ae..478d40837 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py
@@ -12,7 +12,7 @@
 
 from typing import Any
 
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from data_processing.transform import TransformStatistics
 
 
@@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]):
         """
         self.params = params
 
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
+
     def get_transform_config(
         self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
     ) -> dict[str, Any]:
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
index eca191bbb..9a2fb3713 100644
--- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -11,7 +11,6 @@
 ################################################################################
 
 from typing import Any
-from data_processing.data_access import data_access
 from data_processing.transform import AbstractTransform
 
 
@@ -38,13 +37,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
                 to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
                 holding the extension to be used when writing out the new bytes.
         """
-        raise NotImplemented()
-
-    @staticmethod
-    def get_folders(d_access: data_access) -> list(str):
-        """
-        Compute the list of folders to use.
-        :param d_access - data access class
-        :return:
-        """
-        raise NotImplemented()
+        raise NotImplemented()
\ No newline at end of file
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index a8ff95729..b29682997 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -61,7 +61,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(d_access=data_access)
+            files = runtime.get_folders(data_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             files, profile, retries = data_access.get_files_to_process()
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py
index 57f071406..64479302c 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py
@@ -12,7 +12,7 @@
 
 from typing import Any
 
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from ray.actor import ActorHandle
 
 
@@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]):
         """
         self.params = params
 
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
+
     def get_transform_config(
         self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
     ) -> dict[str, Any]:

From a3abf21cda7e280f7089555bc974058d193b502f Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Fri, 11 Oct 2024 08:48:00 +0100
Subject: [PATCH 004/105] added folder_transform

---
 .../runtime/spark/transform_orchestrator.py            |  3 ++-
 .../runtime/spark/transform_runtime.py                 | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
index a4c0c5835..c404559d8 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
@@ -99,7 +99,8 @@ def process_partition(iterator):
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(d_access=data_access)
+            runtime = runtime_config.create_transform_runtime()
+            files = runtime.get_folders(data_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             # Get files to process
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py
index f16b09520..3c9fca76f 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py
@@ -12,7 +12,7 @@
 
 from typing import Any
 
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from data_processing.transform import TransformStatistics
 
 
@@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]):
         """
         self.params = params
 
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
+
     def get_transform_config(
         self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
     ) -> dict[str, Any]:

From af8475df9648a76cb268b284f60de3597fa579c8 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 11 Oct 2024 10:20:48 -0400
Subject: [PATCH 005/105] Fuzzy dedup pure python implementation

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/python/README.md  |  11 +
 .../universal/fdedup/python/pyproject.toml    |  55 ++
 .../universal/fdedup/python/src/Murmur_MH.py  |  99 ++++
 .../src/cluster_analysis_local_python.py      |  46 ++
 .../python/src/cluster_analysis_transform.py  | 229 ++++++++
 .../src/cluster_analysis_transform_python.py  |  44 ++
 .../python/src/data_cleaning_local_python.py  |  56 ++
 .../python/src/data_cleaning_transform.py     | 150 ++++++
 .../src/data_cleaning_transform_python.py     |  83 +++
 .../fdedup/python/src/file_copy_util.py       | 158 ++++++
 .../fdedup/python/src/service_orchestrator.py | 265 +++++++++
 .../python/src/signature_calc_local_python.py |  60 +++
 .../python/src/signature_calc_transform.py    | 504 ++++++++++++++++++
 .../src/signature_calc_transform_python.py    |  44 ++
 14 files changed, 1804 insertions(+)
 create mode 100644 transforms/universal/fdedup/python/README.md
 create mode 100644 transforms/universal/fdedup/python/pyproject.toml
 create mode 100644 transforms/universal/fdedup/python/src/Murmur_MH.py
 create mode 100644 transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
 create mode 100644 transforms/universal/fdedup/python/src/cluster_analysis_transform.py
 create mode 100644 transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
 create mode 100644 transforms/universal/fdedup/python/src/data_cleaning_local_python.py
 create mode 100644 transforms/universal/fdedup/python/src/data_cleaning_transform.py
 create mode 100644 transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
 create mode 100644 transforms/universal/fdedup/python/src/file_copy_util.py
 create mode 100644 transforms/universal/fdedup/python/src/service_orchestrator.py
 create mode 100644 transforms/universal/fdedup/python/src/signature_calc_local_python.py
 create mode 100644 transforms/universal/fdedup/python/src/signature_calc_transform.py
 create mode 100644 transforms/universal/fdedup/python/src/signature_calc_transform_python.py

diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md
new file mode 100644
index 000000000..34f18c73b
--- /dev/null
+++ b/transforms/universal/fdedup/python/README.md
@@ -0,0 +1,11 @@
+# Fuzzy Dedup
+
+Please see the set of
+[transform project conventions](../../../README.md)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary
+
+The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see
+[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details.
\ No newline at end of file
diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml
new file mode 100644
index 000000000..f2b9d8268
--- /dev/null
+++ b/transforms/universal/fdedup/python/pyproject.toml
@@ -0,0 +1,55 @@
+[project]
+name = "dpk_fdedup_transform_python"
+version = "0.3.0.dev0"
+requires-python = ">=3.10"
+description = "Fuzzy Dedup Transform for Python"
+license = {text = "Apache-2.0"}
+readme = {file = "README.md", content-type = "text/markdown"}
+authors = [
+    { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
+    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
+]
+dependencies = [
+    "data-prep-toolkit==0.2.2.dev0",
+    "pyarrow==16.1.0",
+    "pyyaml>=6.0.2",
+    "boto3>=1.34.69",
+    "kubernetes>=30.1.0",
+    "polars>=1.6.0",
+    "disjoint-set>=0.8.0",
+    "scipy>=1.14.1",
+    "numpy<1.29.0",
+    "sentencepiece>=0.2.0",
+    "mmh3>=4.1.0",
+]
+
+[build-system]
+requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
+build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+dev = [
+    "twine",
+    "pytest>=7.3.2",
+    "pytest-dotenv>=0.5.2",
+    "pytest-env>=1.0.0",
+    "pre-commit>=3.3.2",
+    "pytest-cov>=4.1.0",
+    "pytest-mock>=3.10.0",
+    "moto==5.0.5",
+    "markupsafe==2.0.1",
+]
+
+[options]
+package_dir = ["src","test"]
+
+[options.packages.find]
+where = ["src/"]
+
+[tool.pytest.ini_options]
+# Currently we use low coverage since we have to run tests separately (see makefile)
+#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
+markers = ["unit: unit tests", "integration: integration tests"]
+
+[tool.coverage.run]
+include = ["src/*"]
diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py
new file mode 100644
index 000000000..e3442ba02
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/Murmur_MH.py
@@ -0,0 +1,99 @@
+import logging
+import os
+from typing import List, Set
+
+import mmh3
+import numpy as np
+
+
+class Murmur_MH:
+    def __init__(self, num_perm=64, seed=42, hashfunc=None):
+        self.seed = seed
+        self.num_perm = num_perm  # the number of buckets, i.e. the vector length after self.minhash() call
+        self.permutations = self._init_permutations(seed, num_perm)
+
+    def _init_permutations(self, seed, num_perm):
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        max_int = np.uint64((1 << 64) - 1)
+        # initialize pseudo random number generator with given seed value
+        gen = np.random.RandomState(seed)
+        # get self.num_perm pseudo random numbers between 2 and max_int (excl)
+        permutations = np.array(
+            [gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)],
+            dtype=np.uint64,
+        ).T
+        # make all even pseudo random numbers odd by adding 1
+        permutations[permutations % 2 == 0] += 1
+        return permutations
+
+    def minhash(self, shingles: List[str]):
+        """return np.array of minhash"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0)
+        )
+
+    def minhash2(self, shingles: List[str], doc_len: int):
+        """
+        for each shingle (i.e. a group of k-words) it generates a digest value based on
+        mmh3-hash function (32-bit)
+
+        return tuple (A, B)
+            A = an array of values = np.array of minhash
+            B = document_length = number of characters"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0),
+            doc_len,
+        )
+
+    def minhash2_nosalt(self, shingles: List[str], doc_len: int, doc_id: int):
+        """
+        for each shingle (i.e. a group of k-words) it generates a digest value based on
+        mmh3-hash function (32-bit)
+
+        return tuple (A, B)
+            A = an array of values = np.array of minhash
+            B = document_length = number of characters"""
+        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
+        hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64)
+        return (
+            np.right_shift(
+                (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T,
+                32,
+            )
+            .astype(np.uint32)
+            .min(axis=0)
+            .tolist(),
+            doc_len,
+            doc_id,
+        )
+
+    @staticmethod
+    def jaccard(mh1: np.array, mh2: np.array) -> float:
+        """
+        The Jaccard similarity measures the similarity between two sets of data
+        to see which members are shared and distinct.
+
+        The Jaccard similarity is calculated by dividing the number of observations
+        in both sets by the number of observations in either set.
+
+        Developed by Paul Jaccard, the index ranges from 0 to 1.
+        The closer to 1, the more similar the two sets of data.
+
+        As a document is represented by a set. We use Jaccard distance to see how similar between two documents.
+        """
+        assert len(mh1) == len(mh2)
+        return np.count_nonzero(mh1 == mh2) / len(mh1)
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
new file mode 100644
index 000000000..dcfc9a7e4
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
@@ -0,0 +1,46 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from cluster_analysis_transform_python import (
+    ClusterAnalysisPythonTransformConfiguration,
+)
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+}
+if __name__ == "__main__":
+    # Set the simulated command line args
+    # sys.argv = ParamsUtils.dict_to_req(d=params)
+    # print(sys.argv)
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
new file mode 100644
index 000000000..5ad18362a
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
@@ -0,0 +1,229 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import os
+from argparse import ArgumentParser, Namespace
+from typing import Any, List, Tuple
+
+import numpy as np
+import polars as pl
+import pyarrow as pa
+from data_processing.transform import AbstractTableTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider, get_logger
+from Murmur_MH import Murmur_MH
+
+
+short_name = "cluster"
+cli_prefix = f"{short_name}_"
+
+# configuration keys
+jaccard_similarity_threshold_key = "jaccard_similarity_threshold"
+""" This key holds the Jaccard similarity threshold above which two documents are duplicates"""
+
+# command line arguments
+jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}"
+""" Jaccard similarity threshold above which two documents are duplicates"""
+
+captured_arg_keys = [
+    jaccard_similarity_threshold_key,
+]
+
+# defaults
+jaccard_similarity_threshold_default = 0.8
+""" Default Jaccard similarity threshold above which two documents are duplicates"""
+
+
+class ClusterAnalysisTransform(AbstractTableTransform):
+    """
+    This is the second transform of the fuzzy dedup pipeline. It runs in parallel:
+    for each band, the hashing interval is divided into segments. A cluster analysis
+    uses as input all the parquet files from segment of a band. The `bands` output
+    of the signature calculation, the first transform in the fuzzy dedup pipeline
+    contains all the data for a given segment s of a specific band b in the
+    subfolder `bands/band=b/segment=s`.
+    The transform loads all the parquet files in the `bands/band=b/segment=s`
+    subfolder. Each one of these parquet files has two columns: the `band_hash`
+    and a `data` structure, which includes the `document_id`, the `minhashes` and
+    the `document_size` fields. Once all the files have been loaded in a single
+    dataframe, a `group_by` operation on the `band_hash` field is performed in
+    that dataframe. All the documents that have the same band_hash are grouped
+    in a cluster. Subsequently, the documents of each cluster are sorted in
+    descending order according to their size, and a Jaccard similarity is
+    calculated between the cluster documents. The documents for which the Jaccard
+    similarity is above the `jaccard_similarity_threshold` remain in the cluster,
+    the others are removed from the cluster. Finally, from each cluster that has
+    more than one document after running the Jaccard similarity, we select a doc
+    to keep (the largest size document), and mark the other documents as
+    duplicates. The resulting clusters are saved in a file for further analysis.
+
+    Args:
+        jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments
+        defined by the companion runtime, ClusterAnalysisTransformRuntime.
+        """
+        super().__init__(config)
+        self.jaccard_similarity_threshold = config.get(
+            jaccard_similarity_threshold_key, jaccard_similarity_threshold_default
+        )
+        self.logger = get_logger(__name__)
+
+    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
+        bands_dataframe = pl.from_arrow(table)
+        docs2remove_list = []
+        # clustering
+        bands_dataframe_groups = bands_dataframe.group_by("band_hash").agg("document_data")
+        bands_dataframe_cluster = bands_dataframe_groups.with_columns(
+            cluster_length=pl.col("document_data").list.len()
+        ).filter(pl.col("cluster_length") > 1)
+        self.logger.info(f"file_name = {file_name}")
+        num_clusters = len(bands_dataframe_cluster)
+        if num_clusters > 0:
+            sum_cdocs = bands_dataframe_cluster.select(pl.sum("cluster_length")).item()
+            max_cdocs = bands_dataframe_cluster.select(pl.max("cluster_length")).item()
+            min_cdocs = bands_dataframe_cluster.select(pl.min("cluster_length")).item()
+            avg_cdocs = bands_dataframe_cluster.select(pl.mean("cluster_length")).item()
+        else:
+            sum_cdocs = 0
+            max_cdocs = 0
+            min_cdocs = 0
+            avg_cdocs = 0
+        self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs")
+        self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
+        bands_dataframe_response = self.process_bands(bands_dataframe_cluster)
+
+        filtered_doc2remove_dataframe = bands_dataframe_response.filter(pl.col("docs_to_remove_length") > 0)
+        num_clusters = len(filtered_doc2remove_dataframe)
+        if num_clusters > 0:
+            sum_cdocs = filtered_doc2remove_dataframe.select(pl.sum("docs_to_remove_length")).item()
+            max_cdocs = filtered_doc2remove_dataframe.select(pl.max("docs_to_remove_length")).item()
+            min_cdocs = filtered_doc2remove_dataframe.select(pl.min("docs_to_remove_length")).item()
+            avg_cdocs = filtered_doc2remove_dataframe.select(pl.mean("docs_to_remove_length")).item()
+        else:
+            sum_cdocs = 0
+            max_cdocs = 0
+            min_cdocs = 0
+            avg_cdocs = 0
+        self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs")
+        self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
+
+        # Explode the 'docs_to_remove' column
+        doc2remove_exploded_dataframe = filtered_doc2remove_dataframe.explode("docs_to_remove")
+        table = doc2remove_exploded_dataframe.to_arrow()
+        self.logger.info(f"{len(doc2remove_exploded_dataframe)} documents marked to remove")
+        metadata = {"nrows": len(table)}
+        return [table], metadata
+
+    def process_bands(self, df: pl.DataFrame) -> pl.DataFrame:
+        # Define the schema with specific data types
+        schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64}
+        doc_ids_lists = []
+        docs_to_remove_lists = []
+        len_of_docs2remove_lists = []
+        for row in df.iter_rows(named=True):
+            doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self.jaccard_distance_calculation(row)
+            doc_ids_lists += doc_ids_list
+            docs_to_remove_lists += docs_to_remove_list
+            len_of_docs2remove_lists += len_of_docs2remove_list
+        processed_rows = pl.DataFrame(
+            {
+                "first_doc": doc_ids_lists,
+                "docs_to_remove": docs_to_remove_lists,
+                "docs_to_remove_length": len_of_docs2remove_lists,
+            },
+            schema=schema,
+        )
+        return processed_rows
+
+    def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]:
+        # Process row and return a new list of Series or a new row
+        threshold = self.jaccard_similarity_threshold
+        doc_ids_list = []
+        docs_to_remove_list = []
+        len_of_docs2remove_list = []
+        # sort documents
+        document_data = row["document_data"]
+
+        # Sort the list by 'document_length'
+        sorted_document_data = sorted(document_data, key=lambda x: (-x["document_length"], x["int_id_column"]))
+
+        # Extracting int_id_column values into a list
+        doc_list = list(set([item["int_id_column"] for item in sorted_document_data]))
+
+        # Creating a dictionary with int_id_column as key and minhashes as value
+        doc_minhashes = {item["int_id_column"]: item["minhashes"] for item in sorted_document_data}
+
+        while len(doc_list) > 1:
+            docs_to_remove = []
+            new_doc_list = []
+            # this is the document we are going to keep
+            first_doc = doc_list[0]
+            first_mh = doc_minhashes[first_doc]
+            for int_id_column in doc_list[1:]:
+                doc_mh = doc_minhashes[int_id_column]
+                distance = Murmur_MH.jaccard(np.array(first_mh), np.array(doc_mh))
+                if distance >= threshold:
+                    docs_to_remove.append(int_id_column)
+                else:
+                    new_doc_list.append(int_id_column)
+            if len(docs_to_remove) > 0:
+                docs_to_remove = list(set(docs_to_remove))
+                doc_ids_list.append(first_doc)
+                docs_to_remove_list.append(docs_to_remove)
+                len_of_docs2remove_list.append(len(docs_to_remove))
+            doc_list = new_doc_list
+
+        return doc_ids_list, docs_to_remove_list, len_of_docs2remove_list
+
+
+class ClusterAnalysisTransformConfiguration(TransformConfiguration):
+
+    """
+    Provides support for configuring and using the associated Transform class include
+    configuration with CLI args.
+    """
+
+    def __init__(self):
+        super().__init__(
+            name=short_name,
+            transform_class=ClusterAnalysisTransform,
+            remove_from_metadata=[],
+        )
+        self.logger = get_logger(__name__, level="INFO")
+
+    def add_input_params(self, parser: ArgumentParser) -> None:
+        """
+        Add Transform-specific arguments to the given  parser.
+        This will be included in a dictionary used to initialize the NOOPTransform.
+        By convention a common prefix should be used for all transform-specific CLI args
+        (e.g, noop_, pii_, etc.)
+        """
+        parser.add_argument(
+            f"--{jaccard_similarity_threshold_cli_param}",
+            type=float,
+            default=jaccard_similarity_threshold_default,
+            help="Jaccard similarity threshold above which two documents are duplicates",
+        )
+
+    def apply_input_params(self, args: Namespace) -> bool:
+        """
+        Validate and apply the arguments that have been parsed
+        :param args: user defined arguments.
+        :return: True, if validate pass or False otherwise
+        """
+        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+        self.params = self.params | captured
+        self.logger.info(f"{short_name} parameters are : {self.params}")
+        return True
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
new file mode 100644
index 000000000..28d96f428
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
@@ -0,0 +1,44 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import time
+
+from cluster_analysis_transform import ClusterAnalysisTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.runtime.pure_python.runtime_configuration import (
+    PythonTransformRuntimeConfiguration,
+)
+from data_processing.utils import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class ClusterAnalysisPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(transform_config=ClusterAnalysisTransformConfiguration())
+
+
+if __name__ == "__main__":
+    launcher = PythonTransformLauncher(ClusterAnalysisTransformConfiguration())
+    logger.info("Launching noop transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py
new file mode 100644
index 000000000..4295e4e82
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py
@@ -0,0 +1,56 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+)
+from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+duplicate_location = os.path.abspath(
+    os.path.join(
+        os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet"
+    )
+)
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    document_id_column_cli_param: "int_id_column",
+    duplicate_list_location_cli_param: duplicate_location,
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+}
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    print(sys.argv)
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
new file mode 100644
index 000000000..f03b6c1d0
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
@@ -0,0 +1,150 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import io
+import os
+from argparse import ArgumentParser, Namespace
+from typing import Any, List, Tuple
+
+import numpy as np
+import polars as pl
+import pyarrow as pa
+from data_processing.transform import AbstractTableTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger
+
+
+short_name = "fdclean"
+cli_prefix = f"{short_name}_"
+
+# configuration keys
+document_id_column_key = "document_id_column"
+""" This key holds the name of the column storing the unique ID assigned to each document"""
+duplicate_list_location_key = "duplicate_list_location"
+""" This key holds the location of the list of duplicate documents marked for removal"""
+
+# command line arguments
+document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
+""" Name of the column storing the unique ID assigned to each document"""
+duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}"
+""" Location of the list of duplicate documents marked for removal"""
+
+captured_arg_keys = [
+    document_id_column_key,
+    duplicate_list_location_key,
+]
+
+# defaults
+document_id_column_default = "int_id_column"
+""" Default name of the column storing the unique ID assigned to each document"""
+duplicate_list_location_default = None
+""" Default location of the list of duplicate documents marked for removal"""
+
+
+class DataCleaningTransform(AbstractTableTransform):
+    """
+    This is the third transform of the fuzzy dedup pipeline. It takes as input
+    the list of the documents to remove (identified as duplicates during the
+    cluster analysis phase, and the original dataset. Each dataset file is
+    imported into a table, and the documents that are in the documents to remove
+    list are filtered out from that table. The output is a new dataset, which
+    keeps the directory structure of the input dataset, but has all the fuzzy
+    duplicates removed.
+
+    Args:
+        duplicate_location: location (local or s3) of the duplicate document list
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments
+        defined by the companion runtime, ClusterAnalysisTransformRuntime.
+        """
+        super().__init__(config)
+        self.logger = get_logger(__name__)
+        self.document_id_column = config.get(document_id_column_key, document_id_column_default)
+        self.duplicate_list_location = config.get(duplicate_list_location_key, duplicate_list_location_default)
+        contents = config.get("df")
+        self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents))
+        self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows")
+        self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column})
+
+    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
+        self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}")
+        input_df = pl.from_arrow(table)
+        # handle the case when the doc_id columns in the input dataframe and the
+        # docs_to_remove_df  have different types, i.e. one is int32 and the
+        # other is int64
+        input_doc_id_type = input_df[self.document_id_column].dtype
+        if input_doc_id_type != self.docs_to_remove_df[self.document_id_column].dtype:
+            self.docs_to_remove_df = self.docs_to_remove_df.select(
+                pl.col(self.document_id_column).cast(input_doc_id_type)
+            )
+        filtered_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti")
+        filtered_table = filtered_df.to_arrow()
+        metadata = {
+            "input_files": 1,
+            "input_docs": table.num_rows,
+            "input_bytes": table.nbytes,
+            "output_files": 1,
+            "output_docs": filtered_table.num_rows,
+            "output_bytes": filtered_table.nbytes,
+            "filtered_docs": (table.num_rows - filtered_table.num_rows),
+            "filtered_bytes": (table.nbytes - filtered_table.nbytes),
+        }
+        return [filtered_table], metadata
+
+
+class DataCleaningTransformConfiguration(TransformConfiguration):
+
+    """
+    Provides support for configuring and using the associated Transform class include
+    configuration with CLI args.
+    """
+
+    def __init__(self):
+        super().__init__(
+            name=short_name,
+            transform_class=DataCleaningTransform,
+        )
+        self.logger = get_logger(__name__, level="INFO")
+
+    def add_input_params(self, parser: ArgumentParser) -> None:
+        """
+        Add Transform-specific arguments to the given parser.
+        This will be included in a dictionary used to initialize the NOOPTransform.
+        By convention a common prefix should be used for all transform-specific CLI args
+        (e.g, noop_, pii_, etc.)
+        """
+        parser.add_argument(
+            f"--{document_id_column_cli_param}",
+            type=str,
+            default=document_id_column_default,
+            help="name of the column storing the unique ID assigned to each document",
+        )
+        parser.add_argument(
+            f"--{duplicate_list_location_cli_param}",
+            type=str,
+            required=True,
+            default=duplicate_list_location_default,
+            help="location of duplicate document list that are marked for removal",
+        )
+
+    def apply_input_params(self, args: Namespace) -> bool:
+        """
+        Validate and apply the arguments that have been parsed
+        :param args: user defined arguments.
+        :return: True, if validate pass or False otherwise
+        """
+        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+        self.params = self.params | captured
+        self.logger.info(f"{short_name} parameters are : {self.params}")
+        return True
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
new file mode 100644
index 000000000..c0b5fefd6
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
@@ -0,0 +1,83 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from typing import Any
+
+from data_cleaning_transform import DataCleaningTransformConfiguration
+from data_processing.data_access import DataAccessFactoryBase
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.runtime.pure_python.runtime_configuration import (
+    DefaultPythonTransformRuntime,
+    PythonTransformRuntimeConfiguration,
+)
+from data_processing.transform import TransformStatistics
+from data_processing.utils import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class DataCleaningPythonRuntime(DefaultPythonTransformRuntime):
+    """
+    Data cleaning runtime support for Python
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_transform_config(
+        self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
+    ) -> dict[str, Any]:
+        """
+        Download the table of duplicate document ids that will be provided to the
+        filtering/annotation method. This is the opportunity for this runtime to
+        create a new set of configuration based on the config/params provided to
+        this instance's initializer. This may include the addition of new
+        configuration data such as ray shared memory, new actors, etc., that
+        might be needed and expected by the transform in its initializer and/or
+        transform() methods.
+        :param data_access_factory - data access factory class being used by the RayOrchestrator.
+        :param statistics - reference to statistics actor
+        :param files - list of files to process
+        :return: dictionary of transform init params
+        """
+        duplicate_list_location = self.params["duplicate_list_location"]
+        data_access = data_access_factory.create_data_access()
+        if duplicate_list_location.startswith("s3://"):
+            _, duplicate_list_location = duplicate_list_location.split("://")
+        self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
+        return self.params | {"df": self.duplicate_list}
+
+
+class DataCleaningPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for fuzzy dedup data cleaning step
+    as required by the PythonTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param: transform_configuration - transform configuration class
+        :param: runtime_class - name of the runtime configuration class
+        """
+        super().__init__(
+            transform_config=DataCleaningTransformConfiguration(),
+            runtime_class=DataCleaningPythonRuntime,
+        )
+
+
+if __name__ == "__main__":
+    launcher = PythonTransformLauncher(DataCleaningTransformConfiguration())
+    logger.info("Launching fuzzy dedup data cleaning transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/file_copy_util.py b/transforms/universal/fdedup/python/src/file_copy_util.py
new file mode 100644
index 000000000..87867e532
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/file_copy_util.py
@@ -0,0 +1,158 @@
+import argparse
+import io
+import os
+import re
+
+import polars as pl
+from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
+from data_processing.utils import ParamsUtils, get_logger
+
+
+"""
+This class reads all the parquet files inside an `input_folder` of the type
+`.../bands/band=b/segment=s`, concatenates those files, and writes them into a
+file called `.../consolidated_bands/band_b_segment_s.parquet`
+"""
+
+
+class FileCopyUtil:
+    def __init__(
+        self,
+        data_access_factory: DataAccessFactoryBase,
+        config: dict,
+        stats: dict,
+    ):
+        self.data_access_factory = data_access_factory
+        self.root_folder = config.get("root_folder")
+        self.logger = get_logger(__name__, level="INFO")
+
+    def copy_data(self, subfolder_name: str, data_type: str):
+        self.logger.info(f"copy_data(): subfolder_name = {subfolder_name}, data_type = {data_type}")
+        if self.data_access_factory.s3_config is not None:
+            _, root_folder = self.root_folder.split("://")
+        else:
+            root_folder = self.root_folder
+        self.logger.debug(f"copy_data(): root_folder = {root_folder}")
+        if data_type == "bands":
+            match = re.match(r"^band=(\d+)/segment=(\d+)$", subfolder_name)
+            if match:
+                band = int(match.group(1))
+                segment = int(match.group(2))
+            else:
+                raise ValueError(f"Wrong subfolder_name {subfolder_name}, should be band=b/segment=s")
+            input_folder = os.path.join(
+                root_folder,
+                "bands",
+                f"band={band}",
+                f"segment={segment}/",
+            )
+            output_path = os.path.join(
+                root_folder,
+                "bands_consolidated",
+                f"band_{band}_segment_{segment}.parquet",
+            )
+        elif data_type == "docs_to_remove":
+            input_folder = os.path.join(
+                root_folder,
+                f"{subfolder_name}/",
+            )
+            output_path = os.path.join(
+                root_folder,
+                "docs_to_remove_consolidated",
+                f"docs_to_remove_consolidated.parquet",
+            )
+            self.logger.debug(f"copy_data(): input_folder = {input_folder}, output_path = {output_path}")
+
+        data_access = self.data_access_factory.create_data_access()
+        self.logger.debug(f"copy_data(): getting the data from the input_folder {input_folder}")
+        file_dict, status = data_access.get_folder_files(
+            input_folder,
+            extensions=[".parquet"],
+            return_data=True,
+        )
+        self.logger.info(f"Found {len(file_dict)} files in input folder {input_folder}")
+        consolidated_df = pl.DataFrame()
+        for fname, contents in file_dict.items():
+            df = pl.read_parquet(io.BytesIO(contents))
+            # self.logger.info(f"{fname} has {len(df)} rows")
+            consolidated_df = consolidated_df.vstack(df)
+        if "docs_to_remove" in consolidated_df.columns:
+            consolidated_df = consolidated_df.select("docs_to_remove").unique()
+        output_table = consolidated_df.to_arrow()
+        self.logger.info(
+            f"Writing to {output_path} table with {output_table.num_rows} rows and {output_table.nbytes:,d} bytes"
+        )
+        stats = {
+            "input_files": len(file_dict),
+            "input_bytes": sum(len(v) for v in file_dict.values()),
+            "input_rows": output_table.num_rows,
+            "output_files": 1,
+            "output_bytes": output_table.nbytes,
+            "output_rows": output_table.num_rows,
+        }
+        data_access.save_table(output_path, output_table)
+        return stats
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--root_folder",
+        type=str,
+        default=os.getenv("HOME", os.path.join(os.sep)),
+        help="root folder",
+    )
+    parser.add_argument(
+        "--subfolder_name",
+        type=str,
+        default=os.path.join("band=0", "segment=0"),
+        help="subfolder name",
+    )
+    parser.add_argument(
+        "--data_type",
+        type=str,
+        default="docs_to_remove",
+        help="Processing either bands or docs_to_remove",
+    )
+    parser.add_argument(
+        "--use_s3",
+        type=bool,
+        default=False,
+        help="use s3",
+    )
+    args = parser.parse_args()
+    root_folder = args.root_folder
+    config = {"root_folder": args.root_folder}
+    input_folder = args.root_folder
+    output_folder = args.root_folder
+    data_type = args.data_type
+    data_access_factory: DataAccessFactoryBase = DataAccessFactory()
+    daf_args = []
+    if args.use_s3:
+        s3_creds = {
+            "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+            "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+            "url": os.getenv("AWS_ENDPOINT_URL"),
+        }
+        s3_config = {
+            "input_folder": root_folder,
+            "output_folder": root_folder,
+        }
+        daf_args.append("--data_s3_cred")
+        daf_args.append(ParamsUtils.convert_to_ast(s3_creds))
+        daf_args.append("--data_s3_config")
+        daf_args.append(ParamsUtils.convert_to_ast(s3_config)),
+    else:
+        local_config = {
+            "input_folder": root_folder,
+            "output_folder": root_folder,
+        }
+        daf_args.append("--data_local_config")
+        daf_args.append(ParamsUtils.convert_to_ast(local_config))
+    daf_parser = argparse.ArgumentParser()
+    data_access_factory.add_input_params(parser=daf_parser)
+    data_access_factory_args = daf_parser.parse_args(args=daf_args)
+    data_access_factory.apply_input_params(args=data_access_factory_args)
+    stats = {}
+    fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats)
+    fcu.copy_data(args.subfolder_name, args.data_type)
diff --git a/transforms/universal/fdedup/python/src/service_orchestrator.py b/transforms/universal/fdedup/python/src/service_orchestrator.py
new file mode 100644
index 000000000..897a3210c
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/service_orchestrator.py
@@ -0,0 +1,265 @@
+import argparse
+import os
+import sys
+
+from cluster_analysis_transform_python import (
+    ClusterAnalysisPythonTransformConfiguration,
+)
+from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+from file_copy_util import FileCopyUtil
+from signature_calc_transform_python import (
+    SignatureCalculationPythonTransformConfiguration,
+)
+
+
+class ServiceOrchestrator:
+    def __init__(self, global_params=None):
+        self.global_params = global_params or {}
+
+    def execute_service(self, service_logic, service_params):
+        # Call the generic service logic
+        service_logic(service_params)
+
+    def orchestrate(self, service_logic):
+        service_list = self.global_params["services"].split(",")
+
+        for service in service_list:
+            if service == "SignatureCalculation":
+                params = create_transform_args_payload(args, service)
+                params["service_type"] = "SignatureCalculation"
+                self.execute_service(service_logic, params)
+            elif service == "ClusterAnalysis":
+                params = create_transform_args_payload(args, service)
+                params["service_type"] = "ClusterAnalysis"
+                self.execute_service(service_logic, params)
+            elif service == "DataCleaning":
+                params = create_transform_args_payload(args, service)
+                params["service_type"] = "DataCleaning"
+                self.execute_service(service_logic, params)
+            elif service == "BandsFileCopy":
+                params = args
+                params["service_type"] = "BandsFileCopy"
+                self.execute_service(service_logic, params)
+            elif service == "DocsToRemoveFileCopy":
+                params = args
+                params["service_type"] = "DocsToRemoveFileCopy"
+                self.execute_service(service_logic, params)
+            else:
+                print(f"Warning: {service} is not a recognized service.")
+
+
+def generic_service_logic(params):
+    print("Service executed with parameters:", params)
+    service_type = params["service_type"]
+    use_s3 = params["use_s3"]
+    # Remove the 'service_type' key
+    params.pop("service_type", None)  # Using pop() method
+
+    if service_type == "SignatureCalculation" or service_type == "ClusterAnalysis" or service_type == "DataCleaning":
+        # Set the simulated command line args
+        params.pop("num_permutations", None)  # Using pop() method
+        params.pop("num_bands", None)  # Using pop() method
+        params.pop("num_segments", None)  # Using pop() method
+        params.pop("use_s3", None)  # Using pop() method
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    if use_s3:
+        sys.argv.append("--data_s3_cred")
+        sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
+
+    if service_type == "SignatureCalculation":
+        runtime_config = SignatureCalculationPythonTransformConfiguration()
+        launch_transform_service(runtime_config)
+    elif service_type == "ClusterAnalysis":
+        runtime_config = ClusterAnalysisPythonTransformConfiguration()
+        launch_transform_service(runtime_config)
+    elif service_type == "DataCleaning":
+        runtime_config = DataCleaningPythonTransformConfiguration()
+        launch_transform_service(runtime_config)
+    elif service_type == "BandsFileCopy":
+        launch_file_copy_service(params, service_type)
+    elif service_type == "DocsToRemoveFileCopy":
+        launch_file_copy_service(params, service_type)
+
+
+def launch_transform_service(params):
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=params)
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
+
+
+def launch_file_copy_service(args, service_type):
+    root_folder = os.path.join(args["root_folder"], args["output_folder"])
+    data_type = None
+    if service_type == "BandsFileCopy":
+        data_type = "bands"
+        # Get files to process
+        files = [
+            f"band={band}/segment={segment}"
+            for band in range(args["num_bands"])
+            for segment in range(args["num_segments"])
+        ]
+    elif service_type == "DocsToRemoveFileCopy":
+        files = ["docs_to_remove"]
+        data_type = "docs_to_remove"
+    config = {"root_folder": root_folder}
+    data_access_factory: DataAccessFactoryBase = DataAccessFactory()
+    daf_args = []
+
+    if args["use_s3"]:
+
+        s3_config = {
+            "input_folder": root_folder,
+            "output_folder": root_folder,
+        }
+        daf_args.append("--data_s3_cred")
+        daf_args.append(ParamsUtils.convert_to_ast(s3_creds))
+        daf_args.append("--data_s3_config")
+        daf_args.append(ParamsUtils.convert_to_ast(s3_config)),
+    else:
+
+        # Construct folders
+        local_config = {
+            "input_folder": root_folder,
+            "output_folder": os.path.abspath(os.path.join(args["root_folder"], args["output_folder"])),
+        }
+        daf_args.append("--data_local_config")
+        daf_args.append(ParamsUtils.convert_to_ast(local_config))
+
+    daf_parser = argparse.ArgumentParser()
+    data_access_factory.add_input_params(parser=daf_parser)
+    data_access_factory_args = daf_parser.parse_args(args=daf_args)
+    data_access_factory.apply_input_params(args=data_access_factory_args)
+    stats = {}
+    fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats)
+    for file in files:
+        fcu.copy_data(file, data_type)
+
+
+def create_transform_args_payload(args, service):
+    print(args)
+    # Construct folders
+    input_folder = os.path.join(args["root_folder"], args["input_folder"])
+    output_folder = os.path.join(args["root_folder"], args["output_folder"])
+    if service == "ClusterAnalysis":
+        input_folder = os.path.join(args["root_folder"], args["output_folder"], "bands_consolidated")
+        output_folder = os.path.join(args["root_folder"], args["output_folder"], "docs_to_remove")
+    elif service == "DataCleaning":
+        output_folder = os.path.join(args["root_folder"], args["output_folder"], "cleaned")
+        duplicate_location = os.path.join(
+            args["root_folder"],
+            args["output_folder"],
+            "docs_to_remove_consolidated",
+            "docs_to_remove_consolidated.parquet",
+        )
+
+    # Create a local configuration
+    local_conf = {"input_folder": input_folder, "output_folder": output_folder}
+
+    # Create parameters
+    params = {
+        "num_permutations": args["num_permutations"],
+        "num_bands": args["num_bands"],
+        "num_segments": args["num_segments"],
+        "use_s3": args["use_s3"],
+    }
+
+    if args["use_s3"]:
+        params["data_s3_config"] = ParamsUtils.convert_to_ast(local_conf)
+    else:
+        params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf)
+
+    # add extra
+    if service == "DataCleaning":
+        short_name = "fdclean"
+        cli_prefix = f"{short_name}_"
+
+        # configuration keys
+        document_id_column_key = "document_id_column"
+        """ This key holds the name of the column storing the unique ID assigned to each document"""
+        duplicate_list_location_key = "duplicate_list_location"
+        """ This key holds the location of the list of duplicate documents marked for removal"""
+
+        # command line arguments
+        document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
+        """ Name of the column storing the unique ID assigned to each document"""
+        duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}"
+        """ Location of the list of duplicate documents marked for removal"""
+
+        params[document_id_column_cli_param] = "int_id_column"
+        params[duplicate_list_location_cli_param] = duplicate_location
+
+    return params
+
+
+def create_file_copy_args_payload(args):
+    daf_args = []
+    local_config = {
+        "input_folder": args.root_folder,
+        "output_folder": args.root_folder,
+    }
+    daf_args.append("--data_local_config")
+    daf_args.append(ParamsUtils.convert_to_ast(local_config))
+    data_access_factory: DataAccessFactoryBase = DataAccessFactory()
+    daf_parser = argparse.ArgumentParser()
+    data_access_factory.add_input_params(parser=daf_parser)
+    data_access_factory_args = daf_parser.parse_args(args=daf_args)
+    data_access_factory.apply_input_params(args=data_access_factory_args)
+    return data_access_factory
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Service Orchestrator")
+
+    # Define command line arguments
+    parser.add_argument("--root_folder", type=str, required=True, help="Root folder path")
+    parser.add_argument("--input_folder", type=str, required=True, help="Input folder path")
+    parser.add_argument("--output_folder", type=str, required=True, help="Output folder path")
+
+    parser.add_argument(
+        "--contents_column", type=str, default="text", help="Name of the column that holds document text"
+    )
+    parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations")
+    parser.add_argument("--num_bands", type=int, default=14, help="Number of bands")
+    parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band")
+    parser.add_argument("--num_segments", type=int, default=2, help="Number of segments")
+
+    # Single argument for service execution
+    parser.add_argument(
+        "--services",
+        type=str,
+        required=True,
+        help="Comma-separated list of services to run (e.g., SignatureCalculation,BandsFileCopy,ClusterAnalysis,DocsToRemoveFileCopy,DataCleaning)",
+    )
+
+    parser.add_argument(
+        "--use_s3",
+        type=bool,
+        default=False,
+        help="use s3",
+    )
+
+    args = parser.parse_args()
+    return vars(args)  # Convert Namespace to dictionary
+
+
+if __name__ == "__main__":
+
+    s3_creds = {
+        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+        "url": os.getenv("AWS_ENDPOINT_URL"),
+    }
+
+    # Parse command line arguments
+    args = parse_args()
+
+    # Initialize the orchestrator
+    orchestrator = ServiceOrchestrator(global_params=args)
+
+    # Example service execution (if you had defined services)
+    orchestrator.orchestrate(generic_service_logic)
diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
new file mode 100644
index 000000000..eb958ee3d
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
@@ -0,0 +1,60 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+from signature_calc_transform_python import (
+    SignatureCalculationPythonTransformConfiguration,
+)
+
+
+# # create parameters
+# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1"))
+# output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output_second"))
+# local_conf = {
+#     "input_folder": input_folder,
+#     "output_folder": output_folder
+# }
+# code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+# params = {
+#     # Data access. Only required parameters are specified
+#     "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+#     # execution info
+#     "runtime_pipeline_id": "pipeline_id",
+#     "runtime_job_id": "job_id",
+#     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+#     "minhash_num_permutations":112,
+#     "minhash_num_bands":14,
+#     "minhash_num_segments":2
+# }
+
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    # sys.argv = ParamsUtils.dict_to_req(d=params)
+    # print(sys.argv)
+
+    sys.argv.append("--data_s3_cred")
+    s3_creds = {
+        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+        "url": os.getenv("AWS_ENDPOINT_URL"),
+    }
+    sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
+
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration())
+    # Launch python to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py
new file mode 100644
index 000000000..7ac8eb057
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py
@@ -0,0 +1,504 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import os
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from typing import Any, List
+
+import mmh3
+import numpy as np
+import polars as pl
+import pyarrow as pa
+from data_processing.transform import AbstractTableTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider
+from Murmur_MH import Murmur_MH
+from scipy.integrate import quad as integrate
+
+
+short_name = "minhash"
+cli_prefix = f"{short_name}_"
+
+# configuration keys
+document_id_column_key = "document_id_column"
+""" This key holds the name of the column storing the unique ID assigned to each document"""
+contents_column_key = "contents_column"
+""" This key holds the name of the column storing the contents of each document"""
+seed_key = "seed"
+""" This key holds the seed used to instantiate the random number generator"""
+num_permutations_key = "num_permutations"
+""" This key holds the number of permutations that determine how many minhashes to calculate for each document"""
+num_bands_key = "num_bands"
+""" This key holds the number of bands to use in the banding technique"""
+num_minhashes_per_band_key = "num_minhashes_per_band"
+""" This key holds the number of minhashes to use in each band"""
+jaccard_similarity_threshold_key = "jaccard_similarity_threshold"
+""" This key holds the Jaccard similarity threshold above which two documents are duplicates"""
+word_shingle_size_key = "word_shingle_size"
+""" This key holds the size of the word shingles calculated for each document"""
+num_segments_key = "num_segments"
+""" This key holds the number of segments across which we divide the hashing space for each band"""
+
+# command line arguments
+document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
+""" Name of the column storing the unique ID assigned to each document"""
+contents_column_cli_param = f"{cli_prefix}{contents_column_key}"
+""" Name of the column storing the contents of each document"""
+seed_cli_param = f"{cli_prefix}{seed_key}"
+""" The seed used to instantiate the random number generator"""
+num_permutations_cli_param = f"{cli_prefix}{num_permutations_key}"
+""" Number of permutations that determine how many minhashes to calculate for each document"""
+num_bands_cli_param = f"{cli_prefix}{num_bands_key}"
+""" The number of bands to use in the banding technique"""
+num_minhashes_per_band_cli_param = f"{cli_prefix}{num_minhashes_per_band_key}"
+""" The number of minhashes to use in each band"""
+jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}"
+""" Jaccard similarity threshold above which two documents are duplicates"""
+word_shingle_size_cli_param = f"{cli_prefix}{word_shingle_size_key}"
+""" The size of the word shingles calculated for each document"""
+num_segments_cli_param = f"{cli_prefix}{num_segments_key}"
+""" The number of segments across which we divide the hashing space for each band"""
+
+captured_arg_keys = [
+    document_id_column_key,
+    contents_column_key,
+    seed_key,
+    num_bands_key,
+    num_minhashes_per_band_key,
+    num_permutations_key,
+    jaccard_similarity_threshold_key,
+    word_shingle_size_key,
+    num_segments_key,
+]
+
+# defaults
+document_id_column_default = "int_id_column"
+""" Default name of the column storing the unique ID assigned to each document"""
+contents_column_default = "contents"
+""" Default name of the column storing the contents of each document"""
+seed_default = 42
+""" Default seed used to instantiate the random number generator"""
+num_permutations_default = 112
+""" Default number of minhashes used for each document (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+num_bands_default = 14
+""" Default number of bands to use in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+num_minhashes_per_band_default = 8
+""" Default number of minhashes to use in each band (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+word_shingle_size_default = 5
+""" Default size of the word shingles (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+jaccard_similarity_threshold_default = 0.75
+""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+num_segments_default = 1
+""" Default number of segments across which we divide the hashing space for each band"""
+
+
+def _optimal_minhashlsh_param(
+    threshold: float = jaccard_similarity_threshold_default,
+    num_perm: int = num_permutations_default,
+    false_positive_weight: float = 0.5,
+    false_negative_weight: float = 0.5,
+):
+    """
+    Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
+    of probabilities of false positive and false negative.
+    :param threshold: desired similarity threshold
+    :param num_perm: number of permutations
+    :param false_positive_weight: importance of avoiding false positive results
+    :param false_negative_weight: importance of avoiding false negative results
+    :return: a tuple (optimal number of bands, optimal number of rows)
+    """
+
+    def _false_positive_probability(threshold, b, r):
+        _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
+        a, err = integrate(_probability, 0.0, threshold)
+        return a
+
+    def _false_negative_probability(threshold, b, r):
+        _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
+        a, err = integrate(_probability, threshold, 1.0)
+        return a
+
+    min_error = float("inf")
+    opt = (0, 0)
+    for b in range(1, num_perm + 1):
+        max_r = int(num_perm / b)
+        for r in range(1, max_r + 1):
+            fp = _false_positive_probability(threshold, b, r)
+            fn = _false_negative_probability(threshold, b, r)
+            error = fp * false_positive_weight + fn * false_negative_weight
+            if error < min_error:
+                min_error = error
+                opt = (b, r)
+    return opt
+
+
+class SignatureCalculationTransform(AbstractTableTransform):
+    """
+    This is the first transform of the fuzzy dedup pipeline. First, it calculates,
+    for each document in a dataset, `num_permutations` minhashes.  It accepts as
+    input the number of bands and the length of each band.  If those two parameters
+    are not specified, then, based on the values of `jaccard_similarity_threshold`
+    and `num_permutations`, it determines the optimal number of bands, and the
+    length of each band (how many minhashes will be used to get the signature for
+    each band). The band signatures, the minhashes and the document lengths are
+    then saved in the output folder, under a folder structure `bands/band=b/segment=s`.
+    To improve scalability of the next step of fuzzy dedup, the hash space of
+    each band is divided into `num_segments` segments.
+
+    Args:
+        document_id_column: name of the column storing the unique ID assigned to each document
+        contents_column_cli_param: name of the column storing the contents of each document
+        seed: the seed used to instantiate the random number generator
+        num_permutations: number of minhashes to calculate for each document
+        num_bands: number of bands to use for banding technique
+        num_minhashes_per_band: number of minhashes to use in each band
+        jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates
+        word_shingle_size: the size of the word shingles calculated for each document
+        num_segments the number of segments across which we divide the hashing space for each band
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments defined
+        by the companion runtime, SignatureCalculationTransformRuntime.  If running inside the RayMutatingDriver,
+        these will be provided by that class with help from the RayMutatingDriver.
+        """
+        super().__init__(config)
+        self.document_id_column = config.get(document_id_column_key, document_id_column_default)
+        self.contents_column = config.get(contents_column_key, contents_column_default)
+        self.seed = config.get(seed_key, seed_default)
+        self.num_permutations = config.get(num_permutations_key, num_permutations_default)
+        self.jaccard_similarity_threshold = config.get(
+            jaccard_similarity_threshold_key, jaccard_similarity_threshold_default
+        )
+        self.word_shingle_size = config.get(word_shingle_size_key, word_shingle_size_default)
+        self.num_segments = config.get(num_segments_key, num_segments_default)
+        self.num_bands = config.get(num_bands_key, num_bands_default)
+        self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default)
+        # Calculate optimal parameters for bands calculation
+        # self.num_bands, self.num_rows = _optimal_minhashlsh_param(
+        #     threshold=self.jaccard_similarity_threshold,
+        #     num_perm=self.num_permutations,
+        #     false_positive_weight=0.5,
+        #     false_negative_weight=0.5,
+        # )
+        # use this dataframe to store the minhashes and size for each document
+        self.all_minhashes: pl.DataFrame = None
+        # use this dataframe to store the band hashes for each document
+        self.all_band_hashes: pl.DataFrame = None
+        # this variable keeps track of how many files were processed since last
+        # data write to properly update metadata
+        self.files_processed = 0
+        self.bytes_processed = 0
+        self.data_access = config.get("data_access")
+        self.last_file_name = None
+
+    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
+        """
+        Put Transform-specific to convert one Table to 0 or more tables. It also returns
+        a dictionary of execution statistics - arbitrary dictionary
+        This implementation makes no modifications so effectively implements a copy of the
+        input parquet to the output folder, without modification.
+        """
+        self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}")
+        self.logger.debug("----minhash---")
+        self.last_file_name = file_name
+        self.files_processed += 1
+        self.bytes_processed += table.nbytes
+        # instantiate with same seed so every worker use same hash functions
+        mm_min_hash = Murmur_MH(num_perm=self.num_permutations, seed=self.seed)
+
+        # load the data from pyarrow table
+        df = pl.from_arrow(table)
+        # read the target columns
+        df = df.select(self.contents_column, self.document_id_column)
+
+        # generate minhash values
+        minhashes = df.map_rows(
+            lambda text: mm_min_hash.minhash2_nosalt(
+                *self._generate_word_shingles(text, window_size=self.word_shingle_size)
+            )
+        )
+        # rename columns, cast minhashes to list(uint32)
+        minhashes = minhashes.select(
+            pl.col("column_2").alias(self.document_id_column),
+            pl.col("column_0").cast(pl.List(pl.UInt32)).alias("minhashes"),
+            pl.col("column_1").alias("document_length"),
+        )
+        # store the minhash calculations to send out at the end of execution
+        if self.all_minhashes is None:
+            self.all_minhashes = minhashes
+        else:
+            self.all_minhashes = self.all_minhashes.vstack(minhashes)
+
+        # Calculate band hashes
+        band_hashes_list = self.process_rows_into_bands(
+            minhashes,
+            self.num_bands,
+            self.num_rows,
+        )
+        band_hash_schema = pl.Schema(
+            {
+                "band_hash": pl.UInt64,
+                "band_index": pl.Int32,
+                self.document_id_column: pl.Int64,
+            }
+        )
+        band_hashes = pl.DataFrame(band_hashes_list, schema=band_hash_schema)
+
+        # store the band hash calculations to send out at the end of execution
+        if self.all_band_hashes is None:
+            self.all_band_hashes = band_hashes
+        else:
+            self.all_band_hashes = self.all_band_hashes.vstack(band_hashes)
+
+        if len(self.all_minhashes) > 750000:
+            tables, metadata = self.write_band_signatures()
+        else:
+            tables = []
+            metadata = {}
+        # update metadata stats and return the stats (no tables are returned in transform)
+        return tables, metadata
+
+    def flush(self) -> tuple[list[pa.Table], dict[str, Any]]:
+        """
+        This is supporting method for transformers, that implement buffering of tables, for example coalesce.
+        These transformers can have buffers containing tables that were not written to the output. Flush is
+        the hook for them to return back locally stored tables and their statistics. The majority of transformers
+        should use default implementation.
+        If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
+        :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
+        propagated to metadata
+        """
+        self.logger.info(f"Starting flush()")
+        if self.all_band_hashes is not None and self.all_minhashes is not None:
+            tables, metadata = self.write_band_signatures()
+        else:
+            tables = []
+            metadata = {}
+        return tables, metadata
+
+    def write_band_signatures(self):
+        # define the upper and lower bounds of each band segment
+        segment_bounds_list = []
+        upper_bound = np.uint64(np.iinfo(np.uint64).max)
+        segment_len = np.uint64(upper_bound // self.num_segments)
+        for segment_index in range(self.num_segments):
+            segment_bounds_list.append(np.uint64(segment_index) * segment_len)
+        segment_bounds_list.append(upper_bound)
+        segment_bounds = np.array(segment_bounds_list, dtype=np.uint64)
+        self.logger.debug(f"Calculated {len(segment_bounds)} segment_bounds")
+        # output stats for the metadata
+        num_tables_written = 0
+        num_docs_written = 0
+        num_bytes_written = 0
+        self.logger.debug(f"dataframe self.all_band_hashes has {len(self.all_band_hashes)} rows")
+        self.logger.debug(f"dataframe self.all_minhashes has {len(self.all_minhashes)} rows")
+        # iterate through the bands, get the band hashes for each band, divide
+        # them into segments, join with minhashes, and upload to storage
+        for band_ix in range(self.num_bands):
+            # Filtering on, then dropping the `band_index` column
+            band_df = self.all_band_hashes.filter(pl.col("band_index") == band_ix).drop("band_index")
+            # assign each band hash to a segment of the hashing space
+            self.logger.debug(f"band {band_ix} band_df has {len(band_df)} rows")
+            for segment_index in range(self.num_segments):
+                segment_band_df = band_df.filter(
+                    (pl.col("band_hash") > segment_bounds[segment_index])
+                    & (pl.col("band_hash") <= segment_bounds[segment_index + 1])
+                )
+                self.logger.debug(
+                    f"band {band_ix} segment {segment_index} segment_band_df has {len(segment_band_df)} rows"
+                )
+                # join the band hash dataframe with the minihash and doc length dataframe
+                segment_band_minhash_df = segment_band_df.join(
+                    self.all_minhashes,
+                    on=self.document_id_column,
+                    how="inner",
+                )
+                self.logger.debug(f"band {band_ix} segment {segment_index} joined segment_band_df and minhashes")
+
+                # encapsulate document info in a structure
+                segment_band_minhash_df = segment_band_minhash_df.select(
+                    pl.col("band_hash"),
+                    pl.struct(
+                        [
+                            pl.col(self.document_id_column),
+                            pl.col("minhashes"),
+                            pl.col("document_length"),
+                        ]
+                    ).alias("document_data"),
+                )
+                self.logger.debug(f"band {band_ix} segment {segment_index} encapsulated document info in a structure")
+
+                # append the table to the result list, and the path to metadata
+                common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name])
+                last_file_name_path = Path(self.last_file_name)
+                suffix_path = last_file_name_path.relative_to(self.data_access.input_folder)
+                save_path = os.path.join(
+                    self.data_access.output_folder,
+                    "bands",
+                    f"band={band_ix}",
+                    f"segment={segment_index}",
+                    suffix_path,
+                )
+                segment_band_minhash_table = segment_band_minhash_df.to_arrow()
+                bytes_written, _, _ = self.data_access.save_table(save_path, segment_band_minhash_table)
+                if bytes_written > 0:
+                    num_tables_written += 1
+                    num_docs_written += segment_band_minhash_table.num_rows
+                    num_bytes_written += bytes_written
+                    self.logger.debug(f"Uploaded table for band {band_ix} and segment {segment_index}")
+        # add the stats to metadata
+        metadata = {
+            "input_files": self.files_processed,
+            "input_docs": len(self.all_minhashes),
+            "input_bytes": self.bytes_processed,
+            "output_files": num_tables_written,
+            "output_docs": num_docs_written,
+            "output_bytes": num_bytes_written,
+        }
+        self.logger.info(f"Wrote {num_tables_written} tables with a total size of {num_bytes_written:,d} bytes")
+        self.files_processed = 0
+        self.bytes_processed = 0
+        self.all_minhashes = None
+        self.all_band_hashes = None
+        return [], metadata
+
+    # define shingles generation function
+    def _generate_word_shingles(self, text: str, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]:
+        words = text[0].split()
+        document_id = text[1]
+        doc_len = len(text[0])
+        word_count = len(words)
+        k_shingles = []
+        for i in range(0, max(1, word_count - window_size + 1)):
+            k_shingles.append(delimiter.join(words[i : i + window_size]))
+        return k_shingles, doc_len, document_id
+
+    def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b: int, r: int, seed: int = 42):
+        num_minhashes = len(minhashes)
+        assert b * r <= num_minhashes, f"b*r must be <= num minhashes, was b={b}, r={r}, num_minhashes={num_minhashes}"
+        results = []
+        for band_index in range(b):
+            band_hash, _ = mmh3.hash64(
+                minhashes[band_index * r : (band_index + 1) * r],
+                seed=seed,
+                signed=False,
+            )
+            results.append((band_hash, band_index, int_id_column))
+        return results
+
+    # Apply the function
+    def process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band):
+        result = []
+        for row in df.iter_rows():
+            bands = self.emit_bands(
+                row[0],  # document id
+                np.array(row[1], dtype=np.uint32),  # minhashes
+                row[2],  # document length
+                minhashlsh_num_bands,
+                minhashlsh_length_band,
+            )
+            for band in bands:
+                result.append(band)
+        return result
+
+
+class SignatureCalculationTransformConfiguration(TransformConfiguration):
+
+    """
+    Provides support for configuring and using the associated Transform class include
+    configuration with CLI args.
+    """
+
+    def __init__(self):
+        super().__init__(
+            name=short_name,
+            transform_class=SignatureCalculationTransform,
+            remove_from_metadata=[],
+        )
+        from data_processing.utils import get_logger
+
+        self.logger = get_logger(__name__, level="INFO")
+
+    def add_input_params(self, parser: ArgumentParser) -> None:
+        """
+        Add Transform-specific arguments to the given  parser.
+        This will be included in a dictionary used to initialize the NOOPTransform.
+        By convention a common prefix should be used for all transform-specific CLI args
+        (e.g, noop_, pii_, etc.)
+        """
+        parser.add_argument(
+            f"--{document_id_column_cli_param}",
+            type=str,
+            default=document_id_column_default,
+            help="name of the column storing the unique ID assigned to each document",
+        )
+        parser.add_argument(
+            f"--{contents_column_cli_param}",
+            type=str,
+            default=contents_column_default,
+            help="name of the column storing the contents of each document",
+        )
+        parser.add_argument(
+            f"--{seed_cli_param}",
+            type=int,
+            default=seed_default,
+            help="the seed used to instantiate the random number generator",
+        )
+        parser.add_argument(
+            f"--{num_permutations_cli_param}",
+            type=int,
+            default=num_permutations_default,
+            help="number of permutations (minhashes) calculated for each document",
+        )
+        parser.add_argument(
+            f"--{jaccard_similarity_threshold_cli_param}",
+            type=int,
+            default=jaccard_similarity_threshold_default,
+            help="Jaccard similarity threshold above which two documents are duplicates",
+        )
+        parser.add_argument(
+            f"--{word_shingle_size_cli_param}",
+            type=int,
+            default=word_shingle_size_default,
+            help="the size of the word shingles calculated for each document",
+        )
+        parser.add_argument(
+            f"--{num_bands_cli_param}",
+            type=int,
+            default=num_bands_default,
+            help="the number of bands to use in the banding technique",
+        )
+        parser.add_argument(
+            f"--{num_minhashes_per_band_cli_param}",
+            type=int,
+            default=num_minhashes_per_band_default,
+            help="the number of minhashes to use in each band",
+        )
+        parser.add_argument(
+            f"--{num_segments_cli_param}",
+            type=int,
+            default=num_segments_default,
+            help="the number of segments across which we divide the hashing space for each band",
+        )
+
+    def apply_input_params(self, args: Namespace) -> bool:
+        """
+        Validate and apply the arguments that have been parsed
+        :param args: user defined arguments.
+        :return: True, if validate pass or False otherwise
+        """
+        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+        self.params = self.params | captured
+        self.logger.info(f"{short_name} parameters are : {self.params}")
+        return True
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py
new file mode 100644
index 000000000..5ddc102eb
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py
@@ -0,0 +1,44 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import time
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.runtime.pure_python.runtime_configuration import (
+    PythonTransformRuntimeConfiguration,
+)
+from data_processing.utils import get_logger
+from signature_calc_transform import SignatureCalculationTransformConfiguration
+
+
+logger = get_logger(__name__)
+
+
+class SignatureCalculationPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(transform_config=SignatureCalculationTransformConfiguration())
+
+
+if __name__ == "__main__":
+    launcher = PythonTransformLauncher(SignatureCalculationTransformConfiguration())
+    logger.info("Launching noop transform")
+    launcher.launch()

From 7f9b503978c4d7daf9cafc2ae7b448577ca5a7d6 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 11 Oct 2024 10:27:16 -0400
Subject: [PATCH 006/105] Fuzzy dedup spark implementation

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/spark/Dockerfile  |  54 ++++
 transforms/universal/fdedup/spark/Makefile    |  45 +++
 transforms/universal/fdedup/spark/README.md   | 109 ++++++++
 .../spark-executor-pod-template.yml           |   8 +
 .../deployment/kubernetes/spark_profile.yml   |  14 +
 .../universal/fdedup/spark/pyproject.toml     |  42 +++
 .../universal/fdedup/spark/requirements.txt   |  10 +
 .../spark/src/cluster_analysis_spark.py       |  33 +++
 .../src/cluster_analysis_transform_spark.py   |  42 +++
 .../fdedup/spark/src/data_cleaning_spark.py   |  33 +++
 .../src/data_cleaning_transform_spark.py      | 102 +++++++
 .../fdedup/spark/src/file_copy_util_spark.py  | 261 ++++++++++++++++++
 .../fdedup/spark/src/fuzzy_dedup_spark.py     | 205 ++++++++++++++
 .../fdedup/spark/src/requirements.txt         |   8 +
 .../fdedup/spark/src/signature_calc_spark.py  |  35 +++
 .../src/signature_calc_transform_spark.py     |  42 +++
 16 files changed, 1043 insertions(+)
 create mode 100644 transforms/universal/fdedup/spark/Dockerfile
 create mode 100644 transforms/universal/fdedup/spark/Makefile
 create mode 100644 transforms/universal/fdedup/spark/README.md
 create mode 100644 transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml
 create mode 100644 transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml
 create mode 100644 transforms/universal/fdedup/spark/pyproject.toml
 create mode 100644 transforms/universal/fdedup/spark/requirements.txt
 create mode 100644 transforms/universal/fdedup/spark/src/cluster_analysis_spark.py
 create mode 100644 transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
 create mode 100644 transforms/universal/fdedup/spark/src/data_cleaning_spark.py
 create mode 100644 transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
 create mode 100644 transforms/universal/fdedup/spark/src/file_copy_util_spark.py
 create mode 100644 transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
 create mode 100644 transforms/universal/fdedup/spark/src/requirements.txt
 create mode 100644 transforms/universal/fdedup/spark/src/signature_calc_spark.py
 create mode 100644 transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py

diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile
new file mode 100644
index 000000000..523b94c06
--- /dev/null
+++ b/transforms/universal/fdedup/spark/Dockerfile
@@ -0,0 +1,54 @@
+ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0
+
+FROM ${BASE_IMAGE}
+
+# USER root
+# install pytest
+RUN pip install --no-cache-dir pytest
+
+WORKDIR ${SPARK_HOME}/work-dir
+
+# Copy in the data processing framework source/project and install it
+# This is expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=spark:root data-processing-lib-python/ data-processing-lib-python/
+RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
+COPY --chown=spark:root data-processing-lib-spark/ data-processing-lib-spark/
+RUN cd data-processing-lib-spark && pip install --no-cache-dir -e .
+COPY --chown=spark:root python-transform/  python-transform/
+RUN cd python-transform && pip install --no-cache-dir -e .
+
+# Install project source
+COPY --chown=spark:root src/ src/
+COPY --chown=spark:root pyproject.toml pyproject.toml
+RUN mkdir -p /opt/spark/work-dir/src/templates && \
+    mkdir -p /opt/spark/work-dir/config
+
+# install requirements from requirements.txt
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt
+
+COPY deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/
+COPY deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/
+
+RUN pip install --no-cache-dir -e .
+
+# copy the main() entry point to the image
+COPY ./src/signature_calc_spark.py .
+
+# copy some of the samples in
+# COPY src/filter_local_spark.py local/
+
+# copy test
+COPY test/ test/
+COPY test-data/ test-data/
+
+USER spark
+
+# Set environment
+ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH}
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile
new file mode 100644
index 000000000..d30013da8
--- /dev/null
+++ b/transforms/universal/fdedup/spark/Makefile
@@ -0,0 +1,45 @@
+# Define the root of the local git clone for the common rules to be able
+# know where they are running from.
+REPOROOT=../../../..
+# Include a library of common .transform.* targets which most
+# transforms should be able to reuse.  However, feel free
+# to override/redefine the rules below.
+include $(REPOROOT)/transforms/.make.transforms
+
+# This is included in the image name, if defined
+TRANSFORM_NAME=fd-sig-calc
+
+DOCKER_IMAGE_NAME=pyspark-base
+DOCKER_IMAGE_VERSION=latest
+DOCKER_FILE=Dockerfile
+REGISTRY_HOST=docker.io
+REGISTRY_PATH=
+DOCKER=docker
+PYTHON=python
+
+venv:	requirements.txt
+	@# Help: Create the virtual environment using requirements.txt
+	$(PYTHON) -m venv venv
+	@source venv/bin/activate;		\
+	pip install --upgrade pip;		\
+	pip install wheel;			\
+	pip install -r requirements.txt;
+
+image:: .transforms.spark-image
+
+image-direct: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings.
+	@# Help: Create the docker image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
+	$(DOCKER) build -t $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -f $(DOCKER_FILE) .
+
+publish-docker: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings.
+	@# Help: Publish image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) to $(REGISTRY_HOST) container registry
+	$(DOCKER) logout $(REGISTRY_HOST)
+	$(DOCKER) login $(REGISTRY_HOST) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)'
+	$(DOCKER) push  $(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
+
+publish-ibm:
+	ibmcloud login -q -u "$(IBM_CLOUD_USER)" -apikey "$(IBM_CLOUD_API_KEY)"
+	ibmcloud cr login --client docker
+	$(DOCKER) tag $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
+	$(DOCKER) push $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
+	# ibmcloud cr image-list | grep $(DOCKER_IMAGE_NAME)
diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md
new file mode 100644
index 000000000..3bf9b3245
--- /dev/null
+++ b/transforms/universal/fdedup/spark/README.md
@@ -0,0 +1,109 @@
+# Spark-GUF
+
+This is an implementation of Spark data processing modules. At a high level, every Spark application consists of a driver program that runs the user’s main function and executes various parallel operations on a cluster.  
+
+The modules can run locally or remotely in a Kubernetes cluster.
+
+## Running Transforms locally
+
+Start in the `spark-guf` directory. To run the modules locally, follow these steps:
+1. Create a virtual environment using this command
+   ```
+   make venv
+   ```
+2. Activate the virtual environment:
+   ```
+   source venv/bin/activate
+   ```
+
+3. Set the `PYTHONPATH` environment variable to include the `src` directory:
+   ```
+   export PYTHONPATH=${PYTHONPATH}:${PWD}/src
+   ```
+4. Invoke one of the transforms:
+   ```
+   python src/transforms/spark_pi/spark_transformer_pi.py
+   ```
+5. To find out which arguments a transform takes, run that transform with a `--help` flag:
+   ```
+   python src/transforms/spark_filter/spark_filter_transform.py --help
+   usage: spark_filter_transform.py [-h] --input_folder INPUT_FOLDER --output_folder OUTPUT_FOLDER [--data_type DATA_TYPE]
+                                    --filter_criteria_list FILTER_CRITERIA_LIST [--filter_columns_to_drop FILTER_COLUMNS_TO_DROP]
+                                    [--filter_logical_operator {AND,OR}]
+
+   optional arguments:
+      -h, --help            show this help message and exit
+      --input_folder INPUT_FOLDER
+                            path to read the input files (local fs or s3)
+      --output_folder OUTPUT_FOLDER
+                            path to write the output files (local fs or s3)
+      --data_type DATA_TYPE
+                            Type of files to filter (parquet, orc, csv, json, txt)
+      --filter_criteria_list FILTER_CRITERIA_LIST
+                            list of filter criteria (in SQL WHERE clause format), for example: [ "docq_total_words > 100 AND docq_total_words < 200", "docq_perplex_score < 230", "date_acquired BETWEEN '2023-07-04'
+                            AND '2023-07-08'", "title LIKE 'https://%'", "document_id IN ('doc-id-1', 'doc-id-2', 'doc-id-3')" ]
+      --filter_columns_to_drop FILTER_COLUMNS_TO_DROP
+                            list of columns to drop after filtering, for example: ["column1", "column2"]
+      --filter_logical_operator {AND,OR}
+                            logical operator (AND or OR) that joins filter criteria
+   ```
+
+## Running Transforms in Kubernetes/OpenShift
+
+Start in the `spark-guf` directory. To run the transforms in a Kubernetes or OpenShift cluster, follow these steps:
+
+1. Build and push a pyspark base docker image (this example assumes that images are pushed to the Docker hub, but same approach can be used to push images to icr.io, or quai.io:
+   ```
+   docker build -t my-docker-username/my-pyspark:3.5.1 .
+   docker push my-docker-username/my-pyspark:3.5.1
+   ```  
+2. Build and push a specific transform image (this will use the pyspark built in the previous point as the base image):
+   ```
+   docker build -t my-docker-username/my-pyspark-filter:3.5.1 -f src/transforms/spark_filter/Dockerfile --build-arg BASE_IMAGE=my-docker-username/my-pyspark:3.5.1 .
+   docker push my-docker-username/my-pyspark-filter:3.5.1 
+   ```
+
+3. Configure the `spark` service account (note that you can use any other service account name, but you will need then to replace `spark` with `your-service-account-name` in all the yaml files listed below). This is a one-time process to perform for each namespace where you want to run spark apps:
+   ```
+   # create 'spark' service account
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-serviceaccount.yaml --namespace=my-namespace
+
+   # create 'spark' role
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role.yaml --namespace=my-namespace
+
+   # bind the 'spark' service account to the 'spark' role
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role-binding.yaml --namespace=my-namespace
+
+   # bind the 'spark' service account to the cluster roles
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-edit-role-binding.yaml --namespace=my-namespace
+   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-cluster-role-binding.yaml --namespace=my-namespace
+   ```
+   
+ 4. Create any secrets that are needed to access S3 folders used for input or output of the transforms. Follow [this link](https://github.com/aws-samples/machine-learning-using-k8s/blob/master/docs/aws-creds-secret.md) for more information on how to build the S3 secrets.
+ 
+ 5. Edit a pod yaml file from the `deployment/kubernetes/pods` directory.  The steps below refer to the [yaml file used to build the filter pod] (deployment/kubernetes/pods/spark-driver-pod-filter.yaml):
+    1. Give a name to the pod (`metadata/name`), the container launched inside the pod (`spec/containers/name`), and the Spark application (the `APP_NAME` variable in `spec/containers/env`).
+    2. Specify the namespace where the pod will be created (`metadata/namespace`). Use the same namespace for the `EXECUTOR_NAMESPACE` variable in `spec/containers/env`)
+    3. Specify the command to launch the Spark application (in `spec/containers/args`)
+    4. Specify the image used by the driver (`spec/containers/image` - usually this is the transform image built under point 2).
+    5. Specify the image used by the executors (`EXECUTOR_DOCKER_IMAGE` variable in `spec/containers/env`)
+    6. Specify the service account to use by the driver (`spec/containers/serviceAccount`) and by the executors(the `SERVICE_ACCOUNT` variable in `spec/containers/env`)
+    7. Configure S3: 
+       1. Specify the input (`AWS_ENDPOINT_URL_IN`) and output (`AWS_ENDPOINT_URL_OUT`) endpoint URLs.  
+       2. Specify the input and out access key ids and secret access keys.
+
+6. Launch the Spark application by creating the driver pod:
+   ```
+   kubectl apply -f deployment/kubernetes/pod/spark-driver-pod-filter.yaml
+   ```
+   
+7. Monitor the creation of the executor pods:
+   ```
+   kubectl get pods -w
+   ```
+
+8. Monitor the driver logs:
+   ```
+   kubectl logs spark-driver-pod-filter -f
+   ```
+   ```
diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml
new file mode 100644
index 000000000..d9579e0c7
--- /dev/null
+++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Pod
+metadata:
+spec:
+    imagePullSecrets:
+        - name: prod-all-icr-io
+    securityContext:
+        fsGroup: 0
diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml
new file mode 100644
index 000000000..eeddbd694
--- /dev/null
+++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml
@@ -0,0 +1,14 @@
+spark.app.name: ${APP_NAME}
+spark.driver.memory: ${DRIVER_MEMORY}
+spark.executor.instances: ${NUM_EXECUTORS}
+spark.executor.memory: ${EXECUTOR_MEMORY}
+spark.executor.cores: ${EXECUTOR_CORES}
+spark.sql.shuffle.partitions: ${NUM_TASKS}
+spark.task.cpus: ${TASK_CPUS}
+spark.sql.legacy.parquet.nanosAsLong: true
+spark.executor.decommission.forceKillTimeout: "10h"
+# spark.sql.files.ignoreCorruptFiles: true
+# configuration needed when running in kubernetes
+spark.kubernetes.authenticate.driver.serviceAccountName: ${SERVICE_ACCOUNT}
+spark.kubernetes.container.image: ${EXECUTOR_DOCKER_IMAGE}
+spark.kubernetes.namespace: ${EXECUTOR_NAMESPACE}
diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml
new file mode 100644
index 000000000..dcf1f48e2
--- /dev/null
+++ b/transforms/universal/fdedup/spark/pyproject.toml
@@ -0,0 +1,42 @@
+[project]
+name = "dpk_fdedup_transform_spark"
+version = "0.3.0.dev0"
+requires-python = ">=3.10"
+description = "Fuzzy Dedup Spark Transform"
+license = {text = "Apache-2.0"}
+readme = {file = "README.md", content-type = "text/markdown"}
+authors = [
+    { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
+    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
+]
+dependencies = [
+    "dpk_fdedup_transform_python==0.3.0.dev0",
+    "data-prep-toolkit-spark==0.2.2.dev0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "twine",
+    "pytest>=7.3.2",
+    "pytest-dotenv>=0.5.2",
+    "pytest-env>=1.0.0",
+    "pre-commit>=3.3.2",
+    "pytest-cov>=4.1.0",
+    "pytest-mock>=3.10.0",
+    "moto==5.0.5",
+    "markupsafe==2.0.1",
+]
+
+[options]
+package_dir = ["src","test"]
+
+[options.packages.find]
+where = ["src/"]
+
+[tool.pytest.ini_options]
+# Currently we use low coverage since we have to run tests separately (see makefile)
+#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
+markers = ["unit: unit tests", "integration: integration tests"]
+
+[tool.coverage.run]
+include = ["src/*"]
diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt
new file mode 100644
index 000000000..10f3e129b
--- /dev/null
+++ b/transforms/universal/fdedup/spark/requirements.txt
@@ -0,0 +1,10 @@
+pyarrow
+pyyaml
+boto3
+kubernetes
+polars
+disjoint-set
+scipy
+numpy
+sentencepiece
+mmh3
diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py
new file mode 100644
index 000000000..83498f59e
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py
@@ -0,0 +1,33 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+import polars as pl
+from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
+from data_processing.utils import ParamsUtils
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+
+
+if __name__ == "__main__":
+    sys.argv.append("--data_s3_cred")
+    s3_creds = {
+        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+        "url": os.getenv("AWS_ENDPOINT_URL"),
+    }
+    sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
new file mode 100644
index 000000000..afb8c51b7
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
@@ -0,0 +1,42 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from cluster_analysis_transform import ClusterAnalysisTransformConfiguration
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import (
+    SparkTransformLauncher,
+    SparkTransformRuntimeConfiguration,
+)
+
+
+logger = get_logger(__name__)
+
+
+class ClusterAnalysisSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
+    """
+    Implements the SparkTransformConfiguration for Fuzzy Dedup Cluster Analysis
+    as required by the SparkTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(transform_config=ClusterAnalysisTransformConfiguration())
+
+
+if __name__ == "__main__":
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
+    logger.info("Launching fuzzy dedup signature calculation transform")
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_spark.py
new file mode 100644
index 000000000..7b6bd626d
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/data_cleaning_spark.py
@@ -0,0 +1,33 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+import polars as pl
+from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
+from data_processing.utils import ParamsUtils
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+
+
+if __name__ == "__main__":
+    sys.argv.append("--data_s3_cred")
+    s3_creds = {
+        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+        "url": os.getenv("AWS_ENDPOINT_URL"),
+    }
+    sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration())
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
new file mode 100644
index 000000000..03976bac8
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
@@ -0,0 +1,102 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from typing import Any
+
+from data_cleaning_transform import DataCleaningTransformConfiguration
+from data_processing.data_access import DataAccessFactoryBase
+from data_processing.transform import TransformStatistics
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import (
+    DefaultSparkTransformRuntime,
+    SparkTransformLauncher,
+    SparkTransformRuntimeConfiguration,
+)
+
+
+logger = get_logger(__name__)
+
+
+class DataCleaningSparkRuntime(DefaultSparkTransformRuntime):
+    """
+    Data cleaning runtime support for Spark
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_transform_config(
+        self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
+    ) -> dict[str, Any]:
+        """
+        Download the table of duplicate document ids that will be provided to the
+        filtering/annotation method. This is the opportunity for this runtime to
+        create a new set of configuration based on the config/params provided to
+        this instance's initializer. This may include the addition of new
+        configuration data such as ray shared memory, new actors, etc., that
+        might be needed and expected by the transform in its initializer and/or
+        transform() methods.
+        :param data_access_factory - data access factory class being used by the RayOrchestrator.
+        :param statistics - reference to statistics actor
+        :param files - list of files to process
+        :return: dictionary of transform init params
+        """
+        duplicate_list_location = self.params["duplicate_list_location"]
+        data_access = data_access_factory.create_data_access()
+        if duplicate_list_location.startswith("s3://"):
+            _, duplicate_list_location = duplicate_list_location.split("://")
+        self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
+        return self.params | {"df": self.duplicate_list}
+
+
+class DataCleaningSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
+    """
+    Implements the SparkTransformConfiguration for Fuzzy Dedup Data Cleaning
+    as required by the SparkTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(
+            transform_config=DataCleaningTransformConfiguration(),
+            runtime_class=DataCleaningSparkRuntime,
+        )
+
+    def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]:
+        """
+        Download the table of duplicate document ids that will be provided to the
+        filtering/annotation method. This is the opportunity for this runtime to
+        create a new set of configuration based on the config/params provided to
+        this instance's initializer. This may include the addition of new
+        configuration data such as ray shared memory, new actors, etc., that
+        might be needed and expected by the transform in its initializer and/or
+        transform() methods.
+        :param data_access_factory - data access factory class being used by the RayOrchestrator.
+        :return: dictionary of parameters to be broadcast
+        """
+        duplicate_list_location = self.transform_config.params["duplicate_list_location"]
+        data_access = data_access_factory.create_data_access()
+        if duplicate_list_location.startswith("s3://"):
+            _, duplicate_list_location = duplicate_list_location.split("://")
+        self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
+        return {"df": self.duplicate_list}
+
+
+if __name__ == "__main__":
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration())
+    logger.info("Launching fuzzy dedup data cleaning transform")
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/file_copy_util_spark.py b/transforms/universal/fdedup/spark/src/file_copy_util_spark.py
new file mode 100644
index 000000000..58a43a736
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/file_copy_util_spark.py
@@ -0,0 +1,261 @@
+import argparse
+import os
+import socket
+import time
+import traceback
+from datetime import datetime
+
+import polars as pl
+import yaml
+from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
+from data_processing.utils import ParamsUtils, get_logger
+from file_copy_util import FileCopyUtil
+from pyspark.sql import SparkSession
+
+
+logger = get_logger(__name__)
+
+
+class FileCopySpark:
+    def __init__(self, root_folder: str, num_bands: int, num_segments: int, use_s3: bool):
+        self.root_folder = root_folder
+        self.num_bands = num_bands
+        self.num_segments = num_segments
+        self.use_s3 = use_s3
+        self.subdirs = [f"band={b}/segment={s}" for b in range(num_bands) for s in range(num_segments)]
+
+    def _init_spark(self, app_name: str = "copy-app") -> SparkSession:
+        server_port_https = int(os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "-1"))
+        if server_port_https == -1:
+            # we are running locally
+            spark_config = {"spark.driver.host": "127.0.0.1"}
+            return SparkSession.builder.appName(app_name).config(map=spark_config).getOrCreate()
+        else:
+            # we are running in Kubernetes, use spark_profile.yml and
+            # environment variables for configuration
+
+            server_port = os.environ["KUBERNETES_SERVICE_PORT"]
+            master_url = f"k8s://https://kubernetes.default:{server_port}"
+
+            # Read Spark configuration profile
+            config_filepath = os.path.abspath(
+                os.path.join(os.getenv("SPARK_HOME"), "work-dir", "config", "spark_profile.yml")
+            )
+            with open(config_filepath, "r") as config_fp:
+                spark_config = yaml.safe_load(os.path.expandvars(config_fp.read()))
+            spark_config["spark.submit.deployMode"] = "client"
+
+            # configure the executor pods from template
+            executor_pod_template_file = os.path.join(
+                os.getenv("SPARK_HOME"),
+                "work-dir",
+                "src",
+                "templates",
+                "spark-executor-pod-template.yml",
+            )
+            spark_config["spark.kubernetes.executor.podTemplateFile"] = executor_pod_template_file
+            spark_config["spark.kubernetes.container.image.pullPolicy"] = "Always"
+
+            # Pass the driver IP address to the workers for callback
+            myservice_url = socket.gethostbyname(socket.gethostname())
+            spark_config["spark.driver.host"] = myservice_url
+            spark_config["spark.driver.bindAddress"] = "0.0.0.0"
+
+            spark_config["spark.decommission.enabled"] = True
+            logger.info(f"Launching Spark Session with configuration\n" f"{yaml.dump(spark_config, indent=2)}")
+            app_name = spark_config.get("spark.app.name", "my-spark-app")
+            return SparkSession.builder.master(master_url).appName(app_name).config(map=spark_config).getOrCreate()
+
+    def create_data_access_factory(self, root_folder: str, use_s3: bool) -> DataAccessFactoryBase:
+        input_folder = root_folder
+        output_folder = root_folder
+        data_access_factory: DataAccessFactoryBase = DataAccessFactory()
+        daf_args = []
+        if use_s3:
+            s3_creds = {
+                "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+                "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+                "url": os.getenv("AWS_ENDPOINT_URL"),
+            }
+            s3_config = {
+                "input_folder": root_folder,
+                "output_folder": root_folder,
+            }
+            daf_args.append("--data_s3_cred")
+            daf_args.append(ParamsUtils.convert_to_ast(s3_creds))
+            daf_args.append("--data_s3_config")
+            daf_args.append(ParamsUtils.convert_to_ast(s3_config)),
+        else:
+            local_config = {
+                "input_folder": root_folder,
+                "output_folder": os.path.join(root_folder, "bands_consolidated"),
+            }
+            daf_args.append("--data_local_config")
+            daf_args.append(ParamsUtils.convert_to_ast(local_config))
+        daf_parser = argparse.ArgumentParser()
+        data_access_factory.add_input_params(parser=daf_parser)
+        data_access_factory_args = daf_parser.parse_args(args=daf_args)
+        data_access_factory.apply_input_params(args=data_access_factory_args)
+
+        return data_access_factory
+
+    def orchestrate(
+        self, runtime_config: dict, execution_config: dict, data_access_factory: DataAccessFactoryBase, data_type: str
+    ) -> int:
+        """
+        orchestrator for transformer execution
+        :param execution_config: orchestrator configuration
+        :param data_access_factory: data access factory
+        :param runtime_config: transformer runtime configuration
+        :return: 0 - success or 1 - failure
+        """
+        start_time = time.time()
+        start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        logger.info(f"orchestrator started at {start_ts}")
+        data_access = data_access_factory.create_data_access()
+        # initialize Spark
+        spark_session = self._init_spark()
+        sc = spark_session.sparkContext
+        transform_config = sc.broadcast(runtime_config)
+        daf = sc.broadcast(data_access_factory)
+        data_type = data_type
+        print("data_type")
+        print(data_type)
+
+        def process_partition(iterator):
+            """
+            process partitions
+            :param iterator: iterator of records
+            :return:
+            """
+            # local statistics dictionary
+            stats = {}
+            # create file processor
+            file_processor = FileCopyUtil(
+                data_access_factory=daf.value,
+                config=transform_config.value,
+                stats=stats,
+            )
+            for f in iterator:
+                stats = file_processor.copy_data(subfolder_name=f[0], data_type=data_type)
+            # return partition's statistics
+            return list(stats.items())
+
+        num_partitions = 0
+        try:
+            if data_type == "bands":
+                # Get files to process
+                files = [
+                    f"band={band}/segment={segment}"
+                    for band in range(self.num_bands)
+                    for segment in range(self.num_segments)
+                ]
+            elif data_type == "docs_to_remove":
+                files = ["docs_to_remove"]
+            print(data_type)
+
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            logger.info(f"Number of files is {len(files)}")
+            # process data
+            logger.debug("Begin processing files")
+            source_rdd = sc.parallelize(files, execution_config.get("parallelization"))
+            num_partitions = source_rdd.getNumPartitions()
+            logger.info(f"Parallelizing execution. Using {num_partitions} partitions")
+            stats_rdd = source_rdd.zipWithIndex().mapPartitions(process_partition)
+            # build overall statistics
+            stats = dict(stats_rdd.reduceByKey(lambda a, b: a + b).collect())
+            return_code = 0
+            status = "success"
+        except Exception as e:
+            # process execution exception
+            logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
+            return_code = 1
+            status = "failure"
+            stats = {}
+        try:
+            # build and save metadata
+            logger.debug("Building job metadata")
+            input_params = runtime_config
+            # input_params = runtime_config.get_transform_metadata() | execution_config.get_input_params()
+            metadata = {
+                "job details": {
+                    "start_time": start_ts,
+                    "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    "status": status,
+                },
+                "job_input_params": input_params | data_access_factory.get_input_params(),
+                "execution_stats": {
+                    "num partitions": num_partitions,
+                    "execution time, min": (time.time() - start_time) / 60,
+                },
+                "job_output_stats": stats,
+            }
+            logger.debug(f"Saving job metadata: {metadata}.")
+
+            if data_access_factory.s3_config is not None:
+                _, root_folder = self.root_folder.split("://")
+                in_path = os.path.join(root_folder, "bands")
+                out_path = os.path.join(root_folder, "bands_consolidated")
+                data_access.input_folder = f"{in_path}{os.sep}"
+                data_access.output_folder = f"{out_path}{os.sep}"
+            else:
+                data_access.input_folder = os.path.join(self.root_folder, "bands")
+                data_access.output_folder = os.path.join(self.root_folder, "bands_consolidated")
+            data_access.save_job_metadata(metadata)
+            logger.debug("Saved job metadata.")
+            return return_code
+        except Exception as e:
+            logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
+            return 1
+        finally:
+            # stop spark context at the end. Required for running multiple tests
+            spark_session.stop()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--root_folder",
+        type=str,
+        default="/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_second/",
+        help="root folder",
+    )
+    parser.add_argument(
+        "--num_bands",
+        type=int,
+        default=14,
+        help="number of bands",
+    )
+    parser.add_argument(
+        "--num_segments",
+        type=int,
+        default=2,
+        help="number of segments",
+    )
+    parser.add_argument(
+        "--data_type",
+        type=str,
+        default="docs_to_remove",
+        help="bands or doc2remove",
+    )
+    parser.add_argument(
+        "--parallelization",
+        type=int,
+        default=-1,
+        help="spark parallelization",
+    )
+    parser.add_argument(
+        "--use_s3",
+        type=bool,
+        default=False,
+        help="use s3",
+    )
+    args = parser.parse_args()
+    fcs = FileCopySpark(args.root_folder, args.num_bands, args.num_segments, args.use_s3)
+    data_access_factory = fcs.create_data_access_factory(args.root_folder, args.use_s3)
+    app_config = {"root_folder": args.root_folder}
+    execution_config = {"parallelization": args.parallelization} if args.parallelization > 0 else {}
+    status = fcs.orchestrate(app_config, execution_config, data_access_factory, args.data_type)
+    print(f"Orchestrate concluded with status {status}")
diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
new file mode 100644
index 000000000..6d0e090e4
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
@@ -0,0 +1,205 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import argparse
+import logging
+import os
+import sys
+from typing import Union
+
+import polars as pl
+from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
+from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
+from data_processing.utils import ParamsUtils
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from file_copy_util import FileCopyUtil
+from file_copy_util_spark import FileCopySpark
+from signature_calc_transform_spark import (
+    SignatureCalculationSparkTransformConfiguration,
+)
+
+
+s3_creds = {
+    "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+    "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+    "url": os.getenv("AWS_ENDPOINT_URL"),
+}
+
+args_map = {
+    "minhash": [
+        "document_id_column",
+        "contents_column",
+        "seed",
+        "num_permutations",
+        "num_bands",
+        "num_minhashes_per_band",
+        "jaccard_similarity_threshold",
+        "word_shingle_size",
+        "num_segments",
+    ],
+    "copyutil": [
+        "subfolder_name",
+        "data_type",
+        "num_bands",
+        "num_segments",
+        "parallelization",
+        "use_s3",
+    ],
+    "cluster": [
+        "jaccard_similarity_threshold",
+    ],
+    "fdclean": [
+        "document_id_column",
+        "duplicate_list_location",
+    ],
+}
+
+
+def get_arguments(in_args: argparse.Namespace, module_name: str) -> Union[list, dict]:
+    sys_argv = ["python"]
+    in_args_dict = vars(in_args)
+    if in_args.use_s3:
+        sys_argv.append("--data_s3_cred")
+        sys_argv.append(ParamsUtils.convert_to_ast(s3_creds))
+    all_module_arguments = args_map.get(module_name, [])
+    passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None}
+    if module_name == "copyutil":
+        copy_util_config = {k: v for k, v in passed_args.items()}
+        copy_util_config["root_folder"] = in_args_dict["output_folder"]
+        return copy_util_config
+    else:
+        for k, v in passed_args.items():
+            sys_argv.append(f"--{module_name}_{k}")
+            sys_argv.append(str(v))
+        if module_name == "minhash":
+            input_folder = in_args_dict["input_folder"]
+            output_folder = os.path.join(in_args_dict["output_folder"])
+        elif module_name == "cluster":
+            input_folder = os.path.join(in_args_dict["output_folder"], "bands_consolidated")
+            output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove")
+        elif module_name == "fdclean":
+            if f"--{module_name}_duplicate_list_location" not in sys_argv:
+                sys_argv.append(f"--{module_name}_duplicate_list_location")
+                sys_argv.append(
+                    os.path.join(
+                        in_args_dict["output_folder"],
+                        "docs_to_remove_consolidated",
+                        "docs_to_remove_consolidated.parquet",
+                    )
+                )
+            input_folder = in_args_dict["input_folder"]
+            output_folder = os.path.join(in_args_dict["output_folder"], "cleaned")
+        else:
+            logging.error(f"Unknown module name: {module_name}")
+        data_io = {
+            "input_folder": input_folder,
+            "output_folder": output_folder,
+        }
+        if in_args.use_s3:
+            sys_argv.append("--data_s3_config")
+        else:
+            sys_argv.append("--data_local_config")
+        sys_argv.append(ParamsUtils.convert_to_ast(data_io))
+    return sys_argv
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_folder", type=str, required=True, help="path to read the input files")
+    parser.add_argument("--output_folder", type=str, required=True, help="path to write the output files")
+    parser.add_argument(
+        "--use_s3", type=bool, required=False, default=False, help="if true, use S3, if false use local FS"
+    )
+    parser.add_argument(
+        "--contents_column", type=str, required=False, help="name of the column that stores document text"
+    )
+    parser.add_argument(
+        "--document_id_column", type=str, required=False, help="name of the column that stores document text"
+    )
+    parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text")
+    parser.add_argument(
+        "--num_permutations", type=int, required=True, help="number of permutations to use for minhash calculation"
+    )
+    parser.add_argument(
+        "--num_bands", type=int, required=True, help="number of bands to use for band hash calculation"
+    )
+    parser.add_argument(
+        "--num_minhashes_per_band", type=int, required=True, help="number of minhashes to use in each band"
+    )
+    parser.add_argument(
+        "--word_shingle_size", type=int, required=False, help="number of words included in one shingle"
+    )
+    parser.add_argument(
+        "--jaccard_similarity_threshold",
+        type=float,
+        required=False,
+        help="jaccard similarity threshold above which two documents are similar",
+    )
+    parser.add_argument(
+        "--num_segments",
+        type=int,
+        required=True,
+        help="number of segments to divide each band hash interval (to improve scalability)",
+    )
+    parser.add_argument("--parallelization", type=int, required=False, default=-1, help="spark parallelization")
+    parser.add_argument(
+        "--duplicate_list_location",
+        type=str,
+        required=False,
+        help="path to the file with all the duplicate document ids",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    # configure logging
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=logging.INFO,
+    )
+    args = parse_arguments()
+    sys.argv = get_arguments(args, "minhash")
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration())
+    # Launch the spark worker(s) to process the input
+    status = launcher.launch()
+    logging.info(f"Signature calculation concluded with status {status}")
+
+    fcs_config = get_arguments(args, "copyutil")
+
+    root_folder = fcs_config["root_folder"]
+    parallelization = fcs_config["parallelization"]
+    fcs = FileCopySpark(root_folder, fcs_config["num_bands"], fcs_config["num_segments"], args.use_s3)
+    data_access_factory = fcs.create_data_access_factory(root_folder, args.use_s3)
+    app_config = {"root_folder": root_folder}
+    execution_config = {"parallelization": parallelization} if parallelization > 0 else {}
+    status = fcs.orchestrate(app_config, execution_config, data_access_factory, data_type="bands")
+    logging.info(f"Consolidate bands concluded with status {status}")
+
+    sys.argv = get_arguments(args, "cluster")
+    launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
+    # Launch the spark worker(s) to process the input
+    status = launcher.launch()
+    logging.info(f"Cluster analysis concluded with status {status}")
+
+    stats = {}
+    fcu_config = get_arguments(args, "copyutil")
+    fcu = FileCopyUtil(data_access_factory=data_access_factory, config=fcu_config, stats=stats)
+    fcu.copy_data(subfolder_name="docs_to_remove", data_type="docs_to_remove")
+
+    sys.argv = get_arguments(args, "fdclean")
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration())
+    # Launch the spark worker(s) to process the input
+    status = launcher.launch()
+    logging.info(f"Data cleanup concluded with status {status}")
diff --git a/transforms/universal/fdedup/spark/src/requirements.txt b/transforms/universal/fdedup/spark/src/requirements.txt
new file mode 100644
index 000000000..c1a1f2c3d
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/requirements.txt
@@ -0,0 +1,8 @@
+pyspark
+pyarrow
+pyyaml
+boto3
+kubernetes
+disjoint_set
+mmh3
+scipy
diff --git a/transforms/universal/fdedup/spark/src/signature_calc_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_spark.py
new file mode 100644
index 000000000..0e7046549
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/signature_calc_spark.py
@@ -0,0 +1,35 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+import polars as pl
+from data_processing.utils import ParamsUtils
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from signature_calc_transform_spark import (
+    SignatureCalculationSparkTransformConfiguration,
+)
+
+
+if __name__ == "__main__":
+    sys.argv.append("--data_s3_cred")
+    s3_creds = {
+        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+        "url": os.getenv("AWS_ENDPOINT_URL"),
+    }
+    sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration())
+    # Launch the spark worker(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py
new file mode 100644
index 000000000..4e39810c6
--- /dev/null
+++ b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py
@@ -0,0 +1,42 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import (
+    SparkTransformLauncher,
+    SparkTransformRuntimeConfiguration,
+)
+from signature_calc_transform import SignatureCalculationTransformConfiguration
+
+
+logger = get_logger(__name__)
+
+
+class SignatureCalculationSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
+    """
+    Implements the SparkTransformConfiguration for Fuzzy Dedup Signature Calculation
+    as required by the PythonTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(transform_config=SignatureCalculationTransformConfiguration())
+
+
+if __name__ == "__main__":
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration())
+    logger.info("Launching fuzzy dedup signature calculation transform")
+    # Launch the spark worker(s) to process the input
+    launcher.launch()

From 3349521bdfe3b1d95d8160cf442b722988c344be Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Thu, 10 Oct 2024 19:05:39 +0100
Subject: [PATCH 007/105] added folder_transform

---
 .../pure_python/transform_file_processor.py   | 15 ++++--
 .../pure_python/transform_orchestrator.py     | 42 ++++++++++------
 .../runtime/transform_file_processor.py       | 41 ++++++++-------
 .../src/data_processing/transform/__init__.py |  2 +
 .../transform/abstract_transform.py           | 16 ++++++
 .../transform/binary_transform.py             |  5 +-
 .../transform/folder_transform.py             | 50 +++++++++++++++++++
 .../runtime/ray/transform_file_processor.py   |  1 +
 .../runtime/ray/transform_orchestrator.py     | 19 ++++---
 .../runtime/spark/transform_file_processor.py |  5 +-
 .../runtime/spark/transform_orchestrator.py   | 25 +++++++---
 11 files changed, 168 insertions(+), 53 deletions(-)
 create mode 100644 data-processing-lib/python/src/data_processing/transform/abstract_transform.py
 create mode 100644 data-processing-lib/python/src/data_processing/transform/folder_transform.py

diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
index 143835dd0..fa3e69e4a 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
@@ -14,7 +14,7 @@
 
 from data_processing.data_access import DataAccessFactoryBase
 from data_processing.runtime import AbstractTransformFileProcessor
-from data_processing.transform import AbstractBinaryTransform, TransformStatistics
+from data_processing.transform import AbstractTransform, TransformStatistics
 from data_processing.utils import UnrecoverableException
 
 
@@ -28,7 +28,8 @@ def __init__(
         data_access_factory: DataAccessFactoryBase,
         statistics: TransformStatistics,
         transform_params: dict[str, Any],
-        transform_class: type[AbstractBinaryTransform],
+        transform_class: type[AbstractTransform],
+        is_folder: bool,
     ):
         """
         Init method
@@ -36,11 +37,13 @@ def __init__(
         :param statistics - reference to statistics class
         :param transform_params - transform parameters
         :param transform_class: transform class
+        :param is_folder: folder transform flag
         """
         # invoke superclass
         super().__init__(
             data_access_factory=data_access_factory,
             transform_parameters=dict(transform_params),
+            is_folder=is_folder,
         )
         self.transform_params["statistics"] = statistics
         # Create local processor
@@ -52,7 +55,8 @@ def __init__(
         # Create statistics
         self.stats = statistics
 
-    def _publish_stats(self, stats: dict[str, Any]) -> None:
+
+def _publish_stats(self, stats: dict[str, Any]) -> None:
         self.stats.add_stats(stats)
 
 
@@ -65,17 +69,20 @@ def __init__(
         self,
         data_access_factory: DataAccessFactoryBase,
         transform_params: dict[str, Any],
-        transform_class: type[AbstractBinaryTransform],
+        transform_class: type[AbstractTransform],
+        is_folder: bool
     ):
         """
         Init method
         :param data_access_factory - data access factory
         :param transform_params - transform parameters
         :param transform_class: transform class
+        :param is_folder: folder tranform flag
         """
         super().__init__(
             data_access_factory=data_access_factory,
             transform_parameters=dict(transform_params),
+            is_folder=is_folder,
         )
         # Add data access and statistics to the processor parameters
         self.transform_params["data_access"] = self.data_access
diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
index 8692da29e..153eaaf0a 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
@@ -24,7 +24,7 @@
     PythonTransformFileProcessor,
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.transform import AbstractBinaryTransform, TransformStatistics
+from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 
 
@@ -48,8 +48,6 @@ def _execution_resources() -> dict[str, Any]:
         "object_store": 0,
     }
 
-
-
 def orchestrate(
     data_access_factory: DataAccessFactoryBase,
     runtime_config: PythonTransformRuntimeConfiguration,
@@ -74,15 +72,21 @@ def orchestrate(
         return 1
     # create additional execution parameters
     runtime = runtime_config.create_transform_runtime()
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        if retries > 0:
-            statistics.add_stats({"data access retries": retries})
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")
+        else:
+            # Get files to process
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            if retries > 0:
+                statistics.add_stats({"data access retries": retries})
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # Print interval
         print_interval = int(len(files) / 100)
         if print_interval == 0:
@@ -99,6 +103,7 @@ def orchestrate(
                     data_access_factory=data_access_factory, statistics=statistics, files=files
                 ),
                 transform_class=runtime_config.get_transform_class(),
+                is_folder=is_folder,
             )
         else:
             # using sequential execution
@@ -111,6 +116,7 @@ def orchestrate(
                     data_access_factory=data_access_factory, statistics=statistics, files=files
                 ),
                 transform_class=runtime_config.get_transform_class(),
+                is_folder=is_folder,
             )
         status = "success"
         return_code = 0
@@ -157,7 +163,8 @@ def _process_transforms(
     data_access_factory: DataAccessFactoryBase,
     statistics: TransformStatistics,
     transform_params: dict[str, Any],
-    transform_class: type[AbstractBinaryTransform],
+    transform_class: type[AbstractTransform],
+    is_folder: bool,
 ) -> None:
     """
     Process transforms sequentially
@@ -167,9 +174,8 @@ def _process_transforms(
     :param data_access_factory: data access factory
     :param transform_params - transform parameters
     :param transform_class: transform class
+    :param is_folder: folder transform flag
     :return: metadata for the execution
-
-    :return: None
     """
     # create executor
     executor = PythonTransformFileProcessor(
@@ -177,6 +183,7 @@ def _process_transforms(
         statistics=statistics,
         transform_params=transform_params,
         transform_class=transform_class,
+        is_folder=is_folder,
     )
     # process data
     t_start = time.time()
@@ -203,6 +210,7 @@ def _process_transforms_multiprocessor(
     data_access_factory: DataAccessFactoryBase,
     transform_params: dict[str, Any],
     transform_class: type[AbstractBinaryTransform],
+    is_folder: bool
 ) -> TransformStatistics:
     """
     Process transforms using multiprocessing pool
@@ -212,13 +220,17 @@ def _process_transforms_multiprocessor(
     :param data_access_factory: data access factory
     :param transform_params - transform parameters
     :param transform_class: transform class
+    :param is_folder: folder transform class
     :return: metadata for the execution
     """
     # result statistics
     statistics = TransformStatistics()
     # create processor
     processor = PythonPoolTransformFileProcessor(
-        data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class
+        data_access_factory=data_access_factory,
+        transform_params=transform_params,
+        transform_class=transform_class,
+        is_folder=is_folder,
     )
     completed = 0
     t_start = time.time()
diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
index d4ec548d8..1d268875f 100644
--- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
@@ -26,11 +26,13 @@ def __init__(
         self,
         data_access_factory: DataAccessFactoryBase,
         transform_parameters: dict[str, Any],
+        is_folder: bool = False,
     ):
         """
         Init method
         :param data_access_factory: Data Access Factory
         :param transform_parameters: Transform parameters
+        :param is_folder: folder transform flag
         """
         self.logger = get_logger(__name__)
         # validate parameters
@@ -46,6 +48,7 @@ def __init__(
         # Add data access and statistics to the processor parameters
         self.transform_params = transform_parameters
         self.transform_params["data_access"] = self.data_access
+        self.is_folder = is_folder
 
     def process_file(self, f_name: str) -> None:
         """
@@ -58,25 +61,29 @@ def process_file(self, f_name: str) -> None:
             self.logger.warning("No data_access found. Returning.")
             return
         t_start = time.time()
-        # Read source file
-        filedata, retries = self.data_access.get_file(path=f_name)
-        if retries > 0:
-            self._publish_stats({"data access retries": retries})
-        if filedata is None:
-            self.logger.warning(f"File read resulted in None for {f_name}. Returning.")
-            self._publish_stats({"failed_reads": 1})
-            return
-        self._publish_stats({"source_files": 1, "source_size": len(filedata)})
+        if not self.is_folder:
+            # Read source file only if we are processing file
+            filedata, retries = self.data_access.get_file(path=f_name)
+            if retries > 0:
+                self._publish_stats({"data access retries": retries})
+            if filedata is None:
+                self.logger.warning(f"File read resulted in None for {f_name}. Returning.")
+                self._publish_stats({"failed_reads": 1})
+                return
+            self._publish_stats({"source_files": 1, "source_size": len(filedata)})
         # Process input file
         try:
-            # execute local processing
-            name_extension = TransformUtils.get_file_extension(f_name)
             self.logger.debug(f"Begin transforming file {f_name}")
-            out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata)
+            if not self.is_folder:
+                # execute local processing
+                out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata)
+                name_extension = TransformUtils.get_file_extension(f_name)
+                self.last_file_name = name_extension[0]
+                self.last_file_name_next_index = None
+                self.last_extension = name_extension[1]
+            else:
+                out_files, stats = self.transform.transform(folder_name=f_name)
             self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
-            self.last_file_name = name_extension[0]
-            self.last_file_name_next_index = None
-            self.last_extension = name_extension[1]
             # save results
             self._submit_file(t_start=t_start, out_files=out_files, stats=stats)
         # Process unrecoverable exceptions
@@ -95,10 +102,10 @@ def flush(self) -> None:
         the hook for them to return back locally stored data and their statistics.
         :return: None
         """
-        if self.last_file_name is None:
+        if self.last_file_name is None or self.is_folder:
             # for some reason a given worker never processed anything. Happens in testing
             # when the amount of workers is greater than the amount of files
-            self.logger.debug("skipping flush, no name for file is defined")
+            self.logger.debug("skipping flush, no name for file is defined or this is a folder transform")
             return
         try:
             t_start = time.time()
diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py
index 6af43ad60..20254e47b 100644
--- a/data-processing-lib/python/src/data_processing/transform/__init__.py
+++ b/data-processing-lib/python/src/data_processing/transform/__init__.py
@@ -1,3 +1,5 @@
+from data_processing.transform.abstract_transform import AbstractTransform
+from data_processing.transform.folder_transform import AbstractFolderTransform
 from data_processing.transform.binary_transform import AbstractBinaryTransform
 from data_processing.transform.table_transform import AbstractTableTransform
 from data_processing.transform.transform_statistics import TransformStatistics
diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py
new file mode 100644
index 000000000..89db70f42
--- /dev/null
+++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py
@@ -0,0 +1,16 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+class AbstractTransform:
+    """
+    Base class for all transform types
+    """
\ No newline at end of file
diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py
index 80dff61ea..b313aff2f 100644
--- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py
@@ -10,10 +10,11 @@
 # limitations under the License.
 ################################################################################
 
-from typing import Any, TypeVar
+from typing import Any
+from data_processing.transform import AbstractTransform
 
 
-class AbstractBinaryTransform:
+class AbstractBinaryTransform(AbstractTransform):
     """
     Converts input binary file to output file(s) (binary)
     Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
new file mode 100644
index 000000000..866e3286f
--- /dev/null
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -0,0 +1,50 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from typing import Any
+from data_processing.data_access import data_access
+from data_processing.transform import AbstractTransform
+
+
+class AbstractFolderTransform(AbstractTransform):
+    """
+    Converts input folder to output file(s) (binary)
+    Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or
+    more new binary files and metadata.
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This simply stores the given instance in this instance for later use.
+        """
+        self.config = config
+
+    def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        """
+        Converts input folder into o or more output files.
+        If there is an error, an exception must be raised - exit()ing is not generally allowed.
+        :param folder_name: the name of the folder containing arbitrary amount of files.
+        :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
+                to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
+                holding the extension to be used when writing out the new bytes.
+        """
+        raise NotImplemented()
+
+    @staticmethod
+    def get_folders(data_access:data_access) -> list(str):
+        """
+        Compute the list of folders to use.
+        :param data_access - data access class
+        :return:
+        """
+        raise NotImplemented()
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py
index e1fabb144..cdad1309f 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py
@@ -35,6 +35,7 @@ def __init__(self, params: dict[str, Any]):
         super().__init__(
             data_access_factory=params.get("data_access_factory", None),
             transform_parameters=dict(params.get("transform_params", {})),
+            is_folder=params.get("is_folder", False)
         )
         # Create statistics
         self.stats = params.get("statistics", None)
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index 42eba47a6..8276eb56c 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -16,6 +16,7 @@
 
 import ray
 from data_processing.data_access import DataAccessFactoryBase
+from data_processing.transform import AbstractFolderTransform
 from data_processing_ray.runtime.ray import (
     RayTransformExecutionConfiguration,
     RayTransformFileProcessor,
@@ -56,13 +57,18 @@ def orchestrate(
     # create transformer runtime
     runtime = runtime_config.create_transform_runtime()
     resources = RayUtils.get_cluster_resources()
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")        # Get files to process
+        else:
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # Print interval
         print_interval = int(len(files) / 100)
         if print_interval == 0:
@@ -84,6 +90,7 @@ def orchestrate(
                 data_access_factory=data_access_factory, statistics=statistics, files=files
             ),
             "statistics": statistics,
+            "is_folder": is_folder,
         }
         logger.debug("Creating actors")
         processors = RayUtils.create_actors(
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py
index d63664ac4..a0968ab1d 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py
@@ -29,12 +29,15 @@ def __init__(
         data_access_factory: DataAccessFactoryBase,
         runtime_configuration: SparkTransformRuntimeConfiguration,
         statistics: TransformStatistics,
+        is_folder: bool,
     ):
         """
         Init method
         """
         super().__init__(
-            data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params()
+            data_access_factory=data_access_factory,
+            transform_parameters=runtime_configuration.get_transform_params(),
+            is_folder=is_folder,
         )
         # Add data access ant statistics to the processor parameters
         self.runtime_configuration = runtime_configuration
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
index c279f2b73..c534b685f 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
@@ -18,7 +18,7 @@
 
 import yaml
 from data_processing.data_access import DataAccessFactoryBase
-from data_processing.transform import TransformStatistics
+from data_processing.transform import TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 from data_processing_spark.runtime.spark import (
     SparkTransformExecutionConfiguration,
@@ -117,7 +117,10 @@ def process_partition(iterator):
         runtime = runtime_conf.create_transform_runtime()
         # create file processor
         file_processor = SparkTransformFileProcessor(
-            data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics
+            data_access_factory=d_access_factory,
+            runtime_configuration=runtime_conf,
+            statistics=statistics,
+            is_folder=is_folder,
         )
         first = True
         for f in iterator:
@@ -144,13 +147,19 @@ def process_partition(iterator):
         return list(statistics.get_execution_stats().items())
 
     num_partitions = 0
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")        # Get files to process
+        else:
+            # Get files to process
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # process data
         logger.debug("Begin processing files")
         # process files split by partitions

From 0553edf9d5a6d9507a470927b14f5c65b7ec8773 Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Thu, 10 Oct 2024 19:13:01 +0100
Subject: [PATCH 008/105] added folder_transform

---
 .../runtime/pure_python/transform_orchestrator.py             | 2 +-
 .../python/src/data_processing/transform/folder_transform.py  | 4 ++--
 .../data_processing_ray/runtime/ray/transform_orchestrator.py | 2 +-
 .../runtime/spark/transform_orchestrator.py                   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
index 153eaaf0a..d51f80a8a 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
@@ -76,7 +76,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            files = AbstractFolderTransform.get_folders(d_access=data_access)
             logger.info(f"Number of folders is {len(files)}")
         else:
             # Get files to process
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
index 866e3286f..eca191bbb 100644
--- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -41,10 +41,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
         raise NotImplemented()
 
     @staticmethod
-    def get_folders(data_access:data_access) -> list(str):
+    def get_folders(d_access: data_access) -> list(str):
         """
         Compute the list of folders to use.
-        :param data_access - data access class
+        :param d_access - data access class
         :return:
         """
         raise NotImplemented()
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index 8276eb56c..a8ff95729 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -61,7 +61,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            files = AbstractFolderTransform.get_folders(d_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             files, profile, retries = data_access.get_files_to_process()
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
index c534b685f..4a0897952 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
@@ -151,7 +151,7 @@ def process_partition(iterator):
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            files = AbstractFolderTransform.get_folders(d_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             # Get files to process

From a53412ecb5a00535dd85c56939c2d2fa4542c14a Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Thu, 10 Oct 2024 21:00:43 +0100
Subject: [PATCH 009/105] added folder_transform

---
 .../runtime/pure_python/transform_file_processor.py  |  3 +--
 .../runtime/pure_python/transform_orchestrator.py    | 11 ++++++-----
 .../runtime/pure_python/transform_runtime.py         | 10 +++++++++-
 .../data_processing/transform/folder_transform.py    | 12 +-----------
 .../runtime/ray/transform_orchestrator.py            |  2 +-
 .../runtime/ray/transform_runtime.py                 | 10 +++++++++-
 6 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
index fa3e69e4a..44ccd0ef0 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
@@ -55,8 +55,7 @@ def __init__(
         # Create statistics
         self.stats = statistics
 
-
-def _publish_stats(self, stats: dict[str, Any]) -> None:
+    def _publish_stats(self, stats: dict[str, Any]) -> None:
         self.stats.add_stats(stats)
 
 
diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
index d51f80a8a..812be8caf 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
@@ -24,14 +24,13 @@
     PythonTransformFileProcessor,
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform
+from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 
 
 logger = get_logger(__name__)
 
 
-@staticmethod
 def _execution_resources() -> dict[str, Any]:
     """
     Get Execution resource
@@ -48,6 +47,7 @@ def _execution_resources() -> dict[str, Any]:
         "object_store": 0,
     }
 
+
 def orchestrate(
     data_access_factory: DataAccessFactoryBase,
     runtime_config: PythonTransformRuntimeConfiguration,
@@ -76,7 +76,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(d_access=data_access)
+            files = runtime.get_folders(data_access=data_access)
             logger.info(f"Number of folders is {len(files)}")
         else:
             # Get files to process
@@ -145,7 +145,8 @@ def orchestrate(
             "job_input_params": input_params
             | data_access_factory.get_input_params()
             | execution_config.get_input_params(),
-            "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
+            "execution_stats": _execution_resources() |
+                               {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
             "job_output_stats": stats,
         }
         logger.debug(f"Saving job metadata: {metadata}.")
@@ -209,7 +210,7 @@ def _process_transforms_multiprocessor(
     print_interval: int,
     data_access_factory: DataAccessFactoryBase,
     transform_params: dict[str, Any],
-    transform_class: type[AbstractBinaryTransform],
+    transform_class: type[AbstractTransform],
     is_folder: bool
 ) -> TransformStatistics:
     """
diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py
index 4173154ae..478d40837 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py
@@ -12,7 +12,7 @@
 
 from typing import Any
 
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from data_processing.transform import TransformStatistics
 
 
@@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]):
         """
         self.params = params
 
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
+
     def get_transform_config(
         self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
     ) -> dict[str, Any]:
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
index eca191bbb..9a2fb3713 100644
--- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -11,7 +11,6 @@
 ################################################################################
 
 from typing import Any
-from data_processing.data_access import data_access
 from data_processing.transform import AbstractTransform
 
 
@@ -38,13 +37,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
                 to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
                 holding the extension to be used when writing out the new bytes.
         """
-        raise NotImplemented()
-
-    @staticmethod
-    def get_folders(d_access: data_access) -> list(str):
-        """
-        Compute the list of folders to use.
-        :param d_access - data access class
-        :return:
-        """
-        raise NotImplemented()
+        raise NotImplemented()
\ No newline at end of file
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index a8ff95729..b29682997 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -61,7 +61,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(d_access=data_access)
+            files = runtime.get_folders(data_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             files, profile, retries = data_access.get_files_to_process()
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py
index 57f071406..64479302c 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py
@@ -12,7 +12,7 @@
 
 from typing import Any
 
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from ray.actor import ActorHandle
 
 
@@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]):
         """
         self.params = params
 
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
+
     def get_transform_config(
         self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
     ) -> dict[str, Any]:

From 9c3ace785b9a529e047df93ed9e65d27bf3d7ba0 Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Fri, 11 Oct 2024 08:48:00 +0100
Subject: [PATCH 010/105] added folder_transform

---
 .../runtime/spark/transform_orchestrator.py            |  3 ++-
 .../runtime/spark/transform_runtime.py                 | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
index 4a0897952..096fab272 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
@@ -151,7 +151,8 @@ def process_partition(iterator):
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(d_access=data_access)
+            runtime = runtime_config.create_transform_runtime()
+            files = runtime.get_folders(data_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             # Get files to process
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py
index 7b968b1e9..7410d09d1 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py
@@ -12,7 +12,7 @@
 
 from typing import Any
 
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from data_processing.transform import TransformStatistics
 
 
@@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]):
         """
         self.params = params
 
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
+
     def get_transform_config(
         self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
     ) -> dict[str, Any]:

From 7091a2e6087c77d5b204c803917f97b60d974310 Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Fri, 11 Oct 2024 15:35:00 +0100
Subject: [PATCH 011/105] added noop testing

---
 .../runtime/transform_file_processor.py       |  44 +++++---
 .../test_support/transform/__init__.py        |  13 ++-
 .../transform/noop_folder_transform.py        | 105 ++++++++++++++++++
 .../test_support/transform/noop_transform.py  |   6 +-
 .../transform/folder_transform.py             |   2 +-
 .../transform/transform_configuration.py      |   6 +-
 .../transform/test_folders_noop.py            |  33 ++++++
 .../launch/ray/ray_test_noop_launch.py        |   6 -
 .../ededup/ray/src/ededup_transform_ray.py    |   9 +-
 9 files changed, 187 insertions(+), 37 deletions(-)
 create mode 100644 data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py
 create mode 100644 data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py

diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
index 1d268875f..4075f40be 100644
--- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
@@ -83,6 +83,7 @@ def process_file(self, f_name: str) -> None:
                 self.last_extension = name_extension[1]
             else:
                 out_files, stats = self.transform.transform(folder_name=f_name)
+                self.last_file_name = f_name
             self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
             # save results
             self._submit_file(t_start=t_start, out_files=out_files, stats=stats)
@@ -148,15 +149,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats
                 )
             case 1:
                 # we have exactly 1 output file
-                file_ext = out_files[0]
-                lfn = self.last_file_name
-                if self.last_file_name_next_index is not None:
-                    lfn = f"{lfn}_{self.last_file_name_next_index}"
-                output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
+                if self.is_folder:
+                    # its folder
+                    output_name = out_files[0][1]
+                    dt = out_files[0][0]
+                else:
+                    file_ext = out_files[0]
+                    lfn = self.last_file_name
+                    if self.last_file_name_next_index is not None:
+                        lfn = f"{lfn}_{self.last_file_name_next_index}"
+                    output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
+                    dt = file_ext[0]
                 self.logger.debug(
                     f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}"
                 )
-                save_res, retries = self.data_access.save_file(path=output_name, data=file_ext[0])
+                save_res, retries = self.data_access.save_file(path=output_name, data=dt)
                 if retries > 0:
                     self._publish_stats({"data access retries": retries})
                 if save_res is None:
@@ -166,7 +173,7 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats
                 self._publish_stats(
                     {
                         "result_files": 1,
-                        "result_size": len(file_ext[0]),
+                        "result_size": len(dt),
                         "processing_time": time.time() - t_start,
                     }
                 )
@@ -183,14 +190,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats
                     start_index = 0
                 count = len(out_files)
                 for index in range(count):
-                    file_ext = out_files[index]
-                    output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}"
-                    file_sizes += len(file_ext[0])
-                    self.logger.debug(
-                        f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} "
-                        f"of {count}  to {output_name_indexed}"
-                    )
-                    save_res, retries = self.data_access.save_file(path=output_name_indexed, data=file_ext[0])
+                    if self.is_folder:
+                        # its a folder
+                        output_name_indexed = out_files[index][1]
+                        dt = out_files[index][0]
+                    else:
+                        # files
+                        file_ext = out_files[index]
+                        output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}"
+                        self.logger.debug(
+                            f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} "
+                            f"of {count}  to {output_name_indexed}"
+                        )
+                        dt = file_ext[0]
+                    file_sizes += len(dt)
+                    save_res, retries = self.data_access.save_file(path=output_name_indexed, data=dt)
                     if retries > 0:
                         self._publish_stats({"data access retries": retries})
                     if save_res is None:
diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py
index 0e90f7ffd..04d6f3b0f 100644
--- a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py
+++ b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py
@@ -1,6 +1,11 @@
-from .table_transform_test import AbstractTableTransformTest
-from .binary_transform_test import AbstractBinaryTransformTest
-from .noop_transform import (
+from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest
+from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest
+from data_processing.test_support.transform.noop_transform import (
     NOOPTransform,
-    NOOPPythonTransformConfiguration,
+    NOOPTransformConfiguration,
+    NOOPPythonTransformConfiguration
 )
+from data_processing.test_support.transform.noop_folder_transform import (
+    NOOPFolderTransform,
+    NOOPFolderPythonTransformConfiguration
+)
\ No newline at end of file
diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py
new file mode 100644
index 000000000..5baab7858
--- /dev/null
+++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py
@@ -0,0 +1,105 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import time
+from typing import Any
+
+from data_processing.data_access import DataAccess
+from data_processing.runtime.pure_python import (
+    PythonTransformLauncher,
+    PythonTransformRuntimeConfiguration,
+    DefaultPythonTransformRuntime)
+from data_processing.transform import AbstractFolderTransform
+from data_processing.utils import get_logger
+from data_processing.test_support.transform import NOOPTransformConfiguration
+
+
+logger = get_logger(__name__)
+
+
+class NOOPFolderTransform(AbstractFolderTransform):
+    """
+    Implements a simple copy of a pyarrow Table.
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments defined
+        by the companion runtime, NOOPTransformRuntime.  If running inside the RayMutatingDriver,
+        these will be provided by that class with help from the RayMutatingDriver.
+        """
+        # Make sure that the param name corresponds to the name used in apply_input_params method
+        # of NOOPTransformConfiguration class
+        super().__init__(config)
+        self.sleep = config.get("sleep_sec", 1)
+        self.data_access = config.get("data_access")
+
+    def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        """
+        Converts input folder into o or more output files.
+        If there is an error, an exception must be raised - exit()ing is not generally allowed.
+        :param folder_name: the name of the folder containing arbitrary amount of files.
+        :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
+                to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
+                holding the file name to use.
+        """
+        logger.debug(f"Transforming one folder {folder_name}")
+        metadata = {}
+        # get folder files
+        files, retries = self.data_access.get_folder_files(path=folder_name)
+        if retries > 0:
+            metadata |= {"data access retries": retries}
+        result = [()] * len(files)
+        index = 0
+        for name, file in files.items():
+            result[index] = (file, self.data_access.get_output_location(name))
+            if self.sleep is not None:
+                logger.info(f"Sleep for {self.sleep} seconds")
+                time.sleep(self.sleep)
+                logger.info("Sleep completed - continue")
+            index += 1
+        # Add some sample metadata.
+        metadata |= {"nfiles": len(files)}
+        return result, metadata
+
+
+class NOOPFolderPythonRuntime(DefaultPythonTransformRuntime):
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        return [data_access.get_input_folder()]
+
+
+class NOOPFolderPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
+                         runtime_class=NOOPFolderPythonRuntime)
+
+
+if __name__ == "__main__":
+    # launcher = NOOPRayLauncher()
+    launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration())
+    logger.info("Launching noop transform")
+    launcher.launch()
diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py
index 0dee013a4..2fea35506 100644
--- a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py
+++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py
@@ -19,7 +19,7 @@
 from data_processing.runtime.pure_python.runtime_configuration import (
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.transform import AbstractTableTransform, TransformConfiguration
+from data_processing.transform import AbstractTableTransform, TransformConfiguration, AbstractTransform
 from data_processing.utils import CLIArgumentProvider, get_logger
 
 
@@ -75,10 +75,10 @@ class NOOPTransformConfiguration(TransformConfiguration):
     configuration with CLI args.
     """
 
-    def __init__(self):
+    def __init__(self, clazz: type[AbstractTransform] = NOOPTransform):
         super().__init__(
             name=short_name,
-            transform_class=NOOPTransform,
+            transform_class=clazz,
             remove_from_metadata=[pwd_key],
         )
 
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
index 9a2fb3713..caa3bfa52 100644
--- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -35,6 +35,6 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
         :param folder_name: the name of the folder containing arbitrary amount of files.
         :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
                 to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
-                holding the extension to be used when writing out the new bytes.
+                holding the file name to use.
         """
         raise NotImplemented()
\ No newline at end of file
diff --git a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py
index 033e92f2a..a5c9ec9ad 100644
--- a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py
+++ b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py
@@ -13,7 +13,7 @@
 from argparse import ArgumentParser
 from typing import Any
 
-from data_processing.transform import AbstractBinaryTransform
+from data_processing.transform import AbstractTransform
 from data_processing.utils import CLIArgumentProvider
 
 
@@ -23,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider):
     """
 
     def __init__(
-        self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = []
+        self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = []
     ):
         """
         Initialization
@@ -36,7 +36,7 @@ def __init__(
         self.remove_from_metadata = remove_from_metadata
         self.params = {}
 
-    def get_transform_class(self) -> type[AbstractBinaryTransform]:
+    def get_transform_class(self) -> type[AbstractTransform]:
         """
         Get the class extending AbstractBinaryTransform which implements a specific transformation.
         The class will generally be instantiated with a dictionary of configuration produced by
diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py
new file mode 100644
index 000000000..e0fdd86c8
--- /dev/null
+++ b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py
@@ -0,0 +1,33 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.transform import NOOPFolderPythonTransformConfiguration
+
+
+class TestRayNOOPTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = "../../../test-data/data_processing/python/noop/"
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
+        launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration())
+        fixtures = [(launcher, {"noop_sleep_sec": 0}, basedir + "/input", basedir + "/expected")]
+        return fixtures
diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py
index d4cc874f0..e706a4dfa 100644
--- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py
+++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py
@@ -12,7 +12,6 @@
 
 import os
 
-import pyarrow as pa
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
@@ -20,11 +19,6 @@
 from data_processing_ray.test_support.transform import NOOPRayTransformConfiguration
 
 
-table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])})
-expected_table = table  # We're a noop after all.
-expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}]  # transform() result  # flush() result
-
-
 class TestRayNOOPTransform(AbstractTransformLauncherTest):
     """
     Extends the super-class to define the test data for the tests defined there.
diff --git a/transforms/universal/ededup/ray/src/ededup_transform_ray.py b/transforms/universal/ededup/ray/src/ededup_transform_ray.py
index c0823a22e..d90dfa780 100644
--- a/transforms/universal/ededup/ray/src/ededup_transform_ray.py
+++ b/transforms/universal/ededup/ray/src/ededup_transform_ray.py
@@ -149,13 +149,12 @@ def _load_snapshots(self, data_access_factory: DataAccessFactoryBase, statistics
             statistics.add_stats.remote({"data access retries": retries})
         self.logger.info(f"Found the following snapshot files {files.keys()}")
         # process snapshot files
-        for file in files.keys():
-            # load the file
+        for file in files.values():
+            # convert the file
             try:
-                b_hashes, _ = data_access.get_file(file)
-                snaps = pickle.loads(b_hashes)
+                snaps = pickle.loads(file)
             except Exception as e:
-                self.logger.warning(f"Failed to load hashes from file {file} with exception {e}")
+                self.logger.warning(f"Failed to load hashes with exception {e}")
                 raise UnrecoverableException("failed to load hashes")
             request = [[] for _ in range(len(self.filters))]
             for h in snaps:

From 680c78ac3f38724dfcf646673aae2ac3661107be Mon Sep 17 00:00:00 2001
From: nelson <kibnelson@gmail.com>
Date: Fri, 11 Oct 2024 10:47:42 -0400
Subject: [PATCH 012/105] Fuzzy dedup ray implementation

Signed-off-by: nelson <kibnelson@gmail.com>
---
 .../universal/fdedup/ray/pyproject.toml       |  10 +-
 .../ray/src/cluster_analysis_local_ray.py     |  51 ++
 .../ray/src/cluster_analysis_transform_ray.py |  42 +
 .../fdedup/ray/src/compute_shingles.py        |  50 --
 ...ocal_ray.py => data_cleaning_local_ray.py} |  61 +-
 .../ray/src/data_cleaning_transform_ray.py    | 120 +++
 .../universal/fdedup/ray/src/fdedup_s3_ray.py |  76 --
 .../fdedup/ray/src/fdedup_support.py          | 621 --------------
 .../fdedup/ray/src/fdedup_transform_ray.py    | 803 ------------------
 .../ray/src/signature_calc_local_ray.py       |  54 ++
 .../ray/src/signature_calc_transform_ray.py   |  42 +
 11 files changed, 340 insertions(+), 1590 deletions(-)
 create mode 100644 transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
 create mode 100644 transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
 delete mode 100644 transforms/universal/fdedup/ray/src/compute_shingles.py
 rename transforms/universal/fdedup/ray/src/{fdedup_local_ray.py => data_cleaning_local_ray.py} (59%)
 create mode 100644 transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
 delete mode 100644 transforms/universal/fdedup/ray/src/fdedup_s3_ray.py
 delete mode 100644 transforms/universal/fdedup/ray/src/fdedup_support.py
 delete mode 100644 transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
 create mode 100644 transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
 create mode 100644 transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py

diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml
index 3f2c8ba51..e2a2d34c9 100644
--- a/transforms/universal/fdedup/ray/pyproject.toml
+++ b/transforms/universal/fdedup/ray/pyproject.toml
@@ -1,20 +1,18 @@
 [project]
 name = "dpk_fdedup_transform_ray"
-version = "0.2.2.dev0"
+version = "0.3.0.dev0"
 requires-python = ">=3.10,<3.13"
 description = "fdedup Ray Transform"
 license = {text = "Apache-2.0"}
 readme = {file = "README.md", content-type = "text/markdown"}
 authors = [
-    { name = "David Wood", email = "dawood@us.ibm.com" },
-    { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
+    { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
+    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
 dependencies = [
+    "dpk_fdedup_transform_python==0.3.0.dev0",
     "data-prep-toolkit-ray==0.2.2.dev0",
-    "mmh3==4.1.0",
-    "xxhash==3.4.1",
     "tqdm==4.66.3",
-    "scipy==1.12.0"
 ]
 
 [build-system]
diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
new file mode 100644
index 000000000..25b96788d
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
@@ -0,0 +1,51 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration
+from data_processing.utils import ParamsUtils
+from data_processing_ray.runtime.ray import RayTransformLauncher
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+worker_options = {"num_cpus": 0.8}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # where to run
+    "run_locally": True,
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # orchestrator
+    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
+    "runtime_num_workers": 3,
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_creation_delay": 0,
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+}
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
new file mode 100644
index 000000000..970686e13
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
@@ -0,0 +1,42 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from cluster_analysis_transform import ClusterAnalysisTransformConfiguration
+from data_processing.utils import CLIArgumentProvider, get_logger
+from data_processing_ray.runtime.ray.runtime_configuration import (
+    RayTransformRuntimeConfiguration,
+)
+
+
+logger = get_logger(__name__)
+
+
+class ClusterAnalysisRayTransformConfiguration(RayTransformRuntimeConfiguration):
+    """
+    Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(transform_config=ClusterAnalysisTransformConfiguration())
+
+
+if __name__ == "__main__":
+    # launcher = NOOPRayLauncher()
+    launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration())
+    logger.info("Launching  transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/compute_shingles.py b/transforms/universal/fdedup/ray/src/compute_shingles.py
deleted file mode 100644
index 2db75ebe2..000000000
--- a/transforms/universal/fdedup/ray/src/compute_shingles.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import string
-
-
-"""
-This implements the most simplistic splitting of document based on the white spaces
-that can be overwritten by a different document splitter (tokenizer). This method is
-build in the library and can be overwritten using approach described at 
-https://stackoverflow.com/questions/37553545/how-do-i-override-a-function-of-a-python-library
-
-import compute_shingles
-compute_shingles.compute_shingles = my_local_compute_shingles
-"""
-
-
-def _find(s: str, ch: str) -> list[int]:
-    """
-    Get indexes of all locations of character in string
-    :param s: string
-    :param ch: character
-    :return: list of locations
-    """
-    return [i for i, ltr in enumerate(s) if ltr == ch]
-
-
-def compute_shingles(txt: str, word_shingle_size: int, delimiter: str = " ") -> list[str]:
-    """
-    Generate word shingles
-    :param txt: document
-    :param delimiter: delimiter to split document
-    :param word_shingle_size: size of shingle in words
-    :return: list of shingles
-    """
-    text = txt.replace("\n", "").lower().translate(str.maketrans("", "", string.punctuation))
-    separators = _find(text, delimiter)
-    if len(separators) + 1 <= word_shingle_size:
-        return [text]
-    bounds = [-1] + separators + [len(text)]
-    return [text[bounds[i] + 1 : bounds[i + word_shingle_size]] for i in range(0, len(bounds) - word_shingle_size)]
diff --git a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
similarity index 59%
rename from transforms/universal/fdedup/ray/src/fdedup_local_ray.py
rename to transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
index af7bec71c..54fa2ccac 100644
--- a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py
+++ b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
@@ -13,59 +13,52 @@
 import os
 import sys
 
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+)
+from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from fdedup_transform_ray import FdedupRayTransformConfiguration
 
 
-# create launcher
-launcher = RayTransformLauncher(FdedupRayTransformConfiguration())
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
 }
+duplicate_location = os.path.abspath(
+    os.path.join(
+        os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet"
+    )
+)
 worker_options = {"num_cpus": 0.8}
+
 code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
 params = {
     # where to run
     "run_locally": True,
     # Data access. Only required parameters are specified
     "data_local_config": ParamsUtils.convert_to_ast(local_conf),
-    # Orchestration parameters
-    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
-    "runtime_num_workers": 1,
+    document_id_column_cli_param: "int_id_column",
+    duplicate_list_location_cli_param: duplicate_location,
+    # execution info
     "runtime_pipeline_id": "pipeline_id",
     "runtime_job_id": "job_id",
     "runtime_creation_delay": 0,
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
-    # columns used
-    "fdedup_doc_column": "contents",
-    "fdedup_id_column": "int_id_column",
-    "fdedup_cluster_column": "cluster",
-    # infrastructure
-    "fdedup_bucket_cpu": 0.5,
-    "fdedup_doc_cpu": 0.5,
-    "fdedup_mhash_cpu": 0.5,
-    "fdedup_num_doc_actors": 1,
-    "fdedup_num_bucket_actors": 1,
-    "fdedup_num_minhash_actors": 1,
-    "fdedup_num_preprocessors": 2,
-    # fuzzy parameters
-    "fdedup_num_permutations": 64,
-    "fdedup_threshold": 0.8,
-    "fdedup_shingles_size": 5,
-    "fdedup_delimiters": " ",
-    # Random delay between reads
-    "fdedup_random_delay_limit": 5,
-    # snapshotting
-    "fdedup_snapshot_delay": 1,
-    "fdedup_use_doc_snapshot": False,
-    "fdedup_use_bucket_snapshot": False,
+    # orchestrator
+    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
+    "runtime_num_workers": 3,
 }
-sys.argv = ParamsUtils.dict_to_req(d=params)
 
-# launch
-launcher.launch()
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
new file mode 100644
index 000000000..9fdb220f7
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
@@ -0,0 +1,120 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from typing import Any
+
+import ray
+from data_cleaning_transform import (
+    DataCleaningTransform,
+    DataCleaningTransformConfiguration,
+    docs2remove_list,
+    docs2remove_list_key,
+    get_docs_to_remove,
+)
+from data_processing.data_access import DataAccessFactoryBase
+from data_processing.utils import CLIArgumentProvider, get_logger
+from data_processing_ray.runtime.ray import (
+    DefaultRayTransformRuntime,
+    RayTransformLauncher,
+)
+from data_processing_ray.runtime.ray.runtime_configuration import (
+    RayTransformRuntimeConfiguration,
+)
+from ray.actor import ActorHandle
+
+
+logger = get_logger(__name__)
+
+
+class DataCleaningRayTransform(DataCleaningTransform):
+    """ """
+
+    def __init__(self, config: dict):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments defined
+        by the companion runtime, LangSelectorTransformRuntime.  If running inside the RayMutatingDriver,
+        these will be provided by that class with help from the RayMutatingDriver.
+        """
+        docs2remove = config.get(docs2remove_list_key, None)
+        if docs2remove is not None:
+            # This is recommended for production approach. In this case domain list is build by the
+            # runtime once, loaded to the object store and can be accessed by actors without additional reads
+            try:
+
+                config[docs2remove_list_key] = ray.get(config.get(docs2remove_list_key))
+            except Exception as e:
+                self.logger.warning(f"Exception loading languages list from ray object storage {e}")
+                raise RuntimeError(f"exception loading from object storage for key {docs2remove}")
+        super().__init__(config)
+
+
+class DataCleaningRuntime(DefaultRayTransformRuntime):
+    """
+    Ingest Data cleaning runtime support
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        """
+        Create filter runtime
+        :param params: parameters, that should include
+            ingest_supported_langs_file_key: supported languages file
+            ingest_detect_programming_lang_key: whether to detect programming language
+            ingest_domain_key: domain
+            ingest_snapshot_key: snapshot
+        """
+        super().__init__(params)
+        from data_processing.utils import get_logger
+
+        self.logger = get_logger(__name__)
+
+    def get_transform_config(
+        self,
+        data_access_factory: DataAccessFactoryBase,
+        statistics: ActorHandle,
+        files: list[str],
+    ) -> dict[str, Any]:
+        """
+        Set environment for filter execution
+        :param data_access_factory - data access factory
+        :param statistics - reference to the statistics object
+        :param files - list of files to remove
+        :return: dictionary of filter init params
+        """
+        docs_to_remove = get_docs_to_remove(self.params)
+        docs_to_remove_list = ray.put(docs_to_remove)
+        return {docs2remove_list_key: docs_to_remove_list} | self.params
+
+
+class DataCleaningRayTransformConfiguration(RayTransformRuntimeConfiguration):
+    """
+    Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(
+            transform_config=DataCleaningTransformConfiguration(transform_class=DataCleaningRayTransform),
+            runtime_class=DataCleaningRuntime,
+        )
+
+
+if __name__ == "__main__":
+    # launcher = NOOPRayLauncher()
+    launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration())
+    logger.info("Launching  transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py b/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py
deleted file mode 100644
index 285fcfa22..000000000
--- a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import sys
-
-from data_processing.utils import ParamsUtils
-from data_processing_ray.runtime.ray import RayTransformLauncher
-from fdedup_transform_ray import FdedupRayTransformConfiguration
-
-
-# create launcher
-launcher = RayTransformLauncher(FdedupRayTransformConfiguration())
-# create parameters
-s3_cred = {
-    "access_key": "localminioaccesskey",
-    "secret_key": "localminiosecretkey",
-    "url": "http://localhost:9000",
-}
-
-s3_conf = {
-    "input_folder": "test/fdedup/input",
-    "output_folder": "test/fdedup/output",
-}
-worker_options = {"num_cpus": 0.8}
-code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
-params = {
-    # where to run
-    "run_locally": True,
-    # Data access. Only required parameters are specified
-    "data_s3_config": ParamsUtils.convert_to_ast(s3_conf),
-    "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred),
-    # Orchestration parameters
-    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
-    "runtime_num_workers": 5,
-    "runtime_pipeline_id": "pipeline_id",
-    "runtime_job_id": "job_id",
-    "runtime_creation_delay": 0,
-    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
-    # columns used
-    "fdedup_doc_column": "contents",
-    "fdedup_id_column": "int_id_column",
-    "fdedup_cluster_column": "cluster",
-    # infrastructure
-    "fdedup_bucket_cpu": 0.5,
-    "fdedup_doc_cpu": 0.5,
-    "fdedup_mhash_cpu": 0.5,
-    "fdedup_num_doc_actors": 2,
-    "fdedup_num_bucket_actors": 1,
-    "fdedup_num_minhash_actors": 1,
-    "fdedup_num_preprocessors": 2,
-    # fuzzy parameters
-    "fdedup_num_permutations": 64,
-    "fdedup_threshold": 0.8,
-    "fdedup_shingles_size": 5,
-    "fdedup_delimiters": " ",
-    # Random delay between reads
-    "fdedup_random_delay_limit": 5,
-    # snapshotting
-    "fdedup_snapshot_delay": 1,
-    "fdedup_use_doc_snapshot": False,
-    "fdedup_use_bucket_snapshot": False,
-}
-sys.argv = ParamsUtils.dict_to_req(d=params)
-
-
-# launch
-launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/fdedup_support.py b/transforms/universal/fdedup/ray/src/fdedup_support.py
deleted file mode 100644
index 60afb84bf..000000000
--- a/transforms/universal/fdedup/ray/src/fdedup_support.py
+++ /dev/null
@@ -1,621 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import pickle
-import time
-from typing import Any, Iterator, Union
-
-import numpy as np
-import ray
-from data_processing.data_access import SnapshotUtils
-from data_processing.utils import GB, RANDOM_SEED, TransformUtils, get_logger
-from data_processing_ray.runtime.ray import RayUtils
-from ray.actor import ActorHandle
-from ray.util import ActorPool
-from scipy.integrate import quad as integrate
-
-
-NO_SIMILARITY = -1
-REQUEST_LEN = 4096
-LONG_BUCKET = 5000
-LONG_BUCKET_PRINT = 1000
-
-
-def fuzzy_optimal_param(
-    threshold: float,
-    num_perm: int,
-    false_positive_weight: float,
-    false_negative_weight: float,
-) -> tuple[int, int]:
-    """
-    Computes parameters for fuzzy dedup
-    :param threshold: filtering threshold
-    :param num_perm: number of permutations
-    :param false_positive_weight: false positive weight
-    :param false_negative_weight: false negative weight
-    :return: number of buckets and bucket length
-    """
-
-    def _false_positive_probability(ths: float, b: int, r: int) -> float:
-        """
-        Compute false positive probability
-        :param ths: filtering threshold
-        :param b: permutation
-        :param r: rel permutation
-        :return: probability
-        """
-        _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
-        a, err = integrate(_probability, 0.0, ths)
-        return a
-
-    def _false_negative_probability(ths: float, b: int, r: int) -> float:
-        """
-        Compute false negative probability
-        :param ths: filtering threshold
-        :param b: permutation
-        :param r: rel permutation
-        :return: probability
-        """
-        _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
-        a, err = integrate(_probability, ths, 1.0)
-        return a
-
-    min_error = float("inf")
-    opt = (0, 0)
-    for perm in range(1, num_perm + 1):
-        max_r = int(num_perm / perm)
-        for rel in range(1, max_r + 1):
-            fp = _false_positive_probability(threshold, perm, rel)
-            fn = _false_negative_probability(threshold, perm, rel)
-            error = fp * false_positive_weight + fn * false_negative_weight
-            if error < min_error:
-                min_error = error
-                opt = (perm, rel)
-    return opt
-
-
-class MurmurMH:
-    def __init__(self, num_perm: int, seed: int = RANDOM_SEED):
-        self.seed = seed
-        self.num_perm = num_perm
-        self.permutations = self._init_permutations(seed, num_perm)
-
-    def minhash(self, shingle_count: int, shingles: Iterator[str]) -> np.array:
-        def generator():
-            for shingle in shingles:
-                yield TransformUtils.str_to_int(shingle)
-
-        hash_values = np.fromiter(generator(), dtype=np.uint64, count=shingle_count)
-
-        result = np.zeros(self.permutations.shape, dtype=np.uint32)
-        for i, perm in enumerate(self.permutations):
-            result[i] = np.right_shift((perm * hash_values).T, 32).astype(np.uint32).min(axis=0, keepdims=False)
-        return result
-
-    @staticmethod
-    def _init_permutations(seed: int, num_perm: int) -> np.array:
-        # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic
-        max_int = np.uint64((1 << 64) - 1)
-        gen = np.random.RandomState(seed)
-        # get self.num_perm pseudo random numbers between 2 and max_int (excl)
-        permutations = np.array([gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], dtype=np.uint64).T
-        # make all even pseudo random numbers odd by adding 1
-        permutations[permutations % 2 == 0] += 1
-        return permutations
-
-    @staticmethod
-    def jaccard(mh1: np.array, mh2: np.array) -> float:
-        return np.count_nonzero(mh1 == mh2)
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class DocCollector:
-    """
-    An actor collecting de duped document IDs
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Initializer
-        """
-        self.logger = get_logger(__name__)
-        self.actor_id = params.get("id")
-        self.removed = set()
-        data_access_factory = params.get("data_access")
-        self.data_access = data_access_factory.create_data_access()
-        snapshot = params.get("snapshot", None)
-        if snapshot is None:
-            self.ids = {}
-        else:
-            try:
-                bids, _ = self.data_access.get_file(snapshot)
-                self.ids = pickle.loads(bids)
-            except Exception as e:
-                self.logger.warning(f"Failed to load doc collector {self.actor_id} with exception {e}")
-                raise e
-
-    def add_documents(self, dr: tuple[list[tuple[int, int]], list[int]]) -> None:
-        """
-        Add documents and removed document
-        :param dr: documents to keep and documents to remove
-        :return:
-        """
-        docs = dr[0]
-        rm = dr[1]
-        # process documents to remove
-        for did in rm:
-            self.ids.pop(did, None)
-        self.removed.update(rm)
-        # process documents to keep
-        for key, val in docs:
-            if key in self.removed:
-                continue
-            if key in self.ids and val == NO_SIMILARITY:
-                # Do not update existing docs with NO_SIMILARITY
-                continue
-            else:
-                self.ids[key] = val
-
-    def filter(self, docs: list[int]) -> dict[int, int]:
-        """
-        Filter documents
-        :param docs: documents to filter
-        :return: documents to keep
-        """
-        result = {}
-        for doc_id in docs:
-            r = self.ids.get(doc_id, None)
-            if r is not None:
-                result[doc_id] = r
-        return result
-
-    def snapshot(self) -> None:
-        """
-        Snapshotting itself
-        """
-        try:
-            b_doc = pickle.dumps(self.ids)
-            self.data_access.save_file(
-                f"{SnapshotUtils.get_snapshot_folder(self.data_access)}docs/doc_collector_{self.actor_id}", b_doc
-            )
-        except Exception as e:
-            self.logger.warning(f"Failed to snapshot doc collector {self.actor_id} with exception {e}")
-            raise e
-
-    def get_size(self) -> tuple[int, float, int, float]:
-        """
-        get sizes
-        :return: number of ids, its memory utilization, number of removed, its memory utilization
-        """
-        return (
-            len(self.ids),
-            TransformUtils.deep_get_size(self.ids) / GB,
-            len(self.removed),
-            TransformUtils.deep_get_size(self.removed) / GB,
-        )
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class DocsMinHash:
-    """
-    An actor storing min hashes for a doc id
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Initialize
-        :param params: parameters
-        """
-        self.logger = get_logger(__name__)
-        self.actor_id = params.get("id")
-        data_access_factory = params.get("data_access")
-        self.data_access = data_access_factory.create_data_access()
-        snapshot = params.get("snapshot", None)
-        if snapshot is None:
-            self.docs = {}
-        else:
-            try:
-                bdocs, _ = self.data_access.get_file(snapshot)
-                self.docs = pickle.loads(bdocs)
-            except Exception as e:
-                self.logger.warning(f"Failed to load minhash collector {self.actor_id} with exception {e}")
-                raise e
-
-    def add_minhashes(self, updates: list[tuple[int, int, np.array]]) -> None:
-        """
-        Add minhashes
-        :param updates: minhash for doc_id a tuple of doc len and array of hashes
-        :return: None
-        """
-        for doc_id, length, minhash in updates:
-            self.docs[doc_id] = np.concatenate(([length], minhash))
-
-    def get_minhashes(self, doc_ids: list[int]) -> list[tuple[int, int, np.array]]:
-        """
-        Get minhashes for a list of documents
-        :param doc_ids: list of doc ids
-        :return: doc id, len, minhashes
-        """
-        result = []
-        for doc_id in doc_ids:
-            info = self.docs.get(doc_id)
-            if info is not None:
-                result.append((doc_id, info[0], info[1:]))
-        return result
-
-    def snapshot(self) -> None:
-        """
-        Snapshotting itself
-        """
-        try:
-            b_doc = pickle.dumps(self.docs)
-            self.data_access.save_file(
-                f"{SnapshotUtils.get_snapshot_folder(self.data_access)}minhash/minhash_collector_{self.actor_id}",
-                b_doc,
-            )
-        except Exception as e:
-            self.logger.warning(f"Failed to snapshot minhash collector {self.actor_id} with exception {e}")
-            raise e
-
-    def get_size(self) -> tuple[int, float]:
-        """
-        Get size of used min hashes
-        :return: number of docs, its memory utilization
-        """
-        return len(self.docs), TransformUtils.deep_get_size(self.docs) / GB
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class BucketsHash:
-    """
-    Actor storing buckets information
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Initialization
-        """
-        from ray.util.metrics import Counter
-
-        self.submitter = None
-        self.n_buckets = 0
-        self.bucket_memory = 0
-        self.logger = get_logger(__name__)
-        self.actor_id = params.get("id")
-        data_access_factory = params.get("data_access")
-        self.data_access = data_access_factory.create_data_access()
-        snapshot = params.get("snapshot", None)
-        if snapshot is None:
-            self.buckets = {}
-        else:
-            try:
-                b_buckets, _ = self.data_access.get_file(snapshot)
-                self.buckets = pickle.loads(b_buckets)
-            except Exception as e:
-                self.logger.warning(f"Failed to load buckets collector {self.actor_id} with exception {e}")
-                raise e
-        self.bucket_created_counter = Counter("bucket_created", "Amount of buckets created")
-        self.long_bucket_submit_counter = Counter("long_bucket_submitted", "Amount of long buckets submitted")
-        self.short_bucket_submit_counter = Counter("short_bucket_submitted", "Amount of short buckets submitted")
-
-    def add_buckets(self, bck: list[tuple[int, list[int]]]) -> None:
-        """
-        Add additional buckets to hash
-        :param bck: bucket information
-        :return: None
-        """
-        for bucket in bck:
-            b_hash = bucket[0]
-            buckets_for_hash = self.buckets.get(b_hash)
-            if buckets_for_hash:
-                if type(buckets_for_hash) == int:
-                    self.buckets[b_hash] = [buckets_for_hash] + bucket[1]
-                else:
-                    buckets_for_hash.extend(bucket[1])
-            else:
-                if len(bucket[1]) == 1:
-                    self.buckets[b_hash] = bucket[1][0]
-                else:
-                    self.buckets[b_hash] = bucket[1]
-                self.bucket_created_counter.inc(1)
-
-    def add_processing_submitter(self, submitter: ActorHandle) -> None:
-        """
-        Add process submitter
-        :param submitter: reference to submitter
-        :return:
-        """
-        self.submitter = submitter
-
-    def process_buckets(self) -> None:
-        """
-        Process buckets to generate documents
-        :return: None
-        """
-
-        # Remember usage
-        self.n_buckets = len(self.buckets)
-        self.bucket_memory = TransformUtils.deep_get_size(self.buckets) / GB
-
-        # split buckets into short and long. Long buckets can take very long to process
-        long_buckets = []
-        short_buckets = []
-        while len(self.buckets) > 0:
-            doc_id, bucket = self.buckets.popitem()
-            if type(bucket) == list and len(bucket) > LONG_BUCKET:
-                # Its long
-                long_buckets.append(bucket)
-            else:
-                short_buckets.append(bucket)
-        self.logger.info(f"processing buckets {len(long_buckets)} long, {len(short_buckets)} short")
-
-        # process long buckets first - we are submitting them one at a time
-        for bucket in long_buckets:
-            if len(bucket) > 2 * LONG_BUCKET:
-                # For very long buckets, split them
-                self.logger.info(f"Splitting bucket of length len(bucket) into chunks")
-                smaller_bucket = [
-                    bucket[i * LONG_BUCKET : (i + 1) * LONG_BUCKET]
-                    for i in range((len(bucket) + LONG_BUCKET - 1) // LONG_BUCKET)
-                ]
-                for b in smaller_bucket:
-                    ray.get(self.submitter.submit_for_processing.remote([b]))
-                    self.long_bucket_submit_counter.inc(1)
-            else:
-                ray.get(self.submitter.submit_for_processing.remote([bucket]))
-                self.long_bucket_submit_counter.inc(1)
-        self.logger.info("Done submitting long buckets")
-
-        # And now the rest of buckets
-        bucket_chunks = [short_buckets[i * 100 : (i + 1) * 100] for i in range((len(short_buckets) + 99) // 100)]
-        for b in bucket_chunks:
-            ray.get(self.submitter.submit_for_processing.remote(b))
-            self.short_bucket_submit_counter.inc(len(b))
-
-    def snapshot(self) -> None:
-        """
-        Snapshotting itself
-        """
-        try:
-            b_buckets = pickle.dumps(self.buckets)
-            self.data_access.save_file(
-                f"{SnapshotUtils.get_snapshot_folder(self.data_access)}buckets/buckets_collector_{self.actor_id}",
-                b_buckets,
-            )
-        except Exception as e:
-            self.logger.warning(f"Failed to snapshot buckets collector {self.actor_id} with exception {e}")
-            raise e
-
-    def get_size(self) -> tuple[int, float]:
-        """
-        Get buckets resource utilization
-        :return: number of buckets and memory utilization
-        """
-        return self.n_buckets, self.bucket_memory
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class BucketsHashProcessor:
-    """
-    Actor for processing buckets
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Init method
-        :param params - dictionary of parameters containing the following keys
-            remote_docs - handles to the remote docs
-            remote_minhashes - handles to the remote minhashes
-            mn_min_hash - MurmurMH class
-            threshold - threshold
-            statistics - statistics actor
-        """
-        from ray.util.metrics import Counter
-
-        self.threshold = params["threshold"]
-        self.mn_min_hash = params["mn_min_hash"]
-        self.remote_docs = params["remote_docs"]
-        self.remote_minhashes = params["remote_minhashes"]
-        self.stats = params["statistics"]
-        self.logger = get_logger(__name__)
-        self.bucket_processed_counter = Counter("bucket_processed", "Amount of buckets processed")
-
-    def _submit_generated_docs(self, docs: dict[int, int], removed: set[int]) -> None:
-        """
-        Submit generated documents
-        :param docs: docs to submit
-        :param removed: removed documents
-        :return: None
-        """
-        # Remove doc ids that are already removed
-        for did in removed:
-            docs.pop(did, None)
-        # Build remote requests
-        request = [([], []) for _ in range(len(self.remote_docs))]
-        for key, value in docs.items():
-            req_tuple = request[key % len(self.remote_docs)]
-            req_tuple[0].append((key, value))
-        for did in removed:
-            req_tuple = request[did % len(self.remote_docs)]
-            req_tuple[1].append(did)
-        # Submit requests and wait for replies
-        remote_replies = []
-        i = 0
-        for req in request:
-            if len(req[0]) > 0 or len(req[1]) > 0:  # Only submit if the request has data
-                remote_replies.append(self.remote_docs[i].add_documents.remote(req))
-            i += 1
-        # Process replies
-        RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies)
-
-    # get minhashes and length for docs in the bucket
-    def _get_minhashes_docs(self, doc_ids: list[int]) -> dict[int, tuple[int, list[int]]]:
-        """
-        Get minhashes for documents by submitting requests to an appropriate doc collectors
-        :param doc_ids: doc ids
-        :return: doc ids with hashes
-        """
-        request = [[] for _ in range(len(self.remote_minhashes))]
-        for value in doc_ids:
-            request[value % len(self.remote_minhashes)].append(value)
-        remote_replies = []
-        i = 0
-        for req in request:
-            if len(req) > 0:  # Only submit if the length is greater then 0
-                remote_replies.append(self.remote_minhashes[i].get_minhashes.remote(req))
-            i += 1
-        # Process replies
-        hashes = {}
-        while remote_replies:
-            # Wait for replies
-            ready, not_ready = ray.wait(remote_replies)
-            reply = ray.get(ready)[0]
-            for r in reply:
-                hashes[r[0]] = (r[1], r[2])
-            remote_replies = not_ready
-        return hashes
-
-    def process_buckets(self, buckets: list[Union[int, list[int]]]) -> None:
-        """
-        process buckets to generate documents
-        :param buckets: buckets
-        :return: none
-        """
-        t_start = time.time()
-        docs = {}
-        removed = set()
-        for bucket in buckets:
-            if type(bucket) == int:
-                # This hash has a single document
-                if bucket not in docs:
-                    docs[bucket] = NO_SIMILARITY
-                self.bucket_processed_counter.inc(1)
-                continue
-            # multiple documents
-            start = time.time()
-            bucket_len = len(bucket)
-            very_long = bucket_len > LONG_BUCKET
-
-            hashes = self._get_minhashes_docs(bucket)
-            set_list = []
-            unvisited = set(bucket)
-
-            # combine similar documents
-            index = 0
-            while len(unvisited) > 0:
-                current_doc_id = unvisited.pop()
-                current_mh = hashes[current_doc_id][1]
-                current_set = set()
-                for other_doc_id in bucket:
-                    if other_doc_id in unvisited:
-                        other_mh = hashes[other_doc_id][1]
-                        if self.mn_min_hash.jaccard(current_mh, other_mh) >= self.threshold:
-                            current_set.add(current_doc_id)
-                            current_set.add(other_doc_id)
-                            unvisited.discard(other_doc_id)
-                if len(current_set) > 0:
-                    set_list.append(current_set)
-                index += 1
-                if index % LONG_BUCKET_PRINT == 0:
-                    self.logger.info(f"processing very long {bucket_len} bucket, {index} documents so far")
-            if index > LONG_BUCKET_PRINT:
-                self.logger.info(f"done processing very long {bucket_len}")
-
-            # process created sets
-            for current_set in set_list:
-                for d in current_set:
-                    bucket.remove(d)
-                removed.update(current_set)
-                for i, doc_id in enumerate(current_set):
-                    if i == 0:
-                        cluster_id = doc_id
-                        remaining = doc_id
-                        min_len = hashes[doc_id][0]
-                        max_len = min_len
-                        continue
-                    c_len = hashes[doc_id][0]
-                    if c_len > max_len:
-                        max_len = c_len
-                        remaining = doc_id
-                        continue
-                    if c_len <= min_len:
-                        min_len = c_len
-                        cluster_id = doc_id
-                docs[remaining] = cluster_id
-                removed.discard(remaining)
-
-            # if we did not find docs in connections, submit them as NO_SIMILARITY
-            for d in bucket:
-                if d not in docs:
-                    docs[d] = NO_SIMILARITY
-            if very_long:
-                self.logger.info(
-                    f"Processed long ({bucket_len}) bucket in {round((time.time() - start) / 60.,3)} "
-                    f"min; "
-                    f"docs chains {len(set_list)}"
-                )
-            self.bucket_processed_counter.inc(1)
-        # Submit docs
-        self._submit_generated_docs(docs, removed)
-        # peg stats
-        self.stats.add_stats.remote({"generated doc_ids": len(docs), "bucket processing time": time.time() - t_start})
-
-
-@ray.remote(scheduling_strategy="SPREAD")
-class BucketsHashProcessorInvoker(object):
-    """
-    Bucket hash processing coordinator (singleton)
-    """
-
-    def __init__(self, bucket_processors: list[ActorHandle]) -> None:
-        self.n_processors = len(bucket_processors)
-        self.pool = ActorPool(bucket_processors)
-        self.submitted = 0
-        self.processed = 0
-        self.logger = get_logger(__name__)
-        self.start = time.time()
-
-    def submit_for_processing(self, buckets: list[Union[int, list[int]]]) -> None:
-        # Get completed results
-        if self.submitted < self.n_processors:  # still have room
-            self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets)
-            self.logger.debug("Submitted bucket processing request")
-            self.submitted += 1
-            return
-        else:
-            while True:
-                # we can have several workers fail here
-                try:
-                    self.pool.get_next_unordered()
-                    break
-                except Exception as e:
-                    self.logger.error(f"Failed to process request worker exception {e}")
-                    self.processed += 1
-            self.processed += 1
-            if self.processed % 100 == 0:
-                self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min")
-            self.logger.debug("Completed bucket processing request")
-            self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets)
-            self.submitted += 1
-            self.logger.debug("Submitted bucket processing request")
-            return
-
-    def wait_for_completion(self) -> None:
-        self.logger.info(f"Waiting bucket processing completion. Submitted requests {self.submitted}")
-        while self.pool.has_next():
-            try:
-                self.pool.get_next_unordered()
-            except Exception as e:
-                self.logger.error(f"Failed to process request worker exception {e}")
-            self.processed += 1
-            if self.processed % 100 == 0:
-                self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min")
diff --git a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
deleted file mode 100644
index 6c6c02bb3..000000000
--- a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
+++ /dev/null
@@ -1,803 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import random
-import time
-from argparse import ArgumentParser, Namespace
-from typing import Any
-
-import mmh3
-import numpy as np
-import pyarrow as pa
-import ray
-from data_processing.data_access import DataAccessFactoryBase, SnapshotUtils
-from data_processing.transform import AbstractTableTransform, TransformConfiguration
-from data_processing.utils import (
-    RANDOM_SEED,
-    CLIArgumentProvider,
-    TransformUtils,
-    str2bool,
-)
-from data_processing_ray.runtime.ray import (
-    DefaultRayTransformRuntime,
-    RayTransformFileProcessor,
-    RayTransformLauncher,
-    RayUtils,
-)
-from data_processing_ray.runtime.ray.runtime_configuration import (
-    RayTransformRuntimeConfiguration,
-)
-from fdedup_support import (
-    REQUEST_LEN,
-    BucketsHash,
-    BucketsHashProcessor,
-    BucketsHashProcessorInvoker,
-    DocCollector,
-    DocsMinHash,
-    MurmurMH,
-    fuzzy_optimal_param,
-)
-from ray.actor import ActorHandle
-from ray.util import ActorPool
-
-
-short_name = "fdedup"
-cli_prefix = f"{short_name}_"
-
-
-class FdedupTransform(AbstractTableTransform):
-    """
-    Implements fuzzy dedup data preprocessor (building tables and minhashes).
-    """
-
-    def __init__(self, config: dict):
-        """
-        Initialize based on the dictionary of configuration information.
-        :param config: initialization parameters, with the following keys
-            doc_column - name of doc column
-            doc_id_int_column - name of int doc id column
-            word_shingle_size - word shingle size
-            mn_min_hash - MurmurMH class
-            num_bands - number of bands
-            length_band band length
-            remote_buckets - bucket actors
-            remote_minhashes - minhash actors
-            delimiter - delimiter
-            random_delay_limit - random delay limit
-        """
-        super().__init__(config)
-        self.doc_column = config.get("doc_column", "")
-        self.doc_id_column = config.get("doc_id_int_column", "")
-        self.word_shingle_size = config.get("word_shingle_size", 1)
-        self.delimiter = config.get("delimiter", " ")
-        self.mn_min_hash = config.get("mn_min_hash", None)
-        self.num_bands = config.get("num_bands", 1)
-        self.length_band = config.get("length_band", 1)
-        self.buckets = config.get("remote_buckets", [])
-        self.minhashes = config.get("remote_minhashes", [])
-        self.random_delay_limit = config.get("random_delay_limit", 10)
-
-    def _generate_minhashes(self, shingles: list[str]) -> np.array:
-        """
-        Generate minhashes
-        :param shingles:
-        :return: generated minhashes
-        """
-        min_hashes = self.mn_min_hash.minhash(len(shingles), shingles)
-        num_min_hashes = len(min_hashes)
-        assert self.num_bands * self.length_band <= num_min_hashes, (
-            f"num_bans*band_len must be <= num min hashes, was num_bands={self.num_bands}, "
-            f"bands_len={self.length_band}, num_min hashes={num_min_hashes}"
-        )
-        return min_hashes
-
-    def _generate_buckets(self, min_hashes: np.array) -> list[int]:
-        """
-        Generate buckets
-        :param min_hashes: array of minhashes
-        :return:
-        """
-        return [
-            mmh3.hash64(min_hashes[i * self.length_band : (i + 1) * self.length_band], seed=RANDOM_SEED, signed=False)[
-                0
-            ]
-            for i in range(self.num_bands)
-        ]
-
-    def _submit_buckets_minhashes(
-        self, buckets: dict[int, list[int]], minhashes: list[tuple[int, int, np.array]]
-    ) -> None:
-        """
-        Submit buckets to hash
-        :param buckets: buckets
-        :param minhashes: minhashes
-        :return: None
-        """
-        # bucket requests
-        request = [[] for _ in range(len(self.buckets))]
-        for key, value in buckets.items():
-            request[key % len(self.buckets)].append((key, value))
-        # Submit requests to appropriate bucket collectors
-        remote_replies = []
-        i = 0
-        for req in request:
-            if len(req) > 0:  # Only submit if the length is greater then 0
-                remote_replies.append(self.buckets[i].add_buckets.remote(req))
-            i += 1
-        # Minhashes
-        request = [[] for _ in range(len(self.minhashes))]
-        for minh in minhashes:
-            request[minh[0] % len(self.minhashes)].append(minh)
-        # Submit requests to appropriate minhash collectors
-        i = 0
-        for req in request:
-            if len(req) > 0:  # Only submit if the length is greater then 0
-                remote_replies.append(self.minhashes[i].add_minhashes.remote(req))
-            i += 1
-        # wait for completion
-        RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies)
-
-    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
-        """
-        Preprocessing table content.
-        :param table: table
-        :param file_name - name of currently processed file
-        :return: resulting table, statistics
-        """
-        from compute_shingles import compute_shingles
-
-        def flush(limit: int) -> None:
-            """
-            flushing buckets and minhashes to dedicated actors
-            :param limit: number of buckets to flush
-            :return: None
-            """
-            if len(buckets) >= limit:  # time to submit
-                nonlocal num_buckets
-                nonlocal num_minhashes
-                self._submit_buckets_minhashes(buckets, minhashes)
-                num_buckets = num_buckets + len(buckets)
-                num_minhashes = num_minhashes + len(minhashes)
-                buckets.clear()
-                minhashes.clear()
-
-        # make sure that the doc column exists
-        TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column])
-        # Inner variables
-        buckets = {}
-        minhashes = []
-        num_buckets = 0
-        num_minhashes = 0
-        docs = table[self.doc_column]
-        doc_ids = table[self.doc_id_column]
-        # for every document/its integer id
-        for n in range(table.num_rows):
-            doc = docs[n].as_py()
-            doc_id = doc_ids[n].as_py()
-            shingles = compute_shingles(txt=doc, word_shingle_size=self.word_shingle_size, delimiter=self.delimiter)
-            if len(shingles) > 0:
-                mh = self._generate_minhashes(shingles)
-                minhashes.append((doc_id, len(doc), mh))
-                candidates = self._generate_buckets(mh)
-
-                for b_hash in candidates:
-                    bucket_array = buckets.get(b_hash)
-                    if bucket_array is None:
-                        buckets[b_hash] = [doc_id]
-                    else:
-                        bucket_array.append(doc_id)
-                flush(REQUEST_LEN)
-        flush(0)
-        # peg stats
-        stats = {"generated buckets": num_buckets, "generated minhashes": num_minhashes}
-        time.sleep(int(random.random() * self.random_delay_limit))
-        return [], stats
-
-
-class FdedupFilter(AbstractTableTransform):
-    """
-    Filtering documents
-    """
-
-    def __init__(self, config: dict):
-        """
-        Initialize based on the dictionary of configuration information.
-        The dictionary should contain the following:
-            doc_column - name of doc column
-            doc_id_int_column - name of int doc id column
-            cluster_column - name of the cluster column
-            remote_docs - list of remote doc collectors
-            random_delay_limit - random delay limit
-        """
-        super().__init__(config)
-        self.doc_column = config.get("doc_column", "")
-        self.doc_id_column = config.get("doc_id_int_column", "")
-        self.cluster_column = config.get("cluster_column", "")
-        self.docs = config.get("remote_docs", "")
-        self.random_delay_limit = config.get("random_delay_limit", 10)
-
-    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
-        """
-        De duping (filtering) table content.
-        :param table: table
-        :param file_name: name of the currently processing file
-        :return: resulting table, statistics
-        """
-        # make sure that the doc column exists
-        TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column])
-        # inner variables
-        ids = table.column(self.doc_id_column)
-        # Submit requests to an appropriate doc collectors
-        request = [[] for _ in range(len(self.docs))]
-        for value in ids:
-            doc_id = value.as_py()
-            request[doc_id % len(self.docs)].append(doc_id)
-        remote_replies = []
-        i = 0
-        for req in request:
-            if len(req) > 0:  # Only submit if the length is greater then 0
-                remote_replies.append(self.docs[i].filter.remote(req))
-            i += 1
-        # Process replies
-        unique = {}
-        while remote_replies:
-            # Wait for replies
-            ready, not_ready = ray.wait(remote_replies)
-            reply = ray.get(ready)[0]
-            unique.update(reply)
-            remote_replies = not_ready
-        # Filter out table
-        mask = []
-        clusters = []
-        # Actual filtering
-        for n in range(table.num_rows):
-            doc_id = ids[n].as_py()
-            if doc_id in unique:
-                mask.append(True)
-                clusters.append(unique.pop(doc_id))
-            else:
-                mask.append(False)
-        # build out table
-        out_table = TransformUtils.add_column(table=table.filter(mask), name=self.cluster_column, content=clusters)
-        # build execution statistics
-        stats = {"source_documents": table.num_rows, "result_documents": out_table.num_rows}
-        time.sleep(int(random.random() * self.random_delay_limit))
-        return [out_table], stats
-
-
-class FdedupRuntime(DefaultRayTransformRuntime):
-    """
-    Fuzzy dedup runtime support. Here we are using set environment to implement first two steps of fuzzy dedup
-    processing - preprocessing and bucket hash processing
-    """
-
-    def __init__(self, params: dict[str, Any]):
-        """
-        Create filter runtime
-        :param params: parameters, that should include
-            doc_column - name of the document column
-            id_column - name of the integer doc id column
-            cluster_column - name of the cluster column
-            worker_options - start options for preprocessor - from the orchestrator configuration
-            bucket_cpu - number of cpus for bucket actor
-            doc_cpu - number of cpus for doc actor
-            mhash_cpu - number of cpus for minhash actor
-            num_doc_actors - number of document actors
-            num_bucket_actors - number of bucket actors
-            num_minhash_actors - number of minhash actors
-            num_preprocessors - number of preprocessors
-            snapshot_delay - delay (sec) in sending snapshot requests to actors
-            use_bucket_snapshot - use bucket snapshot
-            use_doc_snapshot - use doc snapshot
-            random_delay_limit - random_delay limit
-            # fuzzy specific parameters
-            num_permutations - number of permutations
-            threshold - threshold
-            world_shingle_size - word shingles size
-            delimiters - delimiter
-        """
-        from data_processing.utils import get_logger
-
-        super().__init__(params)
-        self.logger = get_logger(__name__)
-        self.sum_buckets = 0
-        self.sum_buckets_mem = 0
-        self.sum_mh = 0
-        self.sum_mh_mem = 0
-        self.document_collectors = []
-        self.snapshot_delay = self.params.get("snapshot_delay", 1)
-        self.random_delay_limit = self.params.get("random_delay_limit", 10)
-
-    def get_transform_config(
-        self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
-    ) -> dict[str, Any]:
-        """
-        Set environment for filter execution
-        :param data_access_factory - data access factory
-        :param statistics - reference to the statistics object
-        :param files - list of files to process
-        :return: dictionary of filter init params
-        """
-        if self.params.get("use_doc_snapshot", False):
-            self.logger.info("continuing from the document actors snapshot")
-            data_access = data_access_factory.create_data_access()
-            path = f"{SnapshotUtils.get_snapshot_folder(data_access)}docs"
-            files, retries = data_access.get_folder_files(path=path)
-            if retries > 0:
-                statistics.add_stats.remote({"data access retries": retries})
-            self.logger.info(f"Found the following snapshot files {files.keys()}")
-            self.document_collectors = [None] * len(files)
-            for file in files.keys():
-                i = int(file[file.rfind("_") + 1 :])
-                self.document_collectors[i] = DocCollector.options(
-                    **{"num_cpus": self.params.get("doc_cpu", 0.5)}
-                ).remote({"id": i, "data_access": data_access_factory, "snapshot": file})
-                time.sleep(self.snapshot_delay)
-            self.logger.info(f"Created {len(self.document_collectors)} document collectors to continue processing")
-        else:
-            self.logger.info("starting run from the beginning")
-            self._create_doc_actors(data_access_factory=data_access_factory, statistics=statistics, files=files)
-        return {
-            "doc_column": self.params.get("doc_column", ""),
-            "doc_id_int_column": self.params.get("id_column", ""),
-            "cluster_column": self.params.get("cluster_column", ""),
-            "remote_docs": self.document_collectors,
-            "random_delay_limit": self.random_delay_limit,
-        }
-
-    def _create_doc_actors(
-        self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
-    ) -> None:
-        """
-        Create document actors
-        :param data_access_factory - data access factory
-        :param statistics - reference to the statistics object
-        :param files - list of files to process
-        :return: None
-        """
-        mn_min_hash = MurmurMH(num_perm=self.params.get("num_permutations", 64), seed=RANDOM_SEED)
-        if self.params.get("use_bucket_snapshot", False):
-            self.logger.info("continuing from the bucket actors snapshot")
-            data_access = data_access_factory.create_data_access()
-            # recreate bucket collectors
-            path = f"{SnapshotUtils.get_snapshot_folder(data_access)}buckets"
-            files, retries = data_access.get_folder_files(path=path)
-            if retries > 0:
-                statistics.add_stats.remote({"data access retries": retries})
-            self.logger.debug(f"Found the following bucket snapshot files {files.keys()}")
-            bucket_collectors = [None] * len(files)
-            for file in files.keys():
-                i = int(file[file.rfind("_") + 1 :])
-                bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote(
-                    {"id": i, "data_access": data_access_factory, "snapshot": file}
-                )
-                time.sleep(self.snapshot_delay)
-            self.logger.info(f"Created {len(bucket_collectors)} bucket collectors to continue processing")
-            # recreate minhash collectors
-            path = f"{SnapshotUtils.get_snapshot_folder(data_access)}minhash"
-            files, retries = data_access.get_folder_files(path=path)
-            if retries > 0:
-                statistics.add_stats.remote({"data access retries": retries})
-            self.logger.debug(f"Found the following minhash snapshot files {files.keys()}")
-            minhash_collectors = [None] * len(files)
-            for file in files.keys():
-                i = int(file[file.rfind("_") + 1 :])
-                minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote(
-                    {"id": i, "data_access": data_access_factory, "snapshot": file}
-                )
-                time.sleep(self.snapshot_delay)
-            self._process_buckets(
-                data_access_factory=data_access_factory,
-                statistics=statistics,
-                bucket_collectors=bucket_collectors,
-                minhash_collectors=minhash_collectors,
-                mn_min_hash=mn_min_hash,
-            )
-            self.logger.info(f"Created {len(minhash_collectors)} minhash collectors to continue processing")
-        else:
-            self.logger.info("continuing from the very beginning")
-            self._create_doc_actors_internal(
-                data_access_factory=data_access_factory, statistics=statistics, mn_min_hash=mn_min_hash, files=files
-            )
-
-    def _create_doc_actors_internal(
-        self,
-        data_access_factory: DataAccessFactoryBase,
-        statistics: ActorHandle,
-        mn_min_hash: MurmurMH,
-        files: list[str],
-    ) -> None:
-        """
-        Create document actors
-        :param data_access_factory - data access factory
-        :param statistics - reference to the statistics object
-        :param mn_min_hash - MurmurMH class
-        :param files - list of files to process
-        :return: None
-        """
-        # compute fuzzy dedup parameters
-        num_buckets, length_bucket = fuzzy_optimal_param(
-            threshold=self.params.get("threshold", 0.8),
-            num_perm=self.params.get("num_permutations", 64),
-            false_positive_weight=0.5,
-            false_negative_weight=0.5,
-        )
-        self.logger.info(f"Fuzzy: num buckets {num_buckets}, bucket length {length_bucket}")
-        # Build bucket and minhash collectors
-        bucket_collectors = [None] * self.params.get("num_bucket_actors", 1)
-        for i in range(self.params.get("num_bucket_actors", 1)):
-            bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote(
-                {"id": i, "data_access": data_access_factory}
-            )
-        self.logger.info(f"created {len(bucket_collectors)} bucket actors")
-        minhash_collectors = [None] * self.params.get("num_minhash_actors", 1)
-        for i in range(self.params.get("num_minhash_actors", 1)):
-            minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote(
-                {"id": i, "data_access": data_access_factory}
-            )
-        self.logger.info(f"created {len(minhash_collectors)} minhash actors")
-        self._preprocess_tables(
-            data_access_factory=data_access_factory,
-            statistics=statistics,
-            files=files,
-            mn_min_hash=mn_min_hash,
-            num_buckets=num_buckets,
-            length_bucket=length_bucket,
-            bucket_collectors=bucket_collectors,
-            minhash_collectors=minhash_collectors,
-            random_delay_limit=self.random_delay_limit,
-        )
-        # At this point we can snapshot both bucket and minhash collectors for potential restart
-        self.logger.info("creating minhash snapshots")
-        minhash_replies = [None] * len(minhash_collectors)
-        index = 0
-        for collector in minhash_collectors:
-            minhash_replies[index] = collector.snapshot.remote()
-            index += 1
-            time.sleep(self.snapshot_delay)
-        while minhash_replies:
-            ready, not_ready = ray.wait(minhash_replies)
-            minhash_replies = not_ready
-        self.logger.info("minhash snapshots created")
-        self.logger.info("creating bucket snapshots")
-        bucket_replies = [None] * len(bucket_collectors)
-        index = 0
-        for collector in bucket_collectors:
-            bucket_replies[index] = collector.snapshot.remote()
-            index += 1
-            time.sleep(self.snapshot_delay)
-        while bucket_replies:
-            ready, not_ready = ray.wait(bucket_replies)
-            bucket_replies = not_ready
-        self.logger.info("bucket snapshots created")
-        self._process_buckets(
-            data_access_factory=data_access_factory,
-            statistics=statistics,
-            bucket_collectors=bucket_collectors,
-            minhash_collectors=minhash_collectors,
-            mn_min_hash=mn_min_hash,
-        )
-
-    def _process_buckets(
-        self,
-        data_access_factory: DataAccessFactoryBase,
-        statistics: ActorHandle,
-        bucket_collectors: list[ActorHandle],
-        minhash_collectors: list[ActorHandle],
-        mn_min_hash: MurmurMH,
-    ) -> None:
-        """
-        Process buckets
-        :param data_access_factory - data access factory
-        :param statistics - statistics actor
-        :param bucket_collectors - bucket collectors
-        :param minhash_collectors - minhash collectors
-        :param mn_min_hash - MMurmurMH class
-        :return: None
-        """
-        # Create document collectors
-        self.document_collectors = [None] * self.params.get("num_doc_actors", 1)
-        for i in range(self.params.get("num_doc_actors", 1)):
-            self.document_collectors[i] = DocCollector.options(**{"num_cpus": self.params.get("doc_cpu", 0.5)}).remote(
-                {"id": i, "data_access": data_access_factory}
-            )
-        self.logger.info(f"created {len(self.document_collectors)} document actors")
-        # create bucket processors
-        bucket_processors_list = RayUtils.create_actors(
-            clazz=BucketsHashProcessor,
-            params={
-                "remote_docs": self.document_collectors,
-                "remote_minhashes": minhash_collectors,
-                "mn_min_hash": mn_min_hash,
-                "threshold": self.params.get("threshold", 0.8) * self.params.get("num_permutations", 64),
-                "statistics": statistics,
-            },
-            actor_options=self.params.get("worker_options", None),
-            n_actors=self.params.get("num_preprocessors", 1),
-        )
-        self.logger.info(f"created {len(bucket_processors_list)} bucket processor actors")
-        # create bucket processors invoker
-        bucket_processor_invoker = BucketsHashProcessorInvoker.options(
-            num_cpus=self.params.get("bucket_cpu", 0.5)
-        ).remote(bucket_processors=bucket_processors_list)
-        self.logger.info(f"created bucket processor invoker")
-        # Add invoker to the buckets
-        bucket_replies = [
-            collector.add_processing_submitter.remote(submitter=bucket_processor_invoker)
-            for collector in bucket_collectors
-        ]
-        RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies)
-        self.logger.info(f"added invoker to bucket collectors")
-        # start bucket processing and wait for completion
-        start = time.time()
-        bucket_replies = [collector.process_buckets.remote() for collector in bucket_collectors]
-        RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies)
-        # Wait for pool to complete
-        ray.get(bucket_processor_invoker.wait_for_completion.remote())
-        self.logger.info(f"Done processing buckets in {round((time.time() - start) / 60.,3)} min")
-        # At this point we can save doc actors, in case we would want to restart here
-        self.logger.info(f"creating document snapshots")
-        doc_replies = [None] * len(self.document_collectors)
-        index = 0
-        for collector in self.document_collectors:
-            doc_replies[index] = collector.snapshot.remote()
-            index += 1
-            time.sleep(self.snapshot_delay)
-        while doc_replies:
-            ready, not_ready = ray.wait(doc_replies)
-            doc_replies = not_ready
-        self.logger.info(f"document snapshots created")
-        # At this point we do not need bucket and minhash actors, remove them
-        # but first get usage information
-        # Bucket collector
-        replies = [collector.get_size.remote() for collector in bucket_collectors]
-        while replies:
-            ready, not_ready = ray.wait(replies)
-            b_amount, b_memory = ray.get(ready)[0]
-            self.sum_buckets += b_amount
-            self.sum_buckets_mem += b_memory
-            replies = not_ready
-        for collector in bucket_collectors:
-            ray.kill(actor=collector, no_restart=True)
-        # minhash collector
-        replies = [collector.get_size.remote() for collector in minhash_collectors]
-        while replies:
-            ready, not_ready = ray.wait(replies)
-            m_amount, m_memory = ray.get(ready)[0]
-            self.sum_mh += m_amount
-            self.sum_mh_mem += m_memory
-            replies = not_ready
-        for collector in minhash_collectors:
-            ray.kill(actor=collector, no_restart=True)
-        # Clean up processors
-        for processor in bucket_processors_list:
-            ray.kill(actor=processor, no_restart=True)
-        ray.kill(bucket_processor_invoker)
-
-    def _preprocess_tables(
-        self,
-        data_access_factory: DataAccessFactoryBase,
-        statistics: ActorHandle,
-        files: list[str],
-        mn_min_hash: MurmurMH,
-        num_buckets: int,
-        length_bucket: int,
-        bucket_collectors: list[ActorHandle],
-        minhash_collectors: list[ActorHandle],
-        random_delay_limit: int,
-    ) -> None:
-        """
-        Preprocess tables - build, run and cleanup
-        :param data_access_factory - data access factory
-        :param statistics - statistics actor
-        :param files - list of files to process
-        :param mn_min_hash - MurmurMH class
-        :param num_buckets - number of buckets
-        :param length_bucket - bucket length
-        :param bucket_collectors - bucket collector actors
-        :param minhash_collectors - minhash_collector actors
-        :param random_delay_limit - max for random dalay limit
-        :return: None
-        """
-        from ray.util.metrics import Gauge
-
-        worker_options = self.params.get("worker_options", None)
-        # Here we are limiting the number of readers not to overwhelm COS
-        n_readers = self.params.get("num_preprocessors", 1)
-        if n_readers > 1000:
-            n_readers = 1000
-        self.logger.info(f"Table preprocessing uses {n_readers} readers")
-        # Create preprocessing actors
-        processor_params = {
-            "data_access_factory": data_access_factory,
-            "transform_class": FdedupTransform,
-            "statistics": statistics,
-            "transform_params": {
-                "doc_column": self.params.get("doc_column", ""),
-                "doc_id_int_column": self.params.get("id_column", ""),
-                "word_shingle_size": self.params.get("world_shingle_size", 1),
-                "mn_min_hash": mn_min_hash,
-                "num_bands": num_buckets,
-                "length_band": length_bucket,
-                "remote_buckets": bucket_collectors,
-                "remote_minhashes": minhash_collectors,
-                "delimiter": self.params.get("delimiter", " "),
-                "random_delay_limit": random_delay_limit,
-            },
-            "base_table_stats": False,
-        }
-        processors_list = RayUtils.create_actors(
-            clazz=RayTransformFileProcessor,
-            params=processor_params,
-            actor_options=worker_options,
-            n_actors=n_readers,
-        )
-        self.logger.info(f"created {len(processors_list)} table processor actors")
-        # Execute preprocessing
-        # create gauges
-        files_in_progress_gauge = Gauge(
-            "preprocessing_files_in_progress", "Number of files in progress, preprocessing"
-        )
-        files_completed_gauge = Gauge(
-            "preprocessing_files_processed_total", "Number of files completed, preprocessing"
-        )
-        available_cpus_gauge = Gauge("preprocessing_available_cpus", "Number of available CPUs, preprocessing")
-        available_gpus_gauge = Gauge("preprocessing_available_gpus", "Number of available GPUs, preprocessing")
-        available_memory_gauge = Gauge("preprocessing_available_memory", "Available memory, preprocessing")
-        available_object_memory_gauge = Gauge(
-            "preprocessing_available_object_store", "Available object store, preprocessing"
-        )
-        print_interval = int(len(files) / 100)
-        if print_interval == 0:
-            print_interval = 1
-        # process data
-        processors = ActorPool(processors_list)
-        failures = RayUtils.process_files(
-            executors=processors,
-            files=files,
-            print_interval=print_interval,
-            files_in_progress_gauge=files_in_progress_gauge,
-            files_completed_gauge=files_completed_gauge,
-            available_cpus_gauge=available_cpus_gauge,
-            available_gpus_gauge=available_gpus_gauge,
-            available_memory_gauge=available_memory_gauge,
-            object_memory_gauge=available_object_memory_gauge,
-            logger=self.logger,
-        )
-        if failures > 0:
-            statistics.add_stats.remote({"actor failures": failures})
-        # Clean up processors
-        for processor in processors_list:
-            ray.kill(actor=processor, no_restart=True)
-        del processors
-
-    def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]:
-        """
-        Compute execution statistics
-        :param stats: output of statistics
-        :return: job execution statistics
-        """
-        # Get document collector statistics
-        sum_docs = 0
-        sum_docs_mem = 0
-        sum_removed = 0
-        sum_removed_mem = 0
-        replies = [collector.get_size.remote() for collector in self.document_collectors]
-        while replies:
-            ready, not_ready = ray.wait(replies)
-            d_amount, d_memory, r_amount, r_memory = ray.get(ready)[0]
-            sum_docs += d_amount
-            sum_docs_mem += d_memory
-            sum_removed += r_amount
-            sum_removed_mem += r_memory
-            replies = not_ready
-        overall_hash_memory = self.sum_buckets_mem + self.sum_mh_mem + sum_docs_mem + sum_docs_mem + sum_removed_mem
-        dedup_prst = 100 * (1.0 - stats.get("result_documents", 1) / stats.get("source_documents", 1))
-        return {
-            "number of buckets": self.sum_buckets,
-            "number of docs": sum_docs,
-            "number of removed docs": sum_removed,
-            "number of min hashes": self.sum_mh,
-            "overall hash memory GB": overall_hash_memory,
-            "de duplication %": dedup_prst,
-        } | stats
-
-
-class FdedupTableTransformConfiguration(TransformConfiguration):
-    """
-    Provides support for configuring and using the associated Transform class include
-    configuration with CLI args and combining of metadata.
-    """
-
-    def __init__(self):
-        super().__init__(
-            name=short_name,
-            transform_class=FdedupFilter,
-        )
-        from data_processing.utils import get_logger
-
-        self.logger = get_logger(__name__)
-
-    def add_input_params(self, parser: ArgumentParser) -> None:
-        """
-        Add Transform-specific arguments to the given  parser.
-        """
-        parser.add_argument(f"--{cli_prefix}doc_column", type=str, default="contents", help="document column name")
-        parser.add_argument(
-            f"--{cli_prefix}id_column", type=str, default="int_document_id", help="integer document id column name"
-        )
-        parser.add_argument(f"--{cli_prefix}cluster_column", type=str, default="cluster", help="cluster column name")
-        parser.add_argument(
-            f"--{cli_prefix}bucket_cpu", type=float, default=0.5, help="number of CPUs per bucket hash"
-        )
-        parser.add_argument(
-            f"--{cli_prefix}mhash_cpu", type=float, default=0.5, help="number of CPUs per minhash hash"
-        )
-        parser.add_argument(f"--{cli_prefix}doc_cpu", type=float, default=0.5, help="number of CPUs per doc hash")
-        parser.add_argument(f"--{cli_prefix}num_doc_actors", type=int, default=1, help="number of doc actors to use")
-        parser.add_argument(
-            f"--{cli_prefix}num_minhash_actors", type=int, default=1, help="number of minhash actors to use"
-        )
-        parser.add_argument(
-            f"--{cli_prefix}num_bucket_actors", type=int, default=1, help="number of bucket actors to use"
-        )
-        parser.add_argument(
-            f"--{cli_prefix}num_preprocessors", type=int, default=1, help="number of preprocessors to use"
-        )
-        parser.add_argument(f"--{cli_prefix}num_permutations", type=int, default=64, help="number of permutations")
-        parser.add_argument(f"--{cli_prefix}threshold", type=float, default=0.8, help="threshold")
-        parser.add_argument(f"--{cli_prefix}shingles_size", type=int, default=5, help="number of words in shingle")
-        parser.add_argument(
-            f"--{cli_prefix}delimiters", type=str, default=" ", help="delimiter for splitting document"
-        )
-        parser.add_argument(f"--{cli_prefix}snapshot_delay", type=int, default=1, help="snapshot delay time")
-        parser.add_argument(
-            f"--{cli_prefix}use_bucket_snapshot",
-            type=lambda x: bool(str2bool(x)),
-            default=False,
-            help="flag to continue with bucket snapshot",
-        )
-        parser.add_argument(
-            f"--{cli_prefix}use_doc_snapshot",
-            type=lambda x: bool(str2bool(x)),
-            default=False,
-            help="flag to continue with doc snapshot",
-        )
-        parser.add_argument(
-            f"--{cli_prefix}random_delay_limit", type=int, default=10, help="maximum delay between read"
-        )
-
-    def apply_input_params(self, args: Namespace) -> bool:
-        """
-        Validate and apply the arguments that have been parsed
-        :param args: user defined arguments.
-        :return: True, if validate pass or False otherwise
-        """
-        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
-        self.params = self.params | captured
-        self.params["worker_options"] = args.runtime_worker_options
-        if self.params["use_bucket_snapshot"] and self.params["use_doc_snapshot"]:
-            self.logger.warning("both bucket and doc snapshot are specified. Only one allowed")
-            return False
-
-        self.logger.info(f"fuzzy dedup params are {self.params}")
-        return True
-
-
-class FdedupRayTransformConfiguration(RayTransformRuntimeConfiguration):
-    def __init__(self):
-        super().__init__(transform_config=FdedupTableTransformConfiguration(), runtime_class=FdedupRuntime)
-
-
-if __name__ == "__main__":
-    launcher = RayTransformLauncher(FdedupRayTransformConfiguration())
-    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
new file mode 100644
index 000000000..64f492584
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
@@ -0,0 +1,54 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from data_processing.utils import ParamsUtils
+from data_processing_ray.runtime.ray import RayTransformLauncher
+from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+worker_options = {"num_cpus": 0.8}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # where to run
+    "run_locally": True,
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # orchestrator
+    "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options),
+    "runtime_num_workers": 3,
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_creation_delay": 0,
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    # execution info
+    "minhash_num_permutations": 112,
+    "minhash_num_bands": 14,
+    "minhash_num_segments": 2,
+}
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    # create launcher
+    launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py
new file mode 100644
index 000000000..bc3c0d991
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py
@@ -0,0 +1,42 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from data_processing.utils import CLIArgumentProvider, get_logger
+from data_processing_ray.runtime.ray.runtime_configuration import (
+    RayTransformRuntimeConfiguration,
+)
+from signature_calc_transform import SignatureCalculationTransformConfiguration
+
+
+logger = get_logger(__name__)
+
+
+class SignatureCalculationRayTransformConfiguration(RayTransformRuntimeConfiguration):
+    """
+    Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(transform_config=SignatureCalculationTransformConfiguration())
+
+
+if __name__ == "__main__":
+    # launcher = NOOPRayLauncher()
+    launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration())
+    logger.info("Launching  transform")
+    launcher.launch()

From 0c31dc07a06942b3b6eb73cc29a62f512f4c7a00 Mon Sep 17 00:00:00 2001
From: nelson <kibnelson@gmail.com>
Date: Fri, 11 Oct 2024 12:25:46 -0400
Subject: [PATCH 013/105] Fixed bug in ray to distribute docs to remove file to
 all workers

Signed-off-by: nelson <kibnelson@gmail.com>
---
 .../python/src/data_cleaning_transform.py     |  4 +--
 .../ray/src/data_cleaning_transform_ray.py    | 26 ++++++++++---------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
index f03b6c1d0..05b18cc8b 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
@@ -110,10 +110,10 @@ class DataCleaningTransformConfiguration(TransformConfiguration):
     configuration with CLI args.
     """
 
-    def __init__(self):
+    def __init__(self, transform_class: type[AbstractTableTransform] = DataCleaningTransform):
         super().__init__(
             name=short_name,
-            transform_class=DataCleaningTransform,
+            transform_class=transform_class,
         )
         self.logger = get_logger(__name__, level="INFO")
 
diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
index 9fdb220f7..831a6c9c2 100644
--- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
+++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
@@ -16,9 +16,8 @@
 from data_cleaning_transform import (
     DataCleaningTransform,
     DataCleaningTransformConfiguration,
-    docs2remove_list,
-    docs2remove_list_key,
-    get_docs_to_remove,
+    duplicate_list_location_default,
+    duplicate_list_location_key,
 )
 from data_processing.data_access import DataAccessFactoryBase
 from data_processing.utils import CLIArgumentProvider, get_logger
@@ -45,16 +44,15 @@ def __init__(self, config: dict):
         by the companion runtime, LangSelectorTransformRuntime.  If running inside the RayMutatingDriver,
         these will be provided by that class with help from the RayMutatingDriver.
         """
-        docs2remove = config.get(docs2remove_list_key, None)
-        if docs2remove is not None:
+        docs2removedf = config.get("df", None)
+        if docs2removedf is not None:
             # This is recommended for production approach. In this case domain list is build by the
             # runtime once, loaded to the object store and can be accessed by actors without additional reads
             try:
-
-                config[docs2remove_list_key] = ray.get(config.get(docs2remove_list_key))
+                config["df"] = ray.get(config.get("df"))
             except Exception as e:
-                self.logger.warning(f"Exception loading languages list from ray object storage {e}")
-                raise RuntimeError(f"exception loading from object storage for key {docs2remove}")
+                self.logger.warning(f"Exception loading docs2remove list from ray object storage {e}")
+                raise RuntimeError(f"exception loading from object storage for key {docs2removedf}")
         super().__init__(config)
 
 
@@ -90,9 +88,13 @@ def get_transform_config(
         :param files - list of files to remove
         :return: dictionary of filter init params
         """
-        docs_to_remove = get_docs_to_remove(self.params)
-        docs_to_remove_list = ray.put(docs_to_remove)
-        return {docs2remove_list_key: docs_to_remove_list} | self.params
+        duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
+        data_access = data_access_factory.create_data_access()
+        if duplicate_list_location.startswith("s3://"):
+            _, duplicate_list_location = duplicate_list_location.split("://")
+        duplicate_list, retries = data_access.get_file(duplicate_list_location)
+        docs_to_remove_list = ray.put(duplicate_list)
+        return {"df": docs_to_remove_list} | self.params
 
 
 class DataCleaningRayTransformConfiguration(RayTransformRuntimeConfiguration):

From 6ee6695c1ef5d494935c42207dce0d5e0ccd151f Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Thu, 10 Oct 2024 19:05:39 +0100
Subject: [PATCH 014/105] added folder_transform

---
 .../pure_python/transform_file_processor.py   | 15 ++++--
 .../pure_python/transform_orchestrator.py     | 42 ++++++++++------
 .../runtime/transform_file_processor.py       | 41 ++++++++-------
 .../src/data_processing/transform/__init__.py |  2 +
 .../transform/abstract_transform.py           | 16 ++++++
 .../transform/binary_transform.py             |  5 +-
 .../transform/folder_transform.py             | 50 +++++++++++++++++++
 .../runtime/ray/transform_file_processor.py   |  1 +
 .../runtime/ray/transform_orchestrator.py     | 19 ++++---
 .../runtime/spark/transform_file_processor.py |  5 +-
 .../runtime/spark/transform_orchestrator.py   | 25 +++++++---
 11 files changed, 168 insertions(+), 53 deletions(-)
 create mode 100644 data-processing-lib/python/src/data_processing/transform/abstract_transform.py
 create mode 100644 data-processing-lib/python/src/data_processing/transform/folder_transform.py

diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
index 143835dd0..fa3e69e4a 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
@@ -14,7 +14,7 @@
 
 from data_processing.data_access import DataAccessFactoryBase
 from data_processing.runtime import AbstractTransformFileProcessor
-from data_processing.transform import AbstractBinaryTransform, TransformStatistics
+from data_processing.transform import AbstractTransform, TransformStatistics
 from data_processing.utils import UnrecoverableException
 
 
@@ -28,7 +28,8 @@ def __init__(
         data_access_factory: DataAccessFactoryBase,
         statistics: TransformStatistics,
         transform_params: dict[str, Any],
-        transform_class: type[AbstractBinaryTransform],
+        transform_class: type[AbstractTransform],
+        is_folder: bool,
     ):
         """
         Init method
@@ -36,11 +37,13 @@ def __init__(
         :param statistics - reference to statistics class
         :param transform_params - transform parameters
         :param transform_class: transform class
+        :param is_folder: folder transform flag
         """
         # invoke superclass
         super().__init__(
             data_access_factory=data_access_factory,
             transform_parameters=dict(transform_params),
+            is_folder=is_folder,
         )
         self.transform_params["statistics"] = statistics
         # Create local processor
@@ -52,7 +55,8 @@ def __init__(
         # Create statistics
         self.stats = statistics
 
-    def _publish_stats(self, stats: dict[str, Any]) -> None:
+
+def _publish_stats(self, stats: dict[str, Any]) -> None:
         self.stats.add_stats(stats)
 
 
@@ -65,17 +69,20 @@ def __init__(
         self,
         data_access_factory: DataAccessFactoryBase,
         transform_params: dict[str, Any],
-        transform_class: type[AbstractBinaryTransform],
+        transform_class: type[AbstractTransform],
+        is_folder: bool
     ):
         """
         Init method
         :param data_access_factory - data access factory
         :param transform_params - transform parameters
         :param transform_class: transform class
+        :param is_folder: folder tranform flag
         """
         super().__init__(
             data_access_factory=data_access_factory,
             transform_parameters=dict(transform_params),
+            is_folder=is_folder,
         )
         # Add data access and statistics to the processor parameters
         self.transform_params["data_access"] = self.data_access
diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
index 8692da29e..153eaaf0a 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
@@ -24,7 +24,7 @@
     PythonTransformFileProcessor,
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.transform import AbstractBinaryTransform, TransformStatistics
+from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 
 
@@ -48,8 +48,6 @@ def _execution_resources() -> dict[str, Any]:
         "object_store": 0,
     }
 
-
-
 def orchestrate(
     data_access_factory: DataAccessFactoryBase,
     runtime_config: PythonTransformRuntimeConfiguration,
@@ -74,15 +72,21 @@ def orchestrate(
         return 1
     # create additional execution parameters
     runtime = runtime_config.create_transform_runtime()
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        if retries > 0:
-            statistics.add_stats({"data access retries": retries})
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")
+        else:
+            # Get files to process
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            if retries > 0:
+                statistics.add_stats({"data access retries": retries})
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # Print interval
         print_interval = int(len(files) / 100)
         if print_interval == 0:
@@ -99,6 +103,7 @@ def orchestrate(
                     data_access_factory=data_access_factory, statistics=statistics, files=files
                 ),
                 transform_class=runtime_config.get_transform_class(),
+                is_folder=is_folder,
             )
         else:
             # using sequential execution
@@ -111,6 +116,7 @@ def orchestrate(
                     data_access_factory=data_access_factory, statistics=statistics, files=files
                 ),
                 transform_class=runtime_config.get_transform_class(),
+                is_folder=is_folder,
             )
         status = "success"
         return_code = 0
@@ -157,7 +163,8 @@ def _process_transforms(
     data_access_factory: DataAccessFactoryBase,
     statistics: TransformStatistics,
     transform_params: dict[str, Any],
-    transform_class: type[AbstractBinaryTransform],
+    transform_class: type[AbstractTransform],
+    is_folder: bool,
 ) -> None:
     """
     Process transforms sequentially
@@ -167,9 +174,8 @@ def _process_transforms(
     :param data_access_factory: data access factory
     :param transform_params - transform parameters
     :param transform_class: transform class
+    :param is_folder: folder transform flag
     :return: metadata for the execution
-
-    :return: None
     """
     # create executor
     executor = PythonTransformFileProcessor(
@@ -177,6 +183,7 @@ def _process_transforms(
         statistics=statistics,
         transform_params=transform_params,
         transform_class=transform_class,
+        is_folder=is_folder,
     )
     # process data
     t_start = time.time()
@@ -203,6 +210,7 @@ def _process_transforms_multiprocessor(
     data_access_factory: DataAccessFactoryBase,
     transform_params: dict[str, Any],
     transform_class: type[AbstractBinaryTransform],
+    is_folder: bool
 ) -> TransformStatistics:
     """
     Process transforms using multiprocessing pool
@@ -212,13 +220,17 @@ def _process_transforms_multiprocessor(
     :param data_access_factory: data access factory
     :param transform_params - transform parameters
     :param transform_class: transform class
+    :param is_folder: folder transform class
     :return: metadata for the execution
     """
     # result statistics
     statistics = TransformStatistics()
     # create processor
     processor = PythonPoolTransformFileProcessor(
-        data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class
+        data_access_factory=data_access_factory,
+        transform_params=transform_params,
+        transform_class=transform_class,
+        is_folder=is_folder,
     )
     completed = 0
     t_start = time.time()
diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
index d4ec548d8..1d268875f 100644
--- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
@@ -26,11 +26,13 @@ def __init__(
         self,
         data_access_factory: DataAccessFactoryBase,
         transform_parameters: dict[str, Any],
+        is_folder: bool = False,
     ):
         """
         Init method
         :param data_access_factory: Data Access Factory
         :param transform_parameters: Transform parameters
+        :param is_folder: folder transform flag
         """
         self.logger = get_logger(__name__)
         # validate parameters
@@ -46,6 +48,7 @@ def __init__(
         # Add data access and statistics to the processor parameters
         self.transform_params = transform_parameters
         self.transform_params["data_access"] = self.data_access
+        self.is_folder = is_folder
 
     def process_file(self, f_name: str) -> None:
         """
@@ -58,25 +61,29 @@ def process_file(self, f_name: str) -> None:
             self.logger.warning("No data_access found. Returning.")
             return
         t_start = time.time()
-        # Read source file
-        filedata, retries = self.data_access.get_file(path=f_name)
-        if retries > 0:
-            self._publish_stats({"data access retries": retries})
-        if filedata is None:
-            self.logger.warning(f"File read resulted in None for {f_name}. Returning.")
-            self._publish_stats({"failed_reads": 1})
-            return
-        self._publish_stats({"source_files": 1, "source_size": len(filedata)})
+        if not self.is_folder:
+            # Read source file only if we are processing file
+            filedata, retries = self.data_access.get_file(path=f_name)
+            if retries > 0:
+                self._publish_stats({"data access retries": retries})
+            if filedata is None:
+                self.logger.warning(f"File read resulted in None for {f_name}. Returning.")
+                self._publish_stats({"failed_reads": 1})
+                return
+            self._publish_stats({"source_files": 1, "source_size": len(filedata)})
         # Process input file
         try:
-            # execute local processing
-            name_extension = TransformUtils.get_file_extension(f_name)
             self.logger.debug(f"Begin transforming file {f_name}")
-            out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata)
+            if not self.is_folder:
+                # execute local processing
+                out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata)
+                name_extension = TransformUtils.get_file_extension(f_name)
+                self.last_file_name = name_extension[0]
+                self.last_file_name_next_index = None
+                self.last_extension = name_extension[1]
+            else:
+                out_files, stats = self.transform.transform(folder_name=f_name)
             self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
-            self.last_file_name = name_extension[0]
-            self.last_file_name_next_index = None
-            self.last_extension = name_extension[1]
             # save results
             self._submit_file(t_start=t_start, out_files=out_files, stats=stats)
         # Process unrecoverable exceptions
@@ -95,10 +102,10 @@ def flush(self) -> None:
         the hook for them to return back locally stored data and their statistics.
         :return: None
         """
-        if self.last_file_name is None:
+        if self.last_file_name is None or self.is_folder:
             # for some reason a given worker never processed anything. Happens in testing
             # when the amount of workers is greater than the amount of files
-            self.logger.debug("skipping flush, no name for file is defined")
+            self.logger.debug("skipping flush, no name for file is defined or this is a folder transform")
             return
         try:
             t_start = time.time()
diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py
index 6af43ad60..20254e47b 100644
--- a/data-processing-lib/python/src/data_processing/transform/__init__.py
+++ b/data-processing-lib/python/src/data_processing/transform/__init__.py
@@ -1,3 +1,5 @@
+from data_processing.transform.abstract_transform import AbstractTransform
+from data_processing.transform.folder_transform import AbstractFolderTransform
 from data_processing.transform.binary_transform import AbstractBinaryTransform
 from data_processing.transform.table_transform import AbstractTableTransform
 from data_processing.transform.transform_statistics import TransformStatistics
diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py
new file mode 100644
index 000000000..89db70f42
--- /dev/null
+++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py
@@ -0,0 +1,16 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+class AbstractTransform:
+    """
+    Base class for all transform types
+    """
\ No newline at end of file
diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py
index 80dff61ea..b313aff2f 100644
--- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py
@@ -10,10 +10,11 @@
 # limitations under the License.
 ################################################################################
 
-from typing import Any, TypeVar
+from typing import Any
+from data_processing.transform import AbstractTransform
 
 
-class AbstractBinaryTransform:
+class AbstractBinaryTransform(AbstractTransform):
     """
     Converts input binary file to output file(s) (binary)
     Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
new file mode 100644
index 000000000..866e3286f
--- /dev/null
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -0,0 +1,50 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from typing import Any
+from data_processing.data_access import data_access
+from data_processing.transform import AbstractTransform
+
+
+class AbstractFolderTransform(AbstractTransform):
+    """
+    Converts input folder to output file(s) (binary)
+    Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or
+    more new binary files and metadata.
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This simply stores the given instance in this instance for later use.
+        """
+        self.config = config
+
+    def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        """
+        Converts input folder into o or more output files.
+        If there is an error, an exception must be raised - exit()ing is not generally allowed.
+        :param folder_name: the name of the folder containing arbitrary amount of files.
+        :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
+                to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
+                holding the extension to be used when writing out the new bytes.
+        """
+        raise NotImplemented()
+
+    @staticmethod
+    def get_folders(data_access:data_access) -> list(str):
+        """
+        Compute the list of folders to use.
+        :param data_access - data access class
+        :return:
+        """
+        raise NotImplemented()
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py
index e1fabb144..cdad1309f 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py
@@ -35,6 +35,7 @@ def __init__(self, params: dict[str, Any]):
         super().__init__(
             data_access_factory=params.get("data_access_factory", None),
             transform_parameters=dict(params.get("transform_params", {})),
+            is_folder=params.get("is_folder", False)
         )
         # Create statistics
         self.stats = params.get("statistics", None)
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index 42eba47a6..8276eb56c 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -16,6 +16,7 @@
 
 import ray
 from data_processing.data_access import DataAccessFactoryBase
+from data_processing.transform import AbstractFolderTransform
 from data_processing_ray.runtime.ray import (
     RayTransformExecutionConfiguration,
     RayTransformFileProcessor,
@@ -56,13 +57,18 @@ def orchestrate(
     # create transformer runtime
     runtime = runtime_config.create_transform_runtime()
     resources = RayUtils.get_cluster_resources()
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")        # Get files to process
+        else:
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # Print interval
         print_interval = int(len(files) / 100)
         if print_interval == 0:
@@ -84,6 +90,7 @@ def orchestrate(
                 data_access_factory=data_access_factory, statistics=statistics, files=files
             ),
             "statistics": statistics,
+            "is_folder": is_folder,
         }
         logger.debug("Creating actors")
         processors = RayUtils.create_actors(
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py
index d63664ac4..a0968ab1d 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py
@@ -29,12 +29,15 @@ def __init__(
         data_access_factory: DataAccessFactoryBase,
         runtime_configuration: SparkTransformRuntimeConfiguration,
         statistics: TransformStatistics,
+        is_folder: bool,
     ):
         """
         Init method
         """
         super().__init__(
-            data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params()
+            data_access_factory=data_access_factory,
+            transform_parameters=runtime_configuration.get_transform_params(),
+            is_folder=is_folder,
         )
         # Add data access ant statistics to the processor parameters
         self.runtime_configuration = runtime_configuration
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
index c279f2b73..c534b685f 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
@@ -18,7 +18,7 @@
 
 import yaml
 from data_processing.data_access import DataAccessFactoryBase
-from data_processing.transform import TransformStatistics
+from data_processing.transform import TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 from data_processing_spark.runtime.spark import (
     SparkTransformExecutionConfiguration,
@@ -117,7 +117,10 @@ def process_partition(iterator):
         runtime = runtime_conf.create_transform_runtime()
         # create file processor
         file_processor = SparkTransformFileProcessor(
-            data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics
+            data_access_factory=d_access_factory,
+            runtime_configuration=runtime_conf,
+            statistics=statistics,
+            is_folder=is_folder,
         )
         first = True
         for f in iterator:
@@ -144,13 +147,19 @@ def process_partition(iterator):
         return list(statistics.get_execution_stats().items())
 
     num_partitions = 0
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")        # Get files to process
+        else:
+            # Get files to process
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # process data
         logger.debug("Begin processing files")
         # process files split by partitions

From e7260ba32d4d3dc1ab7a4e8d23fa302efdc8b18e Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Thu, 10 Oct 2024 19:13:01 +0100
Subject: [PATCH 015/105] added folder_transform

---
 .../runtime/pure_python/transform_orchestrator.py             | 2 +-
 .../python/src/data_processing/transform/folder_transform.py  | 4 ++--
 .../data_processing_ray/runtime/ray/transform_orchestrator.py | 2 +-
 .../runtime/spark/transform_orchestrator.py                   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
index 153eaaf0a..d51f80a8a 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
@@ -76,7 +76,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            files = AbstractFolderTransform.get_folders(d_access=data_access)
             logger.info(f"Number of folders is {len(files)}")
         else:
             # Get files to process
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
index 866e3286f..eca191bbb 100644
--- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -41,10 +41,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
         raise NotImplemented()
 
     @staticmethod
-    def get_folders(data_access:data_access) -> list(str):
+    def get_folders(d_access: data_access) -> list(str):
         """
         Compute the list of folders to use.
-        :param data_access - data access class
+        :param d_access - data access class
         :return:
         """
         raise NotImplemented()
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index 8276eb56c..a8ff95729 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -61,7 +61,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            files = AbstractFolderTransform.get_folders(d_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             files, profile, retries = data_access.get_files_to_process()
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
index c534b685f..4a0897952 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
@@ -151,7 +151,7 @@ def process_partition(iterator):
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(data_access=data_access)
+            files = AbstractFolderTransform.get_folders(d_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             # Get files to process

From 5856f3f54137ae225b8cbdf07add9eaf20ed38b2 Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Thu, 10 Oct 2024 21:00:43 +0100
Subject: [PATCH 016/105] added folder_transform

---
 .../runtime/pure_python/transform_file_processor.py  |  3 +--
 .../runtime/pure_python/transform_orchestrator.py    | 11 ++++++-----
 .../runtime/pure_python/transform_runtime.py         | 10 +++++++++-
 .../data_processing/transform/folder_transform.py    | 12 +-----------
 .../runtime/ray/transform_orchestrator.py            |  2 +-
 .../runtime/ray/transform_runtime.py                 | 10 +++++++++-
 6 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
index fa3e69e4a..44ccd0ef0 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py
@@ -55,8 +55,7 @@ def __init__(
         # Create statistics
         self.stats = statistics
 
-
-def _publish_stats(self, stats: dict[str, Any]) -> None:
+    def _publish_stats(self, stats: dict[str, Any]) -> None:
         self.stats.add_stats(stats)
 
 
diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
index d51f80a8a..812be8caf 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py
@@ -24,14 +24,13 @@
     PythonTransformFileProcessor,
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform
+from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 
 
 logger = get_logger(__name__)
 
 
-@staticmethod
 def _execution_resources() -> dict[str, Any]:
     """
     Get Execution resource
@@ -48,6 +47,7 @@ def _execution_resources() -> dict[str, Any]:
         "object_store": 0,
     }
 
+
 def orchestrate(
     data_access_factory: DataAccessFactoryBase,
     runtime_config: PythonTransformRuntimeConfiguration,
@@ -76,7 +76,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(d_access=data_access)
+            files = runtime.get_folders(data_access=data_access)
             logger.info(f"Number of folders is {len(files)}")
         else:
             # Get files to process
@@ -145,7 +145,8 @@ def orchestrate(
             "job_input_params": input_params
             | data_access_factory.get_input_params()
             | execution_config.get_input_params(),
-            "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
+            "execution_stats": _execution_resources() |
+                               {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
             "job_output_stats": stats,
         }
         logger.debug(f"Saving job metadata: {metadata}.")
@@ -209,7 +210,7 @@ def _process_transforms_multiprocessor(
     print_interval: int,
     data_access_factory: DataAccessFactoryBase,
     transform_params: dict[str, Any],
-    transform_class: type[AbstractBinaryTransform],
+    transform_class: type[AbstractTransform],
     is_folder: bool
 ) -> TransformStatistics:
     """
diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py
index 4173154ae..478d40837 100644
--- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py
+++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py
@@ -12,7 +12,7 @@
 
 from typing import Any
 
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from data_processing.transform import TransformStatistics
 
 
@@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]):
         """
         self.params = params
 
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
+
     def get_transform_config(
         self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
     ) -> dict[str, Any]:
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
index eca191bbb..9a2fb3713 100644
--- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -11,7 +11,6 @@
 ################################################################################
 
 from typing import Any
-from data_processing.data_access import data_access
 from data_processing.transform import AbstractTransform
 
 
@@ -38,13 +37,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
                 to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
                 holding the extension to be used when writing out the new bytes.
         """
-        raise NotImplemented()
-
-    @staticmethod
-    def get_folders(d_access: data_access) -> list(str):
-        """
-        Compute the list of folders to use.
-        :param d_access - data access class
-        :return:
-        """
-        raise NotImplemented()
+        raise NotImplemented()
\ No newline at end of file
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index a8ff95729..b29682997 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -61,7 +61,7 @@ def orchestrate(
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(d_access=data_access)
+            files = runtime.get_folders(data_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             files, profile, retries = data_access.get_files_to_process()
diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py
index 57f071406..64479302c 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py
@@ -12,7 +12,7 @@
 
 from typing import Any
 
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from ray.actor import ActorHandle
 
 
@@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]):
         """
         self.params = params
 
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
+
     def get_transform_config(
         self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
     ) -> dict[str, Any]:

From 6519686320fb2e76d03d9079b2b59b24be42b6cd Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Fri, 11 Oct 2024 08:48:00 +0100
Subject: [PATCH 017/105] added folder_transform

---
 .../runtime/spark/transform_orchestrator.py            |  3 ++-
 .../runtime/spark/transform_runtime.py                 | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
index 4a0897952..096fab272 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py
@@ -151,7 +151,8 @@ def process_partition(iterator):
     try:
         if is_folder:
             # folder transform
-            files = AbstractFolderTransform.get_folders(d_access=data_access)
+            runtime = runtime_config.create_transform_runtime()
+            files = runtime.get_folders(data_access=data_access)
             logger.info(f"Number of folders is {len(files)}")        # Get files to process
         else:
             # Get files to process
diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py
index 7b968b1e9..7410d09d1 100644
--- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py
+++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py
@@ -12,7 +12,7 @@
 
 from typing import Any
 
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from data_processing.transform import TransformStatistics
 
 
@@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]):
         """
         self.params = params
 
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
+
     def get_transform_config(
         self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
     ) -> dict[str, Any]:

From c728224a5e3396ebe5d71dddb1b23a7a4b64ae7c Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Fri, 11 Oct 2024 15:35:00 +0100
Subject: [PATCH 018/105] added noop testing

---
 .../runtime/transform_file_processor.py       |  44 +++++---
 .../test_support/transform/__init__.py        |  13 ++-
 .../transform/noop_folder_transform.py        | 105 ++++++++++++++++++
 .../test_support/transform/noop_transform.py  |   6 +-
 .../transform/folder_transform.py             |   2 +-
 .../transform/transform_configuration.py      |   6 +-
 .../transform/test_folders_noop.py            |  33 ++++++
 .../launch/ray/ray_test_noop_launch.py        |   6 -
 .../ededup/ray/src/ededup_transform_ray.py    |   9 +-
 9 files changed, 187 insertions(+), 37 deletions(-)
 create mode 100644 data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py
 create mode 100644 data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py

diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
index 1d268875f..4075f40be 100644
--- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
+++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py
@@ -83,6 +83,7 @@ def process_file(self, f_name: str) -> None:
                 self.last_extension = name_extension[1]
             else:
                 out_files, stats = self.transform.transform(folder_name=f_name)
+                self.last_file_name = f_name
             self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
             # save results
             self._submit_file(t_start=t_start, out_files=out_files, stats=stats)
@@ -148,15 +149,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats
                 )
             case 1:
                 # we have exactly 1 output file
-                file_ext = out_files[0]
-                lfn = self.last_file_name
-                if self.last_file_name_next_index is not None:
-                    lfn = f"{lfn}_{self.last_file_name_next_index}"
-                output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
+                if self.is_folder:
+                    # its folder
+                    output_name = out_files[0][1]
+                    dt = out_files[0][0]
+                else:
+                    file_ext = out_files[0]
+                    lfn = self.last_file_name
+                    if self.last_file_name_next_index is not None:
+                        lfn = f"{lfn}_{self.last_file_name_next_index}"
+                    output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
+                    dt = file_ext[0]
                 self.logger.debug(
                     f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}"
                 )
-                save_res, retries = self.data_access.save_file(path=output_name, data=file_ext[0])
+                save_res, retries = self.data_access.save_file(path=output_name, data=dt)
                 if retries > 0:
                     self._publish_stats({"data access retries": retries})
                 if save_res is None:
@@ -166,7 +173,7 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats
                 self._publish_stats(
                     {
                         "result_files": 1,
-                        "result_size": len(file_ext[0]),
+                        "result_size": len(dt),
                         "processing_time": time.time() - t_start,
                     }
                 )
@@ -183,14 +190,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats
                     start_index = 0
                 count = len(out_files)
                 for index in range(count):
-                    file_ext = out_files[index]
-                    output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}"
-                    file_sizes += len(file_ext[0])
-                    self.logger.debug(
-                        f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} "
-                        f"of {count}  to {output_name_indexed}"
-                    )
-                    save_res, retries = self.data_access.save_file(path=output_name_indexed, data=file_ext[0])
+                    if self.is_folder:
+                        # its a folder
+                        output_name_indexed = out_files[index][1]
+                        dt = out_files[index][0]
+                    else:
+                        # files
+                        file_ext = out_files[index]
+                        output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}"
+                        self.logger.debug(
+                            f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} "
+                            f"of {count}  to {output_name_indexed}"
+                        )
+                        dt = file_ext[0]
+                    file_sizes += len(dt)
+                    save_res, retries = self.data_access.save_file(path=output_name_indexed, data=dt)
                     if retries > 0:
                         self._publish_stats({"data access retries": retries})
                     if save_res is None:
diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py
index 0e90f7ffd..04d6f3b0f 100644
--- a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py
+++ b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py
@@ -1,6 +1,11 @@
-from .table_transform_test import AbstractTableTransformTest
-from .binary_transform_test import AbstractBinaryTransformTest
-from .noop_transform import (
+from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest
+from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest
+from data_processing.test_support.transform.noop_transform import (
     NOOPTransform,
-    NOOPPythonTransformConfiguration,
+    NOOPTransformConfiguration,
+    NOOPPythonTransformConfiguration
 )
+from data_processing.test_support.transform.noop_folder_transform import (
+    NOOPFolderTransform,
+    NOOPFolderPythonTransformConfiguration
+)
\ No newline at end of file
diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py
new file mode 100644
index 000000000..5baab7858
--- /dev/null
+++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py
@@ -0,0 +1,105 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import time
+from typing import Any
+
+from data_processing.data_access import DataAccess
+from data_processing.runtime.pure_python import (
+    PythonTransformLauncher,
+    PythonTransformRuntimeConfiguration,
+    DefaultPythonTransformRuntime)
+from data_processing.transform import AbstractFolderTransform
+from data_processing.utils import get_logger
+from data_processing.test_support.transform import NOOPTransformConfiguration
+
+
+logger = get_logger(__name__)
+
+
+class NOOPFolderTransform(AbstractFolderTransform):
+    """
+    Implements a simple copy of a pyarrow Table.
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments defined
+        by the companion runtime, NOOPTransformRuntime.  If running inside the RayMutatingDriver,
+        these will be provided by that class with help from the RayMutatingDriver.
+        """
+        # Make sure that the param name corresponds to the name used in apply_input_params method
+        # of NOOPTransformConfiguration class
+        super().__init__(config)
+        self.sleep = config.get("sleep_sec", 1)
+        self.data_access = config.get("data_access")
+
+    def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        """
+        Converts input folder into o or more output files.
+        If there is an error, an exception must be raised - exit()ing is not generally allowed.
+        :param folder_name: the name of the folder containing arbitrary amount of files.
+        :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
+                to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
+                holding the file name to use.
+        """
+        logger.debug(f"Transforming one folder {folder_name}")
+        metadata = {}
+        # get folder files
+        files, retries = self.data_access.get_folder_files(path=folder_name)
+        if retries > 0:
+            metadata |= {"data access retries": retries}
+        result = [()] * len(files)
+        index = 0
+        for name, file in files.items():
+            result[index] = (file, self.data_access.get_output_location(name))
+            if self.sleep is not None:
+                logger.info(f"Sleep for {self.sleep} seconds")
+                time.sleep(self.sleep)
+                logger.info("Sleep completed - continue")
+            index += 1
+        # Add some sample metadata.
+        metadata |= {"nfiles": len(files)}
+        return result, metadata
+
+
+class NOOPFolderPythonRuntime(DefaultPythonTransformRuntime):
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        return [data_access.get_input_folder()]
+
+
+class NOOPFolderPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
+                         runtime_class=NOOPFolderPythonRuntime)
+
+
+if __name__ == "__main__":
+    # launcher = NOOPRayLauncher()
+    launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration())
+    logger.info("Launching noop transform")
+    launcher.launch()
diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py
index 0dee013a4..2fea35506 100644
--- a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py
+++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py
@@ -19,7 +19,7 @@
 from data_processing.runtime.pure_python.runtime_configuration import (
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.transform import AbstractTableTransform, TransformConfiguration
+from data_processing.transform import AbstractTableTransform, TransformConfiguration, AbstractTransform
 from data_processing.utils import CLIArgumentProvider, get_logger
 
 
@@ -75,10 +75,10 @@ class NOOPTransformConfiguration(TransformConfiguration):
     configuration with CLI args.
     """
 
-    def __init__(self):
+    def __init__(self, clazz: type[AbstractTransform] = NOOPTransform):
         super().__init__(
             name=short_name,
-            transform_class=NOOPTransform,
+            transform_class=clazz,
             remove_from_metadata=[pwd_key],
         )
 
diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
index 9a2fb3713..caa3bfa52 100644
--- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py
+++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py
@@ -35,6 +35,6 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
         :param folder_name: the name of the folder containing arbitrary amount of files.
         :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
                 to metadata.  Each element of the return list, is a tuple of the transformed bytes and a string
-                holding the extension to be used when writing out the new bytes.
+                holding the file name to use.
         """
         raise NotImplemented()
\ No newline at end of file
diff --git a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py
index 033e92f2a..a5c9ec9ad 100644
--- a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py
+++ b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py
@@ -13,7 +13,7 @@
 from argparse import ArgumentParser
 from typing import Any
 
-from data_processing.transform import AbstractBinaryTransform
+from data_processing.transform import AbstractTransform
 from data_processing.utils import CLIArgumentProvider
 
 
@@ -23,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider):
     """
 
     def __init__(
-        self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = []
+        self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = []
     ):
         """
         Initialization
@@ -36,7 +36,7 @@ def __init__(
         self.remove_from_metadata = remove_from_metadata
         self.params = {}
 
-    def get_transform_class(self) -> type[AbstractBinaryTransform]:
+    def get_transform_class(self) -> type[AbstractTransform]:
         """
         Get the class extending AbstractBinaryTransform which implements a specific transformation.
         The class will generally be instantiated with a dictionary of configuration produced by
diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py
new file mode 100644
index 000000000..e0fdd86c8
--- /dev/null
+++ b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py
@@ -0,0 +1,33 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.transform import NOOPFolderPythonTransformConfiguration
+
+
+class TestRayNOOPTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = "../../../test-data/data_processing/python/noop/"
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
+        launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration())
+        fixtures = [(launcher, {"noop_sleep_sec": 0}, basedir + "/input", basedir + "/expected")]
+        return fixtures
diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py
index d4cc874f0..e706a4dfa 100644
--- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py
+++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py
@@ -12,7 +12,6 @@
 
 import os
 
-import pyarrow as pa
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
@@ -20,11 +19,6 @@
 from data_processing_ray.test_support.transform import NOOPRayTransformConfiguration
 
 
-table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])})
-expected_table = table  # We're a noop after all.
-expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}]  # transform() result  # flush() result
-
-
 class TestRayNOOPTransform(AbstractTransformLauncherTest):
     """
     Extends the super-class to define the test data for the tests defined there.
diff --git a/transforms/universal/ededup/ray/src/ededup_transform_ray.py b/transforms/universal/ededup/ray/src/ededup_transform_ray.py
index c0823a22e..d90dfa780 100644
--- a/transforms/universal/ededup/ray/src/ededup_transform_ray.py
+++ b/transforms/universal/ededup/ray/src/ededup_transform_ray.py
@@ -149,13 +149,12 @@ def _load_snapshots(self, data_access_factory: DataAccessFactoryBase, statistics
             statistics.add_stats.remote({"data access retries": retries})
         self.logger.info(f"Found the following snapshot files {files.keys()}")
         # process snapshot files
-        for file in files.keys():
-            # load the file
+        for file in files.values():
+            # convert the file
             try:
-                b_hashes, _ = data_access.get_file(file)
-                snaps = pickle.loads(b_hashes)
+                snaps = pickle.loads(file)
             except Exception as e:
-                self.logger.warning(f"Failed to load hashes from file {file} with exception {e}")
+                self.logger.warning(f"Failed to load hashes with exception {e}")
                 raise UnrecoverableException("failed to load hashes")
             request = [[] for _ in range(len(self.filters))]
             for h in snaps:

From 6e2863a319716c513aa5f1bafa00a363089d2685 Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Sun, 13 Oct 2024 08:53:49 +0100
Subject: [PATCH 019/105] added noop Ray testing

---
 .../runtime/ray/transform_orchestrator.py     |  6 +-
 .../test_support/transform/__init__.py        |  1 +
 .../transform/noop_folder_transform.py        | 57 +++++++++++++++++++
 .../test_support/transform/noop_transform.py  |  4 +-
 .../launch/ray/ray_test_noop_folder_launch.py | 33 +++++++++++
 5 files changed, 95 insertions(+), 6 deletions(-)
 create mode 100644 data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py
 create mode 100644 data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py

diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
index b29682997..da39cbcf7 100644
--- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
+++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py
@@ -68,6 +68,9 @@ def orchestrate(
             if len(files) == 0:
                 logger.error("No input files to process - exiting")
                 return 0
+            # log retries
+            if retries > 0:
+                statistics.add_stats.remote({"data access retries": retries})
             logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # Print interval
         print_interval = int(len(files) / 100)
@@ -79,9 +82,6 @@ def orchestrate(
         logger.info(
             f"Number of workers - {preprocessing_params.n_workers} " f"with {preprocessing_params.worker_options} each"
         )
-        # log retries
-        if retries > 0:
-            statistics.add_stats.remote({"data access retries": retries})
         # create executors
         processor_params = {
             "data_access_factory": data_access_factory,
diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py
index a6cd700f7..dd095c961 100644
--- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py
+++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py
@@ -1 +1,2 @@
 from data_processing_ray.test_support.transform.noop_transform import NOOPRayTransformConfiguration
+from data_processing_ray.test_support.transform.noop_folder_transform import NOOPFolderRayTransformConfiguration
diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py
new file mode 100644
index 000000000..9919600c4
--- /dev/null
+++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py
@@ -0,0 +1,57 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+
+from data_processing.test_support.transform import NOOPTransformConfiguration
+from data_processing.test_support.transform import NOOPFolderTransform
+from data_processing.utils import get_logger
+from data_processing_ray.runtime.ray import (
+    RayTransformLauncher,
+    RayTransformRuntimeConfiguration,
+    DefaultRayTransformRuntime
+)
+from data_processing.data_access import DataAccess
+
+
+logger = get_logger(__name__)
+
+
+class NOOPFolderPythonRuntime(DefaultRayTransformRuntime):
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        return [data_access.get_input_folder()]
+
+
+class NOOPFolderRayTransformConfiguration(RayTransformRuntimeConfiguration):
+    """
+    Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
+                         runtime_class=NOOPFolderPythonRuntime)
+
+
+if __name__ == "__main__":
+    # launcher = NOOPRayLauncher()
+    launcher = RayTransformLauncher(NOOPFolderRayTransformConfiguration())
+    logger.info("Launching noop transform")
+    launcher.launch()
diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py
index 67cf20253..a2082c48c 100644
--- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py
+++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py
@@ -11,9 +11,7 @@
 ################################################################################
 
 
-from data_processing.test_support.transform.noop_transform import (
-    NOOPTransformConfiguration,
-)
+from data_processing.test_support.transform import NOOPTransformConfiguration
 from data_processing.utils import get_logger
 from data_processing_ray.runtime.ray import (
     RayTransformLauncher,
diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py
new file mode 100644
index 000000000..cd61c6745
--- /dev/null
+++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py
@@ -0,0 +1,33 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_ray.runtime.ray import RayTransformLauncher
+from data_processing_ray.test_support.transform import NOOPFolderRayTransformConfiguration
+
+
+class TestRayNOOPTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = "../../../../test-data/data_processing/ray/noop/"
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
+        launcher = RayTransformLauncher(NOOPFolderRayTransformConfiguration())
+        fixtures = [(launcher, {"noop_sleep_sec": 0, "run_locally": True}, basedir + "/input", basedir + "/expected")]
+        return fixtures

From 3c9be57d656eee4fbda6b1d41849894249e167d8 Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Sun, 13 Oct 2024 09:07:48 +0100
Subject: [PATCH 020/105] added noop Spark testing

---
 .../transform/noop_folder_transform.py        |  7 ++-
 .../test_support/transform/__init__.py        |  1 +
 .../transform/noop_folder_transform.py        | 53 +++++++++++++++++++
 .../launch/spark/test_noop_folder_launch.py   | 34 ++++++++++++
 4 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py
 create mode 100644 data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py

diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py
index 9919600c4..1d084b58a 100644
--- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py
+++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py
@@ -11,8 +11,7 @@
 ################################################################################
 
 
-from data_processing.test_support.transform import NOOPTransformConfiguration
-from data_processing.test_support.transform import NOOPFolderTransform
+from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration
 from data_processing.utils import get_logger
 from data_processing_ray.runtime.ray import (
     RayTransformLauncher,
@@ -25,7 +24,7 @@
 logger = get_logger(__name__)
 
 
-class NOOPFolderPythonRuntime(DefaultRayTransformRuntime):
+class NOOPFolderRayRuntime(DefaultRayTransformRuntime):
     def get_folders(self, data_access: DataAccess) -> list[str]:
         """
         Get folders to process
@@ -47,7 +46,7 @@ def __init__(self):
         Initialization
         """
         super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
-                         runtime_class=NOOPFolderPythonRuntime)
+                         runtime_class=NOOPFolderRayRuntime)
 
 
 if __name__ == "__main__":
diff --git a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py
index 83516f9ae..041cb43d6 100644
--- a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py
+++ b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py
@@ -11,3 +11,4 @@
 ################################################################################
 
 from data_processing_spark.test_support.transform.noop_transform import NOOPSparkTransformConfiguration
+from data_processing_spark.test_support.transform.noop_folder_transform import NOOPFolderSparkTransformConfiguration
diff --git a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py
new file mode 100644
index 000000000..9972e0f79
--- /dev/null
+++ b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py
@@ -0,0 +1,53 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime
+from data_processing.data_access import DataAccess
+
+
+logger = get_logger(__name__)
+
+
+class NOOPFolderSparkRuntime(DefaultSparkTransformRuntime):
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        return [data_access.get_input_folder()]
+
+
+class NOOPFolderSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
+    """
+    Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
+                         runtime_class=NOOPFolderSparkRuntime)
+
+
+if __name__ == "__main__":
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=NOOPFolderSparkTransformConfiguration())
+    logger.info("Launching noop transform")
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py b/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py
new file mode 100644
index 000000000..c8e3ce40b
--- /dev/null
+++ b/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py
@@ -0,0 +1,34 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from data_processing_spark.test_support.transform import NOOPFolderSparkTransformConfiguration
+
+
+class TestSparkNOOPTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = "../../../../test-data"
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
+        fixtures = []
+        launcher = SparkTransformLauncher(NOOPFolderSparkTransformConfiguration())
+        fixtures.append((launcher, {"noop_sleep_sec": 1}, basedir + "/input", basedir + "/expected"))
+        return fixtures

From 371a7124c1570270fd692249dd2e601c4b3476c8 Mon Sep 17 00:00:00 2001
From: blublinsky <blublinsky@hotmail.com>
Date: Sun, 13 Oct 2024 10:03:21 +0100
Subject: [PATCH 021/105] more data access simplifications

---
 .../src/data_processing/data_access/data_access.py    |  5 ++++-
 .../data_processing/data_access/data_access_local.py  | 11 -----------
 .../src/data_processing/data_access/data_access_s3.py | 11 -----------
 3 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access.py b/data-processing-lib/python/src/data_processing/data_access/data_access.py
index bba5afd2b..51d7b54b8 100644
--- a/data-processing-lib/python/src/data_processing/data_access/data_access.py
+++ b/data-processing-lib/python/src/data_processing/data_access/data_access.py
@@ -358,7 +358,10 @@ def get_output_location(self, path: str) -> str:
         :param path: input file location
         :return: output file location
         """
-        raise NotImplementedError("Subclasses should implement this!")
+        if self.get_output_folder() is None:
+            self.logger.error("Get out put location. S3 configuration is not provided, returning None")
+            return None
+        return path.replace(self.get_input_folder(), self.get_output_folder())
 
     def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
         """
diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access_local.py b/data-processing-lib/python/src/data_processing/data_access/data_access_local.py
index 224e30ce8..d37e571a3 100644
--- a/data-processing-lib/python/src/data_processing/data_access/data_access_local.py
+++ b/data-processing-lib/python/src/data_processing/data_access/data_access_local.py
@@ -130,17 +130,6 @@ def get_table(self, path: str) -> tuple[pa.table, int]:
             logger.error(f"Error reading table from {path}: {e}")
             return None, 0
 
-    def get_output_location(self, path: str) -> str:
-        """
-        Get output location based on input
-        :param path: input file location
-        :return: output file location
-        """
-        if self.output_folder is None:
-            logger.error("Get output location. local configuration is not defined, returning None")
-            return None
-        return path.replace(self.input_folder, self.output_folder)
-
     def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
         """
         Saves a pyarrow table to a file and returns information about the operation.
diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py b/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py
index 43e13bcb1..8ddc772c5 100644
--- a/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py
+++ b/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py
@@ -126,17 +126,6 @@ def get_table(self, path: str) -> tuple[pyarrow.table, int]:
             self.logger.error(f"Exception reading table {path} from S3 - {e}")
             return None, 0
 
-    def get_output_location(self, path: str) -> str:
-        """
-        Get output location based on input
-        :param path: input file location
-        :return: output file location
-        """
-        if self.output_folder is None:
-            self.logger.error("Get out put location. S3 configuration is not provided, returning None")
-            return None
-        return path.replace(self.input_folder, self.output_folder)
-
     def save_table(self, path: str, table: pyarrow.Table) -> tuple[int, dict[str, Any], int]:
         """
         Save table to a given location

From 680f3138d1e183a814f6c9230ab1eee33ad759c0 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 00:40:59 -0400
Subject: [PATCH 022/105] Renamed/refactored fuzzy dedup python orchestrator

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/python/src/fuzzy_dedup_python.py   | 180 ++++++++++++
 .../fdedup/python/src/service_orchestrator.py | 265 ------------------
 2 files changed, 180 insertions(+), 265 deletions(-)
 create mode 100644 transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
 delete mode 100644 transforms/universal/fdedup/python/src/service_orchestrator.py

diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
new file mode 100644
index 000000000..ca64f336f
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
@@ -0,0 +1,180 @@
+import argparse
+import os
+import sys
+
+import cluster_analysis_transform
+import data_cleaning_transform
+import get_duplicate_list_transform
+import signature_calc_transform
+from cluster_analysis_transform_python import (
+    ClusterAnalysisPythonTransformConfiguration,
+)
+from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils, get_logger
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+from signature_calc_transform_python import (
+    SignatureCalculationPythonTransformConfiguration,
+)
+
+
+SERVICE_DICT = {
+    "SignatureCalculation": "minhash",
+    "ClusterAnalysis": "cluster",
+    "GetDuplicateList": "fdlist",
+    "DataCleaning": "fdclean",
+}
+
+s3_creds = {
+    "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+    "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+    "url": os.getenv("AWS_ENDPOINT_URL"),
+}
+
+ARGS_MAP = {
+    "minhash": [
+        signature_calc_transform.contents_column_key,
+        signature_calc_transform.document_id_column_key,
+        signature_calc_transform.seed_key,
+        signature_calc_transform.num_permutations_key,
+        signature_calc_transform.num_bands_key,
+        signature_calc_transform.num_minhashes_per_band_key,
+        signature_calc_transform.jaccard_similarity_threshold_key,
+        signature_calc_transform.word_shingle_size_key,
+        signature_calc_transform.num_segments_key,
+    ],
+    "cluster": [
+        cluster_analysis_transform.jaccard_similarity_threshold_key,
+        cluster_analysis_transform.num_bands_key,
+        cluster_analysis_transform.num_segments_key,
+    ],
+    "fdlist": [
+        get_duplicate_list_transform.subfolder_key,
+        get_duplicate_list_transform.consolidated_filename_key,
+    ],
+    "fdclean": [
+        data_cleaning_transform.document_id_column_key,
+        data_cleaning_transform.duplicate_list_location_key,
+    ],
+}
+
+
+class ServiceOrchestrator:
+    def __init__(self, global_params: argparse.Namespace = None):
+        self.global_params = global_params
+        self.logger = get_logger(__name__)
+
+    def execute_service(self, service_logic, service_params):
+        # Call the generic service logic
+        service_logic(service_params)
+
+    def orchestrate(self):
+        service_list = self.global_params.services.split(",")
+        for service in service_list:
+            self.logger.info(f"Starting {service} step")
+            if service not in SERVICE_DICT:
+                err_msg = f"Unknown service {service} specified. Must be one of {SERVICE_DICT.keys()}"
+                self.logger.error(err_msg)
+                raise ValueError(err_msg)
+            service_short_name = SERVICE_DICT[service]
+            service_params = self.get_arguments(args, service_short_name)
+            self.logger.info(f"Got parameters for {service}")
+            status = self.execute_service(service_short_name, service_params)
+            if status == 0:
+                self.logger.info(f"{service} completed successfully")
+            else:
+                self.logger.error(f"{service} failed with status {status}, aborting ...")
+                break
+
+    def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list:
+        sys_argv = ["python"]
+        in_args_dict = vars(in_args)
+        all_module_arguments = ARGS_MAP.get(service_name, [])
+        passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None}
+        for k, v in passed_args.items():
+            sys_argv.append(f"--{service_name}_{k}")
+            sys_argv.append(str(v))
+        if service_name == "minhash":
+            input_folder = in_args_dict["input_folder"]
+            output_folder = in_args_dict["output_folder"]
+        elif service_name == "cluster":
+            input_folder = os.path.join(in_args_dict["output_folder"], "bands")
+            output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove")
+        elif service_name == "fdlist":
+            input_folder = in_args_dict["output_folder"]
+            output_folder = in_args_dict["output_folder"]
+        elif service_name == "fdclean":
+            input_folder = in_args_dict["input_folder"]
+            output_folder = os.path.join(in_args_dict["output_folder"], "cleaned")
+        else:
+            self.logger.error(f"Unknown service name: {service_name}")
+        data_io = {
+            "input_folder": input_folder,
+            "output_folder": output_folder,
+        }
+        if in_args.use_s3:
+            sys_argv.append("--data_s3_cred")
+            sys_argv.append(ParamsUtils.convert_to_ast(s3_creds))
+            sys_argv.append("--data_s3_config")
+        else:
+            sys_argv.append("--data_local_config")
+        sys_argv.append(ParamsUtils.convert_to_ast(data_io))
+        return sys_argv
+
+    def execute_service(self, service_short_name: str, params: list) -> int:
+        sys.argv = params
+        if service_short_name == "minhash":
+            launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration())
+        elif service_short_name == "cluster":
+            launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
+        elif service_short_name == "fdlist":
+            launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+        elif service_short_name == "fdclean":
+            launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration())
+        status = launcher.launch()
+        return status
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Service Orchestrator")
+
+    # Define command line arguments
+    parser.add_argument("--input_folder", type=str, required=True, help="Input folder path")
+    parser.add_argument("--output_folder", type=str, required=True, help="Output folder path")
+
+    parser.add_argument(
+        "--contents_column", type=str, default="text", help="Name of the column that holds document text"
+    )
+    parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations")
+    parser.add_argument("--num_bands", type=int, default=14, help="Number of bands")
+    parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band")
+    parser.add_argument("--num_segments", type=int, default=2, help="Number of segments")
+
+    # Single argument for service execution
+    parser.add_argument(
+        "--services",
+        type=str,
+        required=False,
+        default="SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning",
+        help="Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning)",
+    )
+
+    parser.add_argument(
+        "--use_s3",
+        action="store_true",
+        help="use s3",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    # Parse command line arguments
+    args = parse_args()
+    # Initialize the orchestrator
+    orchestrator = ServiceOrchestrator(global_params=args)
+    # Launch python fuzzy dedup execution
+    orchestrator.orchestrate()
diff --git a/transforms/universal/fdedup/python/src/service_orchestrator.py b/transforms/universal/fdedup/python/src/service_orchestrator.py
deleted file mode 100644
index 897a3210c..000000000
--- a/transforms/universal/fdedup/python/src/service_orchestrator.py
+++ /dev/null
@@ -1,265 +0,0 @@
-import argparse
-import os
-import sys
-
-from cluster_analysis_transform_python import (
-    ClusterAnalysisPythonTransformConfiguration,
-)
-from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
-from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
-from data_processing.runtime.pure_python import PythonTransformLauncher
-from data_processing.utils import ParamsUtils
-from file_copy_util import FileCopyUtil
-from signature_calc_transform_python import (
-    SignatureCalculationPythonTransformConfiguration,
-)
-
-
-class ServiceOrchestrator:
-    def __init__(self, global_params=None):
-        self.global_params = global_params or {}
-
-    def execute_service(self, service_logic, service_params):
-        # Call the generic service logic
-        service_logic(service_params)
-
-    def orchestrate(self, service_logic):
-        service_list = self.global_params["services"].split(",")
-
-        for service in service_list:
-            if service == "SignatureCalculation":
-                params = create_transform_args_payload(args, service)
-                params["service_type"] = "SignatureCalculation"
-                self.execute_service(service_logic, params)
-            elif service == "ClusterAnalysis":
-                params = create_transform_args_payload(args, service)
-                params["service_type"] = "ClusterAnalysis"
-                self.execute_service(service_logic, params)
-            elif service == "DataCleaning":
-                params = create_transform_args_payload(args, service)
-                params["service_type"] = "DataCleaning"
-                self.execute_service(service_logic, params)
-            elif service == "BandsFileCopy":
-                params = args
-                params["service_type"] = "BandsFileCopy"
-                self.execute_service(service_logic, params)
-            elif service == "DocsToRemoveFileCopy":
-                params = args
-                params["service_type"] = "DocsToRemoveFileCopy"
-                self.execute_service(service_logic, params)
-            else:
-                print(f"Warning: {service} is not a recognized service.")
-
-
-def generic_service_logic(params):
-    print("Service executed with parameters:", params)
-    service_type = params["service_type"]
-    use_s3 = params["use_s3"]
-    # Remove the 'service_type' key
-    params.pop("service_type", None)  # Using pop() method
-
-    if service_type == "SignatureCalculation" or service_type == "ClusterAnalysis" or service_type == "DataCleaning":
-        # Set the simulated command line args
-        params.pop("num_permutations", None)  # Using pop() method
-        params.pop("num_bands", None)  # Using pop() method
-        params.pop("num_segments", None)  # Using pop() method
-        params.pop("use_s3", None)  # Using pop() method
-    # Set the simulated command line args
-    sys.argv = ParamsUtils.dict_to_req(d=params)
-    if use_s3:
-        sys.argv.append("--data_s3_cred")
-        sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
-
-    if service_type == "SignatureCalculation":
-        runtime_config = SignatureCalculationPythonTransformConfiguration()
-        launch_transform_service(runtime_config)
-    elif service_type == "ClusterAnalysis":
-        runtime_config = ClusterAnalysisPythonTransformConfiguration()
-        launch_transform_service(runtime_config)
-    elif service_type == "DataCleaning":
-        runtime_config = DataCleaningPythonTransformConfiguration()
-        launch_transform_service(runtime_config)
-    elif service_type == "BandsFileCopy":
-        launch_file_copy_service(params, service_type)
-    elif service_type == "DocsToRemoveFileCopy":
-        launch_file_copy_service(params, service_type)
-
-
-def launch_transform_service(params):
-    # create launcher
-    launcher = PythonTransformLauncher(runtime_config=params)
-    # Launch the ray actor(s) to process the input
-    launcher.launch()
-
-
-def launch_file_copy_service(args, service_type):
-    root_folder = os.path.join(args["root_folder"], args["output_folder"])
-    data_type = None
-    if service_type == "BandsFileCopy":
-        data_type = "bands"
-        # Get files to process
-        files = [
-            f"band={band}/segment={segment}"
-            for band in range(args["num_bands"])
-            for segment in range(args["num_segments"])
-        ]
-    elif service_type == "DocsToRemoveFileCopy":
-        files = ["docs_to_remove"]
-        data_type = "docs_to_remove"
-    config = {"root_folder": root_folder}
-    data_access_factory: DataAccessFactoryBase = DataAccessFactory()
-    daf_args = []
-
-    if args["use_s3"]:
-
-        s3_config = {
-            "input_folder": root_folder,
-            "output_folder": root_folder,
-        }
-        daf_args.append("--data_s3_cred")
-        daf_args.append(ParamsUtils.convert_to_ast(s3_creds))
-        daf_args.append("--data_s3_config")
-        daf_args.append(ParamsUtils.convert_to_ast(s3_config)),
-    else:
-
-        # Construct folders
-        local_config = {
-            "input_folder": root_folder,
-            "output_folder": os.path.abspath(os.path.join(args["root_folder"], args["output_folder"])),
-        }
-        daf_args.append("--data_local_config")
-        daf_args.append(ParamsUtils.convert_to_ast(local_config))
-
-    daf_parser = argparse.ArgumentParser()
-    data_access_factory.add_input_params(parser=daf_parser)
-    data_access_factory_args = daf_parser.parse_args(args=daf_args)
-    data_access_factory.apply_input_params(args=data_access_factory_args)
-    stats = {}
-    fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats)
-    for file in files:
-        fcu.copy_data(file, data_type)
-
-
-def create_transform_args_payload(args, service):
-    print(args)
-    # Construct folders
-    input_folder = os.path.join(args["root_folder"], args["input_folder"])
-    output_folder = os.path.join(args["root_folder"], args["output_folder"])
-    if service == "ClusterAnalysis":
-        input_folder = os.path.join(args["root_folder"], args["output_folder"], "bands_consolidated")
-        output_folder = os.path.join(args["root_folder"], args["output_folder"], "docs_to_remove")
-    elif service == "DataCleaning":
-        output_folder = os.path.join(args["root_folder"], args["output_folder"], "cleaned")
-        duplicate_location = os.path.join(
-            args["root_folder"],
-            args["output_folder"],
-            "docs_to_remove_consolidated",
-            "docs_to_remove_consolidated.parquet",
-        )
-
-    # Create a local configuration
-    local_conf = {"input_folder": input_folder, "output_folder": output_folder}
-
-    # Create parameters
-    params = {
-        "num_permutations": args["num_permutations"],
-        "num_bands": args["num_bands"],
-        "num_segments": args["num_segments"],
-        "use_s3": args["use_s3"],
-    }
-
-    if args["use_s3"]:
-        params["data_s3_config"] = ParamsUtils.convert_to_ast(local_conf)
-    else:
-        params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf)
-
-    # add extra
-    if service == "DataCleaning":
-        short_name = "fdclean"
-        cli_prefix = f"{short_name}_"
-
-        # configuration keys
-        document_id_column_key = "document_id_column"
-        """ This key holds the name of the column storing the unique ID assigned to each document"""
-        duplicate_list_location_key = "duplicate_list_location"
-        """ This key holds the location of the list of duplicate documents marked for removal"""
-
-        # command line arguments
-        document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
-        """ Name of the column storing the unique ID assigned to each document"""
-        duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}"
-        """ Location of the list of duplicate documents marked for removal"""
-
-        params[document_id_column_cli_param] = "int_id_column"
-        params[duplicate_list_location_cli_param] = duplicate_location
-
-    return params
-
-
-def create_file_copy_args_payload(args):
-    daf_args = []
-    local_config = {
-        "input_folder": args.root_folder,
-        "output_folder": args.root_folder,
-    }
-    daf_args.append("--data_local_config")
-    daf_args.append(ParamsUtils.convert_to_ast(local_config))
-    data_access_factory: DataAccessFactoryBase = DataAccessFactory()
-    daf_parser = argparse.ArgumentParser()
-    data_access_factory.add_input_params(parser=daf_parser)
-    data_access_factory_args = daf_parser.parse_args(args=daf_args)
-    data_access_factory.apply_input_params(args=data_access_factory_args)
-    return data_access_factory
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Service Orchestrator")
-
-    # Define command line arguments
-    parser.add_argument("--root_folder", type=str, required=True, help="Root folder path")
-    parser.add_argument("--input_folder", type=str, required=True, help="Input folder path")
-    parser.add_argument("--output_folder", type=str, required=True, help="Output folder path")
-
-    parser.add_argument(
-        "--contents_column", type=str, default="text", help="Name of the column that holds document text"
-    )
-    parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations")
-    parser.add_argument("--num_bands", type=int, default=14, help="Number of bands")
-    parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band")
-    parser.add_argument("--num_segments", type=int, default=2, help="Number of segments")
-
-    # Single argument for service execution
-    parser.add_argument(
-        "--services",
-        type=str,
-        required=True,
-        help="Comma-separated list of services to run (e.g., SignatureCalculation,BandsFileCopy,ClusterAnalysis,DocsToRemoveFileCopy,DataCleaning)",
-    )
-
-    parser.add_argument(
-        "--use_s3",
-        type=bool,
-        default=False,
-        help="use s3",
-    )
-
-    args = parser.parse_args()
-    return vars(args)  # Convert Namespace to dictionary
-
-
-if __name__ == "__main__":
-
-    s3_creds = {
-        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
-        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
-        "url": os.getenv("AWS_ENDPOINT_URL"),
-    }
-
-    # Parse command line arguments
-    args = parse_args()
-
-    # Initialize the orchestrator
-    orchestrator = ServiceOrchestrator(global_params=args)
-
-    # Example service execution (if you had defined services)
-    orchestrator.orchestrate(generic_service_logic)

From c29d3bf78eb24045e7f6d3f110a8323432636290 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 00:45:50 -0400
Subject: [PATCH 023/105] Rewrote cluster_analysis_transform as a
 folder_transform

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../src/cluster_analysis_local_python.py      |  11 +-
 .../python/src/cluster_analysis_transform.py  | 180 +++++++++++++-----
 .../src/cluster_analysis_transform_python.py  |  49 ++++-
 3 files changed, 183 insertions(+), 57 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
index dcfc9a7e4..7c162b1b1 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
@@ -21,7 +21,7 @@
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands"))
 output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
 local_conf = {
     "input_folder": input_folder,
@@ -35,12 +35,15 @@
     "runtime_pipeline_id": "pipeline_id",
     "runtime_job_id": "job_id",
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "cluster_num_bands": 14,
+    "cluster_num_segments": 2,
+    "cluster_jaccard_similarity_threshold": 0.0,
 }
 if __name__ == "__main__":
     # Set the simulated command line args
-    # sys.argv = ParamsUtils.dict_to_req(d=params)
-    # print(sys.argv)
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    print(sys.argv)
     # create launcher
     launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
-    # Launch the ray actor(s) to process the input
+    # Launch python to process the input
     launcher.launch()
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
index 5ad18362a..221b50512 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
@@ -9,15 +9,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ################################################################################
+import io
 import os
+import re
 from argparse import ArgumentParser, Namespace
 from typing import Any, List, Tuple
 
 import numpy as np
 import polars as pl
 import pyarrow as pa
-from data_processing.transform import AbstractTableTransform, TransformConfiguration
-from data_processing.utils import CLIArgumentProvider, get_logger
+from data_processing.transform import AbstractFolderTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
 from Murmur_MH import Murmur_MH
 
 
@@ -25,23 +27,37 @@
 cli_prefix = f"{short_name}_"
 
 # configuration keys
+num_bands_key = "num_bands"
+""" This key holds the number of bands used in the banding technique"""
+num_segments_key = "num_segments"
+""" This key holds the number of segments dividing the hashing space for each band"""
 jaccard_similarity_threshold_key = "jaccard_similarity_threshold"
 """ This key holds the Jaccard similarity threshold above which two documents are duplicates"""
 
 # command line arguments
+num_bands_cli_param = f"{cli_prefix}{num_bands_key}"
+""" The number of bands used in the banding technique"""
 jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}"
 """ Jaccard similarity threshold above which two documents are duplicates"""
+num_segments_cli_param = f"{cli_prefix}{num_segments_key}"
+""" The number of segments dividing the hashing space for each band"""
 
 captured_arg_keys = [
+    num_bands_key,
+    num_segments_key,
     jaccard_similarity_threshold_key,
 ]
 
 # defaults
-jaccard_similarity_threshold_default = 0.8
-""" Default Jaccard similarity threshold above which two documents are duplicates"""
+num_bands_default = 14
+""" Default number of bands used in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+jaccard_similarity_threshold_default = 0.75
+""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)"""
+num_segments_default = 1
+""" Default number of segments dividing the hashing space for each band"""
 
 
-class ClusterAnalysisTransform(AbstractTableTransform):
+class ClusterAnalysisTransform(AbstractFolderTransform):
     """
     This is the second transform of the fuzzy dedup pipeline. It runs in parallel:
     for each band, the hashing interval is divided into segments. A cluster analysis
@@ -65,7 +81,9 @@ class ClusterAnalysisTransform(AbstractTableTransform):
     duplicates. The resulting clusters are saved in a file for further analysis.
 
     Args:
+        num_bands: number of bands used in the banding technique
         jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates
+        num_segments: the number of segments dividing the hashing space for each band
     """
 
     def __init__(self, config: dict[str, Any]):
@@ -75,58 +93,102 @@ def __init__(self, config: dict[str, Any]):
         defined by the companion runtime, ClusterAnalysisTransformRuntime.
         """
         super().__init__(config)
+        self.num_bands = config.get(num_bands_key, num_bands_default)
+        self.num_segments = config.get(num_segments_key, num_segments_default)
         self.jaccard_similarity_threshold = config.get(
             jaccard_similarity_threshold_key, jaccard_similarity_threshold_default
         )
+        self.data_access = config.get("data_access")
         self.logger = get_logger(__name__)
 
-    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
-        bands_dataframe = pl.from_arrow(table)
-        docs2remove_list = []
-        # clustering
-        bands_dataframe_groups = bands_dataframe.group_by("band_hash").agg("document_data")
-        bands_dataframe_cluster = bands_dataframe_groups.with_columns(
-            cluster_length=pl.col("document_data").list.len()
-        ).filter(pl.col("cluster_length") > 1)
-        self.logger.info(f"file_name = {file_name}")
-        num_clusters = len(bands_dataframe_cluster)
-        if num_clusters > 0:
-            sum_cdocs = bands_dataframe_cluster.select(pl.sum("cluster_length")).item()
-            max_cdocs = bands_dataframe_cluster.select(pl.max("cluster_length")).item()
-            min_cdocs = bands_dataframe_cluster.select(pl.min("cluster_length")).item()
-            avg_cdocs = bands_dataframe_cluster.select(pl.mean("cluster_length")).item()
+    def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        self.logger.info(f"Cluster analysis for folder {folder_name}")
+        metadata = {}
+        input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name))
+        files, retries = self.data_access.get_folder_files(
+            path=input_folder,
+            extensions=[".parquet"],
+            return_data=True,
+        )
+        if retries > 0:
+            metadata |= {"data_access_retries": retries}
+        match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name)
+        if match:
+            band = int(match.group(1))
+            segment = int(match.group(2))
         else:
-            sum_cdocs = 0
-            max_cdocs = 0
-            min_cdocs = 0
-            avg_cdocs = 0
-        self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs")
-        self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
-        bands_dataframe_response = self.process_bands(bands_dataframe_cluster)
+            raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s")
+        output_folder = self.sanitize_folder_name(self.data_access.output_folder)
+        output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet")
+
+        # consolidate into a single data frame band hashes computed by workers
+        band_segment_dataframe, consolidation_stats = self.consolidate_band_segment_files(files)
+        metadata |= consolidation_stats
+        # cluster grouping by band hashes
+        cluster_dataframe, cluster_stats = self.get_clusters(band_segment_dataframe)
+        metadata |= cluster_stats
+        # cluster analysis using jaccard similarity
+        jaccard_cluster_dataframe, jaccard_stats = self.analyze_clusters(cluster_dataframe)
+        metadata |= jaccard_stats
+        # Generate the docs_to_remove dataframe
+        docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove")
+        output_data = TransformUtils.convert_arrow_to_binary(docs_to_remove_dataframe.to_arrow())
+        self.logger.info(f"{len(docs_to_remove_dataframe)} documents marked to remove")
+        metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)}
+        return [(output_data, output_path)], metadata
+
+    def sanitize_folder_name(self, folder_name: str) -> str:
+        if "://" in folder_name:
+            _, folder_name = folder_name.split("://")
+        if folder_name[-1] != "/":
+            folder_name = f"{folder_name}/"
+        return folder_name
+
+    def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]:
+        band_segment_dataframe = pl.DataFrame()
+        total_input_rows = 0
+        for fname, contents in files.items():
+            df = pl.read_parquet(io.BytesIO(contents))
+            total_input_rows += len(df)
+            self.logger.debug(f"{fname} has {len(df)} rows")
+            band_segment_dataframe = band_segment_dataframe.vstack(df)
 
-        filtered_doc2remove_dataframe = bands_dataframe_response.filter(pl.col("docs_to_remove_length") > 0)
-        num_clusters = len(filtered_doc2remove_dataframe)
+        consolidation_stats = {
+            "input_files": len(files),
+            "input_bytes": sum(len(v) for v in files.values()),
+            "input_rows": total_input_rows,
+            "consolidated_files": 1,
+            "consolidated_bytes": band_segment_dataframe.to_arrow().nbytes,
+            "consolidated_rows": len(band_segment_dataframe),
+        }
+        return band_segment_dataframe, consolidation_stats
+
+    def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]:
+        groupby_dataframe = band_segment_dataframe.group_by("band_hash").agg("document_data")
+        cluster_dataframe = groupby_dataframe.with_columns(cluster_length=pl.col("document_data").list.len()).filter(
+            pl.col("cluster_length") > 1
+        )
+        # self.logger.info(f"file_name = {file_name}")
+        num_clusters = len(cluster_dataframe)
         if num_clusters > 0:
-            sum_cdocs = filtered_doc2remove_dataframe.select(pl.sum("docs_to_remove_length")).item()
-            max_cdocs = filtered_doc2remove_dataframe.select(pl.max("docs_to_remove_length")).item()
-            min_cdocs = filtered_doc2remove_dataframe.select(pl.min("docs_to_remove_length")).item()
-            avg_cdocs = filtered_doc2remove_dataframe.select(pl.mean("docs_to_remove_length")).item()
+            sum_cdocs = cluster_dataframe.select(pl.sum("cluster_length")).item()
+            max_cdocs = cluster_dataframe.select(pl.max("cluster_length")).item()
+            min_cdocs = cluster_dataframe.select(pl.min("cluster_length")).item()
+            avg_cdocs = cluster_dataframe.select(pl.mean("cluster_length")).item()
         else:
             sum_cdocs = 0
             max_cdocs = 0
             min_cdocs = 0
             avg_cdocs = 0
-        self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs")
+        self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs")
         self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
+        cluster_stats = {
+            "groupby_clusters": num_clusters,
+            "cluster_duplicate_docs": sum_cdocs,
+        }
+        return cluster_dataframe, cluster_stats
 
-        # Explode the 'docs_to_remove' column
-        doc2remove_exploded_dataframe = filtered_doc2remove_dataframe.explode("docs_to_remove")
-        table = doc2remove_exploded_dataframe.to_arrow()
-        self.logger.info(f"{len(doc2remove_exploded_dataframe)} documents marked to remove")
-        metadata = {"nrows": len(table)}
-        return [table], metadata
-
-    def process_bands(self, df: pl.DataFrame) -> pl.DataFrame:
+    def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]:
         # Define the schema with specific data types
         schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64}
         doc_ids_lists = []
@@ -137,7 +199,7 @@ def process_bands(self, df: pl.DataFrame) -> pl.DataFrame:
             doc_ids_lists += doc_ids_list
             docs_to_remove_lists += docs_to_remove_list
             len_of_docs2remove_lists += len_of_docs2remove_list
-        processed_rows = pl.DataFrame(
+        jaccard_cluster_dataframe = pl.DataFrame(
             {
                 "first_doc": doc_ids_lists,
                 "docs_to_remove": docs_to_remove_lists,
@@ -145,7 +207,25 @@ def process_bands(self, df: pl.DataFrame) -> pl.DataFrame:
             },
             schema=schema,
         )
-        return processed_rows
+        filtered_jaccard_dataframe = jaccard_cluster_dataframe.filter(pl.col("docs_to_remove_length") > 0)
+        num_clusters = len(filtered_jaccard_dataframe)
+        if num_clusters > 0:
+            sum_cdocs = filtered_jaccard_dataframe.select(pl.sum("docs_to_remove_length")).item()
+            max_cdocs = filtered_jaccard_dataframe.select(pl.max("docs_to_remove_length")).item()
+            min_cdocs = filtered_jaccard_dataframe.select(pl.min("docs_to_remove_length")).item()
+            avg_cdocs = filtered_jaccard_dataframe.select(pl.mean("docs_to_remove_length")).item()
+        else:
+            sum_cdocs = 0
+            max_cdocs = 0
+            min_cdocs = 0
+            avg_cdocs = 0
+        self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs")
+        self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
+        jaccard_stats = {
+            "jaccard_clusters": num_clusters,
+            "jaccard_duplicate_docs": sum_cdocs,
+        }
+        return filtered_jaccard_dataframe, jaccard_stats
 
     def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]:
         # Process row and return a new list of Series or a new row
@@ -216,6 +296,18 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=jaccard_similarity_threshold_default,
             help="Jaccard similarity threshold above which two documents are duplicates",
         )
+        parser.add_argument(
+            f"--{num_bands_cli_param}",
+            type=int,
+            default=num_bands_default,
+            help="The number of bands used in the banding technique",
+        )
+        parser.add_argument(
+            f"--{num_segments_cli_param}",
+            type=int,
+            default=num_segments_default,
+            help="The number of segments dividing the hashing space for each band",
+        )
 
     def apply_input_params(self, args: Namespace) -> bool:
         """
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
index 28d96f428..8ff6dbf2b 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
@@ -10,11 +10,19 @@
 # limitations under the License.
 ################################################################################
 
+import os
 import time
+from typing import Any
 
-from cluster_analysis_transform import ClusterAnalysisTransformConfiguration
-from data_processing.runtime.pure_python import PythonTransformLauncher
-from data_processing.runtime.pure_python.runtime_configuration import (
+from cluster_analysis_transform import (
+    ClusterAnalysisTransformConfiguration,
+    num_bands_key,
+    num_segments_key,
+)
+from data_processing.data_access import DataAccess
+from data_processing.runtime.pure_python import (
+    DefaultPythonTransformRuntime,
+    PythonTransformLauncher,
     PythonTransformRuntimeConfiguration,
 )
 from data_processing.utils import get_logger
@@ -23,11 +31,31 @@
 logger = get_logger(__name__)
 
 
+class ClusterAnalysisPythonRuntime(DefaultPythonTransformRuntime):
+    """
+    Cluster analysis runtime support for Python
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        bands = self.params[num_bands_key]
+        segments = self.params[num_segments_key]
+        folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)]
+        return folders
+
+
 class ClusterAnalysisPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
     """
-    Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher.
-    NOOP does not use a RayRuntime class so the superclass only needs the base
-    python-only configuration.
+    Implements the PythonTransformConfiguration for Fuzzy Dedup ClusterAnalysis
+    as required by the PythonTransformLauncher.
     """
 
     def __init__(self):
@@ -35,10 +63,13 @@ def __init__(self):
         Initialization
         :param base_configuration - base configuration class
         """
-        super().__init__(transform_config=ClusterAnalysisTransformConfiguration())
+        super().__init__(
+            transform_config=ClusterAnalysisTransformConfiguration(),
+            runtime_class=ClusterAnalysisPythonRuntime,
+        )
 
 
 if __name__ == "__main__":
-    launcher = PythonTransformLauncher(ClusterAnalysisTransformConfiguration())
-    logger.info("Launching noop transform")
+    launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
+    logger.info("Launching fuzzy dedup cluster analysis python transform")
     launcher.launch()

From aada59eccbf6b8df6e1c5b332fa19a21a99b125c Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 00:48:21 -0400
Subject: [PATCH 024/105] Wrote get_duplicate_list_transform as a
 folder_transform

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../src/get_duplicate_list_transform.py       | 168 ++++++++++++++++++
 .../get_duplicate_list_transform_python.py    |  71 ++++++++
 2 files changed, 239 insertions(+)
 create mode 100644 transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
 create mode 100644 transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py

diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
new file mode 100644
index 000000000..c7b4cbddf
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
@@ -0,0 +1,168 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import io
+import os
+import re
+from argparse import ArgumentParser, Namespace
+from typing import Any, List, Tuple
+
+import numpy as np
+import polars as pl
+import pyarrow as pa
+from data_processing.transform import AbstractFolderTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
+from Murmur_MH import Murmur_MH
+
+
+short_name = "fdlist"
+cli_prefix = f"{short_name}_"
+
+# configuration keys
+subfolder_key = "docs_to_remove"
+""" This key holds the name of the subfolder with the duplicate records"""
+consolidated_filename_key = "consolidated_filename"
+""" This key holds the name of the file with the consolidated list of duplicates"""
+
+# command line arguments
+subfolder_cli_param = f"{cli_prefix}{subfolder_key}"
+""" The name of the subfolder with the duplicate records"""
+consolidated_filename_cli_param = f"{cli_prefix}{consolidated_filename_key}"
+""" The name of the file with the consolidated list of duplicates"""
+
+captured_arg_keys = [
+    subfolder_key,
+    consolidated_filename_key,
+]
+
+# defaults
+subfolder_default = "docs_to_remove"
+""" Default name of the subfolder with the duplicate records"""
+consolidated_filename_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet")
+""" Default name of the file with the consolidated list of duplicates"""
+
+
+class GetDuplicateListTransform(AbstractFolderTransform):
+    """
+    This is an intermediate step of the fuzzy dedup pipeline. It runs in a single
+    location and consolidates in a single file all the duplicates found for each
+    band segment.
+    Args:
+        subfolder: name of the subfolder with the duplicate records
+        consolidated_filename: name of the file with the consolidated list of duplicates
+    """
+
+    def __init__(self, config: dict[str, Any]):
+        """
+        Initialize based on the dictionary of configuration information.
+        This is generally called with configuration parsed from the CLI arguments
+        defined by the companion runtime, ClusterAnalysisTransformRuntime.
+        """
+        super().__init__(config)
+        self.subfolder = config.get(subfolder_key, subfolder_default)
+        self.consolidated_filename = config.get(consolidated_filename_key, consolidated_filename_default)
+        self.data_access = config.get("data_access")
+        self.logger = get_logger(__name__)
+
+    def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
+        self.logger.info(f"Get Duplicate List for folder {folder_name}")
+        metadata = {}
+        input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name))
+        files, retries = self.data_access.get_folder_files(
+            path=input_folder,
+            extensions=[".parquet"],
+            return_data=True,
+        )
+        if retries > 0:
+            metadata |= {"data_access_retries": retries}
+        output_folder = self.sanitize_folder_name(self.data_access.output_folder)
+        output_path = os.path.join(output_folder, self.consolidated_filename)
+
+        # consolidate into a single data frame band hashes computed by workers
+        consolidated_dataframe, consolidation_stats = self.consolidate_docs_to_remove_files(files)
+        self.logger.info(f"{len(consolidated_dataframe)} documents marked as duplicates")
+        metadata |= consolidation_stats
+        output_data = TransformUtils.convert_arrow_to_binary(consolidated_dataframe.to_arrow())
+        return [(output_data, output_path)], metadata
+
+    def sanitize_folder_name(self, folder_name: str) -> str:
+        if "://" in folder_name:
+            _, folder_name = folder_name.split("://")
+        if folder_name[-1] != "/":
+            folder_name = f"{folder_name}/"
+        return folder_name
+
+    def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]:
+        consolidated_dataframe = pl.DataFrame()
+        total_input_rows = 0
+        for fname, contents in files.items():
+            df = pl.read_parquet(io.BytesIO(contents))
+            total_input_rows += len(df)
+            self.logger.debug(f"{fname} has {len(df)} rows")
+            consolidated_dataframe = consolidated_dataframe.vstack(df)
+        consolidated_dataframe = consolidated_dataframe.select("docs_to_remove").unique()
+
+        consolidation_stats = {
+            "input_files": len(files),
+            "input_bytes": sum(len(v) for v in files.values()),
+            "input_rows": total_input_rows,
+            "consolidated_files": 1,
+            "consolidated_bytes": consolidated_dataframe.to_arrow().nbytes,
+            "consolidated_rows": len(consolidated_dataframe),
+        }
+        return consolidated_dataframe, consolidation_stats
+
+
+class GetDuplicateListTransformConfiguration(TransformConfiguration):
+
+    """
+    Provides support for configuring and using the associated Transform class include
+    configuration with CLI args.
+    """
+
+    def __init__(self):
+        super().__init__(
+            name=short_name,
+            transform_class=GetDuplicateListTransform,
+            remove_from_metadata=[],
+        )
+        self.logger = get_logger(__name__, level="INFO")
+
+    def add_input_params(self, parser: ArgumentParser) -> None:
+        """
+        Add Transform-specific arguments to the given  parser.
+        This will be included in a dictionary used to initialize the GetDuplicateListTransform.
+        By convention a common prefix should be used for all transform-specific CLI args
+        (e.g, noop_, pii_, etc.)
+        """
+        parser.add_argument(
+            f"--{subfolder_cli_param}",
+            type=str,
+            default=subfolder_default,
+            help="The name of the subfolder with the duplicate records",
+        )
+        parser.add_argument(
+            f"--{consolidated_filename_cli_param}",
+            type=str,
+            default=consolidated_filename_default,
+            help="The name of the file with the consolidated list of duplicates",
+        )
+
+    def apply_input_params(self, args: Namespace) -> bool:
+        """
+        Validate and apply the arguments that have been parsed
+        :param args: user defined arguments.
+        :return: True, if validate pass or False otherwise
+        """
+        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+        self.params = self.params | captured
+        self.logger.info(f"{short_name} parameters are : {self.params}")
+        return True
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py
new file mode 100644
index 000000000..703ef630e
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py
@@ -0,0 +1,71 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import time
+from typing import Any
+
+from data_processing.data_access import DataAccess
+from data_processing.runtime.pure_python import (
+    DefaultPythonTransformRuntime,
+    PythonTransformLauncher,
+    PythonTransformRuntimeConfiguration,
+)
+from data_processing.utils import get_logger
+from get_duplicate_list_transform import (
+    GetDuplicateListTransformConfiguration,
+    subfolder_key,
+)
+
+
+logger = get_logger(__name__)
+
+
+class GetDuplicateListPythonRuntime(DefaultPythonTransformRuntime):
+    """
+    Get duplicate list runtime support for Python
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        return [self.params[subfolder_key]]
+
+
+class GetDuplicateListPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    """
+    Implements the PythonTransformConfiguration for Fuzzy Dedup GetDuplicateList
+    as required by the PythonTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        :param base_configuration - base configuration class
+        """
+        super().__init__(
+            transform_config=GetDuplicateListTransformConfiguration(),
+            runtime_class=GetDuplicateListPythonRuntime,
+        )
+
+
+if __name__ == "__main__":
+    launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+    logger.info("Launching fuzzy dedup get duplicate list python transform")
+    launcher.launch()

From 2019d56565ea52c5474632a822e67ac7e66fdac8 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 00:50:13 -0400
Subject: [PATCH 025/105] Added text preprocessing

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../python/src/signature_calc_local_python.py | 39 +++++----
 .../python/src/signature_calc_transform.py    | 81 +++++++------------
 2 files changed, 48 insertions(+), 72 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
index eb958ee3d..062580f22 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
@@ -20,31 +20,28 @@
 )
 
 
-# # create parameters
-# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1"))
-# output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output_second"))
-# local_conf = {
-#     "input_folder": input_folder,
-#     "output_folder": output_folder
-# }
-# code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
-# params = {
-#     # Data access. Only required parameters are specified
-#     "data_local_config": ParamsUtils.convert_to_ast(local_conf),
-#     # execution info
-#     "runtime_pipeline_id": "pipeline_id",
-#     "runtime_job_id": "job_id",
-#     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
-#     "minhash_num_permutations":112,
-#     "minhash_num_bands":14,
-#     "minhash_num_segments":2
-# }
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {"input_folder": input_folder, "output_folder": output_folder}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "minhash_num_permutations": 112,
+    "minhash_num_bands": 14,
+    "minhash_num_segments": 2,
+}
 
 
 if __name__ == "__main__":
     # Set the simulated command line args
-    # sys.argv = ParamsUtils.dict_to_req(d=params)
-    # print(sys.argv)
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    print(sys.argv)
 
     sys.argv.append("--data_s3_cred")
     s3_creds = {
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py
index 7ac8eb057..7c4dd391c 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py
@@ -10,6 +10,8 @@
 # limitations under the License.
 ################################################################################
 import os
+import re
+import unicodedata
 from argparse import ArgumentParser, Namespace
 from pathlib import Path
 from typing import Any, List
@@ -100,44 +102,16 @@
 """ Default number of segments across which we divide the hashing space for each band"""
 
 
-def _optimal_minhashlsh_param(
-    threshold: float = jaccard_similarity_threshold_default,
-    num_perm: int = num_permutations_default,
-    false_positive_weight: float = 0.5,
-    false_negative_weight: float = 0.5,
-):
-    """
-    Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
-    of probabilities of false positive and false negative.
-    :param threshold: desired similarity threshold
-    :param num_perm: number of permutations
-    :param false_positive_weight: importance of avoiding false positive results
-    :param false_negative_weight: importance of avoiding false negative results
-    :return: a tuple (optimal number of bands, optimal number of rows)
-    """
-
-    def _false_positive_probability(threshold, b, r):
-        _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
-        a, err = integrate(_probability, 0.0, threshold)
-        return a
-
-    def _false_negative_probability(threshold, b, r):
-        _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
-        a, err = integrate(_probability, threshold, 1.0)
-        return a
-
-    min_error = float("inf")
-    opt = (0, 0)
-    for b in range(1, num_perm + 1):
-        max_r = int(num_perm / b)
-        for r in range(1, max_r + 1):
-            fp = _false_positive_probability(threshold, b, r)
-            fn = _false_negative_probability(threshold, b, r)
-            error = fp * false_positive_weight + fn * false_negative_weight
-            if error < min_error:
-                min_error = error
-                opt = (b, r)
-    return opt
+NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?")
+WHITESPACE_PATTERN = re.compile(r"\s+")
+PUNCTUATION = "!/—”:％１〈&(、━\\【#%「」，】；+^]~“《„';’{|∶´[=-`*．（–？！：$～«〉,><》)?）。…@_.\"}►»" + "".join(
+    map(
+        chr,
+        (x for a, b in ((0, 9), (11, 13), (13, 32), (127, 160)) for x in range(a, b)),
+    )
+)
+PUNCTUATION_SET = set(PUNCTUATION)
+PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION))
 
 
 class SignatureCalculationTransform(AbstractTableTransform):
@@ -184,13 +158,6 @@ def __init__(self, config: dict[str, Any]):
         self.num_segments = config.get(num_segments_key, num_segments_default)
         self.num_bands = config.get(num_bands_key, num_bands_default)
         self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default)
-        # Calculate optimal parameters for bands calculation
-        # self.num_bands, self.num_rows = _optimal_minhashlsh_param(
-        #     threshold=self.jaccard_similarity_threshold,
-        #     num_perm=self.num_permutations,
-        #     false_positive_weight=0.5,
-        #     false_negative_weight=0.5,
-        # )
         # use this dataframe to store the minhashes and size for each document
         self.all_minhashes: pl.DataFrame = None
         # use this dataframe to store the band hashes for each document
@@ -224,8 +191,8 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
 
         # generate minhash values
         minhashes = df.map_rows(
-            lambda text: mm_min_hash.minhash2_nosalt(
-                *self._generate_word_shingles(text, window_size=self.word_shingle_size)
+            lambda row: mm_min_hash.minhash2_nosalt(
+                *self._generate_word_shingles(row, window_size=self.word_shingle_size)
             )
         )
         # rename columns, cast minhashes to list(uint32)
@@ -374,10 +341,22 @@ def write_band_signatures(self):
         return [], metadata
 
     # define shingles generation function
-    def _generate_word_shingles(self, text: str, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]:
-        words = text[0].split()
-        document_id = text[1]
-        doc_len = len(text[0])
+    def _generate_word_shingles(self, row: tuple, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]:
+        text = row[0]
+        # lower case
+        text = text.lower()
+        # replace numbers with '0'
+        text = NUMBERS_PATTERN.sub("0", text)
+        # convert punctuation to spaces
+        text = text.translate(PUNCTUATION_TRANS)
+        # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end
+        text = WHITESPACE_PATTERN.sub(" ", text.strip())
+        # diacritics/unicode normalization
+        text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
+        text = text.strip()
+        words = text.split()
+        document_id = row[1]
+        doc_len = len(row[0])
         word_count = len(words)
         k_shingles = []
         for i in range(0, max(1, word_count - window_size + 1)):

From 9362803f99fa422437031263474e97365d61d9f3 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 00:51:22 -0400
Subject: [PATCH 026/105] Added python test data

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../python/test-data/input/data_1/df1.parquet    | Bin 0 -> 3093 bytes
 .../python/test-data/input/data_2/df2.parquet    | Bin 0 -> 1397 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet

diff --git a/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c9220bf39c8dd2127707be44ba210363df6aa1a3
GIT binary patch
literal 3093
zcmZ`+2{csw`@b_;CTom)Lt|$wk+qV%-ma!GQW$#@8C$YuMnpxHk{H<<vLsa2qR5g;
zWU~L5QDT&+7kMM3vZQ`@)c-yI^E+?PIp6Q|Jl|(M_dMV8`Ce<-Mi=A*B|y9jhz5c6
z#RCFj0I&h2R;#@RIKcpb0)7(6Bc%fHNl6O(2GZPoJ>5Kl{6hl*K_LhY?SX9Y7y!wX
zqz`sV|1Q^a13;RF!EUz>NE6Y~w@A$P@=28JxvaxV$RH5*U-e2eP~2H6d^C%(<xl<b
zsO5reu}@#3R=Y>D-cMcCRY7e=$)E(wYu}4&pB|!0${2ryymMznu=7KKYyAPeJ%s(6
zjYf$9gPJ44HcP>o%aY7TbEU0rFDl{7XQsW{aZeq6A3qCOAZ8zac1yXeW`e#rvh;jH
z-Pbaj<Zw;(roj|B=^ER||2aOyPmG^#XJ@AgV$RZ^sa&$jGzx0La!j&W6BdlR@~+s$
zZ=`n;GB^o_!M*u5<>a}xB9GW2QzkE;uzc=)!~DctzR>oiy8=F!LjpKZ?`D;g8s}-C
zynvH}r;MZT>YH1acMpeCmB;+DnfAjrWs<$+BI!bp>iSlUZeJn32u$20c@za7C7+o8
zt7SYA8{qx6POX{-zJ7u-FDhs)U}CQ&1|-?kJJXDNPxUc_YPFg^<-5k-ezH?9`IajH
z$nQx|8xu&@uJ$j!rA%}Sko3QA=@&Q|zr8==&L{Ebn?u9ZsZ&`ZVSb}U6<RY^$%3ke
z)p_g$7_C@cUsAv1JP}RKKL2G`0Nhn$t<J}ISwyc)5svhyTzbMtc`n4bZK}F$psa^_
zXIVL3x|DsecaS;JG#_2N;8n=9*Px!(M;cFUFBMWczf*h1B=h3OnqrHN^1q8JN-fNl
z*e%bl5`Pf#zD}hJ>~j<>>Zr}O6<5XEPNPzFj4I^Y9U^v;mi2V&B_aX`>TW6DS3e_b
z@7JAjcS7ee2hTbknYUDPO!uN2{Yj^6)W`mbgk_CcSghREFN4!wV|ktSmzPh*Tt0h6
zeP)-+=W6X&olf<YojFw_F^h(0>qKNz(2sU_W(>K)&Evxt3auIoCQT~6$dmT7e%EPq
zenO!yhTMx43Rx^3dsnU9YJa5XPDq#f28EWZQ9AYMv%$u?TKfxMITt?=`;kjO;5T$q
zmXBbn_gS`BFfVPXVn?Jk59<+Gd~5x*1?5xenyg7(pYHvFPdsuvN`lp+oV#;AH>}}A
zFIpHYdeVZM`hz@=p6(}jl}rBl(1<i>Z+XU<bt8KpPEmw6PdlG2=Qd0B&ZGH_8Lb^&
zb-x@<BF!Fp;9F}hpjT#aVJI>yt$xs8C3&&2^~mb+gU$ipuQ#5R+(!PKZfQ=77%m^N
zFQleBR(*C~2o{|VcC4$2$?0bYW@|T<bzUblTK@3JroE1&$&g%FQ@gDcMorx`qR`l$
z5xyf+Z2q^ed$BU^;~V9VgmO08giotSrau(Lvw%ZZyVKD8nnIX8)?@|w1%kLzD*afT
zlE&TCKT-z-c;z^i9E=4+B1JFFg`{XM&btp?!ZGhV)g`W8AMt(;{f<m*i5&8?HtU-~
z^NkyrKqg1N5ED_U#+{@ccU5Z_?(4fLnx(sq?_W#2ny@^KV_ucJ_tiL2tYb-AGcZj0
zhOG_bjKb0Rr}`+=WRAn+46!>B>@0VRy;Yo-R;KEzPCOdj_PjKyW>*f4WM<?Jq)ODc
zx5t<<RZ@72Vxs1B#?#CwcdC~3vWH80VoL3jM~7vLx2VTi(5lnErr5vl9-K0DyB(8U
z(N|gB(A2Rzs4_l<!df+Xk>74lqf9=$*}%X1uEW4L{C@`O8=s(~8%xHrU;!x)Aw_*9
z1C~KDO2^a`L-`OWCQlN{Ojj^Z*Ypc_oWSb#)^M6PjHBi^nGvqnnRM7=FCTV2Xmi{i
z8+mq7N{(2HmM9ZDj(;a}Y}VXBAg8#m<6P3fv#1`6FxyH#C18l9|4!2XA%&9~O>K0D
zH;f9TK2h4vHnl9&oOtLRFVr^L)b{5^x%wz8ycZ*nwW8ICjdtkX6vs;0bBMj*a_W-V
zmQx=I-{rz~3Htam_{aLfJBCx}j2LzpKQ{JNw<Xra(R_GeJDl<|qgGe@hnJjqtA(QQ
zriyb-AG6q^E9QK;k==;}Y;}on1HA}zn`ZVodg?ZVtg3?_t1SnL?t@_k6IU*cU1z=%
z@{EqvV5E&k$GLv$D65{?G^*PYEqHh{@8uzCf_l6wgHR>6dcI|HFDK8qDxu5t1`pqc
z`^k#xUvw23GEaOl<wXUA_=-Dk{E*{uqZ(C%*|jW{f!Xgl-=!r+egM$VZl2NY8?R8-
z4LX#RS&;P=f&!XWk0(sp#%kdkNUX7D5tq!1Uy?JN%+*9&l_qY=jMMDscKOYH?vwtn
zi^OjBfjd|BQi*nno?0TOt)(lYd%8W&l(eA}qZv|4Ye7aNJU}Q1g?^$N#v>?@<`qbz
zt^-?>KC~i@&qZ;ujR5X(0Wk<la@Q}E6#a>kl>iXD0pJ060lr^gxxo&NVCZil69WZ9
z{`dhLH}ChY;4uI1GJ~SP3k5gr;GGBgzx_Z+?gG#8lU;>Z+g%sD4hwF7u_rmsYbxc?
z(Wl+ve*tyE!D?%VA6KKufE8d1d^MwLdG-YFYE@IX5H4q;(3<JpVA-JcG?jed{KNhS
z)1f6MGtN14ywkEpUbCGKaEu^1rCYBBUQGnZiF+oEuyHB%W-*pVz0MjMI8ZrNr+L3z
z*@kN;$1o^BSuid{MyX*@Ct`ASU@`E)h;FAz-NSo6$=-|4a~-VodUkTAN>(^Neb=AT
zr$2o6&QP3d9O`&w08bhnVXD1Cg~U6C3ceg9&9J9IUgDH&?{~mne`M%wte@7~hy!{=
zbZ%Rm&tS;jGqLYI_4W9LN4Gt`cbXOM;{GA9pJFhFR~oF3@skxbtUD9_v@ZA$=5qdv
zf~xqG`HO{8aZXqA0ZPYX{e4(#+__MWMORM>gXZ#8^lb}9R~Rn*T1;6u(c|%A{=6;R
zD{%vD_Bt?pE3rq_{q4x?i6a-3&BZsy@?Hw<ZuEXxXE(WsSCE&F@s!Av<!GrUneXFW
z+p&?k$DVgv6nbon*gZV!#^2Wa8k1|jS)ISaYpBf}fjCe*ltVvMt9tM6H&@!W&<rVk
zJd`)0vHev>D)r53-3rp2(2dPrkJ{z^UyobzFnKPZ|84d#&}~g95rdDD1*C8g2DE{Y
z$wH)D6u2V&OCG?1{=dXHSuhd9m64G%Ch>#>1<^nrL{UNi&wC93g^;s<rd$O$fJ34n
z0Vz!>O}sPiw-ft+k36Uw4@v;|D>&T0b&ZDpxiQciJSYi3-}E6g9s_l6kwDtl#5c?Q
zzQjM50$Rd@5P+xS|Fy>d`!Wz+m0KfU3&J3m3HpMOKqKEsQV6!Ov2rw`dK~i#aF@(L
zYjer)5RA}1?q+xr(Zk8V^z|bP7^cE|VHn|Lc&jyH4-4bk&0*N*mmea+rrZrS<!)9H
z+@w^l`15ex&y<MhNJd+39)#Qcia*HpgJFUp5~Pj9BPOoD>2ZqQb{%KO?K&Ro<p@-+
z-HPjqG=tE3(=ZvSzzy<2;<%jrvm{~N1%`KV`_)-j)>}hD_jBtb+>(3hj^uaJ^Y?So
lvph*5InXFR@E?FApLLMGdk9rZ6Bz*R3>cvSz#91>_%FJi=e_^{

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..23fac4c726f017c26b7fe64153fe5e0b459d60b1
GIT binary patch
literal 1397
zcmWG=3^EjD6D<+-i4o-!WdLIqQ3gQ<2IhK=t@?i%BJMIUEZ}C~Vc=!pW8h~H07-!X
zTL=>aGovVrq$&fOqy%qfUP*jrN_=vDPHAqQD32JQ*aAke3N;P}py~V6MbB~LGXJR<
z!?(>nMg^IZ_)<b4z((Nit!Ybd6$xj0<Xuv^`DpW@Cx!a_v;HeP2`B#Y+;))3p}*~i
zXE{S1LleX4mFFK61^v%huJ-l$oLBPP%a5mA{O+)0hRgETJXzmAS_OCe>Ya<;Z#*wq
zV)nzM(mQUw+G<s@H9y47V4?l4J60vfuB?97>*c04GwO?b<n1SI=MvUlT$udI{<Qqn
z*b=tg`U^Kdj-FTgVomO%C#5g?Y+q${&r|S}(yMei$z3^nWlhReQP-c0jQiPBDl>)C
zb6Q{Yp4w_=$aYip$lHU`{jqzUo;;g7>&eqwGB@8A{5;GN8GGc#2iJ2KPVQ^$XlD{k
zugvRvUHCfKf9}NZZx_ZrZPq<<M9y()O2BS470<(;+$NbQ<cLH?{GEIKqg(3YuyRY=
z=7pE9T-kE=+D*G^8ND)}^zY`6S2cvso_ojFc5YRc<l`!qZsUqz^U##D1si`fe}5#n
z#$s{gE6qIbhaDnnN$FQ?x5_PZ4=WYl)0owNZqe>Xv3;kv_fPPC&yY29``!Wvw-csk
z4{FP_GFx3b80p>0!?3_<fs&^XPp|vkQ2yA!DI6W_u7L^)=a_hHmt5!hJ>{TXnkgIe
zvEzXj-}q<jTI6_j>JF<ZJ$_8bA2p?w?p$cq{PIMa3qQ|&z5Sc_9A-Hx*>m@7;4$`e
zp(5e^1yhf6oy*mK_rY|BRp)0@UZ!T<8lm(riIYB@FtcD{VwREm;bH3(bamn9bKjSq
z*!{NX`L^;ml8Z0xG;?3BHuDs_wSs|DQcs`<|NoO0b}{@<^0{lp{jB`W%W{t_!xLMJ
zw!X=EFzNm55{m?bk0no+tWJn#cAjNb`qbgYIY*9f(|Jm!MMs&<XV^FGpxWD0>s;<$
zy#06^$Nz0RA{Osmc<U2yPnyd0?>d*vE{W|GcAY42cTGmwl+Z$kG&2+Jq$gnuW@>RN
z1TwtvW8`?|vS(6o%y%AHHnpgxog6X3LX%WQ*bLYMyN{VQcuJoW<$+`nj^zBjlGMDC
zVsPGgr!Mx7TkQxVgQScYhiHWuFh>DdM;Jx_af8^vTxH|IQk0)xBFX~fs4}7DF9uN_
zpcvdpRR$j!pe`{!Nf}8UwFX8R!nsncflpL~LG2K)3;_p<P2m%jU=Zum6ysCl5c|d>
z0&=MYk7H1ff4Eg~az<)yqQn#?eGssSbOZud5MfaU<W~ej`CuV$N5^7EXGcdM9q6bX
z0F*B;2g!RnI;KO_0hx|YAi~iJMEF;M3@rvRA%b9aPHsRx(8OSnIY2rHs^1r+&e74u
z0;owJs2?Z=Qty<NWMpa(6>e&f47Sdt7$olxQVMho5P%)+7znfhq$wS!4@3t-taJe@
zadfl<xz_;90Xq$-)eB@lkoE(KCIZcmFv`h}F!C!)@(3$QN_VtnkYEeQ&q*vQmec{p
P4k&`IGcqs)01I3I^fmr@

literal 0
HcmV?d00001


From ddbd602b9482d89f1ab07f44f45f7c312d458f32 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 00:52:07 -0400
Subject: [PATCH 027/105] Added project admin tools

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/python/.dockerignore     |  1 +
 transforms/universal/fdedup/python/Makefile   | 64 +++++++++++++++++++
 transforms/universal/fdedup/transform.config  |  5 +-
 3 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 transforms/universal/fdedup/python/.dockerignore
 create mode 100644 transforms/universal/fdedup/python/Makefile

diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore
new file mode 100644
index 000000000..f7275bbbd
--- /dev/null
+++ b/transforms/universal/fdedup/python/.dockerignore
@@ -0,0 +1 @@
+venv/
diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile
new file mode 100644
index 000000000..05f6bf5ca
--- /dev/null
+++ b/transforms/universal/fdedup/python/Makefile
@@ -0,0 +1,64 @@
+# Define the root of the local git clone for the common rules to be able 
+# know where they are running from.
+REPOROOT=../../../..
+
+# Set this, before including .make.defaults, to 
+#   1 if requirements reference the latest code in the data processing library 
+#     in this repo (that is not yet published to pypi).	 This is the default setting.
+#   0 if the transforms DPK dependencies are on wheels published to 
+#     pypi (e.g. data-prep-toolkit=0.2.1)
+#USE_REPO_LIB_SRC=1
+
+# Include a library of common .transform.* targets which most
+# transforms should be able to reuse.  However, feel free
+# to override/redefine the rules below. 
+include $(REPOROOT)/transforms/.make.transforms
+
+# Include the common configuration for this transform
+include ../transform.config
+
+venv::	.transforms.python-venv
+
+test::	.transforms.python-test
+
+clean:: .transforms.clean
+
+image:: .transforms.python-image
+
+test-src:: .transforms.test-src
+
+setup:: .transforms.setup
+
+build:: build-dist image
+
+publish: publish-image
+
+publish-image:: .transforms.publish-image-python
+
+setup:: .transforms.setup
+
+# distribution versions is the same as image version.
+set-versions:
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions
+        
+build-dist:: .defaults.build-dist 
+
+publish-dist:: .defaults.publish-dist
+
+test-image:: .transforms.python-test-image
+
+run-cli-sample: .transforms.run-cli-python-sample
+
+run-local-sample: .transforms.run-local-sample
+
+run-local-python-sample: .transforms.run-local-python-sample
+
+#run-s3-ray-sample: .transforms.run-s3-ray-sample
+
+minio-start:	.minio-start
+
+kind-load-image:: .transforms.kind-load-image
+
+docker-load-image: .defaults.docker-load-image
+
+docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/fdedup/transform.config b/transforms/universal/fdedup/transform.config
index 774716e15..ffaeb9f45 100644
--- a/transforms/universal/fdedup/transform.config
+++ b/transforms/universal/fdedup/transform.config
@@ -14,5 +14,6 @@ TRANSFORM_NAME=fdedup
 #
 # If you change the versions numbers, be sure to run "make set-versions" to 
 # update version numbers across the transform (e.g., pyproject.toml).
-FDEDUP_RAY_VERSION=$(DPK_VERSION)
-
+FDEDUP_PYTHON_VERSION=$(DPK_VERSION)
+FDEDUP_RAY_VERSION=$(FDEDUP_PYTHON_VERSION)
+FDEDUP_SPARK_VERSION=$(FDEDUP_PYTHON_VERSION)

From 4dac838b2d941117f40bce371574aec268d09206 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 02:40:11 -0400
Subject: [PATCH 028/105] Bug fix

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/python/src/cluster_analysis_transform.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
index 221b50512..2a5ec3e6b 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
@@ -240,7 +240,7 @@ def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]:
         sorted_document_data = sorted(document_data, key=lambda x: (-x["document_length"], x["int_id_column"]))
 
         # Extracting int_id_column values into a list
-        doc_list = list(set([item["int_id_column"] for item in sorted_document_data]))
+        doc_list = [item["int_id_column"] for item in sorted_document_data]
 
         # Creating a dictionary with int_id_column as key and minhashes as value
         doc_minhashes = {item["int_id_column"]: item["minhashes"] for item in sorted_document_data}

From fbc2b58e255edc758a9d4016d49dd57715c3db93 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 02:41:49 -0400
Subject: [PATCH 029/105] Add op modes for data cleaning: filter (non)dupl and
 annotate

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../python/src/data_cleaning_transform.py     | 38 +++++++++---
 .../src/data_cleaning_transform_python.py     |  5 +-
 .../fdedup/python/src/fuzzy_dedup_python.py   | 60 +++++++++++++++----
 3 files changed, 83 insertions(+), 20 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
index 05b18cc8b..8e17b757f 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
@@ -29,12 +29,16 @@
 """ This key holds the name of the column storing the unique ID assigned to each document"""
 duplicate_list_location_key = "duplicate_list_location"
 """ This key holds the location of the list of duplicate documents marked for removal"""
+operation_mode_key = "operation_mode"
+""" This key holds the operation mode: 'filter_duplicates', 'filter_non_duplicates', or 'annotate'"""
 
 # command line arguments
 document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
 """ Name of the column storing the unique ID assigned to each document"""
 duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}"
 """ Location of the list of duplicate documents marked for removal"""
+operation_mode_cli_param = f"{cli_prefix}{operation_mode_key}"
+""" Operation mode, can be one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate'"""
 
 captured_arg_keys = [
     document_id_column_key,
@@ -44,8 +48,10 @@
 # defaults
 document_id_column_default = "int_id_column"
 """ Default name of the column storing the unique ID assigned to each document"""
-duplicate_list_location_default = None
+duplicate_list_location_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet")
 """ Default location of the list of duplicate documents marked for removal"""
+operation_mode_default = "filter_duplicates"
+""" Default value for operation mode, will filter out all the duplicate documents"""
 
 
 class DataCleaningTransform(AbstractTableTransform):
@@ -72,6 +78,7 @@ def __init__(self, config: dict[str, Any]):
         self.logger = get_logger(__name__)
         self.document_id_column = config.get(document_id_column_key, document_id_column_default)
         self.duplicate_list_location = config.get(duplicate_list_location_key, duplicate_list_location_default)
+        self.operation_mode = config.get(operation_mode_key, operation_mode_default)
         contents = config.get("df")
         self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents))
         self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows")
@@ -88,19 +95,27 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
             self.docs_to_remove_df = self.docs_to_remove_df.select(
                 pl.col(self.document_id_column).cast(input_doc_id_type)
             )
-        filtered_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti")
-        filtered_table = filtered_df.to_arrow()
+        if self.operation_mode == "filter_duplicates":
+            result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti")
+        elif self.operation_mode == "filter_non_duplicates":
+            result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="inner")
+        else:  # self.operation_mode == "annotation"
+            duplicates_df = self.docs_to_remove_df.with_columns(pl.lit("d").alias("duplicate"))
+            result_df = input_df.join(duplicates_df, on=self.document_id_column, how="left").with_columns(
+                pl.col("duplicate").fill_null("")
+            )
+        result_table = result_df.to_arrow()
         metadata = {
             "input_files": 1,
             "input_docs": table.num_rows,
             "input_bytes": table.nbytes,
             "output_files": 1,
-            "output_docs": filtered_table.num_rows,
-            "output_bytes": filtered_table.nbytes,
-            "filtered_docs": (table.num_rows - filtered_table.num_rows),
-            "filtered_bytes": (table.nbytes - filtered_table.nbytes),
+            "output_docs": result_table.num_rows,
+            "output_bytes": result_table.nbytes,
+            "filtered_docs": (table.num_rows - result_table.num_rows),
+            "filtered_bytes": (table.nbytes - result_table.nbytes),
         }
-        return [filtered_table], metadata
+        return [result_table], metadata
 
 
 class DataCleaningTransformConfiguration(TransformConfiguration):
@@ -133,10 +148,15 @@ def add_input_params(self, parser: ArgumentParser) -> None:
         parser.add_argument(
             f"--{duplicate_list_location_cli_param}",
             type=str,
-            required=True,
             default=duplicate_list_location_default,
             help="location of duplicate document list that are marked for removal",
         )
+        parser.add_argument(
+            f"--{operation_mode_cli_param}",
+            choices=["filter_duplicates", "filter_non_duplicates", "annotate"],
+            default=operation_mode_default,
+            help="operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents",
+        )
 
     def apply_input_params(self, args: Namespace) -> bool:
         """
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
index c0b5fefd6..e5c1e5025 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
@@ -10,6 +10,7 @@
 # limitations under the License.
 ################################################################################
 
+import os
 from typing import Any
 
 from data_cleaning_transform import DataCleaningTransformConfiguration
@@ -51,8 +52,10 @@ def get_transform_config(
         :param files - list of files to process
         :return: dictionary of transform init params
         """
-        duplicate_list_location = self.params["duplicate_list_location"]
         data_access = data_access_factory.create_data_access()
+        duplicate_list_location = os.path.abspath(
+            os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"])
+        )
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
         self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
index ca64f336f..c05fe326e 100644
--- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
+++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
@@ -57,6 +57,7 @@
     "fdclean": [
         data_cleaning_transform.document_id_column_key,
         data_cleaning_transform.duplicate_list_location_key,
+        data_cleaning_transform.operation_mode_key,
     ],
 }
 
@@ -66,10 +67,6 @@ def __init__(self, global_params: argparse.Namespace = None):
         self.global_params = global_params
         self.logger = get_logger(__name__)
 
-    def execute_service(self, service_logic, service_params):
-        # Call the generic service logic
-        service_logic(service_params)
-
     def orchestrate(self):
         service_list = self.global_params.services.split(",")
         for service in service_list:
@@ -107,7 +104,14 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list:
             output_folder = in_args_dict["output_folder"]
         elif service_name == "fdclean":
             input_folder = in_args_dict["input_folder"]
-            output_folder = os.path.join(in_args_dict["output_folder"], "cleaned")
+            operation_mode = in_args_dict.get("operation_mode", "filter_duplicates")
+            if operation_mode == "filter_duplicates":
+                output_subfolder = "cleaned"
+            elif operation_mode == "filter_non_duplicates":
+                output_subfolder = "duplicates"
+            else:  # operation_mode == "annotate"
+                output_subfolder = "annotated"
+            output_folder = os.path.join(in_args_dict["output_folder"], output_subfolder)
         else:
             self.logger.error(f"Unknown service name: {service_name}")
         data_io = {
@@ -145,12 +149,48 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--output_folder", type=str, required=True, help="Output folder path")
 
     parser.add_argument(
-        "--contents_column", type=str, default="text", help="Name of the column that holds document text"
+        "--operation_mode",
+        choices=["filter_duplicates", "filter_non_duplicates", "annotate"],
+        required=False,
+        help="operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents",
+    )
+    parser.add_argument(
+        "--contents_column", type=str, required=False, help="name of the column that stores document text"
+    )
+    parser.add_argument(
+        "--document_id_column", type=str, required=False, help="name of the column that stores document text"
+    )
+    parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text")
+    parser.add_argument(
+        "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation"
+    )
+    parser.add_argument(
+        "--num_bands", type=int, required=False, help="number of bands to use for band hash calculation"
+    )
+    parser.add_argument(
+        "--num_minhashes_per_band", type=int, required=False, help="number of minhashes to use in each band"
+    )
+    parser.add_argument(
+        "--word_shingle_size", type=int, required=False, help="number of words included in one shingle"
+    )
+    parser.add_argument(
+        "--jaccard_similarity_threshold",
+        type=float,
+        required=False,
+        help="jaccard similarity threshold above which two documents are similar",
+    )
+    parser.add_argument(
+        "--num_segments",
+        type=int,
+        required=False,
+        help="the number of segments dividing the hashing space for each band (for scalability)",
+    )
+    parser.add_argument(
+        "--duplicate_list_location",
+        type=str,
+        required=False,
+        help="path to the file with all the duplicate document ids",
     )
-    parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations")
-    parser.add_argument("--num_bands", type=int, default=14, help="Number of bands")
-    parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band")
-    parser.add_argument("--num_segments", type=int, default=2, help="Number of segments")
 
     # Single argument for service execution
     parser.add_argument(

From 828ec41b4a0727f008566a3ebf7a0c400ee5c5ac Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 08:07:06 -0400
Subject: [PATCH 030/105] Python and spark transforms for cluster analysis

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../src/cluster_analysis_transform_python.py  |  1 +
 .../src/cluster_analysis_transform_spark.py   | 38 +++++++++++++++++--
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
index 8ff6dbf2b..c35c5a711 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
@@ -72,4 +72,5 @@ def __init__(self):
 if __name__ == "__main__":
     launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
     logger.info("Launching fuzzy dedup cluster analysis python transform")
+    # Launch python to process the input
     launcher.launch()
diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
index afb8c51b7..30f9dd317 100644
--- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
+++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
@@ -10,9 +10,17 @@
 # limitations under the License.
 ################################################################################
 
-from cluster_analysis_transform import ClusterAnalysisTransformConfiguration
+import os
+
+from cluster_analysis_transform import (
+    ClusterAnalysisTransformConfiguration,
+    num_bands_key,
+    num_segments_key,
+)
+from data_processing.data_access import DataAccess
 from data_processing.utils import get_logger
 from data_processing_spark.runtime.spark import (
+    DefaultSparkTransformRuntime,
     SparkTransformLauncher,
     SparkTransformRuntimeConfiguration,
 )
@@ -21,6 +29,27 @@
 logger = get_logger(__name__)
 
 
+class ClusterAnalysisSparkRuntime(DefaultSparkTransformRuntime):
+    """
+    Cluster analysis runtime support for Spark
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        bands = self.params["num_bands"]
+        segments = self.params["num_segments"]
+        folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)]
+        return folders
+
+
 class ClusterAnalysisSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
     """
     Implements the SparkTransformConfiguration for Fuzzy Dedup Cluster Analysis
@@ -31,12 +60,15 @@ def __init__(self):
         """
         Initialization
         """
-        super().__init__(transform_config=ClusterAnalysisTransformConfiguration())
+        super().__init__(
+            transform_config=ClusterAnalysisTransformConfiguration(),
+            runtime_class=ClusterAnalysisSparkRuntime,
+        )
 
 
 if __name__ == "__main__":
     # create launcher
     launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
-    logger.info("Launching fuzzy dedup signature calculation transform")
+    logger.info("Launching fuzzy dedup cluster analysis spark transform")
     # Launch the spark worker(s) to process the input
     launcher.launch()

From bc6b81cd231a328f3fe32bfe26b0d40529d2ee57 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 11:00:28 -0400
Subject: [PATCH 031/105] Sync spark Makefile with dpk

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/spark/Makefile | 84 ++++++++++++----------
 1 file changed, 48 insertions(+), 36 deletions(-)

diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile
index d30013da8..7eb132fbd 100644
--- a/transforms/universal/fdedup/spark/Makefile
+++ b/transforms/universal/fdedup/spark/Makefile
@@ -1,45 +1,57 @@
-# Define the root of the local git clone for the common rules to be able
+# Define the root of the local git clone for the common rules to be able 
 # know where they are running from.
 REPOROOT=../../../..
+
+# Set this, before including .make.defaults, to 
+#   1 if requirements reference the latest code in the data processing library 
+#     in this repo (that is not yet published to pypi).	 This is the default setting.
+#   0 if the transforms DPK dependencies are on wheels published to 
+#     pypi (e.g. data-prep-toolkit=0.2.1)
+#USE_REPO_LIB_SRC=1
+
 # Include a library of common .transform.* targets which most
 # transforms should be able to reuse.  However, feel free
-# to override/redefine the rules below.
+# to override/redefine the rules below. 
 include $(REPOROOT)/transforms/.make.transforms
 
-# This is included in the image name, if defined
-TRANSFORM_NAME=fd-sig-calc
-
-DOCKER_IMAGE_NAME=pyspark-base
-DOCKER_IMAGE_VERSION=latest
-DOCKER_FILE=Dockerfile
-REGISTRY_HOST=docker.io
-REGISTRY_PATH=
-DOCKER=docker
-PYTHON=python
-
-venv:	requirements.txt
-	@# Help: Create the virtual environment using requirements.txt
-	$(PYTHON) -m venv venv
-	@source venv/bin/activate;		\
-	pip install --upgrade pip;		\
-	pip install wheel;			\
-	pip install -r requirements.txt;
+# Include the common configuration for this transform
+include ../transform.config
+
+venv::	.transforms.spark-venv
+
+test::	.transforms.spark-test
+
+clean:: .transforms.clean
 
 image:: .transforms.spark-image
 
-image-direct: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings.
-	@# Help: Create the docker image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
-	$(DOCKER) build -t $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -f $(DOCKER_FILE) .
-
-publish-docker: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings.
-	@# Help: Publish image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) to $(REGISTRY_HOST) container registry
-	$(DOCKER) logout $(REGISTRY_HOST)
-	$(DOCKER) login $(REGISTRY_HOST) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)'
-	$(DOCKER) push  $(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
-
-publish-ibm:
-	ibmcloud login -q -u "$(IBM_CLOUD_USER)" -apikey "$(IBM_CLOUD_API_KEY)"
-	ibmcloud cr login --client docker
-	$(DOCKER) tag $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
-	$(DOCKER) push $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
-	# ibmcloud cr image-list | grep $(DOCKER_IMAGE_NAME)
+test-src:: .transforms.test-src
+
+setup:: .transforms.setup
+
+build:: build-dist image
+
+publish: publish-image
+
+publish-image:: .transforms.publish-image-spark
+
+set-versions:
+	$(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions
+        
+build-dist:: .defaults.build-dist 
+
+publish-dist:: .defaults.publish-dist
+
+test-image:: .transforms.spark-test-image
+
+run-cli-sample: .transforms.run-cli-spark-sample
+
+run-local-sample: .transforms.run-local-sample
+
+minio-start:	.minio-start
+
+kind-load-image:: .transforms.kind-load-image
+
+docker-load-image: .defaults.docker-load-image
+
+docker-save-image: .defaults.docker-save-image

From 4d486d35a36039783df84ce666ab03cd21c0cf59 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 11:01:59 -0400
Subject: [PATCH 032/105] Spark orchestration for fuzzy dedup

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../src/cluster_analysis_transform_spark.py   |   1 +
 .../src/data_cleaning_transform_spark.py      |   9 +-
 .../fdedup/spark/src/fuzzy_dedup_spark.py     | 207 +++---------------
 3 files changed, 34 insertions(+), 183 deletions(-)

diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
index 30f9dd317..5522d67de 100644
--- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
+++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
@@ -11,6 +11,7 @@
 ################################################################################
 
 import os
+from typing import Any
 
 from cluster_analysis_transform import (
     ClusterAnalysisTransformConfiguration,
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
index 03976bac8..29890d05f 100644
--- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
+++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
@@ -10,6 +10,7 @@
 # limitations under the License.
 ################################################################################
 
+import os
 from typing import Any
 
 from data_cleaning_transform import DataCleaningTransformConfiguration
@@ -51,8 +52,10 @@ def get_transform_config(
         :param files - list of files to process
         :return: dictionary of transform init params
         """
-        duplicate_list_location = self.params["duplicate_list_location"]
         data_access = data_access_factory.create_data_access()
+        duplicate_list_location = os.path.abspath(
+            os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"])
+        )
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
         self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
@@ -86,8 +89,10 @@ def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[s
         :param data_access_factory - data access factory class being used by the RayOrchestrator.
         :return: dictionary of parameters to be broadcast
         """
-        duplicate_list_location = self.transform_config.params["duplicate_list_location"]
         data_access = data_access_factory.create_data_access()
+        duplicate_list_location = os.path.abspath(
+            os.path.join(data_access.output_folder, "..", self.transform_config.params["duplicate_list_location"])
+        )
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
         self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
index 6d0e090e4..5217f2f7b 100644
--- a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
+++ b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
@@ -1,28 +1,15 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
 import argparse
-import logging
 import os
 import sys
-from typing import Union
 
-import polars as pl
 from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
 from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
-from data_processing.utils import ParamsUtils
+from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing_spark.runtime.spark import SparkTransformLauncher
-from file_copy_util import FileCopyUtil
-from file_copy_util_spark import FileCopySpark
+from fuzzy_dedup_python import ServiceOrchestrator, parse_args
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
 from signature_calc_transform_spark import (
     SignatureCalculationSparkTransformConfiguration,
 )
@@ -34,172 +21,30 @@
     "url": os.getenv("AWS_ENDPOINT_URL"),
 }
 
-args_map = {
-    "minhash": [
-        "document_id_column",
-        "contents_column",
-        "seed",
-        "num_permutations",
-        "num_bands",
-        "num_minhashes_per_band",
-        "jaccard_similarity_threshold",
-        "word_shingle_size",
-        "num_segments",
-    ],
-    "copyutil": [
-        "subfolder_name",
-        "data_type",
-        "num_bands",
-        "num_segments",
-        "parallelization",
-        "use_s3",
-    ],
-    "cluster": [
-        "jaccard_similarity_threshold",
-    ],
-    "fdclean": [
-        "document_id_column",
-        "duplicate_list_location",
-    ],
-}
-
-
-def get_arguments(in_args: argparse.Namespace, module_name: str) -> Union[list, dict]:
-    sys_argv = ["python"]
-    in_args_dict = vars(in_args)
-    if in_args.use_s3:
-        sys_argv.append("--data_s3_cred")
-        sys_argv.append(ParamsUtils.convert_to_ast(s3_creds))
-    all_module_arguments = args_map.get(module_name, [])
-    passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None}
-    if module_name == "copyutil":
-        copy_util_config = {k: v for k, v in passed_args.items()}
-        copy_util_config["root_folder"] = in_args_dict["output_folder"]
-        return copy_util_config
-    else:
-        for k, v in passed_args.items():
-            sys_argv.append(f"--{module_name}_{k}")
-            sys_argv.append(str(v))
-        if module_name == "minhash":
-            input_folder = in_args_dict["input_folder"]
-            output_folder = os.path.join(in_args_dict["output_folder"])
-        elif module_name == "cluster":
-            input_folder = os.path.join(in_args_dict["output_folder"], "bands_consolidated")
-            output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove")
-        elif module_name == "fdclean":
-            if f"--{module_name}_duplicate_list_location" not in sys_argv:
-                sys_argv.append(f"--{module_name}_duplicate_list_location")
-                sys_argv.append(
-                    os.path.join(
-                        in_args_dict["output_folder"],
-                        "docs_to_remove_consolidated",
-                        "docs_to_remove_consolidated.parquet",
-                    )
-                )
-            input_folder = in_args_dict["input_folder"]
-            output_folder = os.path.join(in_args_dict["output_folder"], "cleaned")
-        else:
-            logging.error(f"Unknown module name: {module_name}")
-        data_io = {
-            "input_folder": input_folder,
-            "output_folder": output_folder,
-        }
-        if in_args.use_s3:
-            sys_argv.append("--data_s3_config")
-        else:
-            sys_argv.append("--data_local_config")
-        sys_argv.append(ParamsUtils.convert_to_ast(data_io))
-    return sys_argv
 
+class SparkServiceOrchestrator(ServiceOrchestrator):
+    def __init__(self, global_params: argparse.Namespace = None):
+        super().__init__(global_params=global_params)
 
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_folder", type=str, required=True, help="path to read the input files")
-    parser.add_argument("--output_folder", type=str, required=True, help="path to write the output files")
-    parser.add_argument(
-        "--use_s3", type=bool, required=False, default=False, help="if true, use S3, if false use local FS"
-    )
-    parser.add_argument(
-        "--contents_column", type=str, required=False, help="name of the column that stores document text"
-    )
-    parser.add_argument(
-        "--document_id_column", type=str, required=False, help="name of the column that stores document text"
-    )
-    parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text")
-    parser.add_argument(
-        "--num_permutations", type=int, required=True, help="number of permutations to use for minhash calculation"
-    )
-    parser.add_argument(
-        "--num_bands", type=int, required=True, help="number of bands to use for band hash calculation"
-    )
-    parser.add_argument(
-        "--num_minhashes_per_band", type=int, required=True, help="number of minhashes to use in each band"
-    )
-    parser.add_argument(
-        "--word_shingle_size", type=int, required=False, help="number of words included in one shingle"
-    )
-    parser.add_argument(
-        "--jaccard_similarity_threshold",
-        type=float,
-        required=False,
-        help="jaccard similarity threshold above which two documents are similar",
-    )
-    parser.add_argument(
-        "--num_segments",
-        type=int,
-        required=True,
-        help="number of segments to divide each band hash interval (to improve scalability)",
-    )
-    parser.add_argument("--parallelization", type=int, required=False, default=-1, help="spark parallelization")
-    parser.add_argument(
-        "--duplicate_list_location",
-        type=str,
-        required=False,
-        help="path to the file with all the duplicate document ids",
-    )
-    return parser.parse_args()
+    def execute_service(self, service_short_name: str, params: list) -> int:
+        sys.argv = params
+        if service_short_name == "minhash":
+            launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration())
+        elif service_short_name == "cluster":
+            launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
+        elif service_short_name == "fdlist":
+            launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+        elif service_short_name == "fdclean":
+            launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration())
+        status = launcher.launch()
+        return status
 
 
 if __name__ == "__main__":
-    # configure logging
-    logging.basicConfig(
-        format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] - %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-        level=logging.INFO,
-    )
-    args = parse_arguments()
-    sys.argv = get_arguments(args, "minhash")
-    # create launcher
-    launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration())
-    # Launch the spark worker(s) to process the input
-    status = launcher.launch()
-    logging.info(f"Signature calculation concluded with status {status}")
-
-    fcs_config = get_arguments(args, "copyutil")
-
-    root_folder = fcs_config["root_folder"]
-    parallelization = fcs_config["parallelization"]
-    fcs = FileCopySpark(root_folder, fcs_config["num_bands"], fcs_config["num_segments"], args.use_s3)
-    data_access_factory = fcs.create_data_access_factory(root_folder, args.use_s3)
-    app_config = {"root_folder": root_folder}
-    execution_config = {"parallelization": parallelization} if parallelization > 0 else {}
-    status = fcs.orchestrate(app_config, execution_config, data_access_factory, data_type="bands")
-    logging.info(f"Consolidate bands concluded with status {status}")
-
-    sys.argv = get_arguments(args, "cluster")
-    launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
-    # Launch the spark worker(s) to process the input
-    status = launcher.launch()
-    logging.info(f"Cluster analysis concluded with status {status}")
-
-    stats = {}
-    fcu_config = get_arguments(args, "copyutil")
-    fcu = FileCopyUtil(data_access_factory=data_access_factory, config=fcu_config, stats=stats)
-    fcu.copy_data(subfolder_name="docs_to_remove", data_type="docs_to_remove")
 
-    sys.argv = get_arguments(args, "fdclean")
-    # create launcher
-    launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration())
-    # Launch the spark worker(s) to process the input
-    status = launcher.launch()
-    logging.info(f"Data cleanup concluded with status {status}")
+    # Parse command line arguments
+    args = parse_args()
+    # Initialize the orchestrator
+    orchestrator = SparkServiceOrchestrator(global_params=args)
+    # Launch spark fuzzy dedup execution
+    orchestrator.orchestrate()

From 19e0844bd93f52b9e02277a70065221d981bf477 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 11:03:02 -0400
Subject: [PATCH 033/105] Bug fix

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/python/src/fuzzy_dedup_python.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
index c05fe326e..acb1be3bb 100644
--- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
+++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
@@ -76,7 +76,7 @@ def orchestrate(self):
                 self.logger.error(err_msg)
                 raise ValueError(err_msg)
             service_short_name = SERVICE_DICT[service]
-            service_params = self.get_arguments(args, service_short_name)
+            service_params = self.get_arguments(self.global_params, service_short_name)
             self.logger.info(f"Got parameters for {service}")
             status = self.execute_service(service_short_name, service_params)
             if status == 0:

From 2ce3d8c440351723373edefdbcaf20c8d3730647 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 11:03:36 -0400
Subject: [PATCH 034/105] Added spark test data

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/spark/test-data/input/df1.parquet     | Bin 0 -> 4111 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 transforms/universal/fdedup/spark/test-data/input/df1.parquet

diff --git a/transforms/universal/fdedup/spark/test-data/input/df1.parquet b/transforms/universal/fdedup/spark/test-data/input/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..2584725bbf8e96cb852fc2702f8014fe689c7116
GIT binary patch
literal 4111
zcmaJ_2{@Ep`+sI^6N4Btp|LY4vJGjHjK&x#jD5{cb}=F<vXo?r$SzyS)|;}7tr8~d
zd<^p1DoaQalC8xzBmejRUEfvTeO>3A`<!#%_wRnL=RW5*&lq8imEf#!J~+x24u%7}
zjTsK`000w6rbgihV2K3)5b#Uj%mPvXs{lWzhcC(A!^Pg&&pXJ+7Y;>$5n2di6a=70
zidTVm3H~Y85E>99L13yzJ}F#^D*ADmuI_RCy|)!<v8g!R=`7b2wD{3dPKS*2gC|v{
z!ng7>dA_WNf9rXh@nJqoNeXlTBn09pulc~MaCsz<Ur6IK!YyZ!3$`-i`=j57+KW4V
z0P_~_Gb}gGX}sy5w#83*wp`k3|GE-;`^tiA2eQq=<N2$AReYw+s}jkw>e<A#@y(%G
zSr3B<g84n^2k3cX{JjBp?;%ux7Y|#asi~<P9C9`BmDDZcG>qR%*qCM}ZB{?MuDm;H
z?K@$HPYB7Ej()n*E}5|0UgR8Aq)lOA<rK}SRnLpdVGZioe8TR2JHTfQK|LUK@z8Y=
zT$J5X%tgq;<J-HE+s8*k0wpKCGAU-G#%27}a;`hjXLWtsm`8W<uYKe8^E-gB4#W#9
z?_W-Z!F=4N>tw1(@HZ_Wy`ucqd<yJdoKL(l*_xz5z0^nXtC4T|l6NlZQOi-~gpzXr
zAgUEBGs&KyP~}}*B8j*6;rFgJ@baCDQSA@?>kIGDfuIoC!1)ZWV6V4D74nOQ2^`Yu
zRk;JPSg?2%`5}4JdNzWXd42tu54OA7NR~DI@1n%YB+f8zr&}%QNkh=|N7~Xy2FrQ_
z|Jss_5iA`zOC6@nHmyX|thyFb%nk)!R-q@J+))aZxPDYYZH{vDb9J$PXZgpXic)<&
ziGi1|vhX{&+>QA=>^c@4MV&R7CcM%plLb(+BBnyL!#wmTVM|$w%oplASXUxhD|<!Q
z%&RBq$*kh@F%<1`Sng)^d8M27i7j2i;h+0wW48`1VR_0;yi&(HstpQA?_J$fm-7a;
zWfzZ0eXUZM=&~eNc4a>vk6cr~TE``v1b(LGk~(q@`(kSJMxkLt{+wo|D{;<j$*X{r
z$c8KQfDoxLXuw+W<V=-9tC?NzUjf~+Okyvy!WGGPmi#r8<WCn)SQj_)?Bpb#@fx`(
z%!<=i{wiF|k(;tvp%!MuOnXM;JxpHEcluKLLs+w}PpLNl3$uvYLx0(D>z?ed^*@l@
zH}y5dT}b{-{eCVEm-}(9<@_g`F@#|=gDcjw`<XgOaW0lzg}ea~`z4}VF3D>W^TXzw
z<Lw9nVack|qehQixeR?{BrGF^JdEB>SZipt`{sGp+UI9M!&Uwx#IJV@^hlwj<>O|B
zfp;t(e|22-=U(u)sH=#~?jP{YRA?&eD!?@u>^NtV-h`2a2)4BO<A!2ywe1gugJHen
ztakGQY}0S3Fd@gOeWG?jWBZIl7G%TjG;^b9fR*9#6fm0{6ryEB6cb&=@k%5oo{yF|
z^d$L2@*q2l$XMkVL_eJ`Njb%qAg;&Dq62<7rq_3=n^%?``e7M7L&U!f8}TyI?OO!1
zPN6jsns)2>I8d@i7eVcbbj@n5ioLk*9s8-%KjN}tw?>haERm-h8gV?Gn+kHi!GiZq
zjMJ}(Ijpp)fIxHE=5veqo-lY<xn)#Iac)YP^f_6)Lqz*fX?*puY!X2i;|L`4kvlpf
zbtzIw%$UgVWyPr!U8lbuZz^YwKJ1MwHBERsDqMU}Hd>!lm9mj!_MvBZUfcdrWI{z>
zWmSDs=W)Nvm?S6KH_g|19cCn_x#kD;Y{#FN4}M4eYmnT~0*+{SIGKTE7jTA(t4N?}
z=me0Wwl>7+lP`}bfh+BfnBE;ZuMmq_m=d*m>;=;lXl1`HuAok{(+qao=G<9hPcvB9
z)inVTd?}c(jK>o-BXoXA56zxk+}C+6e(+UzuYRyeC94E5LQ|RH_ilC?OO6O^Fpp6W
z_YG{3P#w@VD3qIRc8h_wzinzic~gWOZisSC&!uh4H^3syd-g}ulC-P_R&6ic(mi<T
zGw!EI@G%Z|?{v1wzL3t*q{Q^dfnYXR)I^T~%+^A0bX66b^ml5FlERLw2yd&tIOqO~
zYfYas2DrENSaZU9;_?S%`9jdjq2PA8%xj6sN6;CM&wj2l7%ZxV2j|b;y){`tnSr`Q
zL>)>`c^eUZ?n`G`)#CoQN-rZgY!0y89tn(<jX9T&do1$p`pdafW4RiSW4pEQGqW<)
zCWy<fD~Z*oU0By<0r>=Y@LKQtl<j=K3RDd_wk41XIqkC2Ezd)21QM?vSXAnps*qIj
zvx-m4&)7g9e44&_#?F~U$)oBCw8<A-wrMxl6H+bpWVl-;W*-PmkxUa!^Ir5h&UwEn
z;(5>qKe~M?5N{giBF}Z%NU$=Zx5xR)!*+07M7n^)4?he61>nlz&|mBZarW~ix%!d<
zcZsb@1+guN$^kLB4F}h<g+u5p$!L2l$^DBZD*?ct319|T0M<RSWHJTQIrNW|2|@TH
zyitQlM%<r=;iGJSiVWulUW?f)_#ZvX_I-ygA~%>lFLoDZb(~vuwc${Gt(6e%I-g|a
z(AE?39;oB=m-%7yyb44F3;`2hLpM;~rPu#htBlx<5D`tW)-<<zgL?V4Wa63Y&HarF
zK@T+-t+SU|7KDpjm%18}>3$*-C2!a#W_?6>UE;?FkV$0SNCOPj`p_XHTryel#c7c;
zrh6ABAs|4KBRW7xqJB*=bne^Wns4K{QkQ03^HcW(x3!@hb3^6cqhs?Ax5wQ33fdAE
zKK*n{6<=-`>6}1g=P-5@nF&xpj71Q~-@}B(fdx1VeqNaR6EIU5ADNEwlAjJeql^dV
zv`4!S2b{VR^}$6&nT_-9k>^h@(?V<=Kl%1Mp_fq-!{kUWVQ%%hD<N%l{wF9~d9U*y
z$0V)XEEI^gyqgC&bv{?off*sM1&!%<_a>#2Y&W>4UqX~Pv4tDOPOBGsonNn9H^EZ*
z?t^vT_=X(D_ewiXk1t)YyCJE^yDy66R!~oa+uwDjb89FuQPD^jzBJ)6dFgmP9hM(z
z`zTswEJwn@lLvX8qG<O&8tF77RDU<TewRgEfijL5Lv$dr6Rm2bpMHFIxBVbV-Kme+
z=^b}e|6`04`Gri6nBZ8@z89{~Iz+wSOc^j!m~Vi8S9=h|BRNDI1QjjJs1XwUj4hmt
zUU!VPS9{^VYHtDn@;Lw~zzJ{x+`xXukGEZ6{ofVHg~$gXLQp*@M)+saa4N^20Nx{j
z^{4U5{vIh@E@r<(WB<hH-TR{fFb{$+O>$)F>C2f53fx&c6524wZN1tSuvXIbO}${?
zDi8^Dm;UQrfbq`x64ckZoCOh%l2<PM8(QQHT**hM=f(=g<ks0ERGa52dCvUVlkEq|
zqYE_{QjMP}Ua{b0!#IXS87yB)IE_bTTCeF^jm=!AI#l)-I4_*+78$Z5u{J6cJe{`5
z3!1O+DVzzKPdu^Uc`Hv;Pw>bkTqnn+?9$6FLvWof5a>CZ%Oww(yLZfdLRXEN66=Um
z+hWVXB)ewqnc3W|nfIf}kwyQlR*02d+x)T)wXgHpwb&bA9`{SWi61X~G%?Ig-dHR+
z_dZI#txXu4=3>-{lGJNm#iyV}z4@#yf8-9Y;9ZN%gO8nvDi|0XtnPh1q8Wxfa`BA&
z#?k4r2#f68@v|p#Lp=qiLz!=>gqR#PqaVzw&8Uqjo(lCMs|8tK{mEE9l#~15i5lSt
zy5>O#nyz^!Q41TV>=L`{l5XpN20SzCn*Fu#9jd)8hmkQ%7+Q;!ehd`I*UqQ&Ei^%Q
z!i?LFm<owy(;<qAQo)=jiie@!Qd>0LR9TrQuZZgF+!+mp*r7BsCN<sw{Aw!FEvUW#
z6ZN6P4aW^-oUf)<=5~SjiC!Xw&7C&@_S`@16+6{OVQE>FjJf$ml@lB#f0@^P%`s)U
zLronFW<m;X5>99v4;6f&Zrtx^Tnw124gMrhM5|ZREkR{=u^ksx)OLy|61aCd`x<~9
zr!x~6j@jT(AA$*<N*y%;HJ`lSrF_mNsXHjHklq)UJJ>Lt%EAP_*P5qdA?xW-(p?*C
zwR^y`yDg~2u1C0u{;}y=$WATUvZ%RWbd@vSO>%fcj;2OKG{ba~_fJ%uznE%v0dP}8
zA9k8C&(A!xPm~DE8-O6QxTX{n+YP7?E6O^u9%2jQO_AbbRb(UHqNqja9i;y5@NprQ
zq!B+kP;oMV01^R#8zbmPimv5=;M;<G`XCbSy{ASC!{Z=(dJhU7vZuz$z<+WubUF=a
z5SRn}{77(S`a}x+U*Avw97<pNhk2xcGf27@f?q&GKn`V%{3DC}--j21f`;<}sJlqy
z?;Zw>_=m+1qi8rkfcUP00HYv?o`Zb!r;<ax5c)I3KSYICMZ*yQDiQTtjQ_Z;5V5KZ
zj68V+gidgXbqF7r0n#S@XOREDixBy$43t>(Z&Ci^wIFJDQ4(blT=Y8(u@2>9roSM5
zDAw56&;k?aeBRZ^kv|ozzz}sSEM0UMo%%)kJcPJ6-(AwjVgs?Kuvq#?#2z-H?*|7n
z_Vut>_dPqhiq&RxSZzjU7|IA5$WVVR?%HYN>2rES6Gj~RXuRitmSKm*;?(I*3Uq(^
z7Q<fK(@9xX(b_^)(RmjR7s%K*WSG*Eq04U4SRx&P;p9&DV~oUKkhonFEcO^9U&Y<Y
zZfbPb(+qt2Xuw!?q{p{Z_V%(=Hn`|SFef>=V^0A5tVVv`jsbxJa`ZA|R9Ze50F3B=
G1^gQ$Q;6pP

literal 0
HcmV?d00001


From 5e4022cd8289baa46ac09036f91387f67d01f16a Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 12:46:47 -0400
Subject: [PATCH 035/105] Setting input test data for ray

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/ray/test-data/input/df1.parquet      | Bin 0 -> 4111 bytes
 .../fdedup/ray/test-data/input/sample1.parquet  | Bin 36563 -> 0 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 transforms/universal/fdedup/ray/test-data/input/df1.parquet
 delete mode 100644 transforms/universal/fdedup/ray/test-data/input/sample1.parquet

diff --git a/transforms/universal/fdedup/ray/test-data/input/df1.parquet b/transforms/universal/fdedup/ray/test-data/input/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..2584725bbf8e96cb852fc2702f8014fe689c7116
GIT binary patch
literal 4111
zcmaJ_2{@Ep`+sI^6N4Btp|LY4vJGjHjK&x#jD5{cb}=F<vXo?r$SzyS)|;}7tr8~d
zd<^p1DoaQalC8xzBmejRUEfvTeO>3A`<!#%_wRnL=RW5*&lq8imEf#!J~+x24u%7}
zjTsK`000w6rbgihV2K3)5b#Uj%mPvXs{lWzhcC(A!^Pg&&pXJ+7Y;>$5n2di6a=70
zidTVm3H~Y85E>99L13yzJ}F#^D*ADmuI_RCy|)!<v8g!R=`7b2wD{3dPKS*2gC|v{
z!ng7>dA_WNf9rXh@nJqoNeXlTBn09pulc~MaCsz<Ur6IK!YyZ!3$`-i`=j57+KW4V
z0P_~_Gb}gGX}sy5w#83*wp`k3|GE-;`^tiA2eQq=<N2$AReYw+s}jkw>e<A#@y(%G
zSr3B<g84n^2k3cX{JjBp?;%ux7Y|#asi~<P9C9`BmDDZcG>qR%*qCM}ZB{?MuDm;H
z?K@$HPYB7Ej()n*E}5|0UgR8Aq)lOA<rK}SRnLpdVGZioe8TR2JHTfQK|LUK@z8Y=
zT$J5X%tgq;<J-HE+s8*k0wpKCGAU-G#%27}a;`hjXLWtsm`8W<uYKe8^E-gB4#W#9
z?_W-Z!F=4N>tw1(@HZ_Wy`ucqd<yJdoKL(l*_xz5z0^nXtC4T|l6NlZQOi-~gpzXr
zAgUEBGs&KyP~}}*B8j*6;rFgJ@baCDQSA@?>kIGDfuIoC!1)ZWV6V4D74nOQ2^`Yu
zRk;JPSg?2%`5}4JdNzWXd42tu54OA7NR~DI@1n%YB+f8zr&}%QNkh=|N7~Xy2FrQ_
z|Jss_5iA`zOC6@nHmyX|thyFb%nk)!R-q@J+))aZxPDYYZH{vDb9J$PXZgpXic)<&
ziGi1|vhX{&+>QA=>^c@4MV&R7CcM%plLb(+BBnyL!#wmTVM|$w%oplASXUxhD|<!Q
z%&RBq$*kh@F%<1`Sng)^d8M27i7j2i;h+0wW48`1VR_0;yi&(HstpQA?_J$fm-7a;
zWfzZ0eXUZM=&~eNc4a>vk6cr~TE``v1b(LGk~(q@`(kSJMxkLt{+wo|D{;<j$*X{r
z$c8KQfDoxLXuw+W<V=-9tC?NzUjf~+Okyvy!WGGPmi#r8<WCn)SQj_)?Bpb#@fx`(
z%!<=i{wiF|k(;tvp%!MuOnXM;JxpHEcluKLLs+w}PpLNl3$uvYLx0(D>z?ed^*@l@
zH}y5dT}b{-{eCVEm-}(9<@_g`F@#|=gDcjw`<XgOaW0lzg}ea~`z4}VF3D>W^TXzw
z<Lw9nVack|qehQixeR?{BrGF^JdEB>SZipt`{sGp+UI9M!&Uwx#IJV@^hlwj<>O|B
zfp;t(e|22-=U(u)sH=#~?jP{YRA?&eD!?@u>^NtV-h`2a2)4BO<A!2ywe1gugJHen
ztakGQY}0S3Fd@gOeWG?jWBZIl7G%TjG;^b9fR*9#6fm0{6ryEB6cb&=@k%5oo{yF|
z^d$L2@*q2l$XMkVL_eJ`Njb%qAg;&Dq62<7rq_3=n^%?``e7M7L&U!f8}TyI?OO!1
zPN6jsns)2>I8d@i7eVcbbj@n5ioLk*9s8-%KjN}tw?>haERm-h8gV?Gn+kHi!GiZq
zjMJ}(Ijpp)fIxHE=5veqo-lY<xn)#Iac)YP^f_6)Lqz*fX?*puY!X2i;|L`4kvlpf
zbtzIw%$UgVWyPr!U8lbuZz^YwKJ1MwHBERsDqMU}Hd>!lm9mj!_MvBZUfcdrWI{z>
zWmSDs=W)Nvm?S6KH_g|19cCn_x#kD;Y{#FN4}M4eYmnT~0*+{SIGKTE7jTA(t4N?}
z=me0Wwl>7+lP`}bfh+BfnBE;ZuMmq_m=d*m>;=;lXl1`HuAok{(+qao=G<9hPcvB9
z)inVTd?}c(jK>o-BXoXA56zxk+}C+6e(+UzuYRyeC94E5LQ|RH_ilC?OO6O^Fpp6W
z_YG{3P#w@VD3qIRc8h_wzinzic~gWOZisSC&!uh4H^3syd-g}ulC-P_R&6ic(mi<T
zGw!EI@G%Z|?{v1wzL3t*q{Q^dfnYXR)I^T~%+^A0bX66b^ml5FlERLw2yd&tIOqO~
zYfYas2DrENSaZU9;_?S%`9jdjq2PA8%xj6sN6;CM&wj2l7%ZxV2j|b;y){`tnSr`Q
zL>)>`c^eUZ?n`G`)#CoQN-rZgY!0y89tn(<jX9T&do1$p`pdafW4RiSW4pEQGqW<)
zCWy<fD~Z*oU0By<0r>=Y@LKQtl<j=K3RDd_wk41XIqkC2Ezd)21QM?vSXAnps*qIj
zvx-m4&)7g9e44&_#?F~U$)oBCw8<A-wrMxl6H+bpWVl-;W*-PmkxUa!^Ir5h&UwEn
z;(5>qKe~M?5N{giBF}Z%NU$=Zx5xR)!*+07M7n^)4?he61>nlz&|mBZarW~ix%!d<
zcZsb@1+guN$^kLB4F}h<g+u5p$!L2l$^DBZD*?ct319|T0M<RSWHJTQIrNW|2|@TH
zyitQlM%<r=;iGJSiVWulUW?f)_#ZvX_I-ygA~%>lFLoDZb(~vuwc${Gt(6e%I-g|a
z(AE?39;oB=m-%7yyb44F3;`2hLpM;~rPu#htBlx<5D`tW)-<<zgL?V4Wa63Y&HarF
zK@T+-t+SU|7KDpjm%18}>3$*-C2!a#W_?6>UE;?FkV$0SNCOPj`p_XHTryel#c7c;
zrh6ABAs|4KBRW7xqJB*=bne^Wns4K{QkQ03^HcW(x3!@hb3^6cqhs?Ax5wQ33fdAE
zKK*n{6<=-`>6}1g=P-5@nF&xpj71Q~-@}B(fdx1VeqNaR6EIU5ADNEwlAjJeql^dV
zv`4!S2b{VR^}$6&nT_-9k>^h@(?V<=Kl%1Mp_fq-!{kUWVQ%%hD<N%l{wF9~d9U*y
z$0V)XEEI^gyqgC&bv{?off*sM1&!%<_a>#2Y&W>4UqX~Pv4tDOPOBGsonNn9H^EZ*
z?t^vT_=X(D_ewiXk1t)YyCJE^yDy66R!~oa+uwDjb89FuQPD^jzBJ)6dFgmP9hM(z
z`zTswEJwn@lLvX8qG<O&8tF77RDU<TewRgEfijL5Lv$dr6Rm2bpMHFIxBVbV-Kme+
z=^b}e|6`04`Gri6nBZ8@z89{~Iz+wSOc^j!m~Vi8S9=h|BRNDI1QjjJs1XwUj4hmt
zUU!VPS9{^VYHtDn@;Lw~zzJ{x+`xXukGEZ6{ofVHg~$gXLQp*@M)+saa4N^20Nx{j
z^{4U5{vIh@E@r<(WB<hH-TR{fFb{$+O>$)F>C2f53fx&c6524wZN1tSuvXIbO}${?
zDi8^Dm;UQrfbq`x64ckZoCOh%l2<PM8(QQHT**hM=f(=g<ks0ERGa52dCvUVlkEq|
zqYE_{QjMP}Ua{b0!#IXS87yB)IE_bTTCeF^jm=!AI#l)-I4_*+78$Z5u{J6cJe{`5
z3!1O+DVzzKPdu^Uc`Hv;Pw>bkTqnn+?9$6FLvWof5a>CZ%Oww(yLZfdLRXEN66=Um
z+hWVXB)ewqnc3W|nfIf}kwyQlR*02d+x)T)wXgHpwb&bA9`{SWi61X~G%?Ig-dHR+
z_dZI#txXu4=3>-{lGJNm#iyV}z4@#yf8-9Y;9ZN%gO8nvDi|0XtnPh1q8Wxfa`BA&
z#?k4r2#f68@v|p#Lp=qiLz!=>gqR#PqaVzw&8Uqjo(lCMs|8tK{mEE9l#~15i5lSt
zy5>O#nyz^!Q41TV>=L`{l5XpN20SzCn*Fu#9jd)8hmkQ%7+Q;!ehd`I*UqQ&Ei^%Q
z!i?LFm<owy(;<qAQo)=jiie@!Qd>0LR9TrQuZZgF+!+mp*r7BsCN<sw{Aw!FEvUW#
z6ZN6P4aW^-oUf)<=5~SjiC!Xw&7C&@_S`@16+6{OVQE>FjJf$ml@lB#f0@^P%`s)U
zLronFW<m;X5>99v4;6f&Zrtx^Tnw124gMrhM5|ZREkR{=u^ksx)OLy|61aCd`x<~9
zr!x~6j@jT(AA$*<N*y%;HJ`lSrF_mNsXHjHklq)UJJ>Lt%EAP_*P5qdA?xW-(p?*C
zwR^y`yDg~2u1C0u{;}y=$WATUvZ%RWbd@vSO>%fcj;2OKG{ba~_fJ%uznE%v0dP}8
zA9k8C&(A!xPm~DE8-O6QxTX{n+YP7?E6O^u9%2jQO_AbbRb(UHqNqja9i;y5@NprQ
zq!B+kP;oMV01^R#8zbmPimv5=;M;<G`XCbSy{ASC!{Z=(dJhU7vZuz$z<+WubUF=a
z5SRn}{77(S`a}x+U*Avw97<pNhk2xcGf27@f?q&GKn`V%{3DC}--j21f`;<}sJlqy
z?;Zw>_=m+1qi8rkfcUP00HYv?o`Zb!r;<ax5c)I3KSYICMZ*yQDiQTtjQ_Z;5V5KZ
zj68V+gidgXbqF7r0n#S@XOREDixBy$43t>(Z&Ci^wIFJDQ4(blT=Y8(u@2>9roSM5
zDAw56&;k?aeBRZ^kv|ozzz}sSEM0UMo%%)kJcPJ6-(AwjVgs?Kuvq#?#2z-H?*|7n
z_Vut>_dPqhiq&RxSZzjU7|IA5$WVVR?%HYN>2rES6Gj~RXuRitmSKm*;?(I*3Uq(^
z7Q<fK(@9xX(b_^)(RmjR7s%K*WSG*Eq04U4SRx&P;p9&DV~oUKkhonFEcO^9U&Y<Y
zZfbPb(+qt2Xuw!?q{p{Z_V%(=Hn`|SFef>=V^0A5tVVv`jsbxJa`ZA|R9Ze50F3B=
G1^gQ$Q;6pP

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/input/sample1.parquet b/transforms/universal/fdedup/ray/test-data/input/sample1.parquet
deleted file mode 100644
index 58387d07daf4381a020444fc5dee676b1360ebb2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 36563
zcmeHw3v?sZm0*=y?v~tkWA(IjWa1{}-tMBiY1^vO-%CiBq_QoyEUEReYz@#Ul}fg3
zNfwsmUm!~-Oag>W20{qQFf42c8!{QrOnwLEFca7r2q7V1b~0JQ<YYn)Nl2ENkXe$=
z<ZzO`@0H3bNhL#Lw;`GKY0IkjyWe-;efPds9pR#kNoR_fE)SZzOvHYII7$#bBtiD*
ziH;uq<+}*^M_2o6B1~q}#V4R`uW7ewcfUoi)L8VjCWM(ds!<BHdyjU&XBW9^*NvkD
z{R-J{()BrdiQRpc?nJRD6r{`uWgTKn9p*0ctEBlQl71JhKNk>vCx8~h)Iq>UFZ>~h
zH;J~-cGhX@s?(;T>`op1=&mi(cIPhBJ-dP+2_(6%lP`bmB+#6(y4;LQ%2*v)m&d`l
zT<NT6_c+)z!(;`U#4=(!EqDYED`o6<tKe}tvrZdp_hj5@t3zPyXbhWV&p1V===NkX
zj1&I4GcJ$d6w{I|<FShFOqvxvZo4Zh3XI3?vDvL2Nn|~Co84)*iM9+HLv%Ym9;cXP
ztsd5zv3eX1!RB_jtwK8ObXe0?D3h2>7O+fbJZ`7m>2!J=P9~didql>`x<H5rOqRN?
zIf<huw@RDk<W?yI!nzL2dfc(=K9YWKPrs?Vrp9|Si(<7P6)R_Q87%AHbeeD8O+VB{
z|7v&tTLgX0Ir0c;`YahVbznWZ+Mfa<%eIl>;dH(#4VH3wsjn+CxFoC$JI>_Ng)_Ng
zMOrEf`C)74MJL6@!tgRPWF3}@!x^C>43me)q3n!R{_!~ML-XauB1m(;$=o2#R3nT!
zG)+(#atknojs`T#c^n&<W1wUWAiD_q22))D^ojyt>EGCCepN?*%ShkYi==-ZQ97^W
zYHG|qyG&o`P%O_g<jc_@uw|}NDV0HOR#sMqKrll{TELMqQjPWH!mw+IeP?B{loQ`k
zuBM@~AQUr$1z{mqoE<Dnc}c9~mZd>)PAJYwR}jQ<gt)O%$_}$uR_Rxe%4K0zDi0PH
zq_x3fQ7n%T>$efQN~zPUW8P`-+KZK?(qOS%60BCnE@{}1;C5nZ8QFBS8um<$>PFm}
zEy*31)D@|-lB?t;tVOr&Hh-p*e)MhhOBSR>ZyoeC_2SV}rdy6Fdhz`uPyQIFoWv5Q
z1S+P;nDSv6Q?Y8Pv{;s?wZ$s6A`~lBWswpMZ+rO<7mE_LvbdB%1wwJLGAAuji^cpJ
zRbDJe)MA!`5~`9bNJG>N^c967wIr1!p(16Vk+dp_)e1C#M!sO!$HjT7u5o0XD3dJd
zKbi%@c7g(PEiD=D@AZvyp)q!t=v?hynUjj_1B4z_(;yIH=jToi-PlVIp;P<}HAsa*
zz8FPzuY!HEC#hH<<m0JHZaNl?Q-K(j;-ldc6liDX^H;)SU@R1l@_x!YL&e8=DmD{~
z^OHTCpJ8|^5SrqG0Y5bwj!trMV;{lx^-&Qn%1!cdK5FbIY<>MYE=XDqMETPRK4!X{
zu=i2D&?&)>dI_C_@45JJC>*Dff%teJL>jwcu;>xW@XuE=PM%7HIB$@r;$bSv$D;v$
zil<WKC5NeSgpYC|KNTmb11Bh7?3C5$BHTkR;>OVcv<D(1;Xo)3y?o)QpZdY|A7SAa
zTE<VY6d9R=iRIy;c7pfCdH)G27WSQrB_ouN3z7E)y}*pHKQI~qqDQ0QNh%Z$4a}1t
z@6=PtaXv)Fg5f0T&;?>fiWus-T?do<Dx&A;)mIUnQC&Q2<a@5_xoz+TQ|hoSK=~pG
zAZgX{OTglGYFU8g$oL+@9y&}AAL%))|G@_zG#(`!p`(N`*qh{HRFO>*Rv<kV@+U*o
zWFi(HIGhjiu^2F)3jxltI0s@10xyDqcqp2nbgYYXk-bJ&_gIwYxbe=B{@8dT?hhwJ
zegBzEAGxoaVHtk%=Y9}hl&3iO4TUM6&l{t-9$hGW<Y9`B$0$EP8VC(MCA>}N_rD|0
ze?iCljWL2X#;l+k(3*+_6ETW4>>)4L1HTLp9N?#YJnttDbdUSuCwlY;r+X6-KNs(t
z7U%<!$#^I+$^DZq!4Ev_p}M(XFno68sGo|(!%=RGKS3FXxY7vNTzl>Z<Og$6;OBCX
zFg`p1MUJy}B4zk+FBcu-qsLbC$#Jgp>4C3wdpX~!o7e*kHR+{B&l3N&LNEh=bKRvk
zh0e};UO5>GPND^y2~fdF$`_uT9Qc!7&gbJJN8WC@gs`*5U8b?iy*x;d@<oBwz{xn@
zGpT2+{Lmy}JgWx@4g5A3Oq}rr6I2X%9tejh&KD0%nMMfa>Jj4Bf2TsA+7us(^Sr6^
z?z6u=_{hOUqhmS{iX`GxXfhlE1lR!O<;Mac&{bnU<qyOnL2kx)iOwG2?(F#}#RX$9
zUlJj|@e;yW)RWze)xk3Q<pn|?@_CPyj|M`4cmSqVii;0)9{@rkL4IJ|aENd|d{;kc
zCIyW3-uoTHLBiE{usam?^GEu6-bT1TMfOwS(a~Ta#P^tiszAC#AexM?kU<NOdnyo#
z2ygGqTNuC)K=JpQK2PxXeuB8&`y^>#`fWVrPeg)&oRf?n`K#U-#ZL#$UTL5S$K7Gl
zFsyS12F^S`!npK-(3s)lh5^D&MtWo6MAT<k{M2q?YVdIbeG=vss3{+fhND#2=SxJn
zQ*4YlkguZM(ZSSX`$z4>(?2Bt==neI1r&VI!NS>BpJq%cf+r7k$6$gz_QT#NAC8U*
zADmn;-1SG_*GG8YS05pw#;-sANg__ZR~Jkej}x}g6U69k1mQQ`rnB?!xsA~K!Xa|O
z2}RE1`}MjQ)%T~pV3j}yU;6WBiTK%n(Ze((|A4SQ?z-lt-V@Z=XpEd0xY>CXp>y-k
zU!<qTx#w><4E8SsCL<D#e(DoPKE7|aUKfpjl_CPdVIpP}2#@pt5qfMg5GQXP_(0z$
z9VZR9pRieBI`MHBJsdsQfA;iAKqSruLNH;&VX&&9nSHMsEKyJ+J{mJ#K`=)i^?Wn;
zc~E!fKM*<_Yq<2t_jO$8n&-Ti!luP|sRjD`0wDbm0hCQXu>VQ@3I8}38~4FrzG&F^
zF~VU8B(jFUtrOiRC~xBN;{(?M%VK=I^Q8mPaFFr_2Cf<KCcZT8vIbyH5i4sYiGq2_
zdAZnGiM>y^|G1vunHO~~=KU;Tyv1a<usk)&`C>iK5#h&f+5ZzmcYG$oo320h1-;9b
z{K+%NzTG`Tc_YW~?XrL^=Dgvk`FIx*I(E1ltagk~^l<n7z0T*|AMAQZj^H|7#~cU0
zK{#%Dao<|k6@x_ImkDR|2r*(I2^akJ?j;F#^ityDKQ{E0_y21b;miHEfv4EfU4)x^
zmIyy@(nV)--JlDBkkqXU`OcE>y%_55&XQ-pz~4NQCG~E5z#AjU<$?7?K>7oMI`%Qb
z?U*NHrvG#x1`COA+<Pa0aFjzs#1#or=XO1|hdln5?u$kJr+!PgA3xG_3BTvwM|=8@
z%)yNDg(EXSd?4t!f$;1%b$9#1i3phlr6oK!fh~;oJp7AizM3&ysq?VLJ36l%c;bqO
z37yAwdLQ{dhpE$gIL^gRp$VS|86(}WoO0YUF-Cs!1XxUe09GKdcQgE#bx}Y0Ke|T~
z!JyGfIN9Z0e|dG_&;jrdiq(Sgc;b=K4-kEy-+zPQ>5gO14gPh{&l1m(JmB}~Bkv<T
z_dX)sPWE^H^US{;=Hj4c{y>NevUd<V#(Ly>lIfW<f2FH85)F?<_kFdS7&J)af8Y1F
zT}133Keq3$_Yz6twFI+ow?TjW_;K=AU+fZo2DlvR>gs$u`GB6WS&!YS3xv$2i}u}T
z;Qsf4qEyLchRlccV2on>PVS`>UHks+qC<%;Frm!jpXKfPKzv5{QwykkG8{c+d~o2&
z?l`Oq`>Bftyt<fm|LKeV-FHQG@9_elBrHkAeXOwp{_+SFF6MLK6Dj_$M1ngKi*Q!!
zzMo!nTZFLltEIe96e_vJB6vjRsByp9xSuu>DKey^#EVZF_ZyA6T+#61ca<yPB~gn@
zl#pIr2LCCxR<1||N)S6w#Y5nC4N-|Qcw0+SxwKdW-^{<*89$j_%;y(Zz)v5c_+q)b
zBz2lC>%;NLC{;G_x?I^gOX!Cve%}Me&STDx5Ql|XAy*utqEdd}uKgA&SIjOh6%60d
z$0Vs|sykegmO2j{E>loS6&9By_BhcC=u|7CR}sX}NP#?xh(o?sJQ~sP;eXg66VKQi
z91-kXAty{;r_j<MuTxYbdt`?ZJAT+lOkrrX4T<b$=86>V-j-XK)7t#PW43S%m)3YD
zvpiP@#;Cvm1CcMPfl{5F$iuC%Qx-OGl#)k9T*YN|zBaVP3~n8Kh{BqNd~d92D$ZMe
zOrE3fF1z*T(|bodf->^=k#n!|;6XQGhk>1VwX-iv6FKw^n+)*m<?VsI*&J%#i#-^x
z@5S`uB8a6}DdX+f|4f)~v(ZmZ(02`>?broPgo%AY6Sq+lHMiB>rd}mHxBCE!JxN*t
zwGOFhfnTUr2dJJnqtazqBxb7+LBYCQF<(pn@@o32tC1!voi*VvnU<AJz32OVo9*fM
zLZxb>Y(S5Ap#od2VaW{}m6pD?wg$#%h`IhWU8l$Z0j4wcXvv)&=6gEm?+Wx&??Obq
zzTq`Vrc8$wmiJ!X4JfE1pBmB3Zgst-;R!mc1@fX)M}Kd&n$MrfEEZtB#4`B{()<uf
zznr1}B7+EM7^i7eO|&8X5d`c|LI9;f_L=spVy<0KxW|HHmjproYPR3hRSR9~kts#H
z42(qFoT49@rSG2YcXkn)VD9cP-4B4|i1CNXmp}AD7+Q&iL8N$SHdmRerZ>WXLT<3I
zn33}3K^!ASAA^E;O*L1}p_OzP(GRi0m12HT$bc;$t_60LzrZ57Zm6^d(X<7r_zsTq
z&hTt7<edzL0yfyV%_L6G_-vzoM|hmo5YPw7w-A1ow-h!_3i_E+T|rF>;~~tRk<Khj
zOJ$H3){@WdG5^gT`g05POLIt7-%=F4#rUf^hq9)5Wd_-v90nPb#U%+GmBCDbN(<#V
zYEYyuEv;4Nz^AWG2lX_#R71yy&v6LLxprvMl;5n@HRU3erWCWeS*#=P-DSRc7yavH
z`jsluk#hxi?)YhP+G`qBI6ZLvL7*Xh>8w<g%DM7jX=yPt$T%T|>49u0YiAiPz8>7P
zgJtvTCZ7H1x;oFga>b0adS=7$-qB&cyMzABwe*wkM(o;r+}8rD$%+xv6$&d3me5KL
z0a)9h)ji0%DXU||YK4E7;ZF_q>TBEB(!Q#R{1?tb;$O*_*62r(e5@|NA<ggapdY%P
zq`&uH5Chtex*0%C!kcsDzLS7h1`^omw3u~)=tVYdV<8i5%}Nf2g<N;q$vABe!C|-A
ztPJF?os!^Uoel=2$+N7@ofh1r|K;~NGZ}kY$}o&ev}YlKo@KM13=EOYLK2>pn5@%d
zh1|F3vS+iR&26{ZSkd7Y8Obg(Znu*`c(|;RB-m{(A(Isxo^(dCvI5{?7qTv!;I^?2
zd)jJu3K^#d<?h`cD+nT+Wvr00m(p&*1F~w(m2Z(Y%gL=$281>EZO`4L5`9NQj{J{5
zK+?~=k3=#*xmBTT1LB$*bRp-7xA)tb^Zd=NDvj7xUCLuCAg4S({b7>+)-6~i-r`Bm
zbMU#p|4|@Vak}?F$f+b0*D%q@=f3ZwB>jG9(a!5W2SA_um<rI|S_lILvp0bK@(z-I
z_Kx<zTK1gl>l#Z!$NIVjeqpMsw>S@eTqRc^vkIwinM}0j{N9};{kc1_WR>ok6YkTb
z=~`vm8TVTN2ADl&(8><7t~jhwox6I6AZ3*5>9@cV#tbs<Xg{<e`!8pms2=~rEi~+-
zMPAELug10(RHK4YAEnuz{rWBymAf+{>}q88zSLp<yAJx9KOyNaeFBsDMu2Iu<k^c%
zpYB$8^3tB)oBaCiD4_lOn_q`C*+JKK<k#P&m0wQ{4Gl>HclP2q3;D}~?>v^h_O+tm
zr?2J@@oX(WFc~BkxsRg!z&p=Fen1a_ZdOhZa9_mvfrw^6d(->?N>@}NNIfS(*3nW6
zS_^-6d2`HKW2Rq-(O!trp4V)_&0kcl1A8+PY{GLT6xPCP?Q?zR2hH?-e@4>Zy&EqE
z7xeLhK5nl*YPt#!lBV}4o<j1<+E%~j*HBhxZL8n8yXT<dJxxJqge4E2WS#YT?EPpM
z)!NP+xlc`v@y@&j4j!l3fqL~Z)x_zMvRU~E=!PHl%MSBX9rQOI1$Xhwc<Q_{sIQ+r
z4%xY^Q-s;$lATM$9_Kl*r`D}L80G<HPYWDP_WacoD)w|qtEJqM)Wn-x9*1PX-)`{c
z+(Byc=7h<r1c}UXlu0O6)A^iuEu6lsLYBD-2Lv>>0{dHASv1OEIQ3THhNo0S?t?fd
zq*~G8SM(R#&u@|D?~?Q@Pm=VfpTvY}jHwAnleBZ#LOt|#z)abz>ETExI*}@8fzf8|
z)vrO6>T7^en-s}8K)U4{DoA?@b7gT+wNZQKY0&?t+XHG7(sRKsiL$yz?Cuep{k1}R
z8jY0*n?_ZW2ygy-l?ZzvqAG%-NSUUn-uJ#q(l>l_LsenNrZMtnKsDugvFQ>;ReP)`
z+gOE&Y8KAQWHdC?uu*BLe{CY>zHh0B*}WvqD(m>&r1^f5e&`=a`ZxazQ&As6J(?z#
zaA``xa_`YS01@;Yos8GOaZjzvqI_w>?}cxx@awE=%59|i6C{28w@Lb`f5hl1{p$m1
zqIM2y`rNZBHPuyw+4^L-`&obu)!Q%`?te}NWj8*RQit=??~wG@ztbL$W|Y@$XEH7)
zBifx}MzVXDw8v>@A#y4TtV4wRO?HUOvKfK3!;L4qB)P11!R~N-SclW;_IT1x6y=rD
z*|dvggtQ&Pyl_Fu20>p>+RZxM8IchLcP8Uxt?1@bT4EiN!($hroy{f)lFbHpuv{lW
zj4qGpv8U5^M>;Fn8~}!e&@&@AoPy+XWStJXBWr~<kj*$D5^c{qT^TEcgq`WEJ?qX2
zjMi<2lUv2ha&oI!0bvbM-us_dDQJ&!c;-xbPGGS;moG(q_j@FL*FRxJeJjwQIW^$I
zk}_)zR4330*PP&fV(U$&PF!J;|3EwS`u2A}RIy`^I)D0p()=Nke(?o}61{*K(Kw7I
zO6M?Jzw~4EJd#!`OG14f{rX3+9r;oFd8B2q(0v{a;*p&}RoKp;FHH=1;wNebpv-?+
zl2>-wWc}bplK$+Am;tRrX)?iL>R05_eFEJ(RW77yV5e8=EF0gN@O%GDD*Se<j$+(N
zn(ra$yZ;YK|LCU}y^Rr6V`(C|>5BY1lWg+h*wDCfacrn+qUph(sc7m(KI0h?PG#g_
zRYbRI<uu!S{}o)Sf5DX1{OWUrc`mej23g1JPB3aTR%q9#YNGxA|Er??B22rInQkEe
z*FPuedw#AaUt{dez-p2oF`W!5SE+R*VIYYP!yX;63ocVDSEZ39K|i3aO_!;a8jBvn
zq4>uPm#JGzp{nsR^@si+(5&31?g5pD8>nz#BVR4ZF$lO${W3}a`WK+izaZh-aqEz6
z0BO<K_v%pxVcA_4=cIyQ3c)4oBOQ%bxaA_%&1b^Y4Q<r-V~-F)h}FyG8`mb)h4K|R
zb;-s-%!WecM4P&>eke>`sy=I_ly?DI8)u(nY_w0jC>;<T%_kkyb*hZMMlo`U)y0qm
zRX?H(nDUG;QxZgk2|cN^I5MhAW<}|PkgMe()J5u$l)Aie$E~hWZ8p`#P1`f23NumJ
zAIZ2Xn;03-#yx@34Pz@i*y<wLr&UVO*x1#S$F2E;wP72a3#A7dR#|72GBnC=c?qm>
zuC@*-4P?&YV|ePK2GgmoXtpa#9hwS_^N%oh_4W6?dg;!*@POWtR}cIi+<O0ycyF8j
z-M?Z?UpSe6;f}~_-S9q7cSKr!A$rSa>*@1<mYqI-=gr^v5pw#%8RKm}@vd=BpZ;Aq
zo=d-p=V9qL<uEG!yL=9k{yjLdM{nU29=*kbb@VnIjiYz^;2C}4gad@oF5J(39dFm!
zIp6C|J3K(YIeB?{Co|>rR+8WJ7V_Ejo0d$bw?FesZ)>uaexvzU`i-Yl>EDxV9=(le
zIQore*63|chSA%dyrSQ%%n|)2rfujQOrMy)^%c_GMeEJa+(RB9VAE#V4Oc)iLfP`o
zBkw*WUk8*EH|0Yk)E~b45WF2R1K$oUp|>I)Izb&mmj$7q2p7lU30_dLRDnk;{=Qg(
zuMYfGLSLEdu7U@7Ja}y<+%=SH!#0kR%}MzTzJ^#ISw`y&x-$ZWhpNRKx&e$as1{0V
z8x@P`c{sL*$_r9OKyh_w{_aCk5t_qIZqzpsA02cbx=w{mBNm71@5miu&`1sVwA40X
z2I(%Ry%n#GVij75Y1f?q>}uqzhTaS;s@q1O%3DGqN!JRssj>;|xk;ggTxIa~U^PhH
zB20XmuNB}%@%B=vq=Z@ls%y3vrjmkeg{iLDUYH<B`L!3VKpU#I7pm%Q6|HcaDz_PO
z?fn!QU{y6uFthMB2~FFes@TjrDCY(m-hI%DLEC0c=xlK!slcH7^_$|MG}vAY>bKQf
zfj8Ej54b3K-VUx(x6N=h?6YjMw-*ki{aXpE6>cx2+KtmzXsU`Ph^n~)uiH~C{Hlu0
zCQ;4qVn3o~X>Y3BcF_3ZsTQ>x6-^+wI!Ey(&{lx*<xo_<&G71%jWmEZR&FokhT9sg
zuv@CP8T!U$2o2B;b=wQK@qR)p?AH42g|D2lX@#t=X@aSmsBdTn*DfTq!c<jkGsuQZ
z1g(Hus++)XaTd#ml3Kym%eNOEKE>Y(Zlid6p`nBOt-zGhCRkh59r^se1~~M#NsPue
z16166eBitlZcB9&_$_Kq?R<ADTvf&PLX=OnwF0b{H^JMgL~AGKS|O?`nm}%msq&d@
z4S@K(9xC5vc&hW#8c>^R&jW15)0Z^-PvEk&@ft9>!PYUrQKBXI{}UFMaPTIdOD_pa
zYbbV8S`(I*7FQ4-6di_A2HkkD9)ehk!h2OEJH!mx4qZoBx;c22d2&QPDQtOkY#9Bx
zX9bSqPQx$yImK~y&O6JEa@;KEo5f{bZ$teGDuj3QEA^-2@-uwmQE+{L<5u7u-5mVI
zgTP<h2|j1w*Sj*4bS%tFPjcL(I|k3LKp|gEjZN97+0b%&e5#TzPUgZnZ(3TLSQ4OZ
zV198nP@I^9i(2_WZe=!5aLlEX3Dl;VNv@WIJ}xKPqH|*LG?Z~GwdZBg7jOpriE_X<
zv7Q;5T0!GXPBR%)Kf`*;nIu!4W@Xr^=?$3GnIw}(ZPUf6a@vRBrAn#f>cTXu0e5T^
za35WtwnZJ{7{ZCmWs;5(jK7eI1rTh&ky(+ZJXUGii^mNVJUO8-HJ|a}`bug#ItOD|
zXJUZwcsZ9!I_A@Cw3IH0c)Y2p=vXj5o69!gF0MiQ(F*WS!HdgHB^?<5lpkpI*;i$L
zHRIo=UC|3X20k)sT=Uh8f5uwlJCmOFI@Gj+e5|4Q(>cV`baKi%lZ?(~#`q1{WMeC{
zAjb`!2J%zYbYZlTN}dMUaJXNpkk6;bqFG#yc;K6uN@JR)7LYtK?47`^5M7?JO#yBW
zE9k{SJKm+nJo6yae7YFT3Q3uEal7?k4Bl=ww(1G`(EO;&w+>r|tHHa4G#g?-7S)=5
z)#h6$n;FlqfKDR*GU-Ad>6kK~TI8B$C$32)Lp8ZG`OMfT%-QJN#=ID}mXWTiWM&nn
zqve{;7lh<gIW_93%e@#aWyV%zcv`Y7PF00uXgLLQ2<uXDsv?f()i^KD6iWFS+d1f3
z0qjXJisZE3PL?oFt6&?dnej;scc<i-M*3o#noF@Vy&7^XgPkm9lB2~MPj^a=NPn`K
zvAi{1lZ6IuEp2v+Zz*JxM?C<anG=%scJ`xy)`e*{x|~jqmeM(FXLpK!p!=)&+Ppya
zzjZFadXfg6!|M!M&$Q-4AyrPZqjlTbIv)z@G0$AxHovYpx{$VIs;D2#O+5dnZK=Fi
z43*MsTVB3qpB<C^n8%hbtS-YElST6u^s?A)UW1*~ZmY%X^J04xpGLOb$fjw`BUt03
zc}cO8?R4RM;%5Ut1AM91z|SfmpJ-?CYrFpL^qQNQo+!7|vyHkP(<@lhrom4uF0_*w
z%xebOI!9Z1ogdo*a2;0UJJtM*HnwkLO_u%Pb~<&waI~SdenRF=i(ilT1q}ELYm)4{
z@6?*HFvB9hx$Qh`sNXqzJGDMzn<$CnU?VnJA5znyd9jdR0iR)chu7MCs*nPo5PZwR
z6!K4N{%q@hCzXVj;Li%vsXW?i?XXXkm&Uy%@wITAo?8*7C*BPFN;3t#pTR!5YVV-o
zyFfq!?Jcl>DDRcp+L`mRZovzQrZdZ8p}kKpuv3l<>sd=};?t*)A2EHJh5w_h%^tnU
z|9U-p+XAyFLJR}N0otsm6=6IypSD39U}x>#idgW>!#rJ2C8PDVT*Ll};}gp>?9{r*
zPA!0pD?2<-)>6sLX11oGey8l<YshpRd{MNoMtZbU>jlJFGHv&?uvafaY^WVRP5TCj
zGc?6?n&w7he5y@cA~WV$f&Kd?IH~Hje5LcX-q(B*h-s}0WBgh>Kdv4J2xUcfZf<gC
z_l#_6dSYEjdaCX4gY|7qWOsgjmByoM5UXux&tP4v*K5tkCSB5qg}xCxi+rWg_0)9x
z7#NCGfen-6S6cdZUe>0Dc!S2;*-*bzwgUS_5L*X1X37wEZ6{03c3LvsQhd9O{lGri
z6vTCu_{7*$+x=hLw&zREQxNO)O@NO(1$yUMLorZ@X_tlM1n?C05;#BMb-?$vv`xhP
zRn1q3w+2=xWA>HcJeNc9a8<mgKJU>O^MNX~TMrbi+yqLcjGi8c&q*(QXHo9t)a)4K
zOWcs7f&T>+p1mvZ>@RyE_p;0-AmefxayQ-pdLBj3;nR?-@mV1k!_A`v%@}&-@H0->
zgiiaRei70&oWwyshDXoP5uWWReRFEXhv1<!P7FPJ(X)RA4Yh*OKocm<;}y~KBzlGi
zs0Y9tcMbX&hYD1re2&gLAn628ybrgA(a;0_#2;4p^p@nfI*#$+91W_9MT+ph*`sej
zBK#~U&!-az2*M72@K^i|umpOMotVn1VnO*7=eJ*$g`r-nKN_XhAND|V8Fm%=;-AIK
zmVt_<{vZhWsl!Jyg8p*+5xyrPOT)vf^UfOmqcXQpBQ^aZJbH1Q9oa5!9JTjw2p+27
zR#9`DM5}QXKV#6T={c)d7+nYc3ztD^4{Che*!t^%x1odFBx-+ILV@Rv;_)$(vJ~-i
z2z|D!m#9?)^&jSy{+r>e_zOUmG#shn8}Tdn`nCCk^$Ghv7?Z$~Tjpng8;>l@`ccyd
z%oMdeP}iF8jrwL$du!aaxjb;bQt>%qKA`rVa*e;<&G^&eM<e(l)c-P3>EDJwXwGx#
z<ecnRp61wL&{rt(ag*~3zBT^f389)lNC@7g)O6mKwoNRExY}FT=pWhWFY8~+{IjF}
zt$RTzor$au53kMF_|4b!w;BE->ff-}1+*uTe5X)<)V$q%Mf@3?Dx^?ebS%%pJ_W@?
zpwQ<={pXtcV<xHO>qBFYMf2dRuL;v$*yDJrTo~bd*-A~n6nP<`w$7ii5MKYF>NMgP
z+XSygpD?`8GOUGGvADp6BQ~JV@2Tli8~KzG<i<GJX`eI<st6T6&fnZ0$+N{iY{d7t
z$t6%FWKRR!G=>jfYy6e1Bw}jq2jM$DwU*|5c0}LoYqdw9$MOqcPejzd)YQI(|8jdd
zk1xlRVSst0>X@JKpzt3VIW_;`+yTd-JU`elIR^}Aha#Zr!|Mt^+vqEz-COq=a8APk
zewWClQ2({{8h>i~Dj#a7&lfM!ujc;IyecOG^*#~F2akvKcM>zTdA-3?X{?^tfsUXu
zv*<hp{KY>$ZUXTGuYYak57v*y+|{h#jo_~$dN6!>y=&n=o<ESkhTJ{Q!=h*3Jn(RM
zrKVr)_>W0Z<s&h&Q!wX-S7FX>iZ2p+U7uxp<Kqy}80xRWS7ric9UdR~C>+EUQBD{Y
zT!Y%%{P3|Ee=xjyejAVEPe7;PViUX;`5;_iJyz%U(RvN*8rFyPjs5|QWD9>0=+TC=
z3mg|j{6+Y3m>e`1%sT)*1FjNssPNzl(Ozy2m2k*Ns`VeYYd*Ju+93S_hYZFO!8j;C
v^2{bIR7qI6rYcni#ZqYy9}J>u2Y)E}ND#c~4BqRv^Z$rDBLorQqRjsXsl2cq


From c14bdaa471f2338bbf88390f6c0d94176ea792b8 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 12:48:43 -0400
Subject: [PATCH 036/105] Bug fix

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/spark/src/cluster_analysis_transform_spark.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
index 5522d67de..feeb3241e 100644
--- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
+++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
@@ -45,8 +45,8 @@ def get_folders(self, data_access: DataAccess) -> list[str]:
         :param data_access - data access object
         :return: list of folder paths
         """
-        bands = self.params["num_bands"]
-        segments = self.params["num_segments"]
+        bands = self.params[num_bands_key]
+        segments = self.params[num_segments_key]
         folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)]
         return folders
 

From 1215ac5ab9f1c8c04e55252bc25aee305707d620 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 14 Oct 2024 12:49:15 -0400
Subject: [PATCH 037/105] Ray orchestration for fuzzy dedup

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../ray/src/cluster_analysis_transform_ray.py | 48 ++++++++++++---
 .../ray/src/data_cleaning_transform_ray.py    | 10 +++-
 .../fdedup/ray/src/fuzzy_dedup_ray.py         | 60 +++++++++++++++++++
 3 files changed, 107 insertions(+), 11 deletions(-)
 create mode 100644 transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py

diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
index 970686e13..a0e8e7de2 100644
--- a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
+++ b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
@@ -10,9 +10,19 @@
 # limitations under the License.
 ################################################################################
 
-from cluster_analysis_transform import ClusterAnalysisTransformConfiguration
+import os
+from typing import Any
+
+from cluster_analysis_transform import (
+    ClusterAnalysisTransformConfiguration,
+    num_bands_key,
+    num_segments_key,
+)
+from data_processing.data_access import DataAccess
 from data_processing.utils import CLIArgumentProvider, get_logger
-from data_processing_ray.runtime.ray.runtime_configuration import (
+from data_processing_ray.runtime.ray import (
+    DefaultRayTransformRuntime,
+    RayTransformLauncher,
     RayTransformRuntimeConfiguration,
 )
 
@@ -20,11 +30,31 @@
 logger = get_logger(__name__)
 
 
+class ClusterAnalysisRayRuntime(DefaultRayTransformRuntime):
+    """
+    Cluster analysis runtime support for Ray
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        bands = self.params[num_bands_key]
+        segments = self.params[num_segments_key]
+        folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)]
+        return folders
+
+
 class ClusterAnalysisRayTransformConfiguration(RayTransformRuntimeConfiguration):
     """
-    Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
-    NOOP does not use a RayRuntime class so the superclass only needs the base
-    python-only configuration.
+    Implements the RayTransformConfiguration for Fuzzy Dedup Cluster Analysis
+    as required by the RayTransformLauncher.
     """
 
     def __init__(self):
@@ -32,11 +62,13 @@ def __init__(self):
         Initialization
         :param base_configuration - base configuration class
         """
-        super().__init__(transform_config=ClusterAnalysisTransformConfiguration())
+        super().__init__(
+            transform_config=ClusterAnalysisTransformConfiguration(),
+            runtime_class=ClusterAnalysisRayRuntime,
+        )
 
 
 if __name__ == "__main__":
-    # launcher = NOOPRayLauncher()
     launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration())
-    logger.info("Launching  transform")
+    logger.info("Launching fuzzy dedup cluster analysis ray transform")
     launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
index 831a6c9c2..e83960c24 100644
--- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
+++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
@@ -10,6 +10,7 @@
 # limitations under the License.
 ################################################################################
 
+import os
 from typing import Any
 
 import ray
@@ -88,8 +89,11 @@ def get_transform_config(
         :param files - list of files to remove
         :return: dictionary of filter init params
         """
-        duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
         data_access = data_access_factory.create_data_access()
+        duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
+        duplicate_list_location = os.path.abspath(
+            os.path.join(data_access.output_folder, "..", duplicate_list_location)
+        )
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
         duplicate_list, retries = data_access.get_file(duplicate_list_location)
@@ -117,6 +121,6 @@ def __init__(self):
 
 if __name__ == "__main__":
     # launcher = NOOPRayLauncher()
-    launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration())
-    logger.info("Launching  transform")
+    launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration())
+    logger.info("Launching transform")
     launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
new file mode 100644
index 000000000..0b9be33ca
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
@@ -0,0 +1,60 @@
+import argparse
+import os
+import sys
+
+from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration
+from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+from data_processing_ray.runtime.ray import RayTransformLauncher
+from fuzzy_dedup_python import ServiceOrchestrator, parse_args
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
+
+
+s3_creds = {
+    "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+    "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+    "url": os.getenv("AWS_ENDPOINT_URL"),
+}
+
+
+ray_worker_options = {"num_cpus": 0.8}
+ray_params = {
+    # where to run
+    "run_locally": True,
+    # orchestrator
+    "runtime_worker_options": ParamsUtils.convert_to_ast(ray_worker_options),
+    "runtime_num_workers": 3,
+}
+
+ray_params_argv = ParamsUtils.dict_to_req(ray_params)
+
+
+class RayServiceOrchestrator(ServiceOrchestrator):
+    def __init__(self, global_params: argparse.Namespace = None):
+        super().__init__(global_params=global_params)
+
+    def execute_service(self, service_short_name: str, params: list) -> int:
+        sys.argv = params if service_short_name == "fdlist" else ray_params_argv + params[1:]
+        if service_short_name == "minhash":
+            launcher = RayTransformLauncher(runtime_config=SignatureCalculationRayTransformConfiguration())
+        elif service_short_name == "cluster":
+            launcher = RayTransformLauncher(runtime_config=ClusterAnalysisRayTransformConfiguration())
+        elif service_short_name == "fdlist":
+            launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+        elif service_short_name == "fdclean":
+            launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration())
+        status = launcher.launch()
+        return status
+
+
+if __name__ == "__main__":
+    # Parse command line arguments
+    args = parse_args()
+    # Initialize the orchestrator
+    orchestrator = RayServiceOrchestrator(global_params=args)
+    # Launch ray fuzzy dedup execution
+    orchestrator.orchestrate()

From caf79a30b1c24892e1262009d57b29a271993c73 Mon Sep 17 00:00:00 2001
From: nelson <kibnelson@gmail.com>
Date: Fri, 18 Oct 2024 09:41:01 -0400
Subject: [PATCH 038/105] Added python test with expected data files

Signed-off-by: nelson <kibnelson@gmail.com>
---
 .../docs_to_remove/band_0_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_0_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_10_segment_0.parquet  | Bin 0 -> 1505 bytes
 .../docs_to_remove/band_10_segment_1.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_11_segment_0.parquet  | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_11_segment_1.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_12_segment_0.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_12_segment_1.parquet  | Bin 0 -> 1505 bytes
 .../docs_to_remove/band_13_segment_0.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_13_segment_1.parquet  | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_1_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_1_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_2_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_2_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_3_segment_0.parquet   | Bin 0 -> 1505 bytes
 .../docs_to_remove/band_3_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_4_segment_0.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_4_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_5_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_5_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_6_segment_0.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_6_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_7_segment_0.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_7_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_8_segment_0.parquet   | Bin 0 -> 1510 bytes
 .../docs_to_remove/band_8_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_9_segment_0.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_9_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/metadata.json              |  58 ++++++++++++
 .../data_cleaning/cleaned/df1.parquet         | Bin 0 -> 14986 bytes
 .../data_cleaning/cleaned/metadata.json       |  59 +++++++++++++
 .../bands/band=0/segment=0/df1.parquet        | Bin 0 -> 2753 bytes
 .../bands/band=0/segment=1/df1.parquet        | Bin 0 -> 3122 bytes
 .../bands/band=1/segment=0/df1.parquet        | Bin 0 -> 2862 bytes
 .../bands/band=1/segment=1/df1.parquet        | Bin 0 -> 2537 bytes
 .../bands/band=10/segment=0/df1.parquet       | Bin 0 -> 3305 bytes
 .../bands/band=10/segment=1/df1.parquet       | Bin 0 -> 2537 bytes
 .../bands/band=11/segment=0/df1.parquet       | Bin 0 -> 3450 bytes
 .../bands/band=11/segment=1/df1.parquet       | Bin 0 -> 1354 bytes
 .../bands/band=12/segment=0/df1.parquet       | Bin 0 -> 1354 bytes
 .../bands/band=12/segment=1/df1.parquet       | Bin 0 -> 3442 bytes
 .../bands/band=13/segment=0/df1.parquet       | Bin 0 -> 2537 bytes
 .../bands/band=13/segment=1/df1.parquet       | Bin 0 -> 3413 bytes
 .../bands/band=2/segment=0/df1.parquet        | Bin 0 -> 3177 bytes
 .../bands/band=2/segment=1/df1.parquet        | Bin 0 -> 2758 bytes
 .../bands/band=3/segment=0/df1.parquet        | Bin 0 -> 2745 bytes
 .../bands/band=3/segment=1/df1.parquet        | Bin 0 -> 3122 bytes
 .../bands/band=4/segment=0/df1.parquet        | Bin 0 -> 2537 bytes
 .../bands/band=4/segment=1/df1.parquet        | Bin 0 -> 3413 bytes
 .../bands/band=5/segment=0/df1.parquet        | Bin 0 -> 2753 bytes
 .../bands/band=5/segment=1/df1.parquet        | Bin 0 -> 3122 bytes
 .../bands/band=6/segment=0/df1.parquet        | Bin 0 -> 1354 bytes
 .../bands/band=6/segment=1/df1.parquet        | Bin 0 -> 3450 bytes
 .../bands/band=7/segment=0/df1.parquet        | Bin 0 -> 2667 bytes
 .../bands/band=7/segment=1/df1.parquet        | Bin 0 -> 3289 bytes
 .../bands/band=8/segment=0/df1.parquet        | Bin 0 -> 2845 bytes
 .../bands/band=8/segment=1/df1.parquet        | Bin 0 -> 2537 bytes
 .../bands/band=9/segment=0/df1.parquet        | Bin 0 -> 2537 bytes
 .../bands/band=9/segment=1/df1.parquet        | Bin 0 -> 3314 bytes
 .../expected/signature_calc/metadata.json     |  62 +++++++++++++
 .../test_cluster_analysis_transform_python.py |  46 ++++++++++
 .../test_data_cleaning_transform_python.py    |  49 +++++++++++
 .../test_signature_calc_transform_python.py   |  83 ++++++++++++++++++
 63 files changed, 357 insertions(+)
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json
 create mode 100644 transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
 create mode 100644 transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
 create mode 100644 transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f5da05a106242414df178b29c0ac05f21de73c73
GIT binary patch
literal 1505
zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3
z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv%
zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W
z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X<QT*?HF2g
zmV_;$SYTdQOt#*|XTolwq+xyJ*8{KTk3FI8i=_XQtY0zfV+{2zR=uIuQhmz?M$*X^
zbwxXd7OkjPw4R}Od0rO|?6Wr0#s1Y*Kll2bQTN*LKCD?k=#4BEzoF$ad1i4R40Y&s
zy<UT<IL~|XwWkv1bH8v1E~f{&VkXUl6}-IFy@8caMZZ#X#nSZm@!J4q(%9gY%Cpl)
z;gF1P@-Eqg=Q!tO{E|K49Dxo$=Dfr69f9TWz*Ab~IWJJeJc^jnR{jOy&&i2W`2v;3
zB*(h~^vP4BsZ}Y~oa}4Y&Ze7o_Of<rk6XQJyXjOfYMF{S-z^D#coONLT}FAiAovOr
zf2nhRNPL~j_*Lel-Ey*Rw^toCGqrQ%_i)OGbHNlZ<;tYT;oRkiR2<12Q6!tze;g*l
zlgm?Ar#5i=C+U}a*k?j$hwdX4m+DO|<O@b>&<NgKc%yB1FxYMcfj>!&y<m9e_fz(R
U9eXEj9wY(64;;k+O8Dpg2b^^EJOBUy

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f5da05a106242414df178b29c0ac05f21de73c73
GIT binary patch
literal 1505
zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3
z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv%
zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W
z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X<QT*?HF2g
zmV_;$SYTdQOt#*|XTolwq+xyJ*8{KTk3FI8i=_XQtY0zfV+{2zR=uIuQhmz?M$*X^
zbwxXd7OkjPw4R}Od0rO|?6Wr0#s1Y*Kll2bQTN*LKCD?k=#4BEzoF$ad1i4R40Y&s
zy<UT<IL~|XwWkv1bH8v1E~f{&VkXUl6}-IFy@8caMZZ#X#nSZm@!J4q(%9gY%Cpl)
z;gF1P@-Eqg=Q!tO{E|K49Dxo$=Dfr69f9TWz*Ab~IWJJeJc^jnR{jOy&&i2W`2v;3
zB*(h~^vP4BsZ}Y~oa}4Y&Ze7o_Of<rk6XQJyXjOfYMF{S-z^D#coONLT}FAiAovOr
zf2nhRNPL~j_*Lel-Ey*Rw^toCGqrQ%_i)OGbHNlZ<;tYT;oRkiR2<12Q6!tze;g*l
zlgm?Ar#5i=C+U}a*k?j$hwdX4m+DO|<O@b>&<NgKc%yB1FxYMcfj>!&y<m9e_fz(R
U9eXEj9wY(64;;k+O8Dpg2b^^EJOBUy

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..a811ad8780065241630c479f45bc6aac12b04e09
GIT binary patch
literal 1497
zcmcgsUvJV-6hAGjcFAJM;%(Z52V%6@!~W2bnPK*FU|WOE2^g93LZB4bGAI@X*%$B=
z_~1t{zH9s}#s?pK@JpC@PD=$hNc4r1_WZr~oZqkarbR4UA}$i*{)vzUtOIO(+WD1I
z5kP@Z1c-I<V3W!Tp$H|Zt5Id@X|ORTOZ$Uvs|5g=0$Gw|Fn>r%5jXZgNf`QS>)am<
zYfaA)YnrTmk+g4;d2FixGsM^=0w81{l^_GKfKV*t!|QGkiYmGqc}ytsr`h?0-mXju
z-6bl6+R&@{Zr2;RGpv5b%#ShRb-U|=S^vySTF5J^=xXHgtjM2c=kxlkPI-yHdDSl6
zUVGS?q4)ld`IYa+?dA(c&ZB3X?18QfoQ~VA3x&?{p*(X%!uZA)4wLiYfhHSC<6sFl
z&Rp6ZFqeHQ+K0F%o2vG1<sN{M)Yq_Fetz~iJS6?Qyu&ia93qzJWqFE7i8dZ1wlUuh
zayeS?jJ-T!fi1<6rHD)Umz;mWDrl82aBB*3*a?yzYaTR_3froky>P5-x?yFnt7q1z
z*{!r1cIB#?DF^F2#Q-0zBse73QCunncmoiBpd%hKUb{ScojGka?QF~GR)&pC^^)-I
zPw*fHNNH29#C&W-2M@V9fjeT$GU@*$3<Aw6&r|Jc-|3yEU+vPIF=yMnpP)G0Zz3UH
v&{O@o|K`daZaMw_R^9i#acboHgLAK!vL0<)+iBw<2@pPTME{o}{nUR003Y*~

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..e303e5ea14abd679e479c2fe54ba994402d60350
GIT binary patch
literal 1505
zcmcgsL2nXK5T0EYx=k8G8sBC&;XsUTdax~9QX{692W>YfR=`MgF9enaY6^=Bh4kp3
z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%^XWq=4H}icvFI(nSizzJ0?v$9!U=v{L
z)9&xI$^jGzg#p`Sckhd=1WTlJHQJf2G}u}a#q*-tdJ#ZIN0B52%<oc)DaJk|l7_zC
zJ`0AUddqj&hNfs&lJ-?Hk4^P|nivEPm<*_rXaLYpv%M09BSOiy0I`q{#HV(VShS<7
zQ7#c%jADU#Vlml#7oQQkg_MT%kzWtIo<H^kyDyUVQ!;<W%#ShTw^;RtUQ78c8yHb1
zS>zS%=xVg0UeS6M#l!QwaA2R+887xPull*y?~J<FhWBC3{6TkQvG@%wm(epy`=BR=
zZrAHISOVvHPri0lLVxZHhr#9WKvRsAaj=4$x4Jv9@~LQ7il&&V_C9_az)0yEyjp#B
z`Y0Td{!QK`neaU4yn-*u6V5Tv;m4eJc%dU=c}#dps{-dmvgk(^J=!X~p!joAqEw+s
zrP0art_b?1snJwwWF0&A+O=|-rj@&NPOWjPS8F%z+J%#?iu`U_;KNLWgK`=9m7>5a
zK>UTy`61!8tK(PMlXlC_wcTEA)XX~Pi0|PP59b0Y9?DmUkIlKu52-kUJ0eRmt^YW5
zgeFy>uC_C9`zM)~d)Q|}(GHzQC@$5TN{AQq^q>*Ex$s8Y?qIOp2m*hS9(%#?%<reI
V2RqhI#yCg;gby6yAFPZ&_diK3^(O!T

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1a053b47e2e4eeecbc7b1d7675075940736665f6
GIT binary patch
literal 1497
zcmcgsUvJV-6hAFcyJRtB@iuM312Nj{VSnhzjF`O~*w&zP0!F615GVy^2F1c4`vQIf
zAN&Z$ca5LL_~3&NehCxLX{q1_iN0{sp1=2=^ZWJQw1gE)#6?ovJrS~i4S>y0yT8&Z
z0wf?50b)bk+oEz(C_+i;YE+qd8f-4e(*B^^S`k2|K$avK%pX!p#EpGWl7_z4KKF;C
zddqXfx+ZI1B<-7I9-Hd_3^BHd00<dKCCLCRA*6(Sc-=KZQAJlHj|oNoEW4P{+r*5}
zZK5))kG#6?_Pnt>$LeRy{1_u%H@hyG_0PPdrM#kwu0|fuiu_r2F|W_+jF<SESN+26
zcShYgdhc(UU-@p_Y`$dVB6`NjKIqEO>AJm!Q0N@*%X3#GjIVv+Fu52WXtI$q4pwmE
z%$3~%bJeG!eTZwascP>McL0o(zK)g3v$IFxA?e@cU6u*v5wT1!%M(OObnqClgN06z
z%hQ6V>=h7;Y$=W`MO-Sp;QVt|L90TMTT_t7Zjkg?^Ps6z+1BjbYsboEnpW<rc4m!R
zy=uE@S1)VXO0d3L3h>cNf<tl@#pPmvHvsVmI^rSYwJYOS+0%B*&b6Ihb=1t(E(qVl
zDIUZCDQ(J^nU9U=;2}3Ba7S!eru`p>L7-U`c&c3+IQ`Si%RQPi;cSQZ6BLL0O(nz&
udV0|C-(0$*ZD%mpZuq`8NsnEBc<%Mn)`J~uCu1C>0Kx~3VgM!jss9FCMDwr!

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..43cda6a0d1f01c2bf370612d58cded5f1e901344
GIT binary patch
literal 1497
zcmcgsUvJV-6hAGjcFAJM;%(Z52V%6@!~W2bnPK*FU|WOE2^g93LZB4bGAI@X*%$B=
z_~1t{zH9s}#s?pK@JpC@PD=$hNc4r1_WZr~oZqkarbR4UA}$i*{)vzUtOIO(+WD1I
z5g-ns2oUSy!6uaxLJ>+*SEI_*(_mvxmi7nTRto?!1+pZ`VE&MjB5v$~k}&ku*110z
z)|#Fp)-+lBB5B_w^Vn4XXNa*$1VG3@DnSNd0U;&i!|QGkiYmGqc}ytsr`h?0-mXju
z-6bl6+R&@{Zr2;RGpv5b%#ShRb-U|=S^vySTF5J^=xXHgtjM2c=kxlkPI-yHdDSl6
zUVGS?q4)ld`IYa+?dA(c&ZB3X?18QfoQ~VA3x&?{p*(X%!uZA)4wLiYfhHSC<6sFl
z&Rp6ZFqeHQ+K0F%o2vG1<sN{M)Yq_Fetz~iJS6?Qyu&ia93qzJWqFE7i8dZ1wlUuh
zayeS?jJ-T!fi1<6rHD)Umz;mWDrl82aBB*3*a?yzYaTR_3froky>P5-x?yFnt7q1z
z*{!r1cIB#?DF^F2#Q-0zBse73QCunncmoiBpd%hKUb{ScojGka?QF~GR)&pC^^)-I
zPw*fHNNH29#C&W-2M@V9fjeT$GU@*$3<Aw6&r|Jc-|3yEU+vPIF=yMnpP)G0Zz3UH
u&{O@o|K`daZaMw_R^9i#acboHgLAK!vL0<)+iBw<2@pPT6ay&IPyII_hV!EU

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..a4ad5fbf82bf959f34c9a25974d34cb91aeff037
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fi1e2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9;h6KD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
vpk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&i2g5S`l<g0#8UH#

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..008ca3dd918c330c4f150aede27bca5a8b7eb44f
GIT binary patch
literal 1510
zcmcgs&u<bz6n?wFxHXL-O=q*4a3H2`dgu>UQX@?-18p~Gt$>j#7k(@Y)D#vM3hB*1
z#fujY9!yO98$5Y79(wRkFy2gjGs_lm@!*9QcyHe~Z{B?0zJUTOnKVk{^zI3jDXal(
ze%k$=R2V=Me4{|u=*AW$SVbsFXiBg$eNtg_0gC%Yx0O7Aj0uq>0>%$1L8JN}khrd`
zw9dW3u-bHOx~dZOv!s5PjAKLjpF{L56aZ8Pba4y-n5XFe39`Krq`d>EMIZ*m7lnLk
z*HTTwJ?qq%6&Dm}N@P;X#f4E^6yk0d195U>1B|z^sk%F;Yfv4!RnO_VBS%R5CaJ$9
z<4x3fA3>4fw*NS6p~yv33rL5j7Ocz*3!{Os`Kf(Xrn>07uIh!;YY#g!6a09~_{lFy
zxH*>>Mq!aW{bUa`GO#;Nw@wKT^`1O)U;O;9H|&Fp>48f0gnqDu>$ki+U@Uto>X)cW
z3`PAATL;h++A1rRo}TUdL(;D0U2bERWsDW^%k42^2(;NTV{Mje3oMHTPxw=gu{@8M
z2N5&AlzYzcXWWTaxjb);NtSg4=yJ~wnpl~~ij{d~o0(L@%v@E@%u%yjZZ)j(WhGq_
z^__y?hm}YN?JCHNdBInZxS`J2A?ItAMlaK+t)`V}+1>K6k*-`IzlRe(j0vW=DO==v
zEXHhh$eSa%BObX;`akj$;knE4Q>{wh?wzJy+{bgq9BuRcNX6#;Cg$@6E!nSouP>e9
mw%zY<*FDc2Cr6GqICpzV^TCd}lhO|o0R9J#@TXS5fA~LmVfT;#

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
new file mode 100644
index 000000000..26d0c0905
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
@@ -0,0 +1,58 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "cluster",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 08:17:19",
+        "end_time": "2024-10-18 08:17:19",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "jaccard_similarity_threshold": 0.0,
+        "num_bands": 14,
+        "num_segments": 2,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 71.6,
+        "gpus": 0,
+        "memory": 24.71,
+        "object_store": 0,
+        "execution time, min": 0.001
+    },
+    "job_output_stats": {
+        "result_files": 28,
+        "result_size": 33665,
+        "processing_time": 0.052,
+        "input_files": 28,
+        "input_bytes": 78286,
+        "input_rows": 70,
+        "consolidated_files": 28,
+        "consolidated_bytes": 33600,
+        "consolidated_rows": 70,
+        "groupby_clusters": 14,
+        "cluster_duplicate_docs": 33,
+        "jaccard_clusters": 14,
+        "jaccard_duplicate_docs": 19,
+        "num_duplicate_documents": 19
+    },
+    "source": {
+        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/bands",
+        "type": "path"
+    },
+    "target": {
+        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/docs_to_remove",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..11964c2e20b7cef92e09929a3a25e0cee1f17d64
GIT binary patch
literal 14986
zcmeHOc~}!yyAO*9DQ;*eD6~gZT$+Rg1ZA-$K^A3|Ac%@ONhZn2Bok&P1nN?%RdE-U
zs#Q^IU0SQw4c97a)z-akZPil2x?ZiQb*XLbea|Eydi|c~e&2JS`&aS@$(%Vk=RNQH
z`~BYcOb%J9Y^B6gk|l{xlZYiEKar@%wy?iZkFQ0dGB=Tj$Wzop<Rx;vJagPd9&Qqm
z#9z`w3ccG(yd+*h0p5-G1H9dwJ^e*JUegLUg~8v*7m3{ALhR<@QJy3Uo$MYY@eGU<
zd9@7;Xhm80k(7R<j-hR4i^SJo?EjaW|8uuceW>@pGgY7zRwa_Si{MXNcoT`>dcsJ_
z2Go3=qxSoo<3(RgTkanlKYcEL@Vm{*$G#~a|3J=&M&(;&-QKB~dwlc&?~28qnj1&z
z->%C$@4m}thZni8ZTR$W-|aeUn!;BS?>5{m-EFAb?0(OyR-Q=xqt|c!rk`CEdA+*b
z=c(QW`9B4=uPq$<_Nr)Oc5v*O_0y(YTD`|>j%M~-Q+|<6?y}^<r>TEg^&5$A8un;P
z>S$*EA!Dp9*&e-p^W5LKy#?m&=<*xkV|N@KJ58JXbZcFg+ZU>iytOSn=eN2wTbJo>
zJ^l3P=T{@9_FTVnK~<-HyW%>Z@CulEo9jRSao>UW3!lwczpJ;!oLJM!$H%9?_Q1wX
zB@?<1Ui+|PiS|;L)Qe+pK6uQR&aV2!^p~n->o!$eH`kS`cGa|--nvs?(~gD}6Iyt!
z>Df9nF=LE+_>S;ad2@DFo7>zNHg)Mhx20Z_-Ig?LYMn_ft_kb><YL41Lp7C~r>*H-
znWI{g*Qwh(V>4GgvXx|C9I|_DXH}Wc;g5WLypvJd*i$Rk(@$k;-=&l7S87spUk&=I
zero2SsKkgkS(N{cJ;sB~wtjmaE*UZA<N<sBA<cWeO>N6sxTXJdVLS>_1a-afWx>{6
zmp4@F=O|Z29<4f<R$sgMru~+!eD~wC9k=fKOFj7S__*(XpLnusRPfz?zxCV{eZ*r}
zaJM#3s@mE^l}k6bv}{=ObFbu*!J328pI3jd!Z+q#^sa!QxO~3{ce!7SS_YP=ug1m~
z-&*v|qw|UT4ga<ET5-~l_UoU-geil+pIq7NgAAXd?T`Bz(y8}G=iIC*zxzp2+tvZ!
zd3ySI=N(ljLf3sergwV3iUad|#VM0JY2NH~Tsh@)ak4x&xG&!^rf6AiLDiTOExUc0
zGV^57-l&uX(Ze1dcK@<6to=Jx*_Bs2xwZFe>lZzk@VtNh(6!3}_ZLs*7meNZu0d9y
zzF6KNrds;G(N?uG<Mx)S4)P=0DkG-0tLV0ERM#1;*4)@MsrHTV<d7DX@180cHzstG
z?DMOpp_=i>hpzm~vLNZ<U0eOxkF)(|FPY$Go~b>S`4bnk$~63G=d{bUz17C-!NVSQ
zON+ZN?R?;mkn^RxQ*WK`evk04np3R)q^g8Wn;>eRxq!XJM%|n-;`Z8_6tA*5$0i<{
z)IH<D*aJNmqKVxzv<nvY^~=9_{`&sx(T5oGz3&UYsA*p?_Se6)67Al^C(&#hT0hP*
z^VqfM+R}kb{px2mD80VEr#(SkP6#fp9riKzRi}aLuidbIdU&w+ab?-YRuZy0HD%$B
z$IBk2oWFXx%PQJddv!wx^}fRsDoh>UJsw@3J4`hqqhn0a?(j6tsl36meMf?K>LtTR
z9SiF)Jb6mrS>+w(nYLH+_PdcGtLWI-6Y9{^_wRi_?qt`Pb;oMYmTYCx>OJ}d4~UBz
zdGX!~?$0HD`=`7Snsapa{09b`W#8cg``b@%A?L3&B<`zoKh!Qgt$N^n&COeH4SurY
z_Y~CvLo%Is+7v+VElD346rC)wo$fiQlW)>QFVUzuJv8$pN3JmjpNX4WQ#)y}^x5Ia
z^>x#qFVqa|S+)8~m8NdWhl5t_mM?4l=AC}4DlVKWioF`>-)q(#TUhelqLKqs@^^S<
zZcV(h``m}Ih2prG^@YB%7kbRvTHsMXr<Yai7owQEJO0s_<uNyAGKJ{R=pTQn^jo@a
z8-EaW-<uzH=dI_F#y`i5S@odyVH^MC+{GgX2KR|I6@3<1c}+ekzHW780hdyl?VBk-
zU$t)5va1*OO^Li$e(v{GtreDg#POpIZFBeUXc4jG&FkG73Z)%;i>`zWjrQi3CBHdi
zOl<P+?(Nnz++2MrMsZMc>Q3}y`TR$sgy^5UdUQ~z;y?27^J~{Jqsyih9e+OU)hAoG
zAbRGxBOdQoYqqzVEPuQ27hQK|x~oo4Y}ujex5?@owvdVIrnG)f5j5DZ_SlFq@Auo@
zvd4_o-`{Vzx!K;cMVF$S0mX~vJuA7iD6C?^)dg#_H(uDhFs!U$?fr%7f$g+&DgrA4
z2b|#*YIQkryu`zAl(l}zeD9tK5h2-G+0W<YoN>F<s--S_C%I2~Ct&T_GuPC=)PA{g
z&dRWrOTSQ_+Ew_l`j4xM%>nEG`NMRNHGJr!=r{3jkJO8XiZ!E_4*F^3<at?4?^(OT
z2lVoGmj>Uixe*uIVRhBX=TY-glDF4qoVwUsKjEJV1)*1mFx>d5S?O`6i1L6FGb(=A
zJpV8otIo?>uF#bHFn&bcgTC>t=XgYiSVHBUvPbW<AFCelAkTc};Thv3#k2iCannyv
zFb9roKj_nSp9P<8w>OYdKaU9C<vY40UN&mj@PpU)TRUieoOtH!sa$WaQoFaym)*L5
zp3}o{VoS*Btg$;f#BOXa4gGp!#<*t%#cMWr63Q`t7rGvoY&kG-#vXIGc5H3YPx%kD
zh3|bEOsm?{x!<iU>Z5!1iKyNl;koVn?UU6RKP=ChdBQp|e}sHldvmf$-o<us)YpH%
z^;xuP&l}%}mK@3VK5BZpv}J1YAL?73w>+UgQg`f0fA>W6-fhLDAs=~&C${{hLX#>w
z@<G-5@n_F{GI!qbn$c5~Nq?U_VI^i>IWTR+QQz(58`Zs1pFciY(>80|>D2BmrYHJu
z7&z*~vamrZy2U<SS07rFU-Cg&l00yBwdm5KWVyMlV~eOckNfs%*zT7<{M!1NJ1)2*
zYSK#C`_oIF%l_OnJjQGBq)taTzm7wG%`v1>@)5S<Th6aPx^mmB`LpIOkE{7;;T?N<
z?MlCfYgC(YTZWF3`)o_yKeW%u&QBBhijzZw>iRD-k^aPmN2+x)U$-ISXDDnB-&rJj
znAPId@+L~uqb4wyv7mIq#-jeDm0)=@Y2lHEXIK(tT1g9sa!eGfM`?_X;3?=zL2pWD
zqKt$^oRy6-NIPezaAzIQXh{|+!=)%(p^O$Qdq>JjNrIIzv>s)XJVDWLTV>Ii7?wkM
z45OzBiyo$?X*AHr@hH>AbCjM$$t-TqCajd6ixtBJFD6f5EIehgksMga5<F?7bSOix
zi=a_riIuQ$*v2dvnG^R{<tAoiWoG9{(E!FyNU@3XCi_rWR(7NDVMr!J|I!Z}V!#ok
z9udd_#&Cjfct%f@B7%QqA>f@<aG)JlCl&l82mg^MS|L>^6f&;JMzEyJz%aaX7>YwY
z+)rT4V0&qIe1*MDzsB2gajJ!}TabZa5ygX1;4U6KMOlo9BWY46goiO8t>8ZkVI~oF
zBxfac6hZS~trY@H(o)2k7?P7BLZ>4+j$tYAC+=ya3_LVrgB*)AkOXgI;jWcs^fnz#
zYG4^NqDexSRhWoFoK0s!1cwqKz@!BwQJjfnr6`xOkfj7H27iEIlUR~OStMhnNtha@
z14rpu0%>#%&qIrufrXgC3y1*=SWf;TBH$&0<``rLc+jOXmNde+5RgV!a5gK+Qs5UI
z;D|8;%s7%QrgSh7Wzo?#!CC@imnUI4yv%@Y(U}O=2yZZ!;0wlVhM)qzjEE&+-4<kJ
zXi8Tqg;hg$>>Xrh!HKvBUVzZdnJ7n#b4pGw&cWD#SYp`VzPcvV@(j}2DBy$53NXcv
zHiNkq-eyJ+IDBm&%oHpU0uAeH1`BK|4^5Uvv>RE*&YK`m0uM@?dCQ4bj1dFH0<rQa
z4ou(xo6T_rB<&+Cg8*;AeV`&`fk-ht@oK-mj(|}-GC`d2%z_R$*Z?%-kd`dPBj7pH
zDZL$hOoIO~(5)<Zy?Gokq4OEGP>NJEp(V_OU_@piO_&%nAxA2`p2AQDT>#f(pE%th
zOm4;0M&L)hN?|^y@vmQGIv|yEs8$jdns7z9QZ5e3qri38Sd!p?7Xl9`Ek#qn5*Qm#
zN@I_+FNqNX_BgC1OLU~wVI)MCZbm+!mH-DS0?;)A1tLtcAb=J@Tp8h;z=)SM5Q7}T
zZ{Z1!2U%+h6$ZIYSaKtdgb3iV8BlEp+ncGfGz4)3rAGrPGb)CqV{MlLgGnr|pzQ#(
zMyZpdkR(j#7eK_)U<ER9$WG9OaD!!RMw2j+o&nvKA|3cZ;0W-^W?`vff(BSNE*p_0
zn2MxDM<S=TnHe5B3F4$BO%$MnEGF4f5IR_aokV&@j`G0Y3>18@Ry&pqhyhu`gFZB_
zk}()ySc?%z^^(8=*MY{+k8_G0Wdw$RX8jK-XpC#~?BWP0cCes3I9oX#Z6@U*023G#
zh5|285M#kCJ4x$+vP~|OqQWLs1~*bHg4Ae|fzS(Jdey$>z-iJb;1rGkA*9AeApmmG
zrAQCHq5<l9&_1j>ujHGiSTmR<ES6<PiV_kt@hDl@i34&V#5s6uC#gVhKoE2eG{X2C
zD4d=$;0*8*8bQ5)X%IxGZ2%uL1#|#mggh=V_GM7(f%23?oN$TI+{-`(fC)AqGp=b^
z0wQ9S0b0QSW_(w$7pEF%<4eK+2`~kVWk4p70i1wC)r3$X7d4+qFbjK<v$23%2YN7P
zok>m5E0~FP<hRnMAikRJ@E9+&mutcl#-xyEv@E3qv=Rr`5PC>5f))YtarsdU8V*=y
z@j@*Q-E5>DWMN2iS_%UFQf4ySI3T>R9Km{AU_bzjLLRq5cLs6_nDkPFoHpA5nV3+J
zL$Ie1X|~c#DdZQ7cV(mk0?09t|3Rc<9XuySL&gW4b0i!y01HOYg4H&fmhluI|Ak)!
z+JP=<nPO6ACfGs}QXpZ(=^P^&NCASPh2qG7LyL1F0ZEes{Vs+TLMRPTU<g`{>>&Aw
zv=mb;Bgmi}rGR`1e8vHH2n1$ZIwTS(t-yQ@W-@%1A|<RtZkD6^vUw;PS@Fk2DC_Wq
z8i=#dCEU@DyF@j1k&9JYsFRu&ByLnbSOsK9j1fQ;RI13qeZt6K^y(qO@i@u#C&05-
zhJ^%%OGrmC*O-oUfHGJ=INh-T;7~-QBP<*qMTsx4<H1lG7dLjCbqJ_Hg=!EvszD)1
zqC^>tip2s~06J$PG6CH&bzorZUZ}Hhy#{eYjD?1x3Tg>R>d=IB0i@mxN-qa>l8Y0-
zwLFwxNMnXM#X9F@JWE<_U=l<Gm--f5WkG*&E(JLmP+<fH(?Yod>xwb)yfr2)%x<^K
ztt@aysPl3EFnj@37}S9r$%VnpfDIcf(>HA(WJ3KY!@S^RLP_hOFib3?pto}aA&XEb
zq9gh!`$ox4yqOjcwdo2;7IqPk+AJuIg4_lUV5~x|8-uzJ;DHtdVkxe$6r*c26usR8
zerzGIBN>V<$dhT5ffT0`B`D2igy^xPIG?dWL@nKM^#be!U;)~-4AdtCa(+o@gO!mm
zFW^53QaWQtCKAeKh!~)Z(>Ze5Xz(QXh~fYd8O}lm<p61L3B>VGheKdMr)@m!a-=9H
z7Xh}R0C8jkJd(x61O31ej#SA*etuyqmRqb}R+1<bG7;7TL9!{37Nd!W;EO>l^$y-Z
z`p1OD`~;BOF<ziXGn38|iN<0?Q-#pMxlp+{A~FVuXGeLkI9Njh!D=x+1tLLJ4Gk8Q
zLa=5i(O`5n3q=bWi+KPQMHb1@W*ZM02z#6U;@36eq)&izA&JXymcgk7)=b)1hGvX{
z+mgUXV1Pq@kP1Lg>0r_XTt-1T4(%C@E^-)#eefDz%U*Ui{-VR&bOJymED4eWm@l+S
z5u|C5MJVuvWD0S`<I<R){EMJqg~mZ@G*|}>&e+Reh{wObQ~r;TE&f+ODtx62lK47*
zRPe@CA<&f*eov^Xa`V4DGW64~q31k;T-$qxX1TWa0;pZvd)M~fwY`T_?+||1_8uJJ
z+TOdi_fQ78w)gmvk!ySJ+TJ%k_<;wAuI;^Rdk+t-T-*DmJ-=&v|Khi3c(Ty+L->EV
zy?@<7zQDLhQGgexGm&OO;t41Fy1Bn{a8M-ig|?=Be>YKD5IiXG7Y7CfO8=kE8-}``
zK)4PVxDFV+{4<X0fPw3Pf$M;Q>wrPC<7=)12Cf4J!r97K&eP#1kO@s^{#*wPTn7wX
z2Mk;X4Dg{j*8zj3hYhX+2KaQ9=6`n-|Nra>g#RbY-TcL&-u@Sf4kA%tP=K#0J3BKk
z#_>cVpv*lCzxo(e@EU~wz$;ayva8faRgy|&R3!+vRBE+Km5y&Fs8j>+J6t!u!xjFh
zesP`A9|r9UZHa>u;V=FYyo8U!Cm0v{kJg4qDu(7oDs*<C748qh;|hbr<S<^oN^N({
zE8G;iK^K$ofpK<qw5n7Y_yYms4GE_UhlFPoYg2N0tzf@V)n6D-=QIWPgc;x^3P5mp
ts>nw2GM&{bBjBi^9l=2(;Xe<QQF5hR5h#TxGWem)sdge!7W^0Le*?oaLOlQg

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json
new file mode 100644
index 000000000..8486a08e6
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json
@@ -0,0 +1,59 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 08:21:36",
+        "end_time": "2024-10-18 08:21:37",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "filter_duplicates",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 69.5,
+        "gpus": 0,
+        "memory": 25.11,
+        "object_store": 0,
+        "execution time, min": 0.003
+    },
+    "job_output_stats": {
+        "source_files": 1,
+        "source_size": 3093,
+        "result_files": 1,
+        "result_size": 14986,
+        "processing_time": 0.201,
+        "input_files": 1,
+        "input_docs": 5,
+        "input_bytes": 6143,
+        "output_files": 1,
+        "output_docs": 5,
+        "output_bytes": 6141,
+        "filtered_docs": 0,
+        "filtered_bytes": 2,
+        "source_doc_count": 5,
+        "result_doc_count": 5
+    },
+    "source": {
+        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/input/data_1",
+        "type": "path"
+    },
+    "target": {
+        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/cleaned",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..a9ea0990f6152fddc195c2471bf0d3bfdbfc49ec
GIT binary patch
literal 2753
zcmc&$d2mxj82@(P(X_O*w0YsBgi@dh1$v}~mV&YHQhJBdgB~fAk~Xm|q{$;mX$#6J
zGvFWy!U*U<9T1@+BZxE577Hr3gNllb!+6YiB7!q8%Hg&9LVybV;fy2SB)j|VcYnM4
zONyn^6oD0Dg*XeDAQB+LbYqLiKmS!w^|_$Tj;vGb;sp>Q1x<p$3lZeY>(U`!70?k?
z0WAOsj>Tl;5n_158%QUx+BD!Jv|@0b&1J7`w0Ro^tl>2sNV5|e@(ikndD14(Tm%ec
zh#(UH;f+F)g@mUmiYx=c$ixx>CC)!lZM7iY%~MnbB%+GBt)n5i-@i+QA#qCX7&CET
z=w$CCA0K9S*S9n~T)tYn&1Va1B8w(_ZN1ys((F>o`5b93Aj3w+a0cuCMaTyhxl<+V
zQ(1&(uu|uG_F5FqSyKd_J=5_1#=Y#SU4O!Fd&Z)-6F2r#4J(8lLwB&VqPId^jh-#u
zb`5pqpN8`pYuV1)v;9UTpJATO9|<YvzJ+G?G(NQ90J<^aeVppP7}WOB892J+0y_2b
zPbhVujJLS$aHDHG6C9t(b#D3y_8h*6hJgi7c+ZG-<%gk?cY5HE!Gqpid=@Y1VqkUK
zbbPif8-7l|ipAZz%+@2Jkp5;R+Q0oMe7Rx+{I$x%3{75yO&R04rgek(i^uk%l%`&G
z%W|GCTR1UfqdX|Y8S@4A(mPi`ort)tHw$p+Sj@Bt-H<!{I-E@XCgg`7$8l;1gN!G8
z;d*C2gjut}8Iy>w%<O`dr+S%B%Q9ibktgugZVr7s<v5s3diJd$S7E<;9)5lKWQdtu
z2t6MRX4I>1A^oy9Aw2sQd|kT-7?Y843zI<AvVuK0YdwE+trxyKx)e4a+snPUZ93kv
zKLvG<=wima`Uf-P{qLE~-l0Lmm(Rm{lRpbuSd-2j4!X=ajaKFacwxe(dRX*h9v8W|
z9TE=o@B=<W=<2WJuGwAz<HX0A==M-3`+X(Szi<@3FMJpIB7>2=*Tj9EY(#6S(;;|6
z6mao1ctqwtuKsv6SI{HEx_uRBT>=LqKZ)jco%{>EHt-~q@Y-|Cg7dj>&{K}P3v&^<
z>E}72i%AeVxVw*1Id+ky2Sn%+jtwUh^XG>M4d#AnC+iN5Rh0w649oFBs^E}*p<(^Q
zBL+l9sRhkIQ5!u-H#lZU?9gGu<Mf6R@gql#PDo5jHm0PG8Jm_qE@S+J$4rlBPMjo}
zvnFTf<mTlU6k4VfO)V}dEt^(eQCVfJo<5^yX6-CnUA^7$L_=fK>^aV6mm6e3R?Q9d
za5C%lWy+v#nMZuL3UWkS`}}bW%rd}}oD5G1a>7F5U=eY!xPFNvOopYKiA<kFhGj${
zqe}+e)70rwm)meTQYT<PyyW$Lp~1~fE($dcFY9db`hpxz2YtQdsdjLhi~Tj{LJgPt
z8#XF1c5lZX_#XS<qJts?q4T$AateKLQB(yaqAH+E@#g+r;^a0~Vv3b3G!KKD@H^n+
zbhsLPe*8RvG#?_vb+zGungGuI-X9Y4K?Q18rGdzM>l+=-Hi0EDtH<}CwO(i09W{vn
zE~H_xZ<ht$uiO_8sovfG3s~SSCTh)!%2`lE$0S1FO!U#je=PN{1pa5Spy9PTt>r)d
zDS1TT{=eMVM|P7myQ4I`L8a*&pa#(D#89cUbZX@U|HTqJ*knp+l39|Z67r=~C`ql7
zxk1X2qy{OgLD@4Cg(-BEB}w^oCVXHfoU-&%57dz~b^hq&<P5UNF3O{PRx2H|=|Egd
zl7$XtD`geAWTgC@y40eSy2^?cd(QYd)~eD*DsL_(`L)j`gW@ksaYB+H<oBm6sjQE^
zx4JUTWzTJpY}8%}Ps$-=4e+Jq<&-a=gTHhob$>o{$e!6}t4woStBTy!RZ8h3UWvcH
zf@yv8A<ZImPd>DjX4#%w;iCd6-$L~jrPe_Fl6<A#q}hZ5t(T<9beZd<J&>z&#?P)x
zZ6iGyOO;5ITWSAEivB|(duGLwWKOCha;s!+^`DcHEVUv!lF~`Pky508Un$+%oYO+$
zD0NbM;-8q(qw#Bk7%SU6b6Xs~#Clmyw0S)4*4TLtkGIL~icLvRN=Y(mEjoZZALt?f
K{))+;*S`VMIFrHv

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..dd4f930793d5e38efb9536223e4aa1ec1aefe431
GIT binary patch
literal 3122
zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkY$7_Rq6xT5fQ5hu3-^Dk{}6!wHB9BQK?p{
z6|JJyRt1Gx#EuKp759Cm&dhOY$HC>)>e18oXb0!sWMZZ2A9K!}>35R5efxKBe&0#5
zloBKgqKD|6NGCW0PD5t3$9EcZvDe29#~XK4EIfHdgy<|P{V3l_l#6mrD0~$*o=Rbp
z24LsG(NQ3T=d?SBMikT`kUObWb{RTD*5qkA(=?G&3d#*cxs_;RwTjnSQZru=Incq0
zPQdR&B=n4t+&VGA4k4-(HjYYRlU35NUp}P<{u4IG!mbvchwbZ-Wy~znXB*6uvvg*i
zLoQw9W=+mC<`?M=mPC#d<$0oA%4^SZn*YM%E7SYCgLu~=QQXa2F5!Xvv5}8nI0Jbh
z3sBc8p|yD`{5JN-_gC!kc&QKWwDd)0KUcDOixEpExx!~v3yk0S00Ay|*p)-Y54lL+
z`7c~<uYo)#3|q#j@!10>%w8jr(n5PKPaTGXI|6ZhZ45m1{&+aDot&#5fJ%*u^lkEn
za4!G~V*-ZF6>zcM8+Ls>;Nm(A7xJ92EIky{7e``5YaS+)^n%?=7hFrK!=|pq=o!#J
z&eUm$S92{~7stY@`3wqn4TRyo5=~tvV8v7omi+oPqN=hnWxFTNPLyMP>sRDvXM)Dl
zU2wgd3l1IZ4Po7I1U5cGS)3P=>Ni4N8-xv<FV=N$#GGgKxP_s(8Y6Ny-+vQJN4i4&
zWem#heoyj>x?zXl4Sks%D*a?=*|`&bbE1*_M2=k*eW9(8V|$ejCvG_6*d-Y*RR@yz
znvL)_ox;=E(eSz*fXf+`xY2Gvi8&QjWt;G9=25JQ*^2e?;kdbmBTq})(7(I~JnMq-
z;Ng61zSalc250hk!%xIL>lQKl5)>G`5WOr3qkmmQJ~5T^6*cYnxqcw|#=okI!ZQJz
zdv6w|xUYxX#BcfJO^-?Ipyfha!eLV4c!PWx*B3!oZ$U4d;+j|PCT3?lqR;&sS90Mj
zTIT(goZNVi1iKY*)y6E$xc>oX=Nm4oU3~%d2c8g5BywR#0!dAr0~vLw3C-F<a(M1V
zZr)xVOH0OcKZHKwRNG_t^#|?I_<0L%l&{3IibB43Kq+@`YN?Q0*<F73Xafn#-7i#^
z$>lQNiCmxX23h8@blKWNKVsL2WTI#r!r2ex_#$y1V!VFAiQs>9zJ2EnnJXMH=-vJJ
zr8W)@X;H}U>CgQ*X)hL?+|OSb9ggyYQ@Ka`WaQF_W2kHxOLp~ogaXGsq^@i@dJa!O
z)43kJ&w`zBjCzXiCf9=>6383HVNev63wx%lkUv>&!arN)!%c7GKHW5)t8NS;`}*wV
zYfd~P6Hd4B;rn}c@+zCjZF4)pzc(>d=C|!5K0h#xKMoVZR%fF0POQv%?kxE2ZjyJq
zM#!U|ctH0B0^b_SyU(&`Aab7<AMk$S0sc3r6WEoh1ol8F{Y*SM(}{0CPl%oDK^t8k
z;048C5RMmQ@=gl7&Ry&sx;i>_b9QkRm2N7vdv{Hbp1nMJdwO~MX#4o~?bqMWe?UNB
zQ1G7yhJ?N`XmHq@LxzUGHB5?#93B-N6B`$wkT@b~Wb&w#(Ql6#n>sEnef)%plO|8m
zWn^Y$PtBQ@n>RgQZ!i|jC^VUiW)_#sdZ)|mcjvrUI(Odu1@D)Au&{j5;w2RyE?xG~
z^2(1_Xjgu+s$bRWHJ`3sSG|72#?Ll={>A1kTWe~!ZLh2Ua>rK<I~#X>{pa0#_U>y+
zZvJNffrEz*A89#y?D&b3r%s<ad+z*&i<d56xq7Yj+v|VPo>yr^&Wh);9f8qKpKD;)
zo^2BHb4}(>+4<QFG$}$J;FY_bl}DVlRj%}R$I$-8V!jdvJ2aww_N?SUD{BFiUPgJF
zgi2wPVJQFdDOG+E<?nWX9m+dC$Nc<kLyp<nu&g+L-CeuJo#On<)?efERSBvqLZFhH
zGN)zhb)rD~nh*Cvd-TyHZg&|*n=nE+RmUoea;tF1HpJw9tG~#J^28xbTTof<g0)bN
zda&Jzk2|&R!R%Wc|2|nL1+_+<_&@I5e^5{Tzr0xOAN7)yx0Omkt58<BxdLj9%3exI
z8JQYpeORguLI9%>DMFH@QS{3wL6VB4h#V<el5(WT9Lre*6&S%*k&+b0X1ce{bZ1$b
zm<47?8u?EDfPg`C5tS6n_L0SGjA8>dElG)Nh)82w3ttLk`{^0MNkJK@V~et)2TxBM
zmokm<^~p58S$TA@*o(9np+Qix+OsSfuY<lRJvGFT6_X?Bn7+k5O@~sNZJ#n`4BN-E
z!J4`ybZb1*=~;wXml|SB8<%8EA7@FO`Wt1<FaGV0@z7#XdLbTG%7}ujn6YNYkm3@V
zyhW(k=6_V2rQWpIlx(G!q~UBClh0~UkRClaFC(~w)?^?PqD3xd^`{|PFNK~(STt$m
zG&{;qlOl?(_hbo{X;B%;Qc0_lB}wbKCG|9YbP@Gq$&=|*`&5-(ja3pVj{;rcjG}C_
pe`Y~}zpk**SnM$~yU>(tG<XDs4hR|$s7}-X&p&7qfMoh_;oo;1KzjfH

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..26c4d1bf6cf98bc32a1045d139db4a38b76b8904
GIT binary patch
literal 2862
zcmc&$e^gV~9sl0@-b-FE2x4A%!AOIWh!!MJK*Wf=ARrJhfC51&1`;8fkY^GEtW~tN
z!_C^(qg&f~bUSt3w4LXu^qh(~T05<_J+5`_>UP$aZf$MtY3-Sw&DyQqy)Q;(>K{92
zXZ;@d-uu1ZANTwH-upf`PN_aeU<Ip?=MYo^Bms!aA7=(;9T{vy5B~LU$M@~NvMxP!
z+1cGim(~j)=marS;DtnDt1=cq1|QRje9Q{~;?gnUJi^RQu$(|ffi>iVD$yWn+TGor
zwk~(LOTc<we;Db9k)_$f-z}6OljH^h2Er1_06^ptq0m8cpt+h@1~dp*jk*LlqNSvY
z4%B5_wjv+Xh&*%MMo%iZeU&J$SAa3|5m-*KH!8@8_XIn70^aUOo5vk-$NR{{=V|K*
z`g;Q1N=+A${u5*wVk~3W_+M;!$4amA!e)&_*n^c~N7yU#aQS1k5IVR74|N@4x9$HH
z{&6r5{U+<l3H-EbcsA`>c16lwNN+W>&S$Qm;mX(HeBl%9fwprKW@Vpc9<7`SIe+*V
z0_^Mf_^ubx*E8P0xxtI-zPHZ8$xVMoul@XUl$$K$JwXqAJ-m<6WE67;o_Y%ozH|*u
z0|zeptrhLBj7N1RM&P(5gkBjqhc^u~u)S{?KG#qNUld%%;`0^E-j}sdaHI(h?K=q{
z4DN#OwuP9q?2Xt~IG6M7n4-FP>M+Xjjk0^Ts8kIb7Q{U%Pl@xVzR&GGaS3!;h}(Ou
z3TyK)(<2;%is@J3OzuZ<pMExn(?L8Gt{;V~2Pz@nRR;dlEPUzyVc7cGDDzH3F$})E
z5??;Xq2JfM3O1XWec_(VFr-_Hf4yZPq%N$6k>5>ablbl{=7)|zLfJR)VcS7qY*r>H
zlmOo|$R1s>Q+4f$Fnn@yGdz9j5ckt(mf<}^Iq2AoVdlPH{)<`u#s#K$G)+By%UXOW
z`(5>h)&lM&^_QIA>S9hq7>b_ifQ_Fm;*tjXA@ju%)ueY3y8L%?*WAAV>w@`AO1~Bw
z{<#&Ie{vEoRKJ2ENgCuCwQ+ySwxY*c3P7`KGH@BKct-JIuH)4fu4+Vt9fupyj!X__
zzMaDDKl2@WF8K_T`P}2o1LrH?XlN-uR$YO993EtnTmQ)PC47JazW4B&FCW35uiVJK
zP~$}W1sj$Vj-up1klZ5Sgj)C&n<)&aZ+^nc^$=Nlkc2*DOgb_!-@b`ZF$Z7u6JbEj
z@@s&6%`B%<^P0E`+W3hHiIb8h>jZtWXh@l2oSJ%%DQ(*HbhBkf#>`pwW@gRKw&vvC
zmzQ5Kr*Ll3Jlp)@1tpTbbYWR}#iGipYDZ1&VrN}_!;+<qP0g;BWy@RdZ(HGR@9=n6
zc6Rwzt@a1HgCGkszeXG4WHuZrmO<CEmc;HA<ix)Ib#orD%K+;+86Fhm%nc;KMiOA4
zW0N;thRshCoq0B44-tdHVHva!lb{-z-NKli@CX`%AEY_i@E|t^Tjx6kS^GnhEGKJ!
zw5@1fffFUJuYR<qcIp^xFCPP*q+&U(GYU8gVT;BCfF|JUrf}vBvwmMVqW1c|6dVW=
z45;<{Ch2!hw46_{3@KQ7YoqV@xtoM^2gN%AY>756MLR<0uaD&Lk_7!t@D=%(M&x6*
z6wYp6B|$VM8imJ*nP~Y-^4-uZ;Rd$(z1^LWC`O+~`Ztl~oZhlSPw?#a=y%!kP6Zma
z<%6gScXW9JZh<9GZpOE~2xg<>`N<`ucF4!#_$~{osPTBXrFJ*_FJM9Cuu*STRL+9F
z8kp=5I}>vXiEpCum5u+iSkUtZqrve%ey+Z!x8whEW52hXq#xAlc?++9crt-1gHhB<
z_4SLJilP@w{27}~DPOWnl2k|jDOF2SuVn9($|b2&D(zI(?8Kmk&PpYzl8!`<jYLwW
zVH$x3k`}MKH#@tKOv-8(QMt62@-oUuXi0KVW_M9h(Mwh;uWrw+&1r9H?D3S(UF~YF
z@1pvEljPUGiZCT!sS<=FK`0tenNr<2|8PrFez&KhQ*u*(WqVQ%Ay-VUU%HgaRg^_b
zS5lAWvzn~gBkrdBpsTqy*wU<&PWG#d)>pM;d_JUEgl@@)w$d(pDjFkHAyqo4y<*fG
z+rO?-={IRMp;+rBX(63f_-PO1mh!o)+H?CzPx7b{X>u>^KS?oqC}hp9c#_OXbwuxy
z?7h)*Qj(=!#70s&8Ff;MjIJxCy8`7sWFMtY>QCYmS9&y2OAt-6JG7?98_DXB<t%q7
i6znyv^@hT}V7Do!V0O-ItHEIexbX)gEP#{zQvNrLA-4wr

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..eb8e2a4e873ee441c836a8ea92bff584ccd91d51
GIT binary patch
literal 2537
zcmc&$drVVT82_D1DRcwmb(_{z$9Pm!q+rA-+YGm83s?cklt-LeXa!1N(qfs$N!-Yc
z5;e}n&d0>L_!=OaxVWfOw=gx%$J9->e`FXlozv)aOQsv{oO@Ru3jZzh1kU}=_xyd|
zIp3ES3PtI>jF04FO+3dV8VJ?=zUAHf@xQIy%!7~Sl_``T$;)}UW-^jP12R<~1wawU
z(VZiNgSMGyc$qo_@<_ExQEqlx7F3x%RXnHUl<SGoPqb6Cp~HDnr%)*1InXVVet;*T
z5z1rhNocCjfTW*NQn*iZsqw#XIR>{ks1I&(xW!db=de1x3oK@@IlP(@)s_VnE_<EB
zDJF1;D325E9ZvfNr|FN=PYn}s$g~<l_|+jMzMF_eKOXep7%{xF7`BYJ(Ap)S>*yk+
z>==)0--Y49+9cfU9E*x^v1He77uL*=Mbf5DxPD3kbTPOW!J+APHOdbBjmy1zA+OBD
z?h-XVyc>ySn+39>HjJ~WGjXCl4X3sm5$~9Qdj-AZd|N6uXjEkEkwnOD=@9CgiL3@0
zF195?F(wW%v03=m7KwFblToue8?(A>m{<QI6z|30N`5PLM))vV_bEBssv!wS_aSz5
z4ib)@h5O(nIPWNNBw`*mF4ADlU!P%y--?AT@i_Oo9NW7-AwAC#bbRwXevFF2mnU9=
zY}<6Cb=*f|ZUXY#c0j!^9nD-aw!P4S<&WEN9k1ehBhU4`{W8`T#6rE_h{j*85nEjp
z+GUAwG%BzmB^aFt4j^T@0fi6cIM_56+9o+#{AQfKIszvz1>@2OX(Vs&4kUWc;L$Pz
z5^m^lxqJhz_Bv7TEr!2wCmt{P8k>xJusv@YdNy<9(Tblje$^<%w@$>}dn>W)$`~X%
zqsfEjKgdYSb>dAXa61#&qr5@*J0ykOYG2F97;Ke{B}|w1t|yp&f_g6y5=4n^A`2PS
z(6BHoSrxJB?eY^<Q%0{}Wzp{~N%c1^`s!x=wC43y)5pD^zh|SZ)AsrNcE8>5@ej^J
zTb(Cw{rTqcW_M1?$aSYasy%Jb+<sw?H?Os9e^<+rGq;x>>u#?%rEWQYqhsk==i#f`
zb0ai7C$(kH;CHTYR6ALVT0Js*wZ|J`wOd&a&<Ed5lou7s9);E)uKhKbHSNG$hJ4k7
z=E&hNmhn){8jz?$0~BkW0Nu}}#sizRD|)zQ9nsgicB`|}E6p3JT~9@6{ZX`CpY}dX
z%%KufH)TL2_f%9_9cEre$5IkEV7et~Of4~4bao<xQw^>zFPD@DPZOrsUw1lS<)&Ad
zwM-@60h@pvjS#qtq><Er9P=**ell4oIkiS@`j1ENGa8Nm%S+WhtCys_7_Q{Bp~{Eh
zv4C2m3KNQo3W_tOhox$k>DbB;^nxJFrvGfs6a=53uM`Y|P$^_riab44n8m1ULC9ra
zdJcT)NhCeY19K!4EFG`YO`*h${2X@9_OX2i+o)?nFtJTv%8sI3NMq-k@`?HB<;8RB
zEQYByr6omGOztS8_};M5P4t&7I-wxwlKhDzlMS->loe+<Eyha0%<RSRG#$E11LvYS
zbJ#hLZBptYbO}!l<>|fV;tW@5NxrMBL`<E=n=R#+_u61Qv{-Zvz{5(ZcUz2ey-XnF
znwY+*<O{@~oh#Ox7Mrd>=>=grBaL=e19zEWs;zutJ*~+krbLVEWA&#HrAwhay=Y01
z)9k3cRM7jRdlJDiE2<-im6VLcBqhF>da1)uN8^ZjGJEQunzE~rG@*)fn`;->S-le~
n-0lhH+FF+{Zi%(lQ|)rbrB6;xPfb&sG(g`Uj>v#Q`Xlcjk_>gx

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..aca2026d8ffa41af6bda14a140d8fbca62eee401
GIT binary patch
literal 3305
zcmc&%c~nzZ8vpKlFCi$KkQWk+iYyTnOkji}BISV)mWYT|7DWvNStN@j2x=|3(4tbU
zRO?a&6^o)kUC>b$sao9kHMMp`-1XS%(bMT@N9VqmqEhvbIcLuFJIP(Y<@bH}p8HD@
zm_$FCqFreZ74E<hz#$~t{AH!Wp=WgUuq%1V+TGqX(2lgohZfKdG@s^61Hntsr6U$}
zX#s$>D}h@9LZqE+8<NtLBmnph60vo<TBn(mtu|)Ugh(J-heVr^ELtXbpC^eAi-jft
zHx9T1!1FLVNQEJ=5M3z*)(9wTt1#f<T~_#(jx`a|;@4osg03C00R5pO!U?>+iZ^?B
za0w6uc@t!f)T<1<Y&H6f0&S+wG)bd2scm!cAV)JPL!Vcm)p0r9N21$Ewt<wLBBcLC
z$aiLHwxc`kRP-g1OSaDH=~go8vuThUFc0d@447M|fY16taO0v4JYDPw_u9Q7W1tY#
zY|_J`i9+yP(GL3So&a(ucx*>N`NJGY-T800@S+y@S;4R=Sppj#Il!#d3@XmIA#x?b
zuz#BZj;v9Fo7NW|k9vVlHw=afsTg^;cmVZ44g!4~gq2WmuE7JW2e^WhFbvM-I>3_D
zK$yBP0!Es1VM0+quwL#2m*eZ9+O81#%6FiX^-|>Cx&?#_qrtuPBp7xL0o~6cXtA3B
z%ce+S(eJw<a%Cn=u5*J^X?&<MZ%3Uy5Hz0X30HbK!GZn#fm%BP6pc@yG{zm`8`gtl
ziyy2bykKqbMwtDw0dB!?xTK_s&iTV&@hBlkzEwi${TnE^pcibTJV0A&4HZ5-Xy3UL
zd}c>M!ZSYXD)$CiIUnj)s^RE0dpLZa2j?plD7JPzco>hv^I1{gepe0`(ktNF3mp`h
zl3-<NHN2dD2v#UJLse`jbgm}Q^Wr-&sH_jT)%(Mv$8(|P@&NGAIijcQenl>tTgc>v
zz@T%7s3q|*=Jy4tF?%~zUi$+6*)Rm{^Ih3f;1&lp{cEVnE>+->=1C<~KSk!CrPQ6c
zgJ_`q6#03KH~3w;1zPGj(YkyOGC5i!O<M@B=<F$IpK}!*TmJz0I~#~fy$1f(w2`#-
z3gvBCbru@FdxqRVMFfAZK(#TpX!L;=XqDxogC*yPIZY&7kt@){z$b*bPDxhnw}Hlu
z?QpGZIlL^-C;Q8biIAY7yqt>O{QHM?Aitb8s<M>N=Xs?O144K3G7hKm)*Sc+Vq3&e
z^(>aK8A6Z+bTcU3e}kj`H+$T@_W@4`wh;7j8~nB<25eIzA+N75@yo;}Sa7V3yf`Kl
z%Jxqoo;35&`H_dAV#heNtKSo76iz4VOGiN85pmFRx)13&ZztGCK8Nd*8h{K@ka{`{
z1O;W(-pR}O&z2hDcKclLzTm?9x_UfO+31Iw2Q-njM_-}|C+?7;ZT+p>OQ#cCosW{A
zqy_SPwthk8DN@KIV1(dR8BlyLn&((D1AO+h@OxcG=*h1n%rYxM@!@dNWrhtbE*ekl
z%^yn$eG#uYPKmxUmXXeBKafTC*N`^nCt}*qpArwI6jQrKCLqC`5W--;7ddP71gO7Z
zU!Vh4^ckKHU1Ih6#iZEd7E|oy#&Vr>72=M3aR#AK9z1%8CnZ*tU>YETq<DNQfpw3b
zHnw*54!s<mgtW+6EOF^A?bElPYkxO)4^P<uFYkeae0&GX6@LDI8xj!sLD10PVIjjq
zKMZ5SBSu6<DWhXz<5VN#M<t9-9P`oGaY@N3spBW4O`J4Yot~k|oRXEDlRGs}tJ52%
z<r_@}(+i7c%<MVq<Jq4Sm&}<vZ+_{gpOr0GxTyT|#Y?_eTJhyF+48Se3|zTt_1A0G
zR#vTBzoB~LH#M6!*KXNbSKsjMw(UE1HtyQ}_dR=?np+ZD_qBbu|G>fT+YcQ+a`f2o
z6DLodK6Ccm`3n~>T{i!4<wr}rnFbJCl`WST;3u8|{?TLr;WbVH8KfN+Jl9bNbn-ep
zZg603VlcKHaFsUL+`<re2F2|=Zve3E5Z>j2_v#Sd<Kp^vfZ+a>A7%S`OKHMFL($zI
zacgakj-{kbBbAq9G+AZlWwH-Ee$7j)MDv|RYiL;wE!)7+-J6H^&L_JYe>U5&M+{(Q
z$pj0&5Z(AwEa=h@3%YbzN_u-08~?$n1k0)BzVBls`&aywm#ND#S$Osc5}iS^BYk9L
zeK04z?fpG5-<6=GJOIReV@7tSR!viwjy;JtSlv^qs&fj%#0&w1xO<gpzD2n^WwLs&
z>rWFjUlqbyQ)2EcSmq~igswC3biw{zS^r$%KPL+!fkY}%{f~cg{6+l#f4Q-L*-a8%
z>Mat;1fpjW{QOCzVjCtgaa2;U<zk7~QF2xSm~e(+M&mClaST()gl92P43ot~WN~ZZ
zSYaePieQ)+w#W0X9?!U;k@dhjVn)pzB$o%_L1cV1JC7)2+eo&-t{FzfHsLAkl-p+%
z?0jmvf4pCM(zpUm)X=FZ$%)zQzBU2ln~{qf&R+!Q1cSh0@y89>y>9l#)T98NMw!K^
zS$i%#P6ta$*F16TSau%EHkQ;m=oUOv@mjb^ofM!?Nsia2CUdFdc%v=(#eUQs4=xs#
zH}J5f3^!<$<4o)Z6Qg4FIi<p`_@iUEdgEea=_)<Lj9`b#Jhldg)Tp7k>HbByCPP>y
zT;xKw{urX=Q1DtfXNi&H?C^dH6JBUJCk`xYg>@LNk`^N_Ny|EydWtrx0LS6-WbLtk
zY|0*uMH9rX26g_lf=rWdhQZ*g&d=8ux=zo`H|FSdu6}`o{RS%}Dk;FL542DK3HVRW
Fe*i)ig~9*;

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1a9169d9f5dab747b86df507d1750efb57844174
GIT binary patch
literal 2537
zcmc&0YfzL`^xXS>`#{-5;@fs{CDd5ScqfDmq2KacToLdAizzPb%7V-8whut3QL>z<
zA;m^#)^vo@Oif9XW)M(N!x!d*rnH$Rsp;V(wHcgJoUD7lZF#8tYw`@-@7(h~=bm$B
zbNT#Oj^zTlARUJ}2n3L-dfsXX{m?ak4F_BxCyyZX08Y$_RS6I-_3;#g7y$T%Vsc`H
zFsPf5ier`WAP!I}{7MW~W3k!bG;>&v<qb&gLFz=cbTmuK2oef#7|0PoJ^&Ag#t<5X
zheK0HeVidoPQrbZN|gT%7t?SbFoWTS%Z#>CccsbdDmEHi23a{l%8kXPHjBH`DkRW~
z<XuR83#*S{)sUYdY!r`=O{-SIy(7_aeMl&J7jvf*WvtHuNArB#Vm`pu?|uM(Hcv-K
zB0XcJlV`)uNjur4!Hw|T;xIOM`yJG#?SadQ8`wR?y<?|EpJU$DJ`b_q{0x<B4?eu9
z72TV97RTAHOKL8jgYMVALuWp?f#ROA<83x0+-uvx_=Ts4_H4cg&0Pa%GU#y97cpqJ
zR)+G9b--b@10ApJ#jm$9u&!ni?wywoHxv4?Vs9qXc+?*f4i=!69o=xHZWH`n?_efH
zug1xVGeqU@O%Pu{@fnIO?_{^F6^rMsnmNW}pD@M}^1W#5vD*+7iA0S9+1P(NX53sm
zWKOvYr{jJabM@Lwq9Bk#;%l97caIii`gE{_MB>{^+TiUooy^zsQlReWGTh%TLSM}}
z3CYP}?B~z+LrYK<-nTXlLegeK$N7m&(7O95?2UskF8x0IP}~ematveRW`WdQ#~xa`
zQ9Q803BPo&f%i`w5N+AM2ybhNMeS4Dm={0(i+SbjuS`nkB*~PuRro;kCCRGA38F5^
zEs-Tg&zu4$ByBE*)z`8_fwe0jqP0Uj{t`m{e~|Vvd;~EwUuJ?=_|vAminv=Sg1+kD
z$OuoQU}-T4xPEtv$cK4w0U=80L#$+CZSB%o;fkl~-Yhw4P7G<NGir7hMtSO&9lutS
z($rv1o3=S;*T#zOibG5Gdn_5Bd~0ppYCU=5-m0!9dsam7`cpd`r!7g17k0U__Z5A5
zd2dzE&6S;f`)hPjTh3o?S^1^4{fheBI2DJzW0^Iw&;2UPt#m|9PS#THbV*DW6P*Fl
z@O?<$>nH!wPwkPZe-qK69iGc$yL!YPG0_Po@aVAmI0~r`qod=)hp0q(_^@6M9Bo*~
z4UVqGWG!=f_l<X4PmNc5#*=Y<)cPnsAD5W2J{}ZeXQ|m#Y2aA0mcsC`?G~oe?R|O{
z*_{xN6(g(5iM_%jw+WRSsykV*VqG%TW);F4&<%)*2j5$S1rz(B)V|>O;bcLMl`5s~
zKVH30Xf^&XovMCPoh0v-$+220ACLtBC{+r7K0kkML6Wzz6isY2g?L`W^L!rpr!br6
zt9eZspTYBGd}^6M(-46<6ielKE&Y<a?@Mk1>ZBH^A%5=47owvR36h?ZMenKA^h~D*
zG0pQjdT8|YD)9LjdS6}=mlIo3u+VMHn6X@6m~W>1%3KoPiVAWF_EH5SBnSdtdjd*%
zBlMj`1@Ts6W*Kjw`ht6s4gtOIp1)uLy=T+Io4OFXH=gB$rg0ex;%)lE99vPLkUH^~
z=glvB{>XSpu?P%{hn7-fH)bw$Q3kKoQF%eA+UGw{E7Y45n}DzMJfB9<Obe}ny(nWw
zMM+!@smXLIM2cKZ>rX=TJ_?~}1Wgh-$&T>#yr$awPC~F$i^%XoCA~^Ql3u!yy1p{Q
zP5cOXQhj2dsM4qLN`fNPZg9NnHn}28?e<86!(poqtui^B<u+?*Y(i9QRE$!m0vP;*
L4i+Gn{K)$U&YyKv

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..85208dc489bb83a14eecbcddf090f19e189caa37
GIT binary patch
literal 3450
zcmc&%cUV-%7C&>#vP)O@F1xr0QdUrqg$PlC$fZgV5etHqCPkLL!Xg-r*ho-`MxI7g
z6p2O!1q+xcMMQ(WL~O*DC~Ayd@!>PJ=$m_2EQ$W{zVCf6^TEuVnbUq}hWi^L*(f)L
zVtko^a7N5PR{)zCBi5U?)?Sz=jdsx4FTG>xu(iGU)|nf+8D-zE-+PVR<HrCaW#rC`
zA!E*n7?IKw9L1eFa&f0M02ta6e2)lnJ*jFzN`_K+fXG}Sm&C_qBu+?)%SvGgnW1bS
zk{v;+6qWcxt`yEVM_>rRojD!=upG(whT|OgoE<3y5(K2gfC9=y>`h}0zKQe$Zzf|b
z-()Nockal==#`EPqxkD8-ZXL)P!J?p5Cp0=>NN~UlSFMowk9b<Hz6@j7iW@+C#i`O
z613^rnhXKhb0lj=s$-<8olyQ4G2aWf)r`4f7S22(1;lp@9^NK{zL)}O9&@3xm4()Z
zN#I=82d<tqhKGwB;AXQEB=nY|4I8zvaJ&>8mN$d;vIBr?H+W!5K;hj~h^_tu&b%lG
zQL-0o9Ib$F+Rb75N*3j18WU*>FKF7PhC?O6V5@O~2SZ<=lU048NGV57_4YvBb_KCE
z418x%&{Aa&l3q5@P3jA$(#&CTtS3xf;15Gu(_n0_HAt3qgYyxUu+B6GdbsXD$19b{
zuAu^?3kHK-!*Sqt_JfRjGN?Bl3ri;{Vd3-dAh0+ICT_8X6XQfs+PWQGF+@;v%m^-Y
z=?43ndIGgN0Ms=dkRM_P5mjZNsBnX|gd?o(UIU-Ktb%JW5Y7cN#FcrUz@nj2P;3o`
z{99L1T6P!MM%jZVUjjwWG&EONgY#!W5cyODI}4paRVacj#c^=>k{KL4O~dImY7|;t
z2KHG;;o0;cu)E<3XX1<C(u)kp)kQ;b{yKO$^#Cjn-UOwgK5%6vfu7}Ehdu>XU|Z=9
z?GNU_hV#9^KBFsoxb_jUOuUA4jtIC6I|y1F0hv`Jh?<n`RAKoG_`Rwh+T&7eB(@EM
z4Lvtd6D>=@ew+gpx$YsV4LJ&}{g+S`6IzUVyPhDw3~>UtbJsva9VHr;?Ltp_9V6Ve
zw+wPmoq*=qKcgdMw~@OAN378%!jszdq{PvOu2^vjs`frbwh&Hu{h&tWAtq?pzItd-
zWupDFT8P=TBrM7uL$nMJgu9*{gnUadS=wX_p4HK)W_>eUDp&?D3p2@{u6cxa{{eJr
zQFqa;13QpgY9qBKUnCknbC6*%*WEC^$3?ni-+hp}AiDHvC}G@>AhVe|2)26+huwcQ
zym9kmS_&r6e?}ubt_T5>m_SJH;X>RWUkme(G?HhB`#?d{B%-5^MyH1y1aEH#YNvGv
z)JUfimH7eCBOnawPg;@EMbD7K+-fijd<K^%Q~~L&Cbf(&h_ef*-4mCJo-WCP-<s#Z
z$^+Z!RqMtOYiit3U9Vd5qm6%%V~<@YeHwcj*yT?pw!5A;m^sdq-fM7=PFKf}had~Q
zRwO{)&B1inS<}FISG}mqd4xJ1kpOYuf_l(E(sG(H41Yc!T@TxjbX_GVvC*5p;HpNn
z<2a(1&kj1_U@RS4FNdNXqma`XOS=5<OSIYIFzLMcD{^{kFzn78Nz{b}qp!0HNQ-el
zk-27<kS6sPV#>YGiQAL%sGUP1k@&hd!I|ww78)&qKVel8W|zpBX%oNs?h4mIPVs*n
zxSgkL6nG$C{D@E#4efWa&?ctDd4NB`6fH6kOAL*SO-#+qyL9a)Wn>m|g=Kf8RS#>M
zp0;-O4ys;`PQCj$yYzKcySabV&%^WM{sX)|@gC?i$d~o=4+snj9vl)H7Ct0mXymY{
z;Uh+liXI&kJ7(;-@e?M-#U~^tO-fElO`Dvq$<T6BGP88qQ*&~seL8){XEXC=&7L!N
zUjFA_6wF_+u<*-8i@#b@^pB;>zFuCuV&$rm)oV)EmVL8s{kI!7ZYr<Xyrr^g>$dGX
zs%v(B_x-NjwRQCkdm8sP?c4uD^MQkh4j(ys?D&b3Kb~qiedg@B^Q}K!_~*~RT)f1!
z@$qQR*l5P(nTV5KxylFY%YpJ&6%NvSRf!2_e(ksG8*lh=05@rlifrn_sc&Itx3RN3
zr|+IUC*k1UO1!%ao56jWgH5#@7`NjH5>EL*59eQ~4LayHcu2R||NAu(E6(^4MnH2)
zv)>;J00s^-zghRBVUN{icp7vW#5hG&i<Ocg^!zvYUE+{vQZsliOv<9tQ?qmiN$E*E
zsbB_a6U*8avUwJ&7MbdVK(uck_<LzW&8WjMEAy-<7uOjBjl7e1{=%*G)rhasJNzx^
z2^w;7r!LQi*!oYaSbW9l1m<)?_0;A=WN7x98q<?9l688Xzkp;Hk?MxEs>B-8;a}Z<
zAn5yWDhfS7F3L(sNz%kI6n+jk5O2LI9F*Z(y7}S~fCnLex7dtGulz1s^LlTNfgu=C
zxHoT2$%P}~UmgUG(0Mctme{`y?_UV~XRshMR45hU|Klgm-}LnOzg*nE?Iy_ztYwBO
zv8+TX1yCsE#%xs7&}c9H#geb3T>0j~`mroK4FB>ijAe6JzhpLuWs_O|WMRz@D-7Xh
z{wy29k67*;u_R2hcn`cIcIc;lTwVL)NnpfaUiQ!7`#`?Ku30vm@BCtTQP^kIygWJH
zJ;E(MdQ^5|(16J?qoY#zeN80JZ(17e1b_a56PyHYdVj){-+RYCD>m9ABQZFcjpOZw
z@E8tmF`aVM$dSAp%6EF`LhAZ_CgU|fU0k$>HfD5$Hg>cC9mgA{#}_){-F$GdaC<8s
zzLb7kV(=&(zrlut^ZJ5PPG|gKAws=zv2p7xJ<A60)8KTz23%~=fV6n`TwIfWyb>;Q
z4qtzqqW)0unx9~aGso!gehllEqdzAhS>6ikutFvEMgmCvx&S>!6O@hP2sn9r>>r!*
zN2AvSxeXVWIVC$u=aRs2E^(Qe+8mpyNts!x+6)^v&%SPb)rxQ>!0R8VrvM`He_j6p
D2t(7I

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..4748c07ab9f5c77e8fc8c141ac748fc7bb24c158
GIT binary patch
literal 1354
zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x<
zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;<l$a!#xUoBoF<sXrQ7y58
zs;m!f-x*BoXd($RlQsE)&h%T4&`%i{dpY&%jV$-!rW1^op6ka0$BymIDITWIU=+-k
zp1*?j=?b(Tf+h)^$zdLC-JANv%8kVNG>UJ!bC)Wuz<v>~!2PxM7nZj#IL}>w9Itl$
zb1es`ey%X35RE3TXG<dLP!d+uDrx5TyDv%1ilF4FONBHzF9^l`X-X2Lg8727MTIwr
zT1x^}$Qmu(NBkn;^TL;DGnuKHYW~Y0_3KSe|6ilhuRR+%l<_wzpWkCBg0caf&T;p7
zIw<9vxI?P~1^~42rS%#h0b>j`fH73Z{La9DH*~53H0X<V_C<>yBSIh`IR5x}XXhCn
z4w{E#R}<O~XhUcKlQzR5llx(p?AdUySsr#zmriYeX7xG~@_Q}h_u&jTj#uR%NCH=i
z$B*R8<wyN)#dqpsuqmIH$98a8ncX=#A^V87v~^BB<uk)OBeuJh!0I)Fevh|~_1bBF
zM{o0dP%K<4d`L<obn2%uIiO)uJP%5;`t1hS8^y+zNe}Rfj`cZd5cX^PXT!a7)Z_^T
zp~wm8j}+6P;GMxUkvVpUehZ8wof9WZSr`Lc$ux;Kncnl(Ew8r3I=oNHM|@1B(WH?~
uDTemq-O`P>Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..4748c07ab9f5c77e8fc8c141ac748fc7bb24c158
GIT binary patch
literal 1354
zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x<
zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;<l$a!#xUoBoF<sXrQ7y58
zs;m!f-x*BoXd($RlQsE)&h%T4&`%i{dpY&%jV$-!rW1^op6ka0$BymIDITWIU=+-k
zp1*?j=?b(Tf+h)^$zdLC-JANv%8kVNG>UJ!bC)Wuz<v>~!2PxM7nZj#IL}>w9Itl$
zb1es`ey%X35RE3TXG<dLP!d+uDrx5TyDv%1ilF4FONBHzF9^l`X-X2Lg8727MTIwr
zT1x^}$Qmu(NBkn;^TL;DGnuKHYW~Y0_3KSe|6ilhuRR+%l<_wzpWkCBg0caf&T;p7
zIw<9vxI?P~1^~42rS%#h0b>j`fH73Z{La9DH*~53H0X<V_C<>yBSIh`IR5x}XXhCn
z4w{E#R}<O~XhUcKlQzR5llx(p?AdUySsr#zmriYeX7xG~@_Q}h_u&jTj#uR%NCH=i
z$B*R8<wyN)#dqpsuqmIH$98a8ncX=#A^V87v~^BB<uk)OBeuJh!0I)Fevh|~_1bBF
zM{o0dP%K<4d`L<obn2%uIiO)uJP%5;`t1hS8^y+zNe}Rfj`cZd5cX^PXT!a7)Z_^T
zp~wm8j}+6P;GMxUkvVpUehZ8wof9WZSr`Lc$ux;Kncnl(Ew8r3I=oNHM|@1B(WH?~
uDTemq-O`P>Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..3c53b83a00add2b8dc0dbafc83e88f7ca6fca968
GIT binary patch
literal 3442
zcmc&%cUV+c7C-luVTK~8^M)B55h)`oI0FHq29XECND&bWMNw%Y$jm$$5R3&I5tV3U
zHDcFjEGR?;vq}-gVDA;(L{VdGQE_pNExPx;5tUW{*zenKb3VBDo_p%=+{-(cNH)rY
zp%@M070!qlXa^u4v8$k7ufKG<(8hoEp4Z!DEA3An_aE}Kshj~u%E;XqbH)biij+R!
zDsGX<#Vu9<VD3!tBO=K4gsKiH8A{;|A{&KVl9-@R9yK<>IF=z~=CU0~R*6(GD)FaW
zDco>wzz~238$1DEK9cbd$2su1x>5)v2uO(;1(cQ8m&O`AiA)o3CS&a0Wh@uBNaSMl
zMn{HGe7K8+7Onycf+P!qL3N6H3&Yha*^rd2OVMYIN>0d1uu8*=wB%7qhKy{TUI6wI
z$(oR=np8Cs%KswfN8vVDGuN%dnJ1)x_;<6%_sHPc<00L9CX_a?&`>i5+}3u78|N+I
z@q8D!Q{NSmx=GQ7jRu%EQVK3h>cMc$1wh>%9<?E$@O~P^mH!FnUTp?ZYH!#Wqk!+4
zY+%wd7UgAH66uQGuy30N4z19Fqs|>34Sa=8mi2%lr5tswb_VL68pMV$@SjG(nKEaP
zba8<8Qhzv|ZUYPAd|=$%Kp4=F4kL0qfn;%exEN6i>)Paiy?O^aUaCY+H6<XO+aH{2
zjsv&T3-k|UP~Bz(EE=PPc`tv2pyCu5z10y;#EW2c!*+Dt96`k~3%K02J=E>%4Ajbg
zps8qv{17LIC|e7P5)W8IxWdW~6)@#>8Qg?Ea6!uu*JphJ^9M>n@q-rf@7_S^*==DP
z<qW!f2^6`}P+wjSZc~CG^0^3h7Ip<yp$N7XC&1yW)^PAF4QE$rQ0V5h;A}h!FD3<p
z(`_}JODuw`uk?_c6%EDt>)`c-1F%H9308;t!S!VXdXaYvx)*c=$5KyddNczzT<ikQ
z`gZ8?nkUFE`6kM8MZoEuAb3FpWR?vfD#mW73OB!kKgzt&9`|Akv11r)=)8d%ZMPbn
z<6Wr8b&pYH$WdtMxsWOub;hEb`ULrPNLTQ<a1(UYQKDw?F7&L+F~ZYu*DUw+38<g`
zGdi;N9`dy1h*gGUcviWdl(_oQB}-33+1}^K5yFYyKWWhB5GypOt{Q4onP~sCGsN^t
z66WU)C(aBGg8M$rgnX-(T)oc{e9EIy#rk@<TCf;i7iN;3)p>+(&t7y|Q3ui813QpM
zS}nCIUnCklwXb<G*TFo){tCUK?jcCs5xx3(C}HVEkl9QXXq}$IVb5R9Z{PWxmVy=Z
zoLmb}OG3aZHV888-HC@ID`C!&TJrp0KPcEYhG?#$(b)k9!PnP?+S#cYDx?#L()@m4
z-!BZRPj)1w^IssBndM*|^a8GpDg)A2LmC)=5N8)qyGJh)Jzr>q-|A<;vIE=c<?DtM
zt13KDRhLTgvyFd}BaYo7{c1a#Ipt3vwyQ6iO^x@V_nJMRGc>W}AuvMkrAd%?r$60p
z+C*^MRV`|J5uxTMBtXLVpy}I(w3}!NgI~@;x5D<LtacKVT<c3;R%;OL8c%fb+d(HC
zjH5%V<xsR^80vb?j^2FuHQHi(m~`9n4LPYn3%fIi5>;VZ^sTXgw2i++=2~Axy0l-2
z@ejTt?v2T#b`FR{;#<B1XT2NQ>I?+_gq2B{T_QK8QT*<^D_jRT#s6_&x=1-F@I<~k
zg-{d?P4_X=D5k`DfIq<$Eiw~J%q=Xf+F0APZP#AP$ZX{byAH~Z_MIF$J32YLsJggz
z?bhARy@y)k;rW@Dx6kK2d-eXpw~t?6e>NbnUr?~Ne@JLp_<)Fkk%OWJ4;eZvIwm%5
z_=xzCqedqrCMBngNgbP(J}yJ2H*n)Kjak_fa&jkrIcf5gsd>|;&zLzY|Et*rbLP$~
z{CfU^Zx$B)ZPDUymlQ8uwtU6PRjb#m{chd*?>B7Rw7F!<*3z;cwr$^0Ua|AXzwg>z
zSyf%Lr*`kYy8S=ZA2@jE@R6g(j-NPr>hzhj=gwca*l_9cKYsq@%2lqBk4JNs7L%_{
zMV$2d4L(>84wS#DaFA}xS6Fd2H-Ede@pb?QaEIoo$bD@&&0Xy59(MNo+56`&NH}<~
z3`-YdGk8dIu&I&*%O)H_!YLn_;QSLcLNnb6kLgC|f4xQ0k+Xb)5zw5{`j4jqfIfq4
z?o>Ui+0*eFJP*DGVw|F?uA`D6O#QEa>&79`rRn)zm|~<d(u`SVDH$nzQ^6giQ7mgx
z$Y$B9>SU@tLZf~Ez(3k1G>qnL2Nzda0*!oA`2K~d;mwS%Qmg(g=k)}Vi(7R0ZusFY
zR(jLZ38AOob@)^>w0_$fGg9=aS*AXJ8Og37RYNCLVJF-UKXm_zpdZ7jDD(!o$e1)X
zMVG)(_&MN0y!WbbQHF1A?~fk_;7!O|7n>28lv{UeUhmy8Fa#qC_vNiAxo{->%Y(oX
zT8_rW4*PfD{R@Hr6D-Kg6-q_;|M<!C7d<`xFBez!SKTDpQ+t`YN-Qf>N&yr~xg{GF
zH88rj>0-&(P->pM*#MSh2jO3y!dNzk4M=5!SvHjoOcnM5u)+X-709w7{EYdQ8FRwY
z$a~-&u>-&Cu2%QNi=c@9JRg|Dk3sx^U9)UBKLo_`tRQDKJU=ecGr}V=dRTUHaIbN(
zF;QcAxh@jtH!&R#g1<n)2~Gl&$)B*~Wv%RuanauTWNj*&z}pMqF&s>>Eqv6_p*$bT
z4<_hB>ZW|g;k|&YglKO=Y)ph9E=GWk;|((53mwurA6zU<@8!dnGJs3g4$I;NY)Ck-
zFDT`-#2*wQ)EgHYQ%mVtwjaOLX7DxO;(~jnCwk`Mnt1U_xX3wt{c(z>L&19if+fxz
zqr>u8HXz4zPC~N071m*eN}7xWkfwbBdaN!u8^;lF^7hz2Hsz1TqzQ5dE+KP#c1o6e
p634kGWM&$2944e>8q*AV2M?bf9z8UQa3#RoAE>4PBJqD){|0`F&W->8

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..dde573d07cc0c2130eb43113befd5c673b2f0ef0
GIT binary patch
literal 2537
zcmc&0YfO_@^xXS>E$v1pB42G;b@-s716qhMB#il$H&g_iLfI4wt+ZI$t`9)BD0`UT
z#sV76FdAXe81XgdSb~6pI;QC6YcIAekuBTQIdutUQ?}^t{krl{`M1oI_TF>P`<#2v
z$(P3$#BeMZ#D(ZN%t0`KMD<R7qo!$gsf+_In3G2mdJre(#Hx6Rkob9u5exvNVVJBK
zAq?tPq~cg*9EgLI3Tc_qZYr@D-4+hZvAhY%n~^#}Et$xYGLnP>90sxkkqzL%&{#sF
z@L*^PiJvoy$w{~mQ;G7w;bI!@J!T}_2${)I?yWN0JtZci$0)NB#A+%jci6mDb|HZd
zB=1G)n^=7et495dViS0LWLk9r+}i<qZV(AY?`3ZHp!CLESn~G<)Nc8ZZD_j(e;%BV
zjzu*;DVdoAyJzfXOG8^=_VRExZ|7|k{$3vpBy41_<eWtFH0PLiv*tp~*FQrQdlnzv
z(t$j|Qe^5$64hQj2Ys)9kIuY*9mP&{;vEhX-0j@ONF$O(d$(SMgT2FOCg^eECtB2&
zB}4fqy5XqWg-+I8!moES@SR4B8jIuMM*I*~?9X6Yjt7Et%M@te)d!dBx4``d7c)b%
z4ksln5Lq`pE&lb>!zjku!|qrw7K<X5<2lKPMdhc8McYr@g6wVuv<zqC!1<W*a$S%y
z>kjnC{`ADvYtM;7Kn4k~^}wCISs*i{fo*yezO|wgHk|2UzFLwD^~YD@p)L{nV$mts
zba)y1_?aPS52?ln)~CYs)EwwOKaJTGDT44fKY~eVf58tW2Z2e_G7fGbNWAszk<!iL
z;f-$irSA=x`(>zT+s>Epj`kSTHK&tl>%Wg){_F~q+%qF!*7|C^Z)!iYc6q!ga$gf;
z(;Ar5;D*Gl<*@ErrYN{>HAHrFizk1B(9j<YygHBp+83T@LRSaUM|l--H&G<L)sdEA
z=@lwiT1*0--`ygzG50PYL<xh44VYF}SGq7l@l^d=WydWE)0^r|y0)U|=7yCguhk~E
zHd#{VZOz@Yxw5bF$cjVFw)B19+B>$}PhG#ewzt)p85!DmdY7x;me_J(k0<*;@dpF@
ztIyt8(=&LeRv*3X{MGg~pWC}GtIthRaoE?End9HNw90CyEoyeNHmlncV78fQ50D2x
zh~$@~@*kz@W|{go5pCMBxjgb!kDDU{w1Wve+^l|%LgL40>-h0eDp4NWtOLOlHS462
z*0q`K6&~Nb@wMye$?E3Gq+K7jK7r3iC8lhM1BKXKZZTIGIhKs2a6D$Zg{$=Yrz|A1
z6XLL9e04doPk8(^p>m^jCj(ZjPomnaLbwAu0WtC5zl-otVn2-97aTvBEXc7^rPTk&
zqxUh5#{cD|svp-&l3$X^v05S@mW2Q)Rf<5qpkQ%fqVHiTT3HQ+I9|u|d_MV8$l>`K
zURS}V^Lz!LQX$ZEL|_rcQg}X#?&R#>$w@%n)B-icFJAMkMw390wA@U3PN|`58eNEK
zp4ZbwXP`%c&ui&<RatCqOj+SFuPJ@ODnn6$h4QQNNPMd+$s*WG5sZ)^2>9#?DCLdQ
zcNZ7N*-aT0ypie)?nycX4E}S$(xvpAO&4G4Lg>DDRuP)cV=Rnw7>aTo#YIBu#9zKI
zzw8&s$3u!mU`#x;lscy=W0{9C_$)n@7ldm3{`0ehdXr)k@Ry$FQz@EZqcw0Ar!S~1
zi>)O!nNNjCk!xuENr=8nAvB$!Ng^lN5x#-f)%fm72$pIQ8D6NQPf1A9M;B5zRHb`~
zA0bbwPwW#_dNn>tP=q;+u2;NfPgJ?n8D(_295rFpW|!OQu!qION5@2Km3kGx$Pc>7
KCr}>w$omJ>jBkek

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..cd2748f7d0eb609ee249493c7051b096c5ef985f
GIT binary patch
literal 3413
zcmc&%cUV-%7C&>#vLFb^-gOsOMaqH-vJfGPh+Gg>U`512QN)!+1lb}Bf+d0)5>%p*
zrx6=yFm{Oo<|#!5gS|v-#FyABK6{DLL^JoUs66$L_kHhsIUmf-nK|Wm&Rp*A62(S)
zF%+X<daD=#104V)qbEJtH8sTCb?g4phj#sh_EkGq`s5l1KPh2=5it^XM#$J;TY=0M
z+=Q(*5@D+q09ZH^coZUn)PTGJ$rwuN0|FbV#C)7ur%9ZkHcns&u~1xr#MMY1D;It&
zl++z#1BL)R*x(5O>w!!k6~@3}b)XQKBOuMq0)fV-oCr)4=h6nPMA&*F5u(>NVqC%d
zyZG>R1q4A}1^GVeO$BaN8huihHd&XMs8MICty1wKRg;*cPtVfo_?lXf_&SnTk@Aa#
z?7v9)!AcKUGfmbi<{`-!d)4gm6*Bb8Ns#6<8@4pF&|Ehj+*kL6E2k~t@q8D!)z}@9
zdWg{4b$XaLMg%TP8bN=_1;En|9<(Q*@J=ekSN;j7UTy$EiXW_tmBP38ZD86m7UgDG
z5@}LDsNbr911pr^sP%vcLtdie<-MRtCPCe6oq@XT2||4Y1m;oDRPGGst`1-)3WO7B
zHn1Sx7beaPhQZBgFgm9zm@l@2vyoe%tbI0g@vJ~cx5$uF-9`}29RyBwN5QbYH|Xw)
zp|<^KSTtS+^Zxh&LW+}N>}E$emLP!A=56S*1%j$0?ckiP9W>N;18U_!P*gpF{4gho
zEME=Mjb5;daD$bds^GI%<!}x9!x<$*T>iWt%pW2G>Gw*=Z@GffvTR{1<qX<<b0~7B
zp|P?O+&>G2sOJLMUf3Pvg#y@ItcHUZtYQC28cvodQ22(`;A}h$FQ$cp(@jq}HLeIQ
zywpKXW(*YPm%*#a`(TN3J(PwAz~yBGdXakrdKPpB$1UD)|G_L+d)5`4bsf;-RZo$<
z<{HX$LtxN3LFj@=$S5C9R881M6>fM5zm@k!yF7~92^}L~ZMU`5So>0NPH>^3${wTY
zu*1;YXCbvQv8i1T&tv3QVco&&%r($bhl#qyJJ9d0M+k3yi&@TzW6(JB7j$U#ZRFk2
zK$Pe;@O$+d(%db8-njGxl<#?t96?3+{ir}2!mQBHhFYkTXP~`#O~lM<66WWOBASMU
zz#ZR5gk-alEUmW$-^v(NwWbj+6fA~Ug&AZw&s@U4PhUE<sFR>&Uj_0?-A$F`3k1Vv
z46rCRbh1eAa-Lq%a1TTth%S8|PFVIP$SkG?lupm!p!Zdao3}otMPLPertgMl8^gdV
zE(FrMco6r-RKuJ@yUEkT0-&INJn^W8MkfdF2Y-JTYJ1m5P$im7Y{?%8T?R%#?eWf}
zX#NZ2GP@G2Lten8#Bw106{MaC1YuSIwR7wu!SjVixZXGmmhIa{FE1NKlvH`48rN#_
zlXZWRqmSGm19o>abIPAgZ1X&8HY358-eY!`PFKW{2fzq^OOqh?)*!k=-V|`(Q7f=L
zi_oK|B!K!mPz>l#+E1~BVSmg)HzM|;%ns&Av)iBEnK6Q(-4Y1bfC@Tke>`1OF%oq@
zWlwK7_zG?6c#w48^fg%%p+wV~mGF(Rfb5v?Gnr$30clfzB_`edg19|Cm)br!3JGuc
z69(&@sH0X-;5S!;8B;89XY$P6zH0f$KtgeU6j&~i4kA2}FMmQPfQI{bu##s+2`2%5
z>r%A9OlWS=&eE#AwT*2DI}s!9D3RKCl6CIV)uEfCle3H5)vbGvp6(vKJQZHvpY-<e
z{j^VCzkdGx0|o@LLBRt<LY0HU!Xs3JBZou{jUG09#K@S~xcE_{6UHQtRgX*3B#%#-
zkeW6zU8~a@CS@2ivnFTfOqn`u`e!q8^JdPP{dxWuUlz=nJFoDo`3t^YSoDuYi@#Y?
zymZ;}6)Q_hSFQfGY|VFT*R9{Mant54<==1JR#92C{fB?<*jZguTeoZXp8AHpKQ`{$
zf8gMu!$*!DJO0y&rjw^mpE=w7^SOWh^6U8vhKpQ2+F;pk`lT7jAi8{oOXh6=i5%zX
zv3vz>kX$XhcKyb>^+5)Jo3w$7s<$<$ZsBMxINI%#|HjeoaMA7}gY2FO{C#AA2ebhm
z;=qqEZrh6>w>4Nk#%OW8Cr^2-oiEzltof~OS7!q}!wF@rkmLoOWej1WxIt|R!d#oG
z<LEBgNTsJ5GtH9IlR5IhOp#|Me%w)fN-SS5mRIpy`tAjPkPZ}#;tfd)Yb=3A{>dCC
zVrhOozfXGDju(lr)s`c@_jj?;Thdzt@~sXZBR%Uk6quf@OUX1*`8gy$kK`p?<xO2M
z?Y$rVBS}A0r?k)qBm!g7gk-Inp)ljS5bu0AxX4tS?E*3PgAXBT+iXT)GH%PtoZZ`N
zU<gK_^5?uM34bNr#|42iv|f#iJ&y0d#pe_MXR#m_N@Y^j|M;usZ~DaezdX2q+mj@2
zk%@(Jp}46Nep5<i5=%BZdPt0)>A{k$qC7eAVS`wf9g2TBiD21mHYkM+W!V%qIECK}
z!Ulu6RWQqjaWmFiXRPr{BNu@S#15I-)6=sLUW7yr;`HEbZVcfD9GYcS+z=GUsr)&s
z;Pi>(yd%BF#f;3-g!Y{n7aKi+JJ&{Gd{ff!z{d;bgJ2L?O!4?7ch<(=7$4)K(<oC|
zHRsQ#$K_y&Yt^GijNtTeZZMV3LpR}>i1&gr)iFN$xY$U2d@Nr&&NtLlU-<C0cyP0@
zyn}~pWspIm9GS@-uwg3Bo;S*F%|A4Z?>BBXme$s@>_Bd*Oy_!Fh!5?XHqJW-_oO#x
zgqxhr^&dktT?*a{;yp2PTpd1-V}r6y_r!zcys!<+chcm<7irq(OOMlrX5l=1ot!_8
zk6pQ|F<FAd!Jy8Vl$D(6kz_D<s53J3*$$JFGmNQvor9NeFRxw-sY(X$<_BsifGGSA
F)nBZCykr0X

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..29d4d0583c88bc82e4c6e570334ced2d220a2c3b
GIT binary patch
literal 3177
zcmc&%c~p~E7QgRXVo*SYd@;eO$Pz)p1V*S*q<$cTB_d)KK~V!i79oiw2x=|3(4tZo
zsv}wjtQG}@Dq^cF;)eUaGLAEz+HtJyIMsv4>0k%vy)U9N>K}8?oasHu+wOkvJMWjo
z@^QW#!?|$oT0DUxfJ2bBbIn&zYd3^<Bt?9C$oM#y1J03?dUGPq0e?k`0Pqxd^JL;~
zX#lWsq3|d|h!!dLAO*+B{Xyg)m)Rui4EmYrI&(TlNk!61B;ARW5lZp9EXln|EF1-R
zaKIA)-mAG0TI`;LC}R-Vpdbb_2(i|QNJ}``t2kxiZjMZhUdTvs>~Ef8W^Z>Pc8a3i
zL0QAR4BpdDZ%oO}PBmC&>U9>KT?Src=x3%FvvRWyLOQpQ^fpqK(aH;y;=geD)+7t<
zxfS+W?l3LnS=j4I0UiG7T*&la3bpM#w71L#@2$h(`|GywWR(XzX!C-UVG>le!w4&9
zNWf!58yLUy08lx>V|xlp9%ew|zJI~Bu4)ja1;UQ0a`^I*11#Lcqk<e;DpMW^NA{@U
z)MgF1X8XY730>%7-3TaE$dFgFJ1{?~Kx~YLv4sp=sdERLp)TMg84H&)9bk200L)nt
z3gg=|VS4^xuvzZ}H)CpHn|&S(Qq`mLwF>0cQUj6|5#ZKx9!&d3g5mE{XttjY>t-up
z<*(mBSXnB}+U*J#l0;C^z8Br?jiBLNAGp=e3635a0?d|ipl;}ZqDVK0soM(j8egcS
zJYma#23YjG4(`DixS`>wyUX5(RTCs2|5^h@KYougbNj&_#vQVYY@pPeg|>bBz<W_R
z#CD2ce~A|;OGL1{Ob4g$^o5gGS-4uRMp4yU!QFfoo-GUqw;xn+Ex8o#bQvJu5)WlX
z+u-@U6R<(E6Dp#D;qE31JuA2m!;1%kYpowVdb|XxZVm-^gClxU`4l<p?;(pP0+YcF
z!dJ(@q+ge#hV;EmNp%<eqi!TR<Wts1>>3SKL#mir&K2OE<iW&ldxF|Wt!3^<A49`b
z7wFF-y}<XzJ;-LxQZ4HbAd907(zgb&`Ij$1+v3~k%+{ZfUw;!-Zq&m+8@JOop22L*
z#>-H5xD&a8mI^$sM%9saXyVakXi?^%V})0!#f>!FQmN6yfDTHwTSHeIv4w{1ZE&Y}
zJv=YTp@*mnsGw1!*^JTwq90GxBj1cxro2cbVm*_np~3ZR%E?4_^U<Fns#yjVol%tS
zNQ%zonn2_B3!L`*OYa{ZyvItw4n}>{3cu7uf?YxwWDWA6exA_?%g?mZ*Cz!-@sZh7
zM-z*#jz0;d^;6LP!5z>bnMc(Yje|ksqM`ZXK-y#JKIj|v48EIL2Xv5{HgaP@oLkHs
zoV8BWxz-GSZCe6f*PPiewoRkT8+=jI&_=rY^m8=*+<iK@bx1F_qIuM={-^1WlLA=p
zU2Etpbpm|~%n-OS1qvQSu#ScE!TUh7sNYS5I-b(7uw4V{55~~W^KD^O{xs@f&SXmB
zgV?5M4Z^QVJImEG-drX7>H%a(3uO$su$cZvx%9;o-E|3}f$XD)cwNad;>mz0nqft~
z#5TSA*xK3mb?E2lB;lm}Wpd{Mih+X$y9{x4bN5gV_4FDx+}me_O6}|SyOI6@?~NKA
z_<qos;19;~A)({K!Zi_*QPJA*F%x1Z#!dQg@|5_g35nCDC(W2SOP8FYPo15Xo{>2x
zE8Ac+&CM}ea_8mc&tK4I;YW);E+|~QWa+Y^Pd+VPzG7v`XRB7PSzG$~I_3J`Zx~j#
zanl!@x0F{@ZvArG_OGgT?5wWYwY#?N>pgqx_ciSQ=Gy}Y8=IPATMo4zK63Qf@wO8u
zPn|w<_T2dk7cX7Da`oEv8#mkkaO;m&e+x_?zO7s@H^HA+6a2Z+1d2N^1eAy53XZbE
zKt#`%)FwN_Kp-L2%w%PlExl5+QpqcepVUfLdQ>8<a#9|bD9Z%6dA-_ieK&jHSCcL2
z2|zn}(J|o;_P{ThxLZOd?v^3A@a8F2et~He!L)Y8yI`U3OOVJ)HKbXrIClz3FCpcr
z0m|Y57#QBP{tlmSOHf|o4>FNCB|SA;$1xZVJ*d}E)kC4(?KBnxBluIYo>k^VR^gtw
zMdV)9pQAXDHi&35GNBg)c`59nyC)vb*uD#~FF5|sWI-yDE9Bb$G35V7ApC#1vA@|(
zk}i`=MM|->v%dsDu8`UCad8vk1FaWJR>`Oc`ST$>&rif(LeV^*$A_fx;XI$lho%W<
zAy{BMS%va^B$@HPd&WCqX(kqkA%4Pw;VRWAya<blAp6ifGKP@>o91~f8A1}sR^an$
zvY(Ue7vq~8KP6WmK6*~V)VOrQ&yL0M&CkSxU@ufKf`h<hwI?hIuZO-lG2Y*x*QD_}
zqA$3|>0nCe-p5UzO!iS^u%<4AZjEOSJ`1tv;{A;YQ)7&YQ-##AzlqlTqCV^y4=xs_
z*Ww|i3^D07Q!IqRM{0?@Ae7hbe`2IiZ(M9l-KFRGab&5<A~i52hL6rn_RGgL8A*h2
zk@HCXafsHY;Ij}x6Gx7-!~6t3B+uF>Ay}e?Wq6^IRwW@x>$#A6LUwpA_9Ns;^s#-c
zN?K!;1euFTmoqmv)#8(4GWqCoa*TN{^HOun8AgMPZ@>uO5o)<s0r2tz&G=V(EdG=6
EZ<4@Png9R*

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..b4d3a24f9c22b0fac225b64e9de3189163f54830
GIT binary patch
literal 2758
zcmc&$c~DbF82@(Pdm%w6h`f+!L{N#SAcqnG5q;s7fLw|}5d(=3%_Rwf)_P!3oS|M*
zTWvdNM{Da*ry|ZMVnwaDW7STpooVZIrma`2o#NE1)$YC+1<^k`)2ZJidwt*jzGHWP
zNuE#;!?Sz{KUhaP2mtU;n%~mt_+$CC_OVxL_g`zeTF(O?z{?|u-k+E75|swRr5+oF
z)MEtzvJgyK79j?!KSL^>RmOqDU#XCl8*Ik1Duc6%$8uJ_3CTY}>P)rtWk^cu3wR8q
z@h2U?^C2>1jf%wGQNwN_Dx@BbLh7-VldyZP5`ZURV;Xid`!wuuU!%RE#%i*;%8Uk=
z!PiU%W@A}}-BM$<iHV#<^7BZ&l2M<-s(<0}h3R!j`O{JzzZ#2&u!+4h0H?3Yh3Z{J
zxV36G+qCTw+}$+_eIB{Kmvmq@Yz^JY&gi=l2A77hd7JN`wyZO7Ierbhz3gJIVNvIp
zx3Y#p%r`%Sl|6$GuG@$14>^lt?KiyYE}Vm7OTI&=KfH-zgB-ZoZiM@7TNqh*GPixh
z1=!W`01X5kPW&PoZOigS`A6E}pt>3zU3?KQX=7kz-Bf(BFb#gwbYjKM3})kDAJFV6
zMy*?p!Ij2!@VKd(35{BWlj0LN^V)ur8^>EvjQJ4T+#rz@E*$5*-qFw768r=A!I9e#
z7>T%z4<=xrQJAUW4?xDCdvGfDC+}<5M{|MT3-JpM!M*KS;HysqOK>E<J-rQ9oIb>S
zU6>4whiBr>104D)=L94rg|T}FbV6(3T>NQ6Dg>uyL;IKgnZT6~QP|Qw;FtCgek|Js
zOj0ys=f{Jzrjgx0<88@<HBR{DSUtRVd^h+0=Bc>3H3l6R(#DMZ_-|&~*{e+Qp-`_u
z4Ri7Cs7qc8OEp}F*DcNxt!GYx6B0L6z@qD!T)^Ua5V5aa()$uZoqv+6X7~uA$BkwB
z&hvr7zgD1koQ+IS>9<VSyT{;a_EA*lcLliuWXNj%9?!Y846iEDfNWhKxH)qX%Y~QX
zA;~RV#fcJbZ%!VPUQNPlTP7oi-+r_<f`cHd9dA4J80`o;#YF5_&5%ofi{bY%5qt;p
z>}iTuK*7?(B61gnc$1EKbRHoY_xmz3?qFDH0wBz=oWx5i^X}#2`<kDB?|?pmygW#u
z?AuS(KX^b$=)ggP!_-5<hYlMa5ji3%Iwp4Hs5s5&_=LnUNn?}8jTf{jscGpMnOPIE
zbvd~c^YRM{izZDjo}w?AI<0hi*$hK@h0!##vdTPbw#90*gJX`{U$wW>$5n$I?A%(;
z0d@Ty{_}{F`N$FdI`y&OjfJBZElwPxae!h;c0EB}$~%hQB$y2Zb6LgmE|}_j0V*E5
zIVQ9FW0hIWHp(|9Cu=b~U0x=Oi9TfVYC9PDdSCgkKI(g~shdPD>sqiEKFx0AXrgpP
z<noigC-ZnT3aQ7IGEdJ{0@9Uv*2-VTJbq7k$YQcpy4)n{=AL?9eI$@@Pfzrh_<T`<
z%BDC_NSqZ_CaZyG3D<_<E|eXn((UXsp8POC99DF%GB0r(cc(^b_pJUr<|Vo$x|>yq
zXF+Kl6Azv<3F}MnLnyxJ`1xc(&MH+(-GBUh@``x=f4Q;hS9O!*Cwt3TwN&2PI}kvr
zQuqi31rv)C-4{!-iH)KXCujvh$R}Sa*@93jXe)(uL8uf`D#bM|G035#6hX+MJ(+uY
zG82bRia>#ciSvg?Ma7drT5cwtr_|CmjWz^Y5OlQB>giO}3(<5wyF4~Gro4D^jWIo8
zwth-M71dkwNPP2Vkw(Nz5kW{0MBI4dkm|baca{{#*^C*Lf`RTA-IH{P=soj-Nt5V&
z0&U!>i=n&YnN8NTE<<sgT|Xt)UNS{Yo%qXl=Qp9Kdpx9AM7qR7OR04jGbX#JLdeol
zd(o)Y<3B%3tT!n(5l`s_A(aj@EVKrWlJtaG<*{|7CZnhkDRM2XKMB!&C}d45?j(_u
z?1)}3Xlvc)BnC@&5gS3Qq<fQ?q<dXVU2jdVA%4U>>3)Jww$h_<TY@6QVW^%{V{%1S
oI2@6N>S}v!$Xrvk(`>he#ArsujEGk1Q~*yu(9Qznk$<rN04w6BdH?_b

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ce4390a418031143882c26e15a5cf27824bf0a3c
GIT binary patch
literal 2745
zcmc&$drVVT82`?>4+?@H^wwLfib@MQ9t8!_RrHpZKt&!R7G+py1+=u+wuq?nITu{q
zre<c28D-HqUnB9cM3IfDW1E|C>NYjTxGdSGbI(Mx`Pg*l+**{We=J$HFYR|8-*>*}
zIls18DoqhsAwp1F1S~)#fJ&Eks8k>Hd|9>tLZqNc5O^Zy1zwj9@v4AK?41EXa117#
zM~DYQJJJcPHVya)tr%QmbJ?rwZQgnTYk18%q}hZF83xspENK%+C?H@UEP@OGWNj3Z
zEW|wxO=KAeMkbaBC~^LQYO4kDL7bv0AQ4r}eH{&n{n1?_42e}@$C!x)LmS&0e0-SQ
zUE9>?aQUk3HlHo5flM0g)wOPCQ=>~s=OWTvLWWkx@E+Fv1DB6Ya;Hkzud)a)_*?J_
zdm{?xzFY*Jj%oN{{XTZ>uDkGi#~Acp;--G8;f1hc*ba7f^j3(i(zC_eZlKQmPvJtw
zdUkj9*?yyvPcbj%kA#$S-$En%DSmh3A#^M5Bu;f-3tD*Q6!fgPgg$xW2b4Nc##`KW
zxYfCx369U?cE5H8I^MsHhJyu<|G<cL<%gk?V_opB!Gn%3KZ{p%GVs#EnfPp34*ZmU
z9gBPOn5{=cA^kuFI=H<DzG~eF_ttutVady|DPtVhuwf8??Zf>jrJ<YMvYO}1mQD=W
zBo7L44!O*2KK2W!6A`!db^#6@gPA7b2;_~p2`5s&3Ay_HSWXRLkg=p2Ztl*9Fl!Dt
zha}=(p6P@&pL8={mSsZg(K-0~5e|Jm<s&ee^z1uBufsw00{r&sY#5SV2wk5IX4Eg;
zLHbn(AUx*|d|llEjLFEjg-M`lYGn`4eucli-V5LLtc11?_i?Xpn~Ap^OhHHDI+@XL
z{mINad4<XB9u_oW^#Z&v`FzmQs&wxCpr1LX(aL-bUKsycEiC(fG8ehL1riQ*@dM5y
zbp1DS*KBWsapD9fx+N6K{#b+bulB%|!sEyn8I0`RChlUg5xrcQ4#69vfQzrfahdzM
z+K(!^f-Vs@?4OP{ByceDi)e1wiF;_zz!OZuo^{OP3wdzZQ;v@m<{@&^FK|L9lOVKn
z4<4n`c9EqGBJ?AU2`2+{_cTI-xnEnzyq#lJ<$y55a(s{~IHX@_SpV>d0g+K^K{HU)
zMi0^r9x^m$*zggtdP7|N$Wc!vBqk*rQ&LBdNlPD_F>d_RrU{u7CrRe4?3~=Z$@v9^
zmMKM3i%Uw&rj<{xm|?A)IjicK>e;rMTDxOTU46scdCo?c8)QLN%@6f(GVAqa%Ajsq
zKx{V)a>T-xMPnD6Wq>7|49^O3!ct;j88NWDc7-EMhLvqZrcWYl6;a6OltK3#HM-K}
zHe8O>3D^%Wll#8V;Kl|Qg&K#KbvAf?K@O*bzFzWF+d0j({+e^4hD-eotqP1iT(QT#
z#~2~jLa~T|PK3!p5rWW#`y=T>A6yhw0g0#z=u*6Sbe9;pkChlPyy2$sB)AFh1s|uw
zRp;~L=f_BM8X2yt4Lj8Ya2~b(1fP#fP`fq_MBZCl?`X6MEP+`)eh97gI?LXuNhEhj
z!(!hm3%p;sFCJ382lXdEB)r8$PqU)Z3ySENcnI`FA5HAXQ2UDGzb6YCUaQku{^g&N
z{|VgxlN<ZL-6YNCC=G8=Y1#&;0kk?XR4Of<S~1>#vBY*ZnNpf$mL#c!d?^)5QnO^P
zlX4}gPRgoN_RK_K3Y}$1Qa&BYIxv!zGWAjm)Q~iF(NoFE8Dx@EG?}imnkmnrjF^@r
z3uR_2T`F?PNZ0dfQj1b*DyBEtbH~lI&M2*?^2TBkU&~y=6nj~U5fTI;zddD2Wqr<j
zD=X4m_PjdDM$aqmNjikAfpuwlIb9b}=1*M--5<|9vS;?$D$?B68Aa~O8A|HJUx`1z
zf@yu@A;lu}P&~AhX4#%M-A4sdzJ=;5O3eZPCHYFdNwEn9N-s&-beiX+HIOTF$IY!t
zT}Wy&hANRFH`Dr)5dBRdduHV%iJW9d<W|Yt?C+BjEImbZB&Cx6LrRkVeI<2kV{Q}i
zqvT1?6Z_;SZH-?O#2DG;ncw8_CDzJvqRr!RH^(e+c)Sg6S4>KJQc99hYtaGpexQr|
K`zt1YTK@tgIEX(0

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..64499dc53d0c919b6e990bada1910bc4d40ce3cb
GIT binary patch
literal 3122
zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkfnl3mGVG#M8qnBqJ~8TNrEH@)>>Rji%PXJ
zTG1+MZB<mLRkSWpSKRlNIy1+q9mnm|>cP|YXh-MXq_I-<k2z<~^gGGjzWuv5zwabP
zN(~YP(L?l3q7$3}r@>X@yFY6y^KBT??a_d$%#}Muh|Z$YkMf;FxhU6!!dGGAsT4M8
z0CpZ69R)&oPP>(8L_r+_xszIDm!&i0Oqi@QO%^$&pj<<g8;Left9YFyHS-0L109^`
z1pF~XLeCh<trHV$7otjG<ERuiStSko<x^_lx3D=DcCGL{Y+r{QV|Ix?*I=HIqciIq
z^64T!XF|5IutaaLByx->&k^lHUVDbq{1+ZynchDf#Jdhj;x68D2@m9tjC{<8Q&A8y
z7xk?YTAL@qZ*5<Ef7u?77W&{$OK)WNaV6_F7_nfyD}0u<!1$dH5a5D`ojFwfkdKV*
z|Hh@Kb&%(UVZ#_TK6~JV87m}GUTn`5sKanzYaouTjD@G(9}kB=C1)G@p<1INy_>ut
z+zWuhn23m30xmRo!>*SHTwEh?zQ75KGD0zBeiVka7GPXi57;er!PVq?tm|Bg?g87#
z>3R+EYTg9b`El@SK8>Os{b9JTL{sN+STae21;6e@bWJWMZt=vKOgUD!enoC{BxpR<
z3D>%~;NXFt5LOLEU}GC9;=PdEuomh~L0H52VpZ2h%zV~>n;3#Cu_AZl{Wr02xGU6O
z#-ifx_oSet3$_a0&{x=@+E0d-?c3ouGX^P-<=9cx8`>&4w$$iw{JJBKUX<ZtZ6Hag
zTMKW~Nj#Ym1FzcwxRh0m>rV|RGpC`ZVjZ4MJA!4g8?ibe95+^Q<VpE0^sVd$&-!3I
zcsK{^ul9ns!I?Z-^AmB;xk=2v1Vsif#4Jk2$Y1A?PfV43Rozqk+|ZwV<6qNB;hBi_
zJ=Y5p-B-gc^IJY;-6PUEaItVJ@enC<yiPuf?~R}<H=!3!a?MM35wo)$(dYl2D?5J%
zEwledPOQC0g58R^T4N5T-v5BJ^9`46T7Di4`yUfeBynMf14&)H0~v9!3C-GKa%k2C
zZuTA?3(LlGKZLe%sx7hn>I3#@{JaI%E0^L~RWaW)pq#rmsa(jf?kc}~WE%;}-zU^o
z$mKHMOs-e>Hd*%34B5(qKVrwQ6r#8_n6vNC@g?G3#CrXL<H7&zc>B%(nJXMH@ZEj*
zWm7yH(xXw>-Jkn${2t6Zv5&tzG8~l$CUI?hW#r<pqp03Cn(XM&h9bw^q`qP(x(`i6
z)7fsk&)n^BjDCXeCNzK_9LO8R2q;P_h20aE$R95@;a@Ft;HEcnpROCr)iwr^y}kDE
zb;qBPai?za;rn`a@T!=`ZFW1(zn2**^V|F}Ul^FqAA<>D%d=5_Cr;)(YdZXPHOaeN
zC8X^q9?*S(z_*6*?$hlVh}`GJ2fUxSpZ^`|1a@UAfjv-4KNF9RbmE_$Bg9Vj;1*r)
z=LN+e5RMmQ@(v2Sj-BisIy*XbadvSPm2N7vdsj`j?mawudU|>LXnXng?$g)Lzh6LL
zQ1Bc5LqZ1(92EBE;345}MM#lRL!)D2<Khz%lZGV^PZ^Oq^6gQh)5fG{j2)Lbe!@gu
zR(4MAq`b-b1yc(324m6GVw1UKT4~wzcRI~@cjkNLvu4kk`+mg-A6Cwrzo6=)g^NC3
zT>Y0N+NGZ?>r=CQ#iuJ*)vjK%_Oo@Lf3beU#=1?Lx70U$x%I1U+Z%W6{Cd~!J$su{
zn!nk%|G>dRhg*&uJ$C%W$y2A#oIQ8`!o^FMuUu{Y_S#>y=TsVzv*LMNdtkKF=NlNd
z=bD7Ve3Q9DZecD1O^T5FdF3u=<zZ)SjVt}#F|>cNn6HGvc8zGSJu5lT%31)Wmr>p(
zp;Fjn7|OqVN|j$k`MX_Thw_fkF~2a^kY~0wEGy1mbJwnLr#Szz_1E}(Rf6iO5UAv)
z?8&)$ohZ=0=EJ?v9(^=PTU;V&6GjN9YF}khZWV6dhM3&%^%pr&o-~+g3o6T9uolWu
z54Jn;ai{h@n0<@mKPC&Mpw_69{>Q!hPwJ`vmlv!3vtE+&wo)l*70N0%S3s>%*-NRZ
z!_&g74@<R12w)T<MM{!1f_@n#N>Zs5nJ2|aQl1o*XE}?c0>juUN|NH)O!u~#?kr0a
zv%m~V!{6x}5HOG~qLbs;KB|<B(QKfmB`Jvwk?Cw};Y)#RKP4+TIVdY_bV*LkpegBN
zQYSOMK840Ny?_oDdr=l6GzdyodzK~RwbM6cq=guAV)G;&)3>;%=}=0y?NdjMV*3O(
zSW~xzZjEOOJ&QE!(n5^sW0H*-V=Spte<Q5<CA{4}9$G9)FT}%28CjGQJKD?`QhXAV
zw+NNm{EvvY)SDKYlCAWTG?Xo43t0_{GGYc5WCfSen)GKvw8*8b{xn4ErO>lTizbbn
zW=HwyQe>(1o-DyKEh-~fDrr@+Bxyajq@J#iDWQHWc`|)!pQ^H}u}VVaQKT!LT9Rw_
p&n_zR*A*8VOFgFL7Mt>o29Kc7enI^L)kzxQ`3Fq`kV5}0{0G#3JWBuo

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d0c68b087a629133e6561c3090078ae686a5e13e
GIT binary patch
literal 2537
zcmc&0YfO_@^xXS>E$v1pB42G;b*Q7FE6@^QNEq`gZ>R`3g|aCYT4}MgtPeo9DQ+{t
zg+(-)VKl;`G2&~^u>=7v>X@RNuf5o^M7C^G=hP*bP1&Nm_v^|-<=-+-+I!DE?{n@w
zCtp5asN+~JgbOopn1fINiOSfGOTsm~-{61?<>VSd58=d|Sd{=#k^oOJi~)c&0+SUZ
zghAbkR2-{}2XTl}AuTgG%%xV7+sa`%mNy}JGg2q2CF5CAYDg%+VIWHg*#I64jU_Y+
z4~C|Y1UMs@oP_%@l_>ukE~eq$V}`?xl9`?5-YSd3Q))JOOfnlmZ06E(r`=oS5EAG>
z@*bqViPgujYQ)b7Hjc+frd1!ry%S{M29Qwn9_F?WWo*cUg@3O{?bZ+3#<qL#=fOGX
zSakD~lIgjyd)jVxS$GRPyCjm$-+3EFzSj%=i5uA~xhK)=*mKOg*|R|R_0Lenp2bJE
zbRbWt6q$X=qPmOcpm*)}=*;`qQQYKeyu)dRyPdljX;g}6@79ZOuxAKO2Ln#}M2p(8
zWvJjpHyl;F(8>Bscx@*G-^FUthLQxhkuZoA`!kuA<G~=^G6~vu^}^+bEpWfl#Y~G`
zgOd~Iifo&n7XSL`VWhM9*d6P{Vo}r*JTv96sQgrkX#0s<kkhSzmZ2ORJO?vgt_w0}
z+=0HhpPsmS?Rilc$RP1`AKclS4Kiao*r!J0TT44({TUzg)xs2LIKCVYc8Smz^H0I1
z!;8_!&kRC)SPed~E)Axp<wE!QDa<B~2qNG52qvWe1wWJ?1SVO_IJtQs@iwqWmTeXf
zZFIvgy>G&-FT+LKcD{^vwChmU%uc4Q?>>6vvnx!BZ(7idbv1b3<UVHgk_3@vUlU{3
z8ky7JhNP|Ku;yBpD71bhXga#Z6Td-d@DB!F>(2!3i!U(YD}(8yyo$J+NJDRRxMf&+
zg$kAylYr-Uw}@=ay$c9Y!T@4}rqtIjn-`^cs^RUj<JQEfO$}y!TX9Tt<MNZ&>QY*p
ztZB2i=Iz;B*;{#J>7izO#=dVI9orqJuHRkV(^{RS3Eyyfm#fd7)N)~uC+9%P2mSkN
z&fZw%8#q*Fh}m}jYWu3s9bK2z=O(B)>~G7gvF}`3WpmILwYXWk&Fu-Y*e$dN$b%n1
z@=H?rk5YBBO#PdPHtpzK9{H-r%#nWD!2}*|)&NH#31GB!0{94(D35N|{?PH7b;5A#
z+AWR>kAL3y+x7HBb@N2ht`A!u$LFIGQ#QtfLhLTLTB=MOOU6<p9yQ$}RfheO=8@S6
z@mMjox}4ZAJa(E;xske)0V_5nQ*Blu+yR|{n0N@>MPxX!A3^O4jvq`G<XEXv8vf(a
z`<O=K|MF7RkLxALFUjOsEs+n&!T^*iMKE7jxS%M>|F9ITY%GO%UeEJ<0r^wN<@s7(
zU%_Yad<CCcA<*<hU_QlCc|M!&<Q&+^NkHAy0yV@hSoK_NY$8F@^RnnUwU(~wbRnjB
z-ar?<ksbv;uchY|WpR1BvZBRabH>~i#^ORN<yYmC_*PbuMX;AD7$HFr@Y@qm${VBa
zE-8w4m@_MQ6V(^olXM6e1Lwj;i|9FrF8<Vo(EahOAT+(lR21(t7UwxjiiOmPzXE@L
zIWLWkhZKvzsCZ~8_0{Ig#U9Gwvkg>U5ULIMFUS_^O^Qt*P<ozEqiCj`)}Xp1V{T<x
zTpg*&94bVLTubXuLiAq>q3H!p5;@6^@Qu8_)_+ezuvCl4@Iod1N<xx;x{$iDD#J_s
z2zgR{VxOqetMN;MBBI*ldev+3M3+}rN1I$OXKh4{#pSj+9TB>O7+s83X;1+S|Dc<E
K0_BsByng_P3~*oo

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d10d236054191cfdbad484e674d3f07a75c5e0ec
GIT binary patch
literal 3413
zcmc&%cUV-%7C&>#?xG+ddzW2Y5Ge~N$U=lz5V=&56%h-?N|Pe&u5@DoH6*A6BhP{j
zG}u5PikK)x1cSXqY{Zw?D?VG~8Ka42?p;xN>L2g>-uH4on3*$k%I}=H+}|aH4Rv8C
zM#&6NF#-m90!Swo#%8<u_Pg@jtRZs+ZO+_V5V*;A$i}?y8DPYW)R7S}M%Y#$cL#e>
zr>#`fX$1h?YzRDx5J7GPMKh8!l*|nTMlz{hbd)-FN<vgx0z*hdl4>NWMT&5R=wqQ|
zju;y-1mM94PXJi=Vg{-(1`ewwg+LDhsVDRT8lQ3^&`q367qn7Q=ZRE=UfW1;1@G_T
z!`BrM1bG7#2dOs|*c-%ZVlq<V)ag@Vqtc@c67eE2c1nyUIU_~Q*VK+Aw~(TSR9qqC
z|3%6VR(jZwX*E<azmj~hH-sJ6N&hcrK$6=6sBB}QtzjBCZs-Ts&*?))p)K5Pae$b<
zVzhaS1{O~igYC)|&|I|za5jO*JqRecp9qn={)DqH%Rvx77`BAV;G0KAFl#l7a#Hn)
zB-vnS+M$HQYXiV4#R(pddWlX|^@k$46gkw}0Cmq9M4BM*%B7&S$_Di8EWt$V1*emY
zU`eDqOkd;!BioW-Vs>xPTW$guf-9k{M;2H(SECb^a%A050pdl&!MfoDXm<|)^#cji
z_m~LFrpaOPAK!sraU4wDZUrYN381uXC%V=RLCx{*aM9QVnw$CnwayomHBTTf&>DiP
zHh`?c1=bVxu&!4P%z0G>H(@B84`7IEpAUh;QDTsN8vuFj*HKc2G3=mhASF)^iX3TZ
z*|iHC=lDa&a{=rwZ~#St0JaxL!I8^`aOeyTXG)Z4MEM4=NjnBFX8D8l9cMTjT?Ch3
zsv$c)42tv0;8o^9SQ)StN=JCYwbcZAk#igR<(q?5r7JvoJRde+umc-)Pt>vgDKd+_
ziPG&6Xw}x>za$t^tHu&F2|KBR@|W;i)c~~5skpnyDhM|B*-T9}D+QZLwp2)22dWJ`
z25keEQWaBLyZ3cINq!aR050cmLJD<^Xjr}n{cd-haMiR6vrnIdmU%y;qZ{raS5qxf
zqKSpyYd4a5_MUXbs?$)l|2eV(6*2e+B`OazK%<-Mp+S*~4&=5H^J+;b%$`8Bj`4&0
z?oSBm_5iZ9Ngv#Ig`t{_EpR!1IlL-JCHpw%5FP^u(TPR91nmc_kxODDRgxzVjF~&E
zTd}rRw`7Y;^xEc!Aa+7@>GKhU{s4l^VCo>i`WYN?z0vK?-A`#T7{I{Ujqt1@5DX&x
zAlbr+csRKh79MRR&yDef{HAHdlR6rm8F>giJZ!1my`MmhIFqQ%^92jvAgDiOPKpa(
zAln7Iz|ijnT%A${q=%B!FkT?a$fx#BT_$+GG!1UG%!kznchYOhCJ-exE~w6~mi%PP
zpX9{jw@J^&K0@ofOk$_=1>xLD?(}})13Fn5K^}%Q7`!S5a_$bNd*;pr$367|;|mBq
zc}fCAZ35-6p`_VNeHin{LUcRm07~zvhhiH&=)I}q2-<!UVdq&*#~g~Ji>k*XhqGpM
z`H@#>o9PkKaog8qT~Gj;)fNCN((*~uNk5U<hL=%F;xELE2VW5PrsYt(M}{ELZ4W|g
zxEGnGXbAk~Y8EoZ0!JoS`1V!HKL%2Y`=h{em9!M&iG2AZLY_1{x{sAyAtjmt_^nIP
z0-;E+TX%hf9)?E7Jx#=n#8fIX>m@h0=xy1@%G$<OVQ25qx1Xa^e`lqO>n8);+&>*S
zXz&n^p`OFMSZ^O+KmUN?fg^%cBZEhUj1C<$cHH=|@QBC>6DLicGBqkXCN^$bd_rQ<
z^yCz^Mmr-lEj=SMD|_Z=vu4kko0B_l{({f*zW6eK;iAO_UllI-dTG%=mMvegvUt_%
zHEY+El&;_KP1(jxo40H&uh_P|vg+F%JF9oq?EdciJ$q~G>Kpbo?r&;7@I%YNLx+zX
zJ$C%W$x}a`Zas7M-1!S_KVAIi&%a!{ti8hJqqX|oXJ4I*wBl>mxn!<dkScJFzH8Rf
zTIr3lo40On+3KwYxI=5HkS1fT>MoAfj-%Z>^Di9jJ{Ro)(#jv|z&}D-cuZ^IR~+~W
z#%+8B6vkTp4vZGZ`}b2GtN9h9J9WP`>@(NGGn`P~2}zM}E@ucG#SQOD5PB(zYL4#W
z(x~Lbv~*!yavVn<m??6Fk`7bJS&3q+L{Y<Y>AM&FK{`+}$~Pn}s?!G=dBky?h^6iI
z{66VnJ6@!sPFs%j-rvPWZ%Gf+_72OBk)Gij3QUeu$EWM4{34QELW+{!idG9ud+&$;
zNYW40DJyUTsUR&TAuc6~p)ljy67PIC*veJgO}sGogBv03+H6LkGw#aDoZZ`NU<gK_
z^5DEFDSsv0#|42ibY6|E8IEts#pe_MXR#m=$>cKC|M;usZ~DaezdX2q+mj?|mrFzn
zk)*X3epAZiQhhcwbX3@2-Ge1vPdRhq#(J|XI~xCT62!7utam)?&$98XPdvZpjSWU}
zD<75(<YugQ&RFA@X<P&@5IgF#e$LJV@xm{7IH&t$aibqM;Lt3q;s);sPUX*8C8tl1
zb`5rk4jZ2l>py6EM0jWdcb*c0@y$%a10T<a4}w8p(Z%DJ+*ud@w8$_wb!<R98^!ta
z>2Wz&B0BZZapO3B1UKkP=b`KHOviiP=}}>BnuzdVO=LJ<I?gv*SKo-SUGd;%VR;7+
z*Gg|~Y{2+*?tl$carV4XR%iavfqcJlv$1rxo@ITxWk53518t=LprmNmY}}IpoDptv
z7T13aQFkeL&ztwe$Z>V}Jc9Mk(%lmemh-|kEZ<3;6JMlmpD#Tk#Xke*;p^o5aeVB`
zU5(BXq?X#K)EOCZ=}s|Pty5HLswT@aGcGkPQKPnWaqsWaUnx_`0p9#TJp~Yg|DpN|
DjS{<J

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d9326808f106cc0c8551356f9e11397011237c29
GIT binary patch
literal 2753
zcmc&$d2mxj82@(P(X_Ns+Pv^m0u*RMfgVY1OTkchDZNAKK`AMek~X0&q{$;mDFx(|
znc^S_10$dVRlKOk$l(l>VmT~_Iw+{XIF61pDuM{kz$k~;?h64b@P{*we3R_%x8MEk
z?k_2pN=*VQ#0qg%GC>qTq}gBo>h$ktooYUGY0Ds!{ak+mgeXCiAn-yY`SQASh*t%5
zL{(4=077Fi8F_>l-taEc39L2^_(-i7T4Q%Rs_X5(dI4*AO&ij@g$#KH)#E&A6KF01
z1~NpF34riMA<0U@(-cLPfly>-iGUIpn5ec|5bx(Hs)7<x#oX1=klY{MCBm>cC3lRO
zI54!c_mYo~aCmB)8l7%`wZrbWM>LQ{gQL3E<7#SjE9HENG+!aZCdP0Y>;6T^M;7^(
zO4y^a3eRDs&JFC1Xq>aQ2)w%{<9+qJ+0{G#gx_|hqW2Rw^->Kfgf|Ai!Oo1?260t-
zws`vu)RF%YT*z3*wpX9+H6r;m^L+krFrE7z8rhHVp^f{|tzjSI6wl?5`Dac;_p-0h
zhp+sEQu@hwtH%MiI$mQ!<1@MT&1YcO;oE2kSn-$-jA%!G1S&b&35N_`bZqHaysU$P
zHS?$8vt`-vbNY2G?#yMj9SMi@y%lKRYu)fo>qhu%wU-&3yabywMsp48`}3EN??I-9
zE_Ulmo-bQGE^L$BKg>1oYwqQv*Fc?!xNWx!aCj<anuLRpJM<=;O8GAA$4jF)HAFzh
zqAs}Eo(~bWY;X-s#MfqYz^V_sm@mpQq4mfteElGYKAU&~%w|3N-k|HSPdyL6vvL9q
zoKOgzpA2BsYwjTZioMV$`wo0ty$cw#k?{!QLDkgC9+<g-zrD@}KXfmLEys6rFKwTO
zx9&5cgTp$Qr{4U7nf~!bCbMgB$k3JZ@b2XEA&aZhxx*n>IhWDKoCF_?*<1@tF6D7i
zOIsjee<$DfJVMuhC3nsKIvB@|Wnx;wq3rinNdICtTr50>{86FE(PieoOg5smmFW<=
zF&en|Dm*N64_AAlk}K#GVf~&dXng_)!#|JVcAWYPz1{B=lkoNn%)$$~aKKxR4;JPk
za?>wxLI;x|v~l+zrE=^dOAm<96&%}#Ow6C3A~b;erG>29I962-2s13lhp0lsdWA>y
z?i1NJDq1aQ`ia_@{<;AJ2gME^GBi$a7#2T##K?riq-3Kh<*C%P^idh3$2@Hwn>lX0
zWXYP4os*lFUr=bBSTw1)q_k{u`IL&Ow#sSKt7cTswAa)+oU`ic8)naOHM%_@3$ki%
zxR;YzpFdLub<;fJyIGJU=eH~vwa_91EaGH%MvxO069-F(gQd00oDniC-$G>iBr>cZ
z3K<<T=$@rcm%BZNt5G@u2jC^I=L-#OY;aSkar#(SgU=u0bUEqkB~P`D(_HSYITvoY
z(%Y~}fw2cW_Q?0x0~ehXAqZW#JCjrBfs3LlC=pdbU5Ymk?-D0>u@Y0PY}GstZu;B<
zAD7cz=MUiLNu>D{8E&c#`_%++9`^p2n2#z@yE+X--d9`iY_tn3fmuC%0Il^p>(1!$
z1aKh@i#@w6@B!tXcu4i`_g}yQZ#7eER#eV{B044!f@h+SA^u~je<kogiv<m@)oHE&
z@lVMU0{8#r#y+u|q}ds*;SDNHdtWtxRwsr_rKOW9#sn^w*v2MPN|P*-B$bdar9w$+
zmMnErjwIDdS#`>ug(ys<t1L;%r!(P$GvSn_k9wevq)7`#CMRc*MRrjh<+GaUm`w-b
zT9T}Eu-GW8$R#7?=hUPWnQAJgG&yoc&#_G{t*7$FVv=9WY%(bRvJ@vI2|@vX%96@@
z*!wCg(%g>RI>}D$mGGn-Lbf1ZT3$~10y+drS5gn;Gl%S1{Pv18k8Ns^r*f)NI*C^j
zsIOph&wNO;2tAMwZKXwa<WBKZfs}8h`ifF>Fn&qC(r?miLc!Kc(geE9b<rNkl{uql
z*QCrRJxQfXq{+>+|0Ko0p^!a`Vo5S5)e*T(vNQ+INlBJk5gkeCWWY!%GO(|dZfneG
zB5{;DsXg&eOzF`CG(n7&?cTXfPJd#pEGOE%UQcuEJg3*!;Bm*A(vwU{My*u`aPI@1
L<lkR0`SbcW#|)Te

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c7bee26c70d25045c9d71709e8ed0d492f391026
GIT binary patch
literal 3122
zcmc&%c~nzZ8vot*USd!X1YSrmDzXGrkflPED)oVEvWQqka1DzHk_1T*thKn57L{sc
zw4zneS{GEPRqVJxU2)%6>dYLcb{w};tD~p(Xb0!sq_I-<k2z<~^gGGjzWuv5zwabP
zN(~kT(L?l3q!XM0r-4nr*Fy{b^~So8^S8P#eVlt;gy<|P{VCr`l#6mr82l7Ao=Rbp
z24LsG(NQ3T=d@diMikVckUObWc9}Xu_Jqkg(`1oT3d+?)xshn&w2IeRQZru=Incq0
zPQdR&B=n4t+&VGAb|I=1HjYYRlU35NUp}P<ehr&rVb=)H!}fE?Hf9y;a}4GQ**de%
zA&)NdvL|F23ySpyOCm>!@+{FV<h7?c&41zXmFfN6LA>LTDDLDfmvCSH$jHZhG!^-w
zb5Y+ap|yDu{MYos_m}MPXrT{oxAa0*Z&$K@gAohHyTWH#3yk0S0D&%e(3wN!4|&Mg
z{%>4ttAji@92-Wf@%eox%vd3jvLbsfUmcG9TZ3?PWh^}P0eCR1jht!diz<zZ^lI{k
za5oSNV*&=x5^%o38+N`PaB&@sbNNnKlo5t0^P`Z|nvb!iJz%%g1y_>mv9@ywx(99}
zr|LDtt9cV#=f}aT`4kFw^n>A^5>1`QV#y>87X0!xqN{T-af>HTkCUUe^(%6{BSGWI
zPPp2|1qb%`gs^G|f*K#9Jl+e*4Qrs@6pYoJA69j3#LQ<6xPd{q94m6yKX?lZhq*%i
zWh~0?d{6R=yI`x}4Sl&Cs{CbW*}fhAGh>kQM2;Pmy`Zg>V@tIT$F4cz$ORcL)C7^C
zb!*^lI)SG%V&HWv5EnD6aIMXNQga%r%h%%Bw8K~yyAicRBXE5MN1m45M4yUo@T?EP
z{ReZf{)#WW4bJ4z>Ys>v_6=h8BPcX@A!boBM*K35d}^xTE9=_uxS=2UCZM{L!ZQKu
zd#)EIy4S*O+_!wn+DD|d|6<{0!a-8%c#V7<-wVN)Z$K}c;F_22BxYwjqR;yqS9<O=
zT4w*19A9&ngt!%QHO6dAz4sw!=NBQ{wEP?z_B|n<NaVr~1(CXV2QvIX6PmR}<lwCH
z-0a;v7M70Teh7QWskX%OwfpVS_(cn@RV>A`$|AmJU>SFJQkjrf)m47y@HP^hw^yhs
zm&;{-<2c`lZL+K*8M2iJe#DNX6r#8}khAZ{@x|gE#CrXVV<G?Oc<c6?GFLdD|9gA!
z^QL$>q(`HmdjR+2_}!Ryd@p}#L<B1KPvRc#k&z2YM^LqG6xq?^AqpLLk^1r>=sqL?
zO=r6CK6AIjG5RUKo6rD$U=VK<2SZU@A?%vCME+#43IA-F12?^q`)utPuBI`V?D5^r
z*ByIC#-6;%N9^s{!K-{4x7qC&|Ngizng8Zb_=2Ex{wPccU!H}s+i^1IS<~Uavq|3N
z3Ly`F;sM<{1id|ocb{(0K;#}T-sk<ref)1wC$KA13G9JVdQ3bz(ur?7ONgE9{!O~x
z#|w%9ARI5q<Q)`t9Xr`Obar&=;_TuoD&16S_pX|5-FtZS^z`!f(fa!J>fOgbpl@JM
zaL60|Lc`wdKOp?AfrBF69xO#h4T+A4jf)?ekeHM_EM<7=h<8SgN*kS?F=p(z@e?NM
zGPAOCCgo1f%b!x9Hy8`27MaY&(@ILGzuRfXdo$lJn>BmR+z-k>{HS8y`~{UCFI@D=
z;;KI_(JuXTS?}uQD?VGfs-|}Jn$OpMv2Oi_jdhzgZ>ewia_d*ywm0th`p-Le?cURr
z()`Wdeftj_Jk)ad$kAiRPn<k;`pnsL=Pz8mbool_w^#q7J*(1)oE6XG+5@AVKF`3g
zJ;x*z<eAJJatd-7Xi|jS$18U_D-Stqt6k~uj-maF#e5|UwrfOt>{-cyR@MS2y^QiU
z36;Vo!%+U^Q>y$T%HQewI+S;Oj`;;ShFr6?VOeqhs=IcDJH`2zt-r?Ss}fXKhC(Gb
zWlhe}>qLR}H6QMU_UNNY+~P8rHerNvs`gbD<yPVLZHUSJR)3Kb<%t8CwxF`y1#6)k
z^<cXbA9rfsgW0z@{(Z7g3Tlly@qgUA|Dc}we|fRmKk6kZZz+|6R-vqPa|P5ImA#aj
zIxH>R`mj{1g+NB3QlunF!|9h%f+Uqlk-1WgB;`s`xt6m?Dv-ohQIZtTX1ce{bZ1$b
zm<47?8uo6Vz`*`=5uF^z_E9BljAjEhElG)Nh)ic&3ttLi`ze_r$-$Xvql&X*224pG
zojRHE^(i#I>G^c9*o(3lp+Qix+OsSfubsXrBQ4aB9h)oZn7+k5O@~stZJ#=FB-;;V
zgEe(a=+=0q(6dOhE-loUK04W$G1`(k^*7v_-_Up3$3u%n>4kV$DI*KBV@H`8LyAvi
z@)n^IoB!eQmU`1-Q?iv_l7_HlYyqo5VMfe={LGM2T9bZEh!(ko)t`oFy%c&DY0;#S
z)9ffeU5YHR-jgL*rbT5WOC_yJmL#p`mekYrF~!u6B~PYL?Ne2DHC9QeJPLJ1Q;T!V
q0a=BG0lK0hV~NMKoFY@6(clpr);G9skUCKVJpZ6c08;3`h5rB{LO@#p

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..4748c07ab9f5c77e8fc8c141ac748fc7bb24c158
GIT binary patch
literal 1354
zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x<
zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;<l$a!#xUoBoF<sXrQ7y58
zs;m!f-x*BoXd($RlQsE)&h%T4&`%i{dpY&%jV$-!rW1^op6ka0$BymIDITWIU=+-k
zp1*?j=?b(Tf+h)^$zdLC-JANv%8kVNG>UJ!bC)Wuz<v>~!2PxM7nZj#IL}>w9Itl$
zb1es`ey%X35RE3TXG<dLP!d+uDrx5TyDv%1ilF4FONBHzF9^l`X-X2Lg8727MTIwr
zT1x^}$Qmu(NBkn;^TL;DGnuKHYW~Y0_3KSe|6ilhuRR+%l<_wzpWkCBg0caf&T;p7
zIw<9vxI?P~1^~42rS%#h0b>j`fH73Z{La9DH*~53H0X<V_C<>yBSIh`IR5x}XXhCn
z4w{E#R}<O~XhUcKlQzR5llx(p?AdUySsr#zmriYeX7xG~@_Q}h_u&jTj#uR%NCH=i
z$B*R8<wyN)#dqpsuqmIH$98a8ncX=#A^V87v~^BB<uk)OBeuJh!0I)Fevh|~_1bBF
zM{o0dP%K<4d`L<obn2%uIiO)uJP%5;`t1hS8^y+zNe}Rfj`cZd5cX^PXT!a7)Z_^T
zp~wm8j}+6P;GMxUkvVpUehZ8wof9WZSr`Lc$ux;Kncnl(Ew8r3I=oNHM|@1B(WH?~
uDTemq-O`P>Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..9d247f8cae7a335a4ecb364c654ae9a66b3a8c00
GIT binary patch
literal 3450
zcmc&%cUV-%7C&>#vP%)vz3VQnf&vRD$bv%DpmIS#U<Jg2pol9VMV7s?AXu>xQHe&M
zMpOh_L@Y!FeM+$gD{907@g<5awy5~<8B3y>dsi%p{_(!=eJ}ID%$%9ierJaJ8^YNL
zcZOnon1LEb%s@K;+iAzA?3oqz=<Lf;n=|HRI{U3%-&2y?J+GwhXzWSfqS)!n8DON0
z+=Ve^tQZj^QhI@txYb53Zq)(+Q+tB%5kcN0RP{*7Pzp~FSt;a_aoV)_*d(nXi6LaB
zvcpJr5~-3@;!n9!xZoUtApmz)cmTk1IOC(iIq*3<Q3xamNQnsrl!e%v#u|JR83*1>
z##q10ST1hek&DqA9T`UP*Hye}<|LpXNHQ+~R87=d7)}=P`nXJ8LRv;_yf#B?k&GwF
z@v(9G)J$EP0PHs;YeuR%Qq@c-|BIL(h1+P!T({IPPe}psFD6g!lL7N4LW<`c*xtxO
zW6gMQS=SwITr`I#iyYu?tuw@RlcG%}dRRC{3Jxo3L4VBwz^y$zZbLx+gJg&<{~a#8
zDg{wuUnm)=fc4E*Fm)A+a?;I-6h&V+v_lQY3j?96&J`Yqy+UWpdcabp9649p19jgG
z#QG5MnMpx?nLS7xZK1u?2hOEf!QyByn6SVP1~;a_=xiI1EN>4Dq1$0Yn=I(;whNu!
zu0(b<TR^&C5ZKk625xsRNP8%Q>NcZc*?1)^{A~~T7bL*gZC&9^j0lPwccSa22rBB#
z;BwpcaQILcpw<iobwvy02H8Pq**Z{cafh{p6Rhc20n=ZX!7b<qmjW5$`rOZ9QJ54I
z-v&bNy&EVcvn}kP>_L|+fu$}q)Rvcn%k%&Ue<6b1`Ocuq7s0jyEu6S&3CGUUaK1>5
zf=kzdz2OwRoEiXjciiB@xTSFQRT^YxL_$IC26#Q`D69zF48_6y;rc29z0A1{-Sav@
z*X<tA{CGBOYH$Smw07vp+Na1m{uat`LcpcjLBQfrNG}^kR3z=B@=IUAuVuZ^e%Ass
zan}&o)MXPj*18z%V;rdP4Np*I&?#u_y@c8lTW{9Q?F{*4kTbYnx&=Dw6j8H$FM94+
zM|kM(nPi_k1GTe$L?_qXM;;wGqDUVP&nq{Q5~u$3mX+tA?BEO36*NTO@6@O?$N~*H
zTn#m<baZ59Ju$12ghkn-i29-a@W88ukZ%hliw~KDS9v6=*jNi!^OnQw{B*L5TMpsf
zyAPebw4><W(Ot+r`2bavD-sQzF~GEd>u8$V`3hZl_z_555ncQum@w}}keN&s1lm1=
z6CO8B@7(>2mVyQJo^}A9Z3zO4D1S)p>`FWuQwj4=9w09c?GJf}#uF`7G&(={7<hX-
zP`hnfph7x{*q%EOIu8ti>a(3l>7tj&VNN+%`oDy0v1LGdt4TfM1LDj)YTwvpq8CdH
z@N?~KSaozKy?VnaqNu_hRXJ9YJxhKkN7vmZ`yc3HVwXFK*y+|_G9$)|K4|igPE|*d
z$H4%7SH?lk-9dD_nUld~Z?&jx141oNNdWCPpdQeVw4Q7ZLw}o(ZigH}8SNw}{(v`q
z*-edTrx?Pq|1LW2STr46Er+GMMj+=4)^zEK*Jx{p6Qs-5ugIy5fv_)qI8hZ6h`u)D
zksV@wAhRv6B3<%N#KebR5ckLDP`d|*Bk^r-g0tL*I_UHS{)ClDn7twwrb+znyDMA=
zImQ2R;MPFdD)2zQ`W~SO8k!$qp-D`Ma{zyWDOzMAmYABETePvXYTK^8l#z9iE37*z
zJ9W0P?b6lG-a+N)<lL>hi)#-zwYx{pUY=f`_3qR6bMJor2l%kQegpjj0tW>JhiC?e
zhJ_D_7&>hDh{%yq(W6GkjENnq9Tyj$Fg`IUIb}ksE=|u(OgCg?PRh!jJZ0*%=`(U>
z&YC@EZtfTJ^5!pCnE&OX#a}I1`j2JHzg|(Wa@Fd>HATg1*R9{M@taL0n@hKB-L}2#
z+Z{W1l~?TE^UuBeDyyn%_8&NS=<t#6YL6Z}e&Xb*y3=RQet)k1{Dq5`8XAAN{I4H>
zx^k6k;^Wbrx!JU9GY}`eeuEFzg9GJF6%NvEb)f}kb?fKbC3k!|fV(tDg&%6msqbND
z_p!5I&Of+#Ny5RyRd{zfHiJhr2b(K7FmJ{YB%JcG5zfC+6SUAx@Pux%|Ib?_ojCKS
z7y->GEq{F`0O&Ww>TcEZn*E)w!Ha-vAjT=G>N_bJ!pMJvJ|qr_E;)_o!UO}Anrz50
zNk~oLNd+@VlUUZQkj?F&s+XzG2t@n-fq#@H)Qnn#S(#@=xwy(4Xyl#1^A~Q7Z$^BT
zKHzUbPtcHyTXlIh#MXaW#o`-ICkS#Qp?YEaDKfNtOO2@sX^9y|p1+J_SCHzCjjGTF
z)8Sv;e<J9|a4PaWK`t`HB_-&z427Qq4#az}3J0ZTTYDc|0`Mf{9~PSt8I?a|YhLf&
zF)#!p(s=XMlw3Fx{^dd72(3rsV2%CT^8SUue+COOQ-xBY`5!-d{-&qL|K;NTZ8u4l
zXCpIJiDiXKDS$#LH)kUv!Xo<`FP3~Q<;FKp)|X}3A^4YXAuOB4`X;ggESt#sB?@c4
zSYa?f^JCc{e#CO?h$Ufa;63n;*sv+x-Q0TPiGS!IUiQo4dw;&eu31*YcfL`)DD1Op
zUY;<{Bh-Cd<cQ4pfIbtVMn)v@`?_$P-{chB3I6;9CpZb*jQ)fvzxRQ?Av)4CEj}=j
z)$;a2cnk-(s8%^*_;6kh<~t*FA$4Ou6Y!dEhBneuA2l*mA3ailj^hn6;tL-3VLrH6
zxV@JTUrJvtK5#?^zrhA+czr=Bt2O?RAfev4*toTpo@EE})4)`|23&MNpOkSP*|;XX
zcqLrqEWZ9YMdP91HDAFJXO7X~{V3Kq%Xm&gvb+`6VTDQ>jRcU!bpd*mE+7-f5peSM
z*grPqkH)A8a$8QDJ~1;P!!?fMT(#-x`YhW?3F(GpeVVPiR}c3dYK2A#@b(9)DS&YN
GU)LYZD$;EL

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..78e853246b125d8723fcba4db0657d7509e2c93c
GIT binary patch
literal 2667
zcmc&$dr(wW82`?_cb8oSLEtV6ZHf<8G~Oc!3=wyE!-{|@EQGkrt~}hA>>_A}SlQqV
z`5G-49H7x05mYj45KvG<K+RV#EDM{dd}TV}lv<;mbGJqD@kcXFezSYdch2|z&N;ui
z>0D+o%dp<;bS0j^6~HBGL94j_aOJu=f|lF?R*;*>0_)03{IJ}G6|zEE82F0pGO5U}
z4FF<q3XcpzG$n6CGM4cS1)+<lRIE`OwfRMAOA$*+7)cY7>_YM+x#(G*JV`8IDZqmZ
zo&X+Ck->X}gtmz&j!97}vP+~QyRHPM{p2oo@K4$lNxOj=P21N=Ybq!+=#18Ut=g(~
zD#lE)How56FEbeVLI#lJERwIM<)<mxzliwM@-8{B*Bq4W9mF5PE&BFNl&~QU=Kr-0
zwHJNF)HL6RKlaW>hy82Ei>9W+&M7;Y#a@jtJ<o?p-+mkUyx#{y;hUJtsmIW)z|-`*
z$ulANtDnHYoTLtI?LbynG1B%#2`bK=hQ2iy(TNYPqYw`>)nL-X-Oe4f*f(0RyY3w9
z?Y)Plf|82-Gzc{(JE4rD-Ec@=ijGyCr`B}R@J(P4+K>|lH^N3JX=@_gc*GIJTiu|2
zM;{DVZ-oanrSz1*)l^h?grIoy3&Ly1_oLwA9;RWfP$=-tqh>_!7Zmj82(}%)2`SxD
zXuOv~InJi&GPVm6r`>_UkRQigxjIMS4o(ohst4}uP6j7cJm@F+Q#TiN!nzYZ^q2Fa
zq58-YYNShmKA+nUoA)n72PTg|yZdr#&)PVc6qgF!XI`W?`w773t&hPu{!jQJe=pEc
zL9~gD0Z~~sb8zt%;k`{3__^;*nE8d5pniKU)zBV{x@L6J&4UlnYo{*L(LGZfrmbC0
zwRjBDm3d(Tzm|=(K1f9mfCVD!3SjlsB!O$y3h?Xb7Eb&cp^@Kdczq}lf?j!<_FCZx
z?^&v8kGu<Xh4U~n6n{r8`}G}aeMK&HeaUL(z}$2sx*SEBoev@pgNXw5hvX7}LbJb7
zerzxO@ToE6$fXQvKBjMx_c%P!_s=3E7ThvmzL#bkBC4tu$M{MoRKKk`QWQREW3^V%
zoE=bGv*g&-is+_|MRBw0(sper=_@(7sI69?&~m}pvCY_j{cdG%lR3%HYs0{f(m{P>
z<Jn!-ls!57hFX`Oys@(9Qd@;Gp#IF2_LZL*yN0V^RSeQZb>IL4lW~BMeK>&Q8xu96
z-Suf=4f9KzSmRjUZR`)xfP0;ZICIi4l_$eAAo{**Sblz@jHPUVkTmvXi4Da@0uDM0
zqc65t9dvpfd3E@S_0p0mN6GQ=^4baV8XkBa?Z#8zkqhgtWGVg;iDW|rJ1{*74|a)E
zWakJx*!hRM*mx8k7?59de-=DAkAj6>XDqbZkgfwsdXfB;n|zxa28buUKO^ST3iGT9
z1*y<dP^2@cSq1~55A_JH`pA^6ZZR0WA(WDi?Jg^{DUZc3qW7@xxM4zN6tQNc{237J
zrEmoMS@?Ki|K7wuANcWNLBe>-JeB`3=szbg{$FnDbGu2BCKm}K7fJ3ry90R2q>fx>
z=De&(+r^SLF@c0aIR(dY8Td~qmE+1eMIo2KafMuLA-|`<3Udh;%W=tM#%ueGS3GMW
z9*84u-pZE(1H&;BpO!?{vE^ipCj)lPaY`~MRAkA^xgfG$stHL8)?_U#(<VeLRb^)u
z5xF59=eME+54^uv-U&_ulg%H`64@AgOHNj(QJYxEsfj%w9+!hjWnX75SU}b(WU!Ua
zr*6w<Dc)09)mfn?Rd$*wCz~%F$IGzQm-6b^d~mZcJ(3S;rNXRDTxcZ%E?G(Rd8KlD
z{ETG2-?-VB?5*dxIKn3CNe|392@xflkP6(B*+dC9xt#PLr)WDAyr<wTapt%>ELU-g
za@#ra$r3B9!||Q88SzEh_W9CPhJ-R4hp&^^WB=Hc9F0vAq~2zA>9R7N)xW@O_E(pd
jn##SG>q;%fCZl(7SU_+<kf%}xF#3aT`~jAZKQsRZwIH6n

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..3704000ef5a4c06ce7bce030a206d86afe278c9c
GIT binary patch
literal 3289
zcmc&%c~nzZ8vpKFLO?blFD4ifSpq1SK!qYj^nnnTh=>b<3x-7&A&Ddi)?Lfgg`!fe
zxPrx^pimX;C`%C)_Z2tlI67*rTiuSHj@4G@zL%m>^^ZAc&h$IU-M;<%?zz7tmW}ga
zD8`9#RpSZl0BnQ$Z7B=TMm;?6LyYz5Ne_3mGr-s}VlPI(*y681<_{jiPM$>ADGdPT
zP6QqW2$2H$b|hmcsUHYzr4sXGjW#tYLu1Hb2(dt1j>KD#JVGw~kR_=X7YjoG9&GUh
zfcHwKzZ$#eLX=Pl%ux`D86453iIWyFk~eWmgq<9T5d9$|#<9P9ikTL!eCz~4x`Mo#
zdL6unRjMu}FDFfFOiI-lHCCB;k(rv5qRY<9(emkBLgLFvUP{VO60-lo<$IHSX3fmE
zQ8QOaKF=nzr;TLz=TjleZvoVFu+Y&w8N4?3gB#~9;c2lO+-dWKl)fTVxmgEGCW^pq
zRU7E8xdABb;juLVMfWoyvF>*`_p%BE>4C6$tQ5X}WD7IauxL)MC6Of!gx2j!IJ`~;
z&N<%jc=$_ns<uCr$Rx<K$rY%33J~g|VaQwx+G}0G+}#Q6MML0BmMttz^oJ>nLt$7)
z7K|_G4dyHD;bKe;R9NSOqhcpIQ6obx&D9`U904xPCqTbz0B9eGp~-qYte7l=CBJ<K
zVWnv>X`3^goFIU*jveT_1%mqH-QZGpd)U|72dMQ!L0SI<3L{-0rgkGpt9_uH@PPF_
z>tWWbTDS>=;DU-Fu7CCk6b~1H^jj4a-o1gc^18!z$`x`7&7s7LhPJvo@R}74vCjpt
ztH=}NMFQAXs(~X{ZQ#&Z8qRJ|qNu8k;A%JqFJ^>;%WVamOD=({FSSr$jEB;~3V1c`
zAgoetfwHJzxW0x!FXr5WehYhnbB!-NdORO0FS>)P)($-_{{=au-b6+Z1bVFtgfESO
z+}cq@eZ~%|sOlyByLJHj-n+D$&^a0^`&3es9Lm6Tf*TcE@f0;g9)pg6<y3W2d$+!d
zljIkXp5SxgCgf1Zh~|~M(XZ~u318h^vw|}xp>5t}badlA<l93}Y|y2`uML|>bB|!U
zdi5Eo-TNFlgPI6DphQ)XR%pb&CTN!DqWyE*iFpkq6c>ym+DC@Leg7wfWSfdCYqbRb
zx_DH-sSU0!TnVp=a>+i5IYdyvKsvLer{M0voyaG%h1yUk5R9BX*rHV5(<0mP3cYUM
zLlAi*y6kxrVL5;x^O#0Zxjcg-zCT*rzVk6H0xJlZ*#gh1Bf%;m46+@)iH8##VA0VQ
z^8CnPSlBw5c+yCtv%?NSP>>t7tM?PA7fmB-3WtK@&}e8n)r%Arzd&va>cA%K1zbz2
z1u{rU>X;!Q%v(t9nY2Rie7OOBYMT#h4(_1WR*WMy)cc@D_XhH#&A*f5kKZDLTl$!}
z6iy>{C@z}Kp5RaKHG4p3D-+1WV1U5YDKO_w1Z_8WI(Y4F5_G?a(34;AyQ299l!FJ6
z4%01R<Zp}6t?2#8XlIU6TY~64xuXf%V*=qGypv8jlt`EC9D_X1InY%{UZJf$j*wnk
zmywOpDm0@*#l1)!G*d)-G3E4|$CGcbgyJqcrk@EX8$6LOPb1WeesmwN%V|nD8W2HJ
zw7^VgZqd!s%G$=ZyPds=5%-Wt9eT=oIrety<Lu(<CU^Jn?Ay=FyT3x|<NMJ7KmU&d
z1_pi-G$?rR5H=)qXjr%^A~GskJuGH;?1;FLqehR3ADfUkZv2FaNs~0mDXD3b(=#%&
zrex=6b^58f24mi|{DSG9&X_rC_MEx%<}diH@Nb_lT(o#e(HF%_mn|>(a>dH8R+X+^
zvv%G34Q1sUzpmKyP37h-Rn=R!)zp5weaFta`d#1sefOS*#-`@)TlTi@+kc?#;Gx4u
zjvhOH;^e8*XWGx6JAdI~#}AkOarvKDuIm5A`J?rg-DX~!jr5}HH#ld$dXUJmkG^Zy
z(R#^`6*qsnwRua39^f{ur(#>X>qp(e*6w0!_s;%|t=;FWJ-BjJzO1K=AxwA^(e=qT
z&&kws*pp_UvNH`vv$X6q?m5SAcR4L?V8qWv@+L-J%H!JGt$FVQtz?w1QApTm2{Z~y
z<A4KG#~<^%FoflJk_bCxIT(8P6bp5Mq1}!jf+3sNpp>1aO*fiw6$8-~B(Llxza+yz
z^sf02IDKE9(jq^Q2n;D1X*n8(!tm=xyoKU!GW9n5AsC3kkC1dNHX|?zcg0Cg?oAsQ
zf)S{LIBiP8w}eA_0(<Cejhh3u@5I^X9sfC55DTO-srrBX6XP!e{r}6wmH$;YNnGq8
z7RZI-OHvVlR3@=x<Kl+L2bwOHq?}T4<j01vEIR^!If`c4d^RMV4QJVOHZ+|-3&8@z
zxK$|2MshRWcg}dnFAbap&Ja8N(|!s?0A7T}L~#4id~OWm25g#T)!Y!0z-@UxtK{}m
zl6_-*lH<qZrG^ijk}x(dgX8DK;`pX#;eodo${WE!U^3a`mmIH)z9BK*Pn)VrXEmHY
z?;fXvDWP*8H+nR;kKzVX>U`*?c&6a95ThpEPnR$@Mwd92PaXRkVahLRRM&WLu`s<A
z4_C?%eX44Vkz=rtYEGUP%J1|)B9gB+E;gpl(zEPPZmG)VYM@UHADETwTYzgafD^(+
z&gbfnLo_u7pM~(6IC7jF<|nWr`KCVc!E#zyhUF`1QsR>|o%5+D<b>y8KYX5?KDLil
zxz?B@LE@y><W9{?GkT}!_1>D?TwT7?w6t79rcUeR<KN$>zf!7}0lfY|69o{9|J3{k
D8Fh$h

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..0d95d679984cbe1a72e4410ce8b15f3ba58a4945
GIT binary patch
literal 2845
zcmc&$e{@sT9sl0@-b-FeN=wO0lM*lm5{kC;hqROuN?%G_+NLFKfs$e^X%pH&(maxs
zw#uO3a77WdM>cdj#T^+N=r{v$M@qGbLq$DK*$>#xHgU(n#546AJR9Ps``#A<bif}w
zXJ>v-a_{}#?~nWaar3!x%Z>RGFQrNbmqaB<0}w6qa)wu*X_0#+kkTY$wj`ohBWWx}
zkR>K$rfVwz&{-*tA|WE!pC%TGHy45?)ojwWd%JyYUEXMyM2({HF=E_J?8SERcAm`H
z7z#)fa7e`o0QoYh*o7gm(5yTMIs&{-TLuD38EN7ZGcib}>%p4Dgv2CrS9Oe-z|FfT
zOq!`CKq>bj*!S|+GRR8yg*$pe{_a?t&l~e52XGPawRMDpJ)v$jr;EgRp4f*t`^(hw
zUmSVMOfQSlR-H>4j`v}d|9k<he6$`S2ba+!U5EG``@e#J9GpvD&e=UdoL&q2GWYST
zGxosDRvYhr=5sPq{T7@pew;thc6!2{x$kg~RL_R|-~I(c{9E*;T`!U^X1z@d!WXrD
z@4W*jxBP*;`NNM%LApYR!#?<8WG|=7Disbq`5qiReuYd27cKck9@$@=Od3v%!b|oD
zd3E44-7><#&c0RjbW;WVt>_XpJy*r;c|{LJ&o`4Jdr!jqgS+7C9T6@wcQbVq&ldti
zQ#2P}J52HeWBl+ojizbSqNLr*l%(LaKM6lL@hKQ`h_L5M4b{)3T#s}Ns%Bh<Qw1L;
z{q>`Hf&r4Dc;gscK2QzGo(c#~%b}m%I|AF^9OHh|R0@Nytf7~V3FKFGuY<#3<6pSr
z5*#sXpg-NV7^W?*h0$M5<qSK&B(?{hhm?vh;e)n=z&Y}`uv7+O&mezv^%I&ak4NGB
z$*u6zYlnpIKeLJsAIT@jW{q%n|M*|rJ#Sy&O2;y_Gq!D@hjPzoH?<ZC$F-jb!8{N5
z21KFc$qv~3(GnqTpdYed9Mw!ZL&&AS<6ZOq2=W#!<TCp8(Dcvk#P+?DaG~~95=+w&
z-<U)AeQq9kw514iyC#E>)k<fT9u_)YZxL!nO)zwL1sTc~VD_&wg#D+!CeNmy;<BH8
zjJyAA6&#H$r^jlm$akZITzc#8xW1J4Nht6~y8e>~>Bnm}^Dor7iFm<5m6W3-Jru@U
zB%RPoKjE{b0qyl)uzEdAJZmJ<2b|S_6Zh4-gp>;KSwAuZT3%cagmS!~(TcjH3Hs!T
zDXEjvCL1JUy2+d|#WHo;9oEe0GiKWCv$AI2c~^GMoZP(pg1hGy7R@W3U$Vfluyj$G
z>?~hgQCYR5x~A4uSHINV(AczW`HJS1o|aYjwBFmc+S}gY^RMab3anih40VS=krZ*g
zJ|ZZ5G*+sBp=Sg7?v<3(zW)2>-S1QYHVO)SOH#5op@YrnV4!1*KUsmTPoc~<2Zskx
zp?E|A{kPGnPT{w4Ha9#-&cSyw-F$RVI0xGox+O*b5C$tK`iFOvEGTl5)Qz=|)YVTt
z2Rkdzfrz2l_Zch_jYHUy@c^I;1-coW`J;R=5RGa5K|ccrM1ldWaqlGK6BF$}Pq7cF
zP<dmaZ~1e#N;6#y?-1Bh9exHtNN2Ck*t~JrYZ4O@lbFzDaCY-9I=PC+lyDR6$J1|z
zW+~UOE$Hv=jKwkf4Ptzk*neiUe<mTG-R%7~XWlA7^NvC=X`&rn{*YJV5z1}!h8MwR
zaXmM=40DG<Y8qc<NfTEd54TM3dj0XS(zqPVnm4JvV6O(n6B0eKWuSj6^RFKKpUHwz
zG+WHB|M7G64ZR)zmmB+y-6Z3A$tc=I<Db(Is>~LXUT$n$+FTO9Sf;1>Tt<bmQ<mih
z{4%PQ<zCs@DObvJr(E8t?m1DRj;+dNxth($CuZc-Wt4efj^w5H-IbeLjEjo;C5$ic
zWn%>!(6ubP*x>XqR+Y<nj9=GYP@mu4yrRcfIe(pJWn&kUhuj!n|5_YWf90wZ3<4?c
zPhB$EIQwWzb78lys#Eqdd-Zrs2g#G*8<#I<d<`4osjH#K<5`D$&X~8kFzi`bA8uKx
zrjF+|#Ph3JHa;FK7Savzuu?h|U)72j6UfyrrmrgXCeClDR_l$$MoN@kmKU>SRgl#{
zX{nsQw!NSaYciK9VUc@T{V~LNQ*h6zT4Lmw9m+kjvp3!+HCSeaI<i{HxRIJ<d|yr7
z6RPaNbJRSUJ^Dve)|$8`n5>F7vcAV3%jr;*9B(8N?zL|4N1}mnw>7_LPX3%cv&#Z-
N?E|AcfE)i9{~HhQtA+po

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..81083b2ee90f8208676313b318811207bed04c6d
GIT binary patch
literal 2537
zcmc&$drVVT82_D1DeVTxqqk{Ybxfe5A_XT#*=D#!TfhoPraa=*LMu?{ZKcIBjgxF6
zb4t`07dxjD=i+OCY~tdgPThi!I3H6t+5VAX%ydqpnOQd7aOd2vJQV(0<_Vnpo$vYk
zzH`1WEf5ORco`qXt1LXnBL-0D-W9nQ@kizf9()Y1PNw`QUe3#PQ;?(x$~1ix0A(CU
zca9Ja`evfzW!iMeqqG`jnN_qcth9P6c}~r#*Aum$=%?xx!+FvsQz+m$&@GC7fXAT`
z%46$sXc|RO(od-=+$Xuz_+PjjgWDU@2RA9g<|?mq+C}d|o7HQLsG>xbZDF~qrp_r!
z2^=KqV?=+4(|^h7`s4Id!$cf1t%fjub(n?kCQ{Llh1@?%Oz$j#BmFJ3b_wV@vKYzR
zC*az5;kdsx5jQ)>p?tiG?A+zTnguE(Zt8^Vr$oRIi+hnAnr>I2bpPMD+`9+ziVW;3
z*5aePQCPlNAS-LbIfpg_$J<kJYKs|ToD*>`zn7eEOTh-6hKxI$0NE`A6s}pwY>?q%
zTLP40;}EOL#J7$ptSg;@>eX48-Q~di`sbm1FBVtwTCpS2htY=5$k|pMi9fOzs@2(u
zKXMlC1Ct@%QR8srd~95-!<xT7$4tK+i(1Cu+-q`d>-v=RJVVg&&9nG1Iu>6Ye*v<s
zGmzTx8ya)sk=M2z+P!IL=8~}Wxely&)Q0PL1>c)_uIKHSur^->?LIRaf4N2+b<t>-
zCBWIJ#D?Thbnf4e<P|0qJdoo+(>Umx<Y@6*ar){AoV*l@OCP3^+&$Zo;5mbb%T0*C
zVZi0G4Y=AXqTX8sf8!23T6zMT%)7BIcRG4DbL8R5pD|(8D2!>HguD0N#Lg>Yks!v9
z`^~?Tk+$o^n?&Fi<JqIULHIi)ncZq%%g7jPjf^Esm-ntGlzu{bFAx$!iEbhb8`aRT
zC^JbDx$5n*<CRlKuU};|?kP_3H!c49X8rW$^_4Tmzn`~zqodRD#e#N!jp>sg#DiPJ
zlehkS<5;shJ9*@~Qy<r!uF2STVYfH8wRB%s%hEHqmmTeHueYRpaQ;Teva{l$tNL>z
zbUYVm%k07LT<NS5S&P~|vYIN7H_TpRXFWh4d^b^FRH}QFdVhre*HG5919KVjRS%jY
zhr(ILLp5tqqEQ4X);dADpG%DgHfvYRaLqcRuXSteVud#_ZvyRlDq8Q4rtSKq_hDiV
zm6*0E9U8f(ywdKp@-jM>61f4>Em3D_iOr<56X~30aCLckKzZ;qVS4>_rvp}Qd6`+u
zG}0Zg3CPh1!MjKtN&Uw$|5D(`lZBeo>a>>sc=SG{(fGf-RQ=O>N$QIcYEG|EKZsBP
zTAe0bC@jn`$_P9xO|#6vR=Qvm1Yr*SXKR)q_yl8xU=oB1A*({-8L7f-Mr8>?4*Sw`
z@Jmk;>0ut2BO!m;1cPBJC1&PjvvZb@?K9a%T?>MRZN?IIl;lDxJ6D%Y%1bLNnpbBt
zO{*>`F05p7X930cx`S?#zbwfK1wmK9pF}d*AbU?~QMza|R|r;SFNLS+&{YyV7tWo_
z&be$0q%J`Z;HjoOqt{xL?kXwHbCniLsnd9K0{P{>Iv5Wv7F`4Iuu>Y`HuF3$69_pL
zrY|Y^g7N3%NcE=0rYl%_L72fva}BG3yVNwzQ8uZb)?_kMqDA(x`csI3OQAfYWJ!_J
z?5Mm%F!}=bB!OjCR7a3188DKP4DhAYOPr=U8b`{L*;D`2lwD0g6Ph@;wRTCJ-8-?|
n?Vf0@t#$e0mfCAQRW30uZAwa7N~+eP1N#1OSOyf(A9?=(tu<#2

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..28e2ab00c7a1b435bea1614c8f8f03dd7c991c90
GIT binary patch
literal 2537
zcmc&0Yfw~W^nBmF`#{-5;%>XRB5JH;yq1KJA@nY9ToLdAizzPb%7V-8whusS6f0+_
zAs>y-U^+shnVOO&%^;wlhVOjPls3~OH9dT!HiJ`&ll6V~wmek+HF*ZUd%p8N=R4=j
z=JNSEj^zTlAOnXv2n3L8mU_IeZw#3*g99#*Q$`Sa04L?7ns^A8`*^AW3;_H>F*z|p
z7_?1D!?Ef(kOrt#ekCTmx!7uQTRE)6%JoRufV2r(`AC-35hN7gFpwjFd;lH|jUhA&
zkA|j_`#6J`l7#y>l_>ukE~eo=WCp?wSC}28-b#zzQ*1VQOp0=Xl$(o79X4;JT}Yr6
zDZ7yNHr5`&nn6E<*a#k<m{zTfdsk-QdXZ4{F6M40N?(@)uI72T#d?6P-~AB&Y@UXW
zL^h0;PnrcgC+=jI1UJI-i^AC4?RQaI)>*icuz}rE+%tM=^m*pJtQSD{&CgKDp2deZ
zwW9k|&f!?c4Oz{l^U%HeJ9Os5n<(}<C*I~T!~M1$j9+-NWY6YH(A?FJCV>Gbei4Iq
zXDLwLu?{$_b)n<6J$Q8+18Zv*;-0x_a4WtKtM+CvjYs_<{$K%W+0hMG>o&pP^)6;&
z^eUW`FkMpq{y6E46Q3bnc_+JVjZ`{!<&04c&T*q`A>T{39=iiUkx0_mpN;*eVaCg~
zL&oHLa60y<QP-}&EC~VyB)riH_x5Cg!k7lOkVt%IaT~mIrjz-4ZZgyzU5fkKCFrZ!
zCm|^*jQxB<AG8Ej;eBgTAtZGcbX*wE1g(95!rnRvW6~bL55>*EB*iceZYIdRb?l)f
z8>Rgl-0(~HoAAMj1ClM<7vgO#I@CU;jd|(gznE9g{mLYFPLxevQ-u#iUzV+06ffzL
z-Imy5jLa!;L*nLASam&95?H$&B3e77V=p7r_XlY&(?<|9;}s@&xj${nYlypzBIv6Q
zjEwL^DwY<LfaiC&gnXEX7ZIX_Uc}1A*VZnX8LoP!?(LGJ)`XDtb!PqU!l;J&rN^(=
zBsZ<MrcT|Qvuk5Tcg3N_`x|WOpL}a?-D*F1^Zv@NCTC_u@VZkwT&Hb`jTd)$viB8z
zdS!3b*;^|*d-vBEqPASP*0SPDd;3-G`7s&}i({ENywCkA%k6YTEpFCU?)JzmHVd5r
z((t`V+2g1D(NEi;(EcW&LpwB=CwBF)Jz}8~OyKch^>I{kA4W&VhYwPT^3Y+u5;)SZ
zju{wTo5fz{5%-NauBXOo8^)4xecbv8KA)7Bx;_q6Qg^A<QfcB?vX;W|knI+xG3<SI
zCfS`3hgHL?%SlDy;oF4D4c473Sg9e2YO^Zg4d@2M#DniG!h(tYP-<Ur{AjYE#A=P&
z@E@<<r?eXXmrm6_txi(*D3n+$SN1D{0Mr_lKcAmJryx;mEL9U5O(Blg^E{tN{wd7j
z`D$KY#;5aq8J|)n(DX!LHpNnSK8t?I-S;Io0d-Rg)DS;s#f#C=2?R;Y$)xv`YI>&8
zgP7)d13mOcdKLJ5481QaiOtcK6fE$X)2A;p7Uo+izcQD^x4eQJg1r>M2nm9KXiq>X
zZ<xNjs36X6&M4zeR9|pU(jj2<-Sg+qr}u1nh^Y&qi}5TYG`+`E5a%!!<~WK9h17|^
zJTbrQdBfu&#Ud~y9$HGh)10xuLm7OQfyxU))jt1uSwg)@u?hG}&-1Ai&9KoLIE&J!
zSCquokeW=RLZrymwEiSS@lgm(FKCj;Np^&9<n`6!I|;#3Eh57Ul@yhPBt^QAy0J3d
zOZ*6VQhj2dsM4nqB|#PHG`U{$T0D`ZPG_Xa<#JSqR#{x`a)&)s7aygIicuRh00V!}
L!2;xxA9?=(n#OI(

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7b231f3e38c05597bb0f2244bd064df42bdc2274
GIT binary patch
literal 3314
zcmc&%c~nzZ8vpKlFCkGjAul8t6<H!ENT5KGBISV)fryAz7DWw&RkD}_L9GQBDk{~=
zXkDtJ#cENY3fNH=saV|im0DYI-;S*wJ)Mp^I`_R4l~MngbLLFHliYj1`|aPo=l+re
zCee?kXayahqWLs907xbl*Df$-sbe=7tDezI%Wo_`->|3SipRS9G|(bi;!O)^d#uZs
z1%aoaTP6{7YXd+pcLI+BM3Bot-i~B6B@G0=y;LGhSL-yBbJV6Bnh*=b^+?==<jHcu
zyIe`Vac)2pfCqa#0bo9y4p!kD*j!yG1VRLq&?+2wSjrOLGO;E-vXuzBB@zMpQ%8&|
zcykvE|Ez!@$m<~APra<b(?+AuD9~o=%#$^0v)U#XFLE`LGxYfdS{+x@T_nDV<c*}f
zi;(>nA>UeQr!9TeRz;sDxngUq9&aO~7fge^z<E&H$v|go7I<$Q0@u!2!{a3$aHqoy
zG6sv#<}G?yJV^u|D?31c#RI_C5gytSP<}rbQXBpSXP;LCKRX1rBun9|2lg;)4TFk}
z)<m8(1om!Mz`?ajaMk+2!_m*tiMpXsA(J4l7I&cT`GP<n2jL|YoUU^R;UE`q6otd7
zJbPH08U#}pMZ&1gJeXKC0E8<X;X-^ZRM{3nf8Tm^tX78HTDO8|(MWJ>JqCuI0igR;
z3@x@3VR@De7C-wAqE=<Xlp0q!p2mmD&K>A#F9eN8d&9*(j?lh$AW-W@fTHmcl*YJ0
zeBDNnZuNr=geR=)+X%B?)WHoH4(F9LadrL&uw=9dq~9u`^zJp3SI`HxQ|_QG6+(qK
z4>}qez<YKyBs}56&T=o1m-C@!l^PCRwu1v_cyMOD0>xHu1b5RBcseT@+;02A+4Ksy
z{9Fe`<|J5ES_Lnr?}wGjZ=f<Z46d#r(9_~uFr=&>xYqi^gNJir^Myg+u5&<-H~fyA
zH8+si6M;eJ2GL97VeGSos4-^;RbKrZ{!tfz_V}#oEpUy4%>y@6Q=BWoJ<WqksCtY#
zgO^da;`X7zzQ@VWV!Xib{0-1jM~K!HyO7yIh%{}XyrNUbp<~V^ba>-E<nLr4*6TI!
z&!$bJ&@+s;b@eHz`~C@X1r-tUg925@*q|}(Ezl}AqJ1T&i8)OqT=Z3-`$3NgNsW@M
z+-nVun>yff*$Q}3ZX^f#789Yt!+5zBeff9y*CW5&Hfnt-pU?A5BL;=l^D+*k^47Nh
z2C*#?sC*JjSO*Yf0o@Erw?E*J|MgzC@4U|wfei$I+y;MajRBjKD9G>cL;N<W2^Jo1
zBhQTugR;F@#G_^&Iy343RMd}0I|n?1M$vSlwsZva9}x#FC;E{d^BTY|>M2~ATnA*R
zg4EODASfuKc28N(f3nO3H#_En*I8%YmsJyp^^Jb0c~BEsedq<6c=Q$-);7?}t#mrE
z&FK*NNm>xkd)wz^z9NM@2qp+wodLynM)Dj=W`Osu7Ji=#2tE3pgjt<RP<%L?be>@i
zONu5CyN%-rkq_cE$0^YlrZUng?MJf6?lRKm{!C2!^;6<rRx!16R00y*3MCA7yOEPt
zPk{Ps_62IUqEGU?=@P3~PbS44w}fIZH>NA3iwIBT^OFdL@!-LI%#>JBf@y$XPm0I4
z5(s<swzjdgv+v{JD5Awq5~*`vS-<`RTn4(jxqHY5d3p^V;_WlkSK;UXUO-^b`@zFP
zJ_sEi_F*^^5ji3%S~)T%HcmAvessc^#IYZZ8=sV%k~(2x+N8--)ae<T%&hF3+`Osz
zTAki7&1f<gOfM{&F|+rqk7s{UTry|wy!oY{E+|{LXmR;xOO}4Vtm1FW<tx5eIe69T
zHD9h>x4v@2#;>Y2eZ6_hH`QCW)zsE~yM0G}L*vfx{=RE>Q*%o~>z=mn_qOl*p=1An
zgNF_uIeP5)iIb;JpE-N({DsaRFaBhSH`4%uOY#*`1N_W0z%NY(5MAaJkU`dE$>$2{
zf~&kP_iJ1-*Ksm7U2usuSl_@Y@C=HZw_Z!Yrb~323*M(obcc)U-vxrZ7k`qU>?@-Q
z3k;3y`G^a(xjGh-GEG!|uE}hbnV-o%@c1<^u@cXB60fD@yJ&eMhj*_Z-dmq+7usFL
zVipD``_N`<_J{#GS>(cmFN6aveg&WsLAQoP(5=Hl(wn>3Sm?rmDuGB_`Q85SLL|GF
z_>`Zi%QjnZ_8=0UMDmOM<ZJq2OnTG%J7T`AKxug(Ncg6VoJ_5nrZ60P5U-)ShfG!D
z7>;X)Ktj^9%QWAj+!Hccy;uFG37W48WvwX*cNQ%26F5TmnRqy3|1PY5F7SUA3u1v(
zCRP29e{%dq`2T;ov47c365sAC7RUwSa%cSfNo5jiCNXhzQi$bZNj6ZvECn(V48x4U
zUzXw+rjUuqW}+D;n~BWk_9C#tD0UUeFfr_m`R*BW+|tB)U>z}|XAbf84aSS8_>n9h
zS;&r2?0{V}jEWs1QdpLgGYXcUn(iO(m!34fKodP|YD#iq4lCCt;QVIf;eqoP$vMGE
zV6yn*maMFYy(u**P^VF5GiuhJ3y;gel+w*7jvL4FvFu<eolD)4&s4k@VOA#v>Qj>A
z^{L5R={VjPOMS5)^~?u13)5@)u&s<RXq4m4tbmD8vHF}+VR!s7F<ifKvoUqIo?%9?
zOJzRW14C-`u)K8tBHWVzRtYz`knKNC(Q+twFM_kgnd9oPJcWrUw44){ENg{z7_O5R
zBd$oxK395*Ho5@E;p$}Vv43pJ9*sp4BrXQEaauv9*(bwb@KGC$`a+lKnMPBtUgzQ$
WG}LdXLaLGhy!=251(1OM?fe^E1DWdp

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json
new file mode 100644
index 000000000..0fb3047e0
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json
@@ -0,0 +1,62 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "minhash",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 08:15:20",
+        "end_time": "2024-10-18 08:15:21",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "contents_column": "contents",
+        "seed": 42,
+        "num_permutations": 112,
+        "jaccard_similarity_threshold": 0.75,
+        "word_shingle_size": 5,
+        "num_bands": 14,
+        "num_minhashes_per_band": 8,
+        "num_segments": 2,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 69.0,
+        "gpus": 0,
+        "memory": 25.13,
+        "object_store": 0,
+        "execution time, min": 0.006
+    },
+    "job_output_stats": {
+        "source_files": 1,
+        "source_size": 3093,
+        "result_files": 0,
+        "processing_time": 0.38,
+        "source_doc_count": 5,
+        "result_doc_count": 0,
+        "input_files": 1,
+        "input_docs": 5,
+        "input_bytes": 6143,
+        "output_files": 25,
+        "output_docs": 70,
+        "output_bytes": 33600
+    },
+    "source": {
+        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/input/data_1",
+        "type": "path"
+    },
+    "target": {
+        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
new file mode 100644
index 000000000..771d3256e
--- /dev/null
+++ b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
@@ -0,0 +1,46 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from cluster_analysis_transform_python import (
+    ClusterAnalysisPythonTransformConfiguration,
+)
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+
+
+class TestPythonClusterAnalysisTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "cluster_num_bands": 14,
+            "cluster_num_segments": 2,
+            "cluster_jaccard_similarity_threshold": 0.0,
+        }
+        launcher = PythonTransformLauncher(ClusterAnalysisPythonTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                basedir + "/expected/signature_calc/bands",
+                basedir + "/expected/cluster_analysis/docs_to_remove",
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
new file mode 100644
index 000000000..fca5485b4
--- /dev/null
+++ b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
@@ -0,0 +1,49 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+)
+from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+
+
+class TestPythonDataCleaningTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        duplicate_location = os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "output",
+                "docs_to_remove_consolidated",
+                "docs_to_remove_consolidated.parquet",
+            )
+        )
+        config = {
+            document_id_column_cli_param: "int_id_column",
+            duplicate_list_location_cli_param: duplicate_location,
+        }
+        launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration())
+        fixtures = [(launcher, config, basedir + "/input/data_1", basedir + "/expected/data_cleaning/cleaned")]
+        return fixtures
diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py
new file mode 100644
index 000000000..07710b74d
--- /dev/null
+++ b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py
@@ -0,0 +1,83 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing.utils import ParamsUtils
+from signature_calc_transform_python import (
+    SignatureCalculationPythonTransformConfiguration,
+)
+
+
+class TestPythonSignatureCalcTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    # # create parameters
+    # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+    # output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+    # local_conf = {"input_folder": input_folder, "output_folder": output_folder}
+    # code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+    # params = {
+    #     # Data access. Only required parameters are specified
+    #     "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    #     # execution info
+    #     "runtime_pipeline_id": "pipeline_id",
+    #     "runtime_job_id": "job_id",
+    #     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    #     "minhash_num_permutations": 112,
+    #     "minhash_num_bands": 14,
+    #     "minhash_num_segments": 2,
+    # }
+    print("====")
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "minhash_num_permutations": 112,
+            "minhash_num_bands": 14,
+            "minhash_num_segments": 2,
+            # # When running in ray, our Runtime's get_transform_config() method  will load the domains using
+            # # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration.
+            # # columns used
+            # "fdedup_doc_column": "contents",
+            # "fdedup_id_column": "int_id_column",
+            # "fdedup_cluster_column": "cluster",
+            # # infrastructure
+            # "fdedup_bucket_cpu": 0.5,
+            # "fdedup_doc_cpu": 0.5,
+            # "fdedup_mhash_cpu": 0.5,
+            # "fdedup_num_doc_actors": 1,
+            # "fdedup_num_bucket_actors": 1,
+            # "fdedup_num_minhash_actors": 1,
+            # "fdedup_num_preprocessors": 1,
+            # # fuzzy parameters
+            # "fdedup_num_permutations": 64,
+            # "fdedup_threshold": 0.8,
+            # "fdedup_shingles_size": 5,
+            # "fdedup_delimiters": " ",
+            # # Random delay between reads
+            # "fdedup_random_delay_limit": 5,
+            # # snapshotting
+            # "fdedup_snapshot_delay": 1,
+            # "fdedup_use_doc_snapshot": False,
+            # "fdedup_use_bucket_snapshot": False,
+        }
+        launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration())
+        fixtures = [(launcher, config, basedir + "/input/data_1/", basedir + "/expected/signature_calc/")]
+        return fixtures

From 8fd9676f36d33e9c304309c956468a207a0eff52 Mon Sep 17 00:00:00 2001
From: nelson <kibnelson@gmail.com>
Date: Fri, 18 Oct 2024 11:24:20 -0400
Subject: [PATCH 039/105] Added python tests and expected outputs for the tests

Signed-off-by: nelson <kibnelson@gmail.com>
---
 .../src/cluster_analysis_local_python.py      |   2 +-
 .../python/src/cluster_analysis_transform.py  |  15 +++++
 .../src/get_duplicate_list_transform.py       |  16 +++++
 ...t_duplicate_list_transform_local_python.py |  44 +++++++++++++
 .../cleaned => cleaned/data_1}/df1.parquet    | Bin 14986 -> 14933 bytes
 .../expected/cleaned/data_2/df2.parquet       | Bin 0 -> 3068 bytes
 .../test-data/expected/cleaned/metadata.json  |  59 ++++++++++++++++++
 .../docs_to_remove/band_0_segment_0.parquet   | Bin 1497 -> 1513 bytes
 .../docs_to_remove/band_0_segment_1.parquet   | Bin 905 -> 1497 bytes
 .../docs_to_remove/band_10_segment_1.parquet  | Bin 905 -> 1523 bytes
 .../docs_to_remove/band_11_segment_0.parquet  | Bin 1497 -> 1523 bytes
 .../docs_to_remove/band_12_segment_1.parquet  | Bin 1505 -> 1532 bytes
 .../docs_to_remove/band_13_segment_1.parquet  | Bin 1497 -> 1526 bytes
 .../docs_to_remove/band_1_segment_0.parquet   | Bin 1497 -> 1523 bytes
 .../docs_to_remove/band_1_segment_1.parquet   | Bin 905 -> 1497 bytes
 .../docs_to_remove/band_2_segment_1.parquet   | Bin 905 -> 1497 bytes
 .../docs_to_remove/band_3_segment_0.parquet   | Bin 1505 -> 1510 bytes
 .../docs_to_remove/band_4_segment_1.parquet   | Bin 1497 -> 1513 bytes
 .../docs_to_remove/band_5_segment_0.parquet   | Bin 1497 -> 1513 bytes
 .../docs_to_remove/band_6_segment_1.parquet   | Bin 1497 -> 1513 bytes
 .../docs_to_remove/band_7_segment_0.parquet   | Bin 905 -> 1497 bytes
 .../docs_to_remove/band_7_segment_1.parquet   | Bin 1497 -> 1505 bytes
 .../docs_to_remove/band_8_segment_0.parquet   | Bin 1510 -> 1530 bytes
 .../docs_to_remove/band_8_segment_1.parquet   | Bin 905 -> 1497 bytes
 .../docs_to_remove/band_9_segment_0.parquet   | Bin 905 -> 1497 bytes
 .../docs_to_remove/metadata.json              |  36 +++++------
 .../data_cleaning/cleaned/data_1/df1.parquet  | Bin 0 -> 14933 bytes
 .../data_cleaning/cleaned/data_2/df2.parquet  | Bin 0 -> 3068 bytes
 .../data_cleaning/cleaned/metadata.json       |  46 +++++++-------
 .../docs_to_remove_consolidated.parquet       | Bin 0 -> 663 bytes
 .../docs_to_remove_consolidated.parquet       | Bin 0 -> 663 bytes
 .../expected/get_list_transform/metadata.json |  48 ++++++++++++++
 .../python/test-data/expected/metadata.json   |  49 +++++++++++++++
 .../bands/band=0/segment=0/data_2/df2.parquet | Bin 0 -> 3984 bytes
 .../bands/band=0/segment=0/df1.parquet        | Bin 2753 -> 0 bytes
 .../bands/band=0/segment=1/data_2/df2.parquet | Bin 0 -> 4763 bytes
 .../bands/band=0/segment=1/df1.parquet        | Bin 3122 -> 0 bytes
 .../bands/band=1/segment=0/data_2/df2.parquet | Bin 0 -> 3695 bytes
 .../bands/band=1/segment=0/df1.parquet        | Bin 2862 -> 0 bytes
 .../bands/band=1/segment=1/data_2/df2.parquet | Bin 0 -> 3684 bytes
 .../bands/band=1/segment=1/df1.parquet        | Bin 2537 -> 0 bytes
 .../{df1.parquet => data_2/df2.parquet}       | Bin
 .../band=10/segment=1/data_2/df2.parquet      | Bin 0 -> 4466 bytes
 .../bands/band=10/segment=1/df1.parquet       | Bin 2537 -> 0 bytes
 .../band=11/segment=0/data_2/df2.parquet      | Bin 0 -> 4906 bytes
 .../bands/band=11/segment=0/df1.parquet       | Bin 3450 -> 0 bytes
 .../band=11/segment=1/data_2/df2.parquet      | Bin 0 -> 3317 bytes
 .../bands/band=11/segment=1/df1.parquet       | Bin 1354 -> 0 bytes
 .../band=12/segment=0/data_2/df2.parquet      | Bin 0 -> 3138 bytes
 .../bands/band=12/segment=0/df1.parquet       | Bin 1354 -> 0 bytes
 .../band=12/segment=1/data_2/df2.parquet      | Bin 0 -> 5020 bytes
 .../bands/band=12/segment=1/df1.parquet       | Bin 3442 -> 0 bytes
 .../band=13/segment=0/data_2/df2.parquet      | Bin 0 -> 3138 bytes
 .../bands/band=13/segment=0/df1.parquet       | Bin 2537 -> 0 bytes
 .../band=13/segment=1/data_2/df2.parquet      | Bin 0 -> 5244 bytes
 .../bands/band=13/segment=1/df1.parquet       | Bin 3413 -> 0 bytes
 .../bands/band=2/segment=0/data_2/df2.parquet | Bin 0 -> 4782 bytes
 .../bands/band=2/segment=0/df1.parquet        | Bin 3177 -> 0 bytes
 .../bands/band=2/segment=1/data_2/df2.parquet | Bin 0 -> 3988 bytes
 .../bands/band=2/segment=1/df1.parquet        | Bin 2758 -> 0 bytes
 .../bands/band=3/segment=0/data_2/df2.parquet | Bin 0 -> 4323 bytes
 .../bands/band=3/segment=0/df1.parquet        | Bin 2745 -> 0 bytes
 .../bands/band=3/segment=1/data_2/df2.parquet | Bin 0 -> 4341 bytes
 .../bands/band=3/segment=1/df1.parquet        | Bin 3122 -> 0 bytes
 .../bands/band=4/segment=0/data_2/df2.parquet | Bin 0 -> 4035 bytes
 .../bands/band=4/segment=0/df1.parquet        | Bin 2537 -> 0 bytes
 .../bands/band=4/segment=1/data_2/df2.parquet | Bin 0 -> 4860 bytes
 .../bands/band=4/segment=1/df1.parquet        | Bin 3413 -> 0 bytes
 .../bands/band=5/segment=0/data_2/df2.parquet | Bin 0 -> 3554 bytes
 .../bands/band=5/segment=0/df1.parquet        | Bin 2753 -> 0 bytes
 .../bands/band=5/segment=1/data_2/df2.parquet | Bin 0 -> 4872 bytes
 .../bands/band=5/segment=1/df1.parquet        | Bin 3122 -> 0 bytes
 .../bands/band=6/segment=0/data_2/df2.parquet | Bin 0 -> 3553 bytes
 .../bands/band=6/segment=0/df1.parquet        | Bin 1354 -> 0 bytes
 .../bands/band=6/segment=1/data_2/df2.parquet | Bin 0 -> 4311 bytes
 .../bands/band=6/segment=1/df1.parquet        | Bin 3450 -> 0 bytes
 .../bands/band=7/segment=0/data_2/df2.parquet | Bin 0 -> 3765 bytes
 .../bands/band=7/segment=0/df1.parquet        | Bin 2667 -> 0 bytes
 .../bands/band=7/segment=1/data_2/df2.parquet | Bin 0 -> 4158 bytes
 .../bands/band=7/segment=1/df1.parquet        | Bin 3289 -> 0 bytes
 .../bands/band=8/segment=0/data_2/df2.parquet | Bin 0 -> 3781 bytes
 .../bands/band=8/segment=0/df1.parquet        | Bin 2845 -> 0 bytes
 .../bands/band=8/segment=1/data_2/df2.parquet | Bin 0 -> 3997 bytes
 .../bands/band=8/segment=1/df1.parquet        | Bin 2537 -> 0 bytes
 .../bands/band=9/segment=0/data_2/df2.parquet | Bin 0 -> 4018 bytes
 .../bands/band=9/segment=0/df1.parquet        | Bin 2537 -> 0 bytes
 .../bands/band=9/segment=1/data_2/df2.parquet | Bin 0 -> 4326 bytes
 .../bands/band=9/segment=1/df1.parquet        | Bin 3314 -> 0 bytes
 .../expected/signature_calc/metadata.json     |  54 ++++++----------
 .../test_cluster_analysis_transform_python.py |   4 +-
 .../test_data_cleaning_transform_python.py    |   6 +-
 ...est_get_duplicate_list_transform_python.py |  45 +++++++++++++
 .../test_signature_calc_transform_python.py   |  45 +------------
 93 files changed, 345 insertions(+), 124 deletions(-)
 create mode 100644 transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
 rename transforms/universal/fdedup/python/test-data/expected/{data_cleaning/cleaned => cleaned/data_1}/df1.parquet (79%)
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/metadata.json
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
 rename transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/{df1.parquet => data_2/df2.parquet} (100%)
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet
 delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py

diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
index 7c162b1b1..915cdcd1e 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
@@ -37,7 +37,7 @@
     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
     "cluster_num_bands": 14,
     "cluster_num_segments": 2,
-    "cluster_jaccard_similarity_threshold": 0.0,
+    "cluster_jaccard_similarity_threshold": 0.7,
 }
 if __name__ == "__main__":
     # Set the simulated command line args
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
index 2a5ec3e6b..412fc1fa8 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
@@ -33,6 +33,8 @@
 """ This key holds the number of segments dividing the hashing space for each band"""
 jaccard_similarity_threshold_key = "jaccard_similarity_threshold"
 """ This key holds the Jaccard similarity threshold above which two documents are duplicates"""
+sort_output_key = "sort_output"
+""" This key is used to sort"""
 
 # command line arguments
 num_bands_cli_param = f"{cli_prefix}{num_bands_key}"
@@ -41,11 +43,14 @@
 """ Jaccard similarity threshold above which two documents are duplicates"""
 num_segments_cli_param = f"{cli_prefix}{num_segments_key}"
 """ The number of segments dividing the hashing space for each band"""
+sort_output_cli_param = f"{cli_prefix}{sort_output_key}"
+""" Sort the output"""
 
 captured_arg_keys = [
     num_bands_key,
     num_segments_key,
     jaccard_similarity_threshold_key,
+    sort_output_key,
 ]
 
 # defaults
@@ -55,6 +60,7 @@
 """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)"""
 num_segments_default = 1
 """ Default number of segments dividing the hashing space for each band"""
+sort_output_default = False
 
 
 class ClusterAnalysisTransform(AbstractFolderTransform):
@@ -98,6 +104,7 @@ def __init__(self, config: dict[str, Any]):
         self.jaccard_similarity_threshold = config.get(
             jaccard_similarity_threshold_key, jaccard_similarity_threshold_default
         )
+        self.sort_output = config.get(sort_output_key, sort_output_default)
         self.data_access = config.get("data_access")
         self.logger = get_logger(__name__)
 
@@ -225,6 +232,8 @@ def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, An
             "jaccard_clusters": num_clusters,
             "jaccard_duplicate_docs": sum_cdocs,
         }
+        if self.sort_output:
+            filtered_jaccard_dataframe = filtered_jaccard_dataframe.sort(by="first_doc")
         return filtered_jaccard_dataframe, jaccard_stats
 
     def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]:
@@ -308,6 +317,12 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=num_segments_default,
             help="The number of segments dividing the hashing space for each band",
         )
+        parser.add_argument(
+            f"--{sort_output_cli_param}",
+            type=bool,
+            default=sort_output_default,
+            help="Sort",
+        )
 
     def apply_input_params(self, args: Namespace) -> bool:
         """
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
index c7b4cbddf..c49124cf1 100644
--- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
+++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
@@ -31,16 +31,21 @@
 """ This key holds the name of the subfolder with the duplicate records"""
 consolidated_filename_key = "consolidated_filename"
 """ This key holds the name of the file with the consolidated list of duplicates"""
+sort_output_key = "sort_output"
+""" This key is used to sort"""
 
 # command line arguments
 subfolder_cli_param = f"{cli_prefix}{subfolder_key}"
 """ The name of the subfolder with the duplicate records"""
 consolidated_filename_cli_param = f"{cli_prefix}{consolidated_filename_key}"
 """ The name of the file with the consolidated list of duplicates"""
+sort_output_cli_param = f"{cli_prefix}{sort_output_key}"
+""" Sort the output"""
 
 captured_arg_keys = [
     subfolder_key,
     consolidated_filename_key,
+    sort_output_key,
 ]
 
 # defaults
@@ -48,6 +53,7 @@
 """ Default name of the subfolder with the duplicate records"""
 consolidated_filename_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet")
 """ Default name of the file with the consolidated list of duplicates"""
+sort_output_default = False
 
 
 class GetDuplicateListTransform(AbstractFolderTransform):
@@ -69,6 +75,7 @@ def __init__(self, config: dict[str, Any]):
         super().__init__(config)
         self.subfolder = config.get(subfolder_key, subfolder_default)
         self.consolidated_filename = config.get(consolidated_filename_key, consolidated_filename_default)
+        self.sort_output = config.get(sort_output_key, sort_output_default)
         self.data_access = config.get("data_access")
         self.logger = get_logger(__name__)
 
@@ -118,6 +125,9 @@ def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.
             "consolidated_bytes": consolidated_dataframe.to_arrow().nbytes,
             "consolidated_rows": len(consolidated_dataframe),
         }
+        if self.sort_output:
+            consolidated_dataframe = consolidated_dataframe.sort(by="docs_to_remove")
+
         return consolidated_dataframe, consolidation_stats
 
 
@@ -155,6 +165,12 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=consolidated_filename_default,
             help="The name of the file with the consolidated list of duplicates",
         )
+        parser.add_argument(
+            f"--{sort_output_cli_param}",
+            type=bool,
+            default=sort_output_default,
+            help="Sort",
+        )
 
     def apply_input_params(self, args: Namespace) -> bool:
         """
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
new file mode 100644
index 000000000..be90b3073
--- /dev/null
+++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
@@ -0,0 +1,44 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.utils import ParamsUtils
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+
+
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected/cluster_analysis"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+}
+
+if __name__ == "__main__":
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
+    print(sys.argv)
+    # create launcher
+    launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+    # Launch the ray actor(s) to process the input
+    launcher.launch()
diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet
similarity index 79%
rename from transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet
rename to transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet
index 11964c2e20b7cef92e09929a3a25e0cee1f17d64..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd 100644
GIT binary patch
delta 2417
zcmb8xdpr{g8wc=>(OgEGHe}{D8zpxgQ?UuzScK{@lUz=O8Riz5T+)SDZYx^a+)F|v
z$-$6YE|Yta%Z?N>q|Qs{^1ko+eBR4n@B94o`{(z0p6Ac+jmS}kr8B|B2o8d)!;KyP
z3;+xO(5W~0uD<sQ000F1b3lJ38~}&G(RzQ@_QECMl1L>mi1h>x1hFW}7E(k2K<v*R
z2nZAx!)yVOaFDVYKyt6L5+snuI2Y)5&NnnTGK2<~MSu|9K*S0Va|Q$c*NQGIv(*y*
zUhUtX@Gg=wvjRxpEJhr`2W2z(J+(&Lvi5o7{^>Y%*3(1U2U)j&sz;}Elane`25nM0
zd6lM?vuUz;jPc#?6Wgq9x0Lhccv%XN45R>+CKo>ArkZoA=V$87sUE&GRaR5VP;Tx$
zQCl4O6VG(Xky+{M)O327KAkg;5C1sKUw=`xo-*RWxURxkF-wPYqNNWGY}>iVH@HSK
z@*@r&T((@A9*uHnf2qz-d)Qy5vz_9klE9^{#;%+?cwC4|AnoAtvJ5H<EiALyg1d3c
z&qWXW6a>LVBInCjuL)2mbQ8$;YfdY^s;UA{K%r0;)5hkTnGzFCgnw+7<V<XL-#)C?
z)O?IO!&^FnjLrfkCtG<ukMYUUUk7RNC9cZARWQy0^t<irZi`RLj(6r#8<YC$LxRj_
zNh@yBAM{OS-=}2y$B?E$2~y9;I|~%W&-UtZ1a2-ux>ra};rg-EvHUq=h3~579uN?{
zABtvbVmOZW@$%d%&s~JZ)wOmL?|@>X37r5@N10Cn7r{ZOq({s|h$tbD&qvF~IlOi1
zPog&v$bOJFp^^OiSOOBYUp_NpOy7e_J4-R{W()TF51z+_g}QIO*lOZacLE_ERL+CS
z1mtf+Tge0H!>X&cYq~W!dk{rcTmCDDDHn|~x>pOPJuZ89z$xamD_`$j>HP<?7N2oV
zaMW)<!Zubn(cycPW2_gf&@Z+e=l}MR>s=Q)6@CQ!==xaPy?iz&fKSS`Dq}QKc2ISt
zPu0v0q<D8kk+ss&x{mv3`plEuCA$0&v6-b6SQoY+#j1a}%dIOFCuw9hN9?L(il?nv
zW7kG1`y#4;ElNRD3=;;%C6|pa1kL=ISij-V?R2dSkJrL$`e?FT;dXMNUY-LP(4PbC
z^LxrZ`Hl&+i*MOK@a^M9pIO3vEVleaoJ;bg;IhF`cbD|8<eIsyqYXoGgkOloMS`X0
z4?_ph4cd`Na!_xkdr;2o>U;pR<>krvDHqawwyRi%PoL9d1hRzgwyOYp_Cv<d9k<c3
zcreWA=<yhOi+$wGn#-jxJvWt7vBmeYO0izb$?Az6BPE_qg^)|Ud~1v;O5AZ#ka$Xz
zwi$z(+vk|orZ&JwX^7CZPal$#OWbbGPR;OMxLvWb7$j2D7mxYEeOj2dUt`0e!G4!%
z8GS;+Of2c1V2-nHB>B?7_jQ=3jYiZAB`@NMDyd4iCyd*58r*M`SPg;uH=Mv{HE!Sk
zVn4d@R;wg9(rfYYKI@+)d7>fHc(zGLRYOv9p=pYFls~(aQ(ftiz={j+IndaecR{By
zYmF`o)x`|cLYCL2BA7O$$taZV!W{I=3~#59$T*Y#@|7rHmfZoSUAi3;-12nx_|i5`
zB1vA+u{=jjSNQtl+XP4#UrkBI&98OcmU)UGvQfyca@pN?sp-V1t^P5Wp^`}j3e<eR
zWPyR#h_chbDO*(rNg^Z7*w%eB^3s^bYpm3YKf99~?Kz^@!u*hlE6JG;lwQLrB*2wU
zH+|Yzrnk+wA_K`9&rDhfZSy@|U&G&^u42=bQXST9D4-Sx)yBHg))AeSE<?YJdE&Gg
zQm#TQA23H)2vaUkfVwR;i$Wk<UeY!9lfP4ShXc#&8GLnYi@V7R>Nm3sJ8^L(8#9{_
zC8sk5UL;lK4@a(vij{MP`l<N2(p+kUeXc;7=~;Q~tcF#0Jrf!<v$tZ|K{ep5R|ngF
zPPLyKCgWjl(C<)05C<401_;%ME~ESn*H&eRW$>QuyaJjGLB{D=-JZ!l=P_}Jkg=&$
z1dtL#*<8C8^PY~pE+-5~vzTBZU-X9++acF$?lGGp4H?U=r-ZDXqw3)B_xSdaclIBe
zPx{fn49#7=%Ii)3T07418e9dEc!3PlqfIMR(r}D<<<Y1++52Os?-;NY7xT+qs>j;n
zZ`87@4<#;^&#_%dP@gQ8GE12(NCz5PTVFdb8w&F-+9=Ecj}AHBw)X!Ww{iGTRP0m|
zo4IS{Y-(ce&j-0?G9KL~?yG&1(_*5&bBKOxJEN1Ols!R6OuX|z?%g+Ek(n?miq0oq
z{AF?)ksn)nRunGsJy)15E(H~ypCH(L_~}CjLdy&ttybNSXIEZBqCWS=)7$nMN;m=)
zPIUO)?<18)@o3K5qyVQUgQQnFKzf@B^MP!Gc5~@zO+0bf8Y|^jZ1Fu)WlQl*H^8Qq
zMu2>lJYKD|f^sZ9>!uBxg~DcgQg1vM4>$udzoC<Fy=0d3y_h$;O#V48`eL4XZ_Tsw
z<gS+6W+?B)dQBtNX6DVoTEm82f}o9i7ge)h+1uDw{8@aj;vJ3h*Q4-R3xgyVCX6z&
z`g|8=r4k~uMzss)dtcrh$i^AXNcaMV7fyl20w!`_oz-fzj@78MzObkP!30lyRacD6
zo2CQz;3YGUoRG@_+zLYd@J+J9OaohXC0A}F9mQ5`@4N`?zwx;Dk!MFq`!ffz6^~v+
zmtPaZmn0BaJs8?v>b=Ik#}h~hv*&`yNM=-Ci6t{TT+(WAVbuWv;EDm!A%#}o6mIEB
zZy>a~>H*{_&)}P(tS_@%%drk$X9EVkIdzDR!J1I)BB-Nb-Df;=8}hVWklF<?085{X
zAG6-_U?+uv^6#D;@Ya`;uxopy@?FE`1Y?iXp<C~5CZ8Mc8WsFnq>qR~%ILxD?B99j
zIRc2~V*h20x+}U|dE8$%`A=)~`yk=6e^~?U8%ksN(-;x|V8+B{#Eo;9b}dYMU7~Ki
ye0H7A5o3@tQd#f+_eu6)U2XLLc&R~Ns)&gIVZexO?>aor8VG(31pxl;tUmxhZ76gA

delta 2494
zcmb7`X*k=77RLV(B2l%3q=Iy;v6M(`6`>_FM8(#`Qfn<m)Luztibkn@jb+dhZ4KJ0
zs$FRb22*<}ouX+mYPoF%EnUsbox42q;oc8(zMb>F&wJjFzZpg*lkH?l#Uq3fP6!J-
zgct&V1Au0$-d82fhX9ZW0>FSUAOb*ue~qxS5C8@t00a`Dsq=?h3IRbt6y>1*(dD3^
ze>;(Y=Kr|<A^sVDz#RaD{#e97U{P?oHGoMK;v^`jiy^;&kozFa84UFQhjJcju#-+B
z?kfGYu5T|g+yc0rc@0Ul$SPp;KWf77irf6~G%%Z>l-{DNeuPsndiDgAb5)qOI<)l7
zqT7s6C#(Y!*dwK%_46^c+)SJ}!UE(4vIlv9^33<SRb!No+HcLXrmKx!wMpNiK)v0c
zp=9`>9^X_Gf~jmYajHHuZNBCKB%hXdIPHaQs%rV{4a%3OGYx(}2_Dc`i(!#P1HnWl
zIhN4gRPZ*Y+dI5nY2ko=e8+Hnra$>Z%cAPWEO+Q|tG@Hw#oCt2fb|bIhHot!WN6jj
zFX1ZmbebuTLF6(vV!kWdHL=|a{ghqbX&4c1#XAUt!M<br_cb)KlGI#o?#Q$J=T#|l
z@oU?=jPLWfFX&$^MI|J;ElrE*mYqE5EC~e@dWYb8k_e<$OTy90A&%tLp?}aV|9)Gz
z<f>;zg)OK8k_svpG)g#zT;=I0@68Eb4e*MaGHVTsoh{4V6x5HzJ6?a!WKri%b=_39
zOoR<yhOuE#vXWi=#P#~H54t4riqzPbJez=fC+{s~I1U(F8JOuBBUc{;_eZvxKiDbv
zi5u^WbswM|H>67?ih%5Yo4u%{h*MNsz2n`|xp24bOg_HaXqek?x5RH+i(O}?ckNEg
zw{(6XxqP)Sd;BhWT+JB$#{8{TBVh>aiB^}~<Fch<G5Crm(TKaX&$Y=c7h1p0^BVSb
zannsgr<|ghJAC_1%&*a+C>Cj%XmNhM?5FoLR=q+0XjwULeM+W&&r}bOew<pY&31rA
zx9^$<*@qlI>%7KGe{;oJN<!|DurLhjHjKkz>VA$hv^VGU6=|E{$qKYX3ZwY6TViCa
zD;sUXkT;F4boJ)Ojftw?vB?>a?l!h5A$aZ#3f(EzlR3hr7B4G+WZ+V8f{UN<)~kV)
z1-Y%Osf@Dt&SOEk-lVy7S<^P13&BiomBU6eR~9?eT5OOZ%~5alRm(nDyV{t-KcG)m
z5h*@4;hhkNY1F;7O!uH&9QCOB5>aB!-g(1Zn*M<b&nr&?h3EK>I6jL}tfqT?P_|p(
z8<K*lE}lE;c4k{T%6%VIX1?#Dtj}m{`XRab=Sf$%tU$XYK*q7;-1<4=wQQe_n>-sx
zV*W_-K#GRLc6^^!sZz3rgMUe>3EX{d=2b8C>_BAr=40>MJQ?r!U%&bz{JX8#dxUe6
zVP_H|az<7N{O@fm;7hpzJmle~|5(Vvx9IbH&mUs$DcIJptVZ1!bb*fI6B`a90^2Ay
zr5(GK?`>w57gVdmnEd6tvZS8DBo1Bv*eGGi)zi|)LEcocOW%$*;pU>-Glaf>2I1v9
zq9^M`PBY0(mo22Xw=rVh7^$$U!-#oFi%J<6HXkR9tC`k~@TXZVk#<Yqqv#W6#;4~t
zug82Yhxeu(z&H=*6>SGGBYFn=dS$Xiu#A@itDZ%n0cm@?Hrp-Q+WKLay^eP_mL);t
zFsn(rTv#{D{<I>2j9^Y`r6`D7Cqn?=d`()B(dpV?^psfvkDubA!~QgARKJ+DUrMvp
z;?}(6(iYP$oviM{R!SUtZC=fpeG*MvMj^FxUo-W{Z=zX!Y3?1ujxAO%yPjMkhKiZx
zEQN{_XEk$MyunNP+EHR~6<k4=#rwExrmH!Tp-P_#zq}}hSJbsK`js@g-Su7{-Zu*V
z92Zx;&EJtkl3lO*ux-&tiS+0nQN=6R6pO{0V(%E6Vyd_!c7|J*Te&>flV-G;{^VV?
z1TJFJZ**87<=WdJVo-kQmAW8QN8S*4sp3I^GAhZ3vg3&4cS6#&g0-4?Q(Ql7;x%Cx
zTl60ImhfB+EQ_<WxD12CrR5z|8?VbhpM)Hx29ywTo(!?U$J%J^2UD@%nA}#o?<izB
znJg;HeVa;JWvV3CrAZveDZ0S<BR+8#%-cmZvuhr23D%lowM0~-oll%EEBwToFVo|c
zESKD*Hq17a>Lm(pZk3X3rTz0cC=Tkx6az;hrTdMtz;NHFrSc-E*0%;KR43|wq4N}I
z{-9_8m3=?32me~`=JeDG=>`8zReqITRmE-mL}%zu+s9>GlU)68AL*Otf-b>u=E;Ma
zl(`^It#8H2XH}_%PLYPWo%$!Vp+Y+74c@95Mz)4KzHeM;LvCMkn3yv>lk{7XH)i=%
zWX#13CwnuxLAu;nHs@(m(cn2E$<65+j>dX=k?pg%ZDJvj4<@KYV6Y0*v-e|1+D>e{
zg-`8F1*hOX^*)QqnoJ5uot8OyqwYuaq;xkbgtTv<-zk2UWufcq>DB+LH%gZFOY+q8
zgex?r*uPu#j=IJzXU(9oW|c{&_zqcOgNzR5VS_`$C-3vMcZL1%aqwBSQABfJa`uC8
zb++_5e)Kc<ZU4~YKcmAeWx`w^RYe~S_;dwWiZu{!o!J;~b9j2qDQ7Gy+1&?QDHBen
zV^x{`z7M}1{*hq$;J{Bn`H(wwnEs(cltTVUT32q~3%g8`*9v>I2OQtP&7Zmq7E2a=
z!J$!rAvU-E;`Ea%1%;!$vuSwiuj6A;e(ao=eVIPP;_c}TByGz6?l4cvDPfYLA(CZ<
zyld-wDN*mFO~6%{TFpSMJByuYjYZ|P0rO>KY<Qx)h;ja|$x%T&+}&%XKBr?=NGT+x
zO7}t*YhU+sqn9b<YKp>83|#)yug*b~5UdY#w0Wj}xT-a`D7WC68SirG>)3R@c@<o+
z5+a$<?BR=rwNiRLj*csTuwrn=JrozeE29S@{bt`=*6E6aPF>8#F?WuX0Xt42|D~4Z
z4QPX;@m~<+PXf6YpokDh837O}lpHi5GJ+8p!H7YKBgK$hE(p1B8goMp^90;1fZmWS
y+%$m~CTbF)VkkwF&fjAfH4V()`y?ZF<r)YnhJhmI{F)CCNFeBhGywb)+I|NO+an(U

diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d
GIT binary patch
literal 3068
zcmeHJU2GIp6u!GGYZr<vh1u>VOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd
zr)%p2U<hKU6y-tUPmu(TQBZ7vglG{-4E}`(O614-AO#a35t|Y$rs|n#i(31_gZk3*
zaR1IX-#O>bx#!+mwyv0QF_?+Op+i{+<sB&+!rm<iWujr3KGHLY$ztZ_1I){~8F$d1
zF{<%rjADH#Z=6*iuvtAZgq$#OIUJLmp#evbaRo||+Z*st6J=w)m|V{(l9`g3OrOj5
zsl#`}5n3I}xEDhK%RR(65j0-t5Q6z(4|8_%U&Gt5&M`Cv9c_%~y`CGp?{|>J=%~>8
zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N!
zT+gohy}f4!viskNb<Jyk=3=U&>vSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h
z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{<Is@}v7#
zAMEU1*s}fjEYF3{y@QEpLoZi<sGzI%g&~J~vNN-*|A2Q5cdk{g%vLVV|7LG|?pEP!
z_ES5ro(Oli_-sLbU}101gXdFMyE>+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk
zldhaoEl!8**-E2<axUk?mv(0yjCn%yPhC4P?YSq1*uMU<+`zGiqSH~qnRhS!`qj3Z
zc~f%EEMM0B__c;?_qVw3026K8aq)@kH8-}z_uXEVJCz@sE@de1vU4rf4XPzcTRtXO
zC8CCvCd?646FE^ab*zd!Rb=QjVi*!lVpSrXn3URZq754Y1uSQndOqe%&H4Zo9N@(y
z)kK-cL{<Q#jnG)cBmoeWa?Q5dfF;==l47c~5Z42rMB3Z1B#{)slE@1Npd~S-f@MR{
z6q6ThtC9$AfK`bklu^5qBJ&n$lcGT^lG09P9!Obm$W$ddCNOZNEkQJ*V@`n3fS*$o
z>0aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1
zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe
zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R<q%rskj$++VG-{8m)S#9FkqBT4
zA|)1*Ba#F^8&U!sE*!tSbk7VjnIq*T1Cqo5qq<#LPR#V(oE|#t37wo2oXE<Fto)a<
za=g1Ju&5OI-8vlI6k%L&IpsRX+<l12gtfZ~n~UN>$Zwx35Deu1M^|QO;^<5i^#8S>
z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@C<t&?Kt9W)9%ku0J
zTg6sYv23-aT*9(3tB3h$4>N06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1
zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^(
zZ9Z<q6$=R#K#(7vQ(B8jjRH<p3y7w{WeRt;F2eJmxU8_aFdWE-N@5k0XI%)@!n^+u
DR7u&o

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json
new file mode 100644
index 000000000..de47f367b
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json
@@ -0,0 +1,59 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:34:04",
+        "end_time": "2024-10-18 10:34:04",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "filter_duplicates",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 96.1,
+        "gpus": 0,
+        "memory": 23.82,
+        "object_store": 0,
+        "execution time, min": 0.006
+    },
+    "job_output_stats": {
+        "source_files": 2,
+        "source_size": 4490,
+        "result_files": 2,
+        "result_size": 18001,
+        "processing_time": 0.341,
+        "input_files": 2,
+        "input_docs": 12,
+        "input_bytes": 8753,
+        "output_files": 2,
+        "output_docs": 4,
+        "output_bytes": 4650,
+        "filtered_docs": 8,
+        "filtered_bytes": 4103,
+        "source_doc_count": 12,
+        "result_doc_count": 4
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
index 06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9..79fe53b6203893a919b73fc9a93777fa66f579a6 100644
GIT binary patch
delta 495
zcmcb~{gT@wz%j^BltolQ)W`=&F$gg*Xl&L0t02(Gz`)D^0W45jltENTR7D4<OhA-P
zlubs0112NE!93AHl#zMze@1Z&7Dh&9Cbch&GNLS!rVMNXk`kP0nMK7V@hSPqqC8?O
zVka2I-Y}|#sc~R4L6=!70Ec@7q3&SC=^mI02@aSVF$N6|!O60WitG?)Et~AgEUON4
zLkEl6E@q%BaOvx1R++q(QJoFq<H^2Ex|7+N_(4XoFsNOdEX$<A!y>kWQS1(*+75QL
rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK{~Syj

delta 453
zcmaFKeUqCdz%j^h@?J)9YZe9w5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go
znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;&cK`MuG#z0y%+ovM!?{)Z(_u
zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQ<m6Z;-O1ui{7|E|P1a>n;ei=7gI(<w
pP{Aa2CP^7d8*)7;%}A;PRT;6m!=6)Y1&bJy8i&}F&DJdI7y(^^Ja7O2

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
index 57642d19925240e1a86b323222b420617a75e7d4..9df2f3bd54e13d5078be076585302c2d0f4e93af 100644
GIT binary patch
literal 1497
zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K}
z0vA4laj)@Nj0+bod<hfJX{q20B)Z|GJ-_#!^L@QHEo0Rbagh>tj)g2>17P#h?yro3
z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI><LK$avKj2}{3#PvN$q;ze)edZ5G
z^_J&|q$;alB=wtQ92v_04AHlU00<dKrN{s*A#^k3!|SdPiYl5Cc}ytsXW7Ms-mc9E
z-6AT(`pB#MZqFOLbF6;GjE^zmb-nA7S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDYL|
zerMF3qxb%X@s;n!_2x@PE~2L&?}3&WI$gKd5D7ZRd-B{B3F9kYI7}{v2db>6_5Bsx
zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I
z<O;OlDSJi45?hKROA(ifFF604RnV$f;?@)tup1;j);wq`HMX{uf9;t0Y}3qN+Nb8Y
z)vL9eR_(&hRfF~2a)6Ik5*(7tD6W(Oya9+m&=C(9uT>qt%AK@ZR=(}@YNKY(J|}z+
zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(<f)cDaQY|Nm-lJTgtHyqPf#50H<b`C
uXqiF7e{<oEww=LXyW#uZBr|sX;hERZm=AW$ovgl}1_&QGi~*GCr~VuJx%5l`

delta 386
zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn
z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm
zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R
c0|$>-h!`-u#JU(a>#!_k<d_c(GEl$+04m8s>i_@%

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
index 57642d19925240e1a86b323222b420617a75e7d4..0e089dee3ccc91c9decf65449e3ae930f1d1717b 100644
GIT binary patch
literal 1523
zcmcgs&2G~`5FR_J>!K9Ws%zO&4iPF=5B-UWib&PVCM{MVlq86><UoELCn;*;;G~KA
z5<CER9svn)=D>j`Kpc8ToVkIRUE3s(<iI5(duC^LJo9}!E9Ys^AYqaq2RR}USO?g8
zxA!xyz6Gd;AOg3*6A`M{LRBC@Hi*8BsZo5Tm?{SesvPoD71){)#Z6xMMgc%XM-s%)
zCg{I}7zu0pAVoEGrFri52bG3nk%%HGUj*fcpuf=N|LLM_BO4$hpo*dan8$7nRw3#`
zp382{Pf+Dmf5q5MM{#o&yAMlab~lk%zcO$tuHA8l_5{T@Vf|wWNlurY(`+WmD7l>@
z;&_BI1rU7Dco^$sb8>9Kjw%N$)+<<#qXDs%Y%n%K0{xpj=0*PIRk^UctwEd9doL(o
z1^x4~{(2edO&6cjauz);w-2h+x7v26MkE~S9dY8mc>X^p>;-4T14Yte+TjAOpT4j=
zu<)rU-@}Te%gU#v4FD~sMrg74<n+KBl6o!fF&WV`rL=%AlSh<dphX{0+M<~jkEJo;
zF<WIQ%`%IAV9{e+nP)8ilqp^+lVzpRNz*nDI!v=hzLc0Po2i$UkxJB!)K&S^7&bbk
zX5B1ZmXk%E-_CP<n2B&ut^&W1<#;)WKhP;XVtCEs@Okp2*)UU0t5X`(ljRG<ci+cD
zDMyMQr3;LYNvTDTSaAe*%q$bX{zK2<nyC!yYL<Ie_at$47yFD@w8g$hC>E>di-#A~
tc(3NZy0iy7R<E~Hb6sZ?AKGsJ-08-Rd%MPNLOYBBcn=)o4?2&(``=K^{GR{-

delta 369
zcmey&-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn
z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm
zFzNym)Tqf0Ows}{M=WB3Xq~)}DI$+SQby8-6lbduav!Gi)KJ_Zhwct=Xo!V~0fR@Z
Si*d6X%Th*;`M?kYg);yNpF9u%

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
index 06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9..4b0fecb15e7a68344562f556912622282bd20231 100644
GIT binary patch
delta 585
zcmcb~{h7Nyz%j^Bltt7))XGPcO_V{1fk9)d{$B-&r3?%S%nU3HybKV)#E|fSF@aH(
zK~zLkO$Vq{0EpRSBsgFSBsiF%O2rs7u!sUB!6tJBGB7X!Ic$tf%qk3?YX2BzL|G(F
z8Q26QB{<VEi;7F)Q}UBVdBoVnZZV2|V^oV%;~>VG$-2x^A-H@Z$iT|L4Rsz)p9n$~
zNN})1Ss<SXPL^eq6hL*g+OEl-%(Ch*7tUf)JHiZfBQ67anN=pQWmIQ_1lnX@Cf&*G
zO#JL@jErmyYOf~CGO6&giS1w%yThn<hFz_XeX<|ZL_ao38A%&zhK@8NsjgOK#O^pd
UPO&#EVr*(0Vkb6hv#etT0PhD<O8@`>

delta 476
zcmey&eUrOBz%j^BltolPRLMt_Nt8i|fk9)d{$B+SM+OEK1_%&k5ET+t(E&;eh_Z>Y
z$w+X(WDqPd1`RBtK>c9TI6WB{n1CE6Mn)zEwI_@+qAZf83~U0D5)*rP^_j$WFpAw_
zRC80~Aja&;8=0jhA7>PoX2t0~*2xo@MKxja5*#oVcK5VRj%Sn=gt_7cv)U?VpgX{3
zFtScw&a48oO`VZx@_R-{c8CwQO+L%4!VA+egI(<w+vI=Dc9WMe1+z0r%1GL%J(<`j
gL59bq83{Q?l@Yt;_8co%#F*4L#HMU^Wm(4v09vg{(EtDd

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
index f5da05a106242414df178b29c0ac05f21de73c73..5601f5cb07d71179df35855265cde6b0634c130f 100644
GIT binary patch
delta 605
zcmbV}yGlbr5Qb-V&t))*IU%!dOoSANHHdN)I|-<L3ZvF?9>F*81uP7k($-R&V5Oyn
zk02JJU?n2hh(#1P@kE3KyiBu~`RCtnX76&{+}_6e<`R3n$SVyF7zzNj)8wtHj{s5$
z9*o0h2uN=+MFuXinP=j>lLr+|1A!))Nfp#a+`44nd8`Q_+<{QYHVE@>pM*|GU88zI
zvt#Hto;L4IuR82Ct~VUZ7k2d+U^fCE0+`5X<hB4gf;+O4$a7Jq(&T45w+wOKgE_ZN
zITMk3K=q2|&@&g(f%Z)>vMro#T@@EF^XT>03pcF-$KR(l$Y?_QjEra#oC*n12BvGz
zWLze7iRux}O=u26_r!38>)p1+R!F5vss8`@i6484#UGf;wC^i2?jt+P-U)sHPxe-<

literal 1505
zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3
z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv%
zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W
z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X<QT*?HF2g
zmV_;$SYTdQOt#*|XTolwq+xyJ*8{KTk3FI8i=_XQtY0zfV+{2zR=uIuQhmz?M$*X^
zbwxXd7OkjPw4R}Od0rO|?6Wr0#s1Y*Kll2bQTN*LKCD?k=#4BEzoF$ad1i4R40Y&s
zy<UT<IL~|XwWkv1bH8v1E~f{&VkXUl6}-IFy@8caMZZ#X#nSZm@!J4q(%9gY%Cpl)
z;gF1P@-Eqg=Q!tO{E|K49Dxo$=Dfr69f9TWz*Ab~IWJJeJc^jnR{jOy&&i2W`2v;3
zB*(h~^vP4BsZ}Y~oa}4Y&Ze7o_Of<rk6XQJyXjOfYMF{S-z^D#coONLT}FAiAovOr
zf2nhRNPL~j_*Lel-Ey*Rw^toCGqrQ%_i)OGbHNlZ<;tYT;oRkiR2<12Q6!tze;g*l
zlgm?Ar#5i=C+U}a*k?j$hwdX4m+DO|<O@b>&<NgKc%yB1FxYMcfj>!&y<m9e_fz(R
U9eXEj9wY(64;;k+O8Dpg2b^^EJOBUy

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
index a811ad8780065241630c479f45bc6aac12b04e09..02bedff1c8ca8f026eb6a3d7664517b0f74cbb2d 100644
GIT binary patch
delta 599
zcmcb~{f#>$z%j^BltolQ)W`=&F$gg*Xl&L0t02(Gz`(`;0X$GzltENTR7FRWO;iAg
z*<>U*U@{UMFtuU~8XP<meH2~T7#W#a)V?svh_XnUGO!6qN^quS78RGor{pJ#@`$mC
zonRDu!>AUf#(~G2jp~60qE<dY+u#n9SjxbVAjrVXzzua36GOrS#so&7<3vQ&kR2xo
zRUpB^3}t~FCpcM_QBr^_kb!{-D9Xmj#H_-gwqdd-v#dIs*eyn}Z;WaaSk!hg16_s7
zfL>-5D;z-vG7|(CAru2<kTFfZ&nU{sH2Eu|B0D6EK1`NnQW1m&$R&2Q7IvWJFm?Z!
rRVMc^1-r0G%1GK!9G)snB>PpBk&pvbIbN}dv8i#0o!hL<vW^h|RQy?5

delta 536
zcmeyyeUsZKz%j^BltolPRLMt_Nt8i|fk9)d{$B+SM+OEq1_%&k5ET+t(E&;eh_Z>Y
z$w+X(WDqPd1`Q6ji6M&YOpJ_73~Em%)<~!_iS1w%yThpFrpAF=&twN?6%QN^V8-bH
zn2ZDmj0JK4^JGIt$%(lfP)D>)j$)Kmhq>Yfv)U?VpgUk15G;t+d5r4%*g}C3rz03A
z=QD|N!{jA67$?SwL#^I6`2w?wAk69+>}t0_!3NgA$T(S=MP>3zreJm^Nf}8S(!xrW
dkyKx)FsTu8jVi|q7BMC@4zVelJy_N;0ss{%Nvr?>

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
index 06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9..bf131f43cbf10180944b4906799b7d6288c54724 100644
GIT binary patch
delta 585
zcmcb~{h7Nyz%j^Bltt7))XGPcO_V{1fk9)d{$B-&r3?%S%nU3H{0tDl#E|fSF@aH(
zK~zLkO$Vq{0EpRSBsgFSBsiF%O2rs7u!sUB!6tJBGB7X!Ic$tf%qk3?YX2BzL|G(F
z8Q26QB{<VEi;7F)Q}UBVdBoVnZZV2|V^oV%;~>VG$-2x^A-H@Z$iT`V0CgTtp9n$~
zNN})1Ss<SXPL^eq6hL*g+OEl-%(Ch*7tUf)JHiZfBQ67anN=pQWmIQ_1lnX@Cf&*G
zO#JL@jErmyYOf~CGO6&giS1w%yThn<hFz_XeX<|ZL_ao38A%&zhK@8NsjgOK#O^pd
UPO&#EVr*(0Vkb6hv#etT016ONUH||9

delta 476
zcmey&eUrOBz%j^BltolPRLMt_Nt8i|fk9)d{$B+SM+OEK1_%&k5ET+t(E&;eh_Z>Y
z$w+X(WDqPd1`RBtK>c9TI6WB{n1CE6Mn)zEwI_@+qAZf83~U0D5)*rP^_j$WFpAw_
zRC80~Aja&;8=0jhA7>PoX2t0~*2xo@MKxja5*#oVcK5VRj%Sn=gt_7cv)U?VpgX{3
zFtScw&a48oO`VZx@_R-{c8CwQO+L%4!VA+egI(<w+vI=Dc9WMe1+z0r%1GL%J(<`j
gL59bq83{Q?l@Yt;_8co%#F*4L#HMU^Wm(4v09vg{(EtDd

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
index 57642d19925240e1a86b323222b420617a75e7d4..d41b35de2e6f38dd1aa146c59b1e5050088f7b70 100644
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyM!!c@iuM312Nj{VSlJ(M$BFgY-`Xt0VC7C5GVy^2F1c4`vQIf
zAN&Z$ca5LL_~3&NehCxLX{q1_iN0{sp1=2=^ZWJQw2W0##6?QnIT5meO@OUWd%rRY
z0wf_60b*0!-KKI%Bt#;uDN$wSsj#&mOZ$Uv>m>l00$Gw|Fn&mB5!d%2k<zvG_PIYC
z)mxq;lB%qJk<@RJacn66GeqAe0w81{l_CSMgwRIFhu2*r6jd}O@|aNM&$5dNy<M9T
z+8`>!`pB#MZqFOLbF6;GjE^zmb+hY|S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDSo6
zerMF3qxb%n@s;n!&E`u+E~2NO?1PpVI$gKd5D7ZRd-B{B3FB*DI7}{v2db>6^@A1M
zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0mccu3lJd5>j+1w^dS%kl(~5*<87>|n7I
z<O;OlDSJi45?hKROA(ifFF604RnV$f;?@)tup1;j);wq`HMX{uf9;t0Y}3qN*=Ods
z)vL9eR_)TxRfF~2a)6Ik5*(7ND6W(Oya9+m&=C(AuT>qt%AK}bR=(}@YNKY(z94)L
zr+5$pq_n9}VLle3gNNLlz#Xw=nf8Ai27zW(<f)cDaQdg&m-lJTgtHyqPf#50H<b`C
uXqiF7e{<=McAUXrr{VkFBr|sX;knn(m=AW%-K>6)1_&QGiUE}Ar~Vt0tMqdK

delta 386
zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn
z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm
zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R
c0|$>-h!`-u#JU(a>#!_k<d_c(GEl$+04m8s>i_@%

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
index 57642d19925240e1a86b323222b420617a75e7d4..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 100644
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U|
zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5
z1c*T>0>rwwze(k!NQgvAQ=-b$Q(<FHmi7nTR*L{K1+pZ`VEm9$BChX3BB^Vutuucx
ztTjDHtf{j4MN+>>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=<Ujs
z&~2hJs13cE?{>YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^
z?X`!U8G7$;8DIHs+-$yJ<UD%%@h)hIfzxrjb&;TRd?3$UkubjYg~Q~0c%aI9O5b0?
zjWd>Z2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi
zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC
zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9
zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t
vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J|

delta 386
zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn
z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm
zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R
c0|$>-h!`-u#JU(a>#!_k<d_c(GEl$+04m8s>i_@%

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
index e303e5ea14abd679e479c2fe54ba994402d60350..2838dd9727770220dd6b3f3ae9f0d4bdaefd8ec2 100644
GIT binary patch
delta 341
zcmaFJ{fv8pHY3|a9Y+>6Mj%}%4J3NKtqnx2d_>tq8H5-ZG`8yhRghT9z>pxwz|6qP
z00B%42@e<(7)2RGMMTwffJy~`m`z54LvS)bqo|f3RG9<^Gn56Sz$S48GB7X!Ic$tf
z%qk3O3nm9L%L=lI-C`8`#;Df7qPB_|W(cF;<cW;d(l7&nRtbr!AX~)@vWk&;vMQq@
zJIM76Y8NK&W>x{3wu4dZ4x`#8cC{}cC&AQBWK^3xjVX|kZSo5yIgmd+m<Zah!bH$e
V6;82JEMjbG9AfJ>8?&rq1OPv?J)Hmm

delta 484
zcmaFH{g8WtHY3YK9Y+=xMj%}%4J3NK%@stAd_-A98H5-ZG`8yhRS;-oU|?o|09Gh1
z${;Evs-go_CLqct$|fVh!8*BtQ8XMTFTnxR0n`9Ch0~LPfeFX~TE)zyHjPzAltt2%
zflWYCf-faMxj4QgKfWk6H@_@Zlt+w3>;$9O8%DJs%xbHcVP-S3PUdD&nY@cpJQ8FM
z2w=JbCL_TCV~H_nz}W~<pk{Peu`s9|n|y#-#gIj82cy^>Mzs~}YOg>(gPDY2p_w^(
t0i(?1*-U|qER)|d$$<jV!+?ArGEv|RDNeB+EMhEb9AZm0Td=HS1OQ^=N<9Do

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
index 1a053b47e2e4eeecbc7b1d7675075940736665f6..7cb2cbac4ca976304da8c64e8db39c678260db2c 100644
GIT binary patch
delta 495
zcmcb~{gT@wz%j^BltolQ)W`=&F$gg*Xl&L0t02(Gz`)D^0jyA3ltENTR7D4<OhA-P
zlubs0112NE!93AHl#zMze@1Z&7Dh&9Cbch&GNLS!rVMNXk`kP0nMK7V@hSPqqC8?O
zVka2I-Y}|#sc~R4L6=!70Ec@7q3&SA=^mI02@aSVF$N6|!O60WitG?)Et~AgEUON4
zLkEl6E@q%BaOvx1R++q(QJoFq<H^2Ex|7+N_(4XoFsNOdEX$<A!y>kWQS1(*+75QL
rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK2hvO;

delta 453
zcmaFKeUqCdz%j^h@?J)9YgPsb5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go
znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;dBB_MuG#z0y%+gvM!?{)Z(_u
zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQ<m6Z;-O1ui{7|E|P1a>n;ei=7gI(<w
pP{Aa2CP^7d8*)7;%}A;PRT;6m!=6)Y1&bJy8i&}F&DJdI7y)zSJcj@P

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
index 06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9..79fe53b6203893a919b73fc9a93777fa66f579a6 100644
GIT binary patch
delta 495
zcmcb~{gT@wz%j^BltolQ)W`=&F$gg*Xl&L0t02(Gz`)D^0W45jltENTR7D4<OhA-P
zlubs0112NE!93AHl#zMze@1Z&7Dh&9Cbch&GNLS!rVMNXk`kP0nMK7V@hSPqqC8?O
zVka2I-Y}|#sc~R4L6=!70Ec@7q3&SC=^mI02@aSVF$N6|!O60WitG?)Et~AgEUON4
zLkEl6E@q%BaOvx1R++q(QJoFq<H^2Ex|7+N_(4XoFsNOdEX$<A!y>kWQS1(*+75QL
rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK{~Syj

delta 453
zcmaFKeUqCdz%j^h@?J)9YZe9w5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go
znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;&cK`MuG#z0y%+ovM!?{)Z(_u
zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQ<m6Z;-O1ui{7|E|P1a>n;ei=7gI(<w
pP{Aa2CP^7d8*)7;%}A;PRT;6m!=6)Y1&bJy8i&}F&DJdI7y(^^Ja7O2

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
index 43cda6a0d1f01c2bf370612d58cded5f1e901344..9de62574605a07b28b991ff2c736fbf6e3a7f45b 100644
GIT binary patch
delta 495
zcmcb~{gT@wz%j^BltolQ)W`=&F$gg*Xl&L0t02(Gz`)D^0W45jltENTR7D4<OhA-P
zlubs0112NE!93AHl#zMze@1Z&7Dh&9Cbch&GNLS!rVMNXk`kP0nMK7V@hSPqqC8?O
zVka2I-Y}|#sc~R4L6=!70Ec@7q3&SA=^mI02@aSVF$N6|!O60WitG?)Et~AgEUON4
zLkEl6E@q%BaOvx1R++q(QJoFq<H^2Ex|7+N_(4XoFsNOdEX$<A!y>kWQS1(*+75QL
rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK0*6c<

delta 453
zcmaFKeUqCdz%j^h@?J)9YZe9w5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go
znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;dBB_MuG#z0y%+gvM!?{)Z(_u
zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQ<m6Z;-O1ui{7|E|P1a>n;ei=7gI(<w
pP{Aa2CP^7d8*)7;%}A;PRT;6m!=6)Y1&bJy8i&}F&DJdI7y)GsJbC~C

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
index 57642d19925240e1a86b323222b420617a75e7d4..9df2f3bd54e13d5078be076585302c2d0f4e93af 100644
GIT binary patch
literal 1497
zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K}
z0vA4laj)@Nj0+bod<hfJX{q20B)Z|GJ-_#!^L@QHEo0Rbagh>tj)g2>17P#h?yro3
z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI><LK$avKj2}{3#PvN$q;ze)edZ5G
z^_J&|q$;alB=wtQ92v_04AHlU00<dKrN{s*A#^k3!|SdPiYl5Cc}ytsXW7Ms-mc9E
z-6AT(`pB#MZqFOLbF6;GjE^zmb-nA7S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDYL|
zerMF3qxb%X@s;n!_2x@PE~2L&?}3&WI$gKd5D7ZRd-B{B3F9kYI7}{v2db>6_5Bsx
zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I
z<O;OlDSJi45?hKROA(ifFF604RnV$f;?@)tup1;j);wq`HMX{uf9;t0Y}3qN+Nb8Y
z)vL9eR_(&hRfF~2a)6Ik5*(7tD6W(Oya9+m&=C(9uT>qt%AK@ZR=(}@YNKY(J|}z+
zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(<f)cDaQY|Nm-lJTgtHyqPf#50H<b`C
uXqiF7e{<oEww=LXyW#uZBr|sX;hERZm=AW$ovgl}1_&QGi~*GCr~VuJx%5l`

delta 386
zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn
z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm
zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R
c0|$>-h!`-u#JU(a>#!_k<d_c(GEl$+04m8s>i_@%

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
index a4ad5fbf82bf959f34c9a25974d34cb91aeff037..8e1fe121e25cb51ec8c26b1aea7ee00463f9400b 100644
GIT binary patch
delta 441
zcmcb~{g8WtHY3YK9Y+=xMj%}%&B!wOKa)p*D2u3qsF4qlVi00r(AcW~S3#hWfkBV~
z0+^w+D1)ewsEQ6ynSdypD4UD~2TX+o2TY9^g9eA-WLZW<cAyo^Ols36dos(avxuEw
z6nn#{_Jdh%6*C@vz04|;*D{J*GvTxmCL_TCV}Wdhv4NtK|1*k%>}Fw5J2qLCNyU&w
zYzL#*9Y(bk>}sz-ZbniIwFTV-U1q7tJxsyCz<IzV2M!`nMpFHu%1D9joMJmz#8}ig
M#FlK<W?9Dw0BhAb00000

delta 361
zcmaFJeUp2FHY3wS9Y+=>Mj%}%4J3NKC+}bsk!EIq08s`}AyE|_Q6^CVQ8rOF83_*N
z$^49>jLef&85P+<+8ET@CI>Rh3Nnf9U=+K<sP=+cZ51<;x`~X|){Hn!gULv6z*u4o
z8gMpHbh199IMkkPlXo+#7{ctC!LD`-<Q|wl1Pk2+0T!vrQ<#DonI^wrk^_0sgNdM<
ZRhZPsw@#X41&bJy8i&}F&BiS27y*2-GxY!f

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
index 008ca3dd918c330c4f150aede27bca5a8b7eb44f..37aea5168fab44a7bd45091bfdfe5871bee8d360 100644
GIT binary patch
literal 1530
zcmcgsPmj`25T8=$b`cG`@ilG2frB<2_77dMM$8_3u)7V4E1<G27XqcgnuTIvA$!xC
zhdujEOpJ+lKY<5NegKaiJg9NrYrB7hHSxlvo$1V*H}m^-US5e-3=$?uvQb0>%K)i&
z>%Wrm*8pMgB<5QJWSQJrAyE=15|I+B9IU8vkgGsyMidM1s*44H5IRvo5aarHAwk00
zCP;BjU2Gq^!%@BESR|%Il~025P0*{l{69^!6~=&s0970f0Qx0zzley>h^0OPyjlVg
zA|SzapIP{62t)z5DI|4~-;%$s%A)_p_)SM~k@LHD5bzVlH@l#}kDN2Sf!v1mkyCf=
zo-?*N&7XquThQNz^$#KvovwRDx0yu$H9^5gKRv-<ZmJyQu%6S=fZ^o{{*cf4k<0w*
zM|Qt6>Ry=Umj(TF0clPbzoO+Vnp$xaRB32+?OuaOIN;l%3unjk|N7Hja5g+pqFO@R
zn!{0?-&xS-Jr(6NqC|CB`5d_kpe588tyCWG-}8o~ehsZNna~`iw2Uv4hm>NVLw70d
z(0qr-a+vUledQ@FFpGX*(PK;bXDt4NDPAgHV5QN?(Jl`<OtVJ5RGF=r*%y|PO*f6~
zNp0U4w|dof)2trXG8LZREpdF9iEvO(0>50~csYnO=#*|Vyk=#5klAau%xv50RY%QC
z?FjMR_3=>3k>aLYnej0xwdgi0j^K8fW#ZR=;5l3~m1kYe+Q91XrJrwLp9zb0*nWg!
zv3kCEctK4L8t%(ud$ei|2CEI%btcKN?G6u}e$u$JW~`;PtptGgzz*JJCA{_j0OaiY
Ae*gdg

literal 1510
zcmcgs&u<bz6n?wFxHXL-O=q*4a3H2`dgu>UQX@?-18p~Gt$>j#7k(@Y)D#vM3hB*1
z#fujY9!yO98$5Y79(wRkFy2gjGs_lm@!*9QcyHe~Z{B?0zJUTOnKVk{^zI3jDXal(
ze%k$=R2V=Me4{|u=*AW$SVbsFXiBg$eNtg_0gC%Yx0O7Aj0uq>0>%$1L8JN}khrd`
zw9dW3u-bHOx~dZOv!s5PjAKLjpF{L56aZ8Pba4y-n5XFe39`Krq`d>EMIZ*m7lnLk
z*HTTwJ?qq%6&Dm}N@P;X#f4E^6yk0d195U>1B|z^sk%F;Yfv4!RnO_VBS%R5CaJ$9
z<4x3fA3>4fw*NS6p~yv33rL5j7Ocz*3!{Os`Kf(Xrn>07uIh!;YY#g!6a09~_{lFy
zxH*>>Mq!aW{bUa`GO#;Nw@wKT^`1O)U;O;9H|&Fp>48f0gnqDu>$ki+U@Uto>X)cW
z3`PAATL;h++A1rRo}TUdL(;D0U2bERWsDW^%k42^2(;NTV{Mje3oMHTPxw=gu{@8M
z2N5&AlzYzcXWWTaxjb);NtSg4=yJ~wnpl~~ij{d~o0(L@%v@E@%u%yjZZ)j(WhGq_
z^__y?hm}YN?JCHNdBInZxS`J2A?ItAMlaK+t)`V}+1>K6k*-`IzlRe(j0vW=DO==v
zEXHhh$eSa%BObX;`akj$;knE4Q>{wh?wzJy+{bgq9BuRcNX6#;Cg$@6E!nSouP>e9
mw%zY<*FDc2Cr6GqICpzV^TCd}lhO|o0R9J#@TXS5fA~LmVfT;#

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
index 57642d19925240e1a86b323222b420617a75e7d4..3d1f158e9e79bac193f88f94d2b548b79827778b 100644
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyJS(ac$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U|
zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5
z1h@mC2oUSy{w9@^A|VnfO^GT~Plb&+S=t|TTP*^}6v&b!gYiR3iMYNCiKMQrw$A*)
zu-5b(v8KxE7fJml8ApclKST6QA^<`LQb{rZ3kW4cKD_Q4p{SxMk;jB0f0~_7=<Ujs
z&~2hJs13cE?{>YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^
z?X`!U8G7$;8DIHs+-$yJ<UD%%@h)hIfzxrjb&;TRd?3$UkubjYg~Q~0c%aI9O5b0?
zjWd>Z2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi
zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC
zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9
zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t
uf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEVGN)|KlR_NHuSUr

delta 386
zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn
z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm
zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R
c0|$>-h!`-u#JU(a>#!_k<d_c(GEl$+04m8s>i_@%

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
index 57642d19925240e1a86b323222b420617a75e7d4..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 100644
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U|
zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5
z1c*T>0>rwwze(k!NQgvAQ=-b$Q(<FHmi7nTR*L{K1+pZ`VEm9$BChX3BB^Vutuucx
ztTjDHtf{j4MN+>>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=<Ujs
z&~2hJs13cE?{>YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^
z?X`!U8G7$;8DIHs+-$yJ<UD%%@h)hIfzxrjb&;TRd?3$UkubjYg~Q~0c%aI9O5b0?
zjWd>Z2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi
zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC
zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9
zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t
vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J|

delta 386
zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn
z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm
zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R
c0|$>-h!`-u#JU(a>#!_k<d_c(GEl$+04m8s>i_@%

diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
index 26d0c0905..c08326355 100644
--- a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
+++ b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
@@ -5,8 +5,8 @@
         "job name": "cluster",
         "job type": "pure python",
         "job id": "job_id",
-        "start_time": "2024-10-18 08:17:19",
-        "end_time": "2024-10-18 08:17:19",
+        "start_time": "2024-10-18 10:32:15",
+        "end_time": "2024-10-18 10:32:15",
         "status": "success"
     },
     "code": {
@@ -15,7 +15,7 @@
         "path": "path"
     },
     "job_input_params": {
-        "jaccard_similarity_threshold": 0.0,
+        "jaccard_similarity_threshold": 0.7,
         "num_bands": 14,
         "num_segments": 2,
         "checkpointing": false,
@@ -25,34 +25,34 @@
         "num_processors": 0
     },
     "execution_stats": {
-        "cpus": 71.6,
+        "cpus": 91.7,
         "gpus": 0,
-        "memory": 24.71,
+        "memory": 24.01,
         "object_store": 0,
         "execution time, min": 0.001
     },
     "job_output_stats": {
         "result_files": 28,
-        "result_size": 33665,
-        "processing_time": 0.052,
+        "result_size": 38040,
+        "processing_time": 0.061,
         "input_files": 28,
-        "input_bytes": 78286,
-        "input_rows": 70,
+        "input_bytes": 115324,
+        "input_rows": 168,
         "consolidated_files": 28,
-        "consolidated_bytes": 33600,
-        "consolidated_rows": 70,
-        "groupby_clusters": 14,
-        "cluster_duplicate_docs": 33,
-        "jaccard_clusters": 14,
-        "jaccard_duplicate_docs": 19,
-        "num_duplicate_documents": 19
+        "consolidated_bytes": 80640,
+        "consolidated_rows": 168,
+        "groupby_clusters": 35,
+        "cluster_duplicate_docs": 79,
+        "jaccard_clusters": 35,
+        "jaccard_duplicate_docs": 44,
+        "num_duplicate_documents": 44
     },
     "source": {
-        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/bands",
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands",
         "type": "path"
     },
     "target": {
-        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/docs_to_remove",
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove",
         "type": "path"
     }
 }
diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd
GIT binary patch
literal 14933
zcmeHOYgiLkw+@Ji5meA1DB=bYY@39OqT-cs6OfyLPz9`$$&idpX5!3*K(&@CUfOCE
zue2&v@Jef~TCG}ZRn&T`7p#g}QB*#w^@dv89<|@M_9P&BJkN8!?>W!;mHZ%?Jv)1^
zz1I7_YwgJ<O_Ls~a8e*eSQ5O+JY=%I+k&5?&L7BR4l?V@$@Z?0DLfPbO6b;E;jD1>
z@p5Ut<K@!a*HhN_HLWm!G-~?<namL`oE#k5I0im;@KHE<hs&HhdwaRjtoRPCdxzv0
zvyoM}dOCR?cJRF85SSI{^6yOKjm?uO9A)sQGrY-UaJ_f9;%cwf%bT#WWB3S}{NvT0
zLm~?niZw^Jgg$YNUvi$B6M@RUQT2JNY(f6EfYQUs(>C-t8B=<&s=EL1E3;f91H(Rh
zUiYNzNoFarKGHSEA=x3v!Rx~H`w<s+FWPjie(RvTaU|QbY**>2#fw)o?VFtZTUGy?
zN!4WPt}&-gmlmCm<Zqv@dGOuF2c>7mi68c;xD!4{QDKq4b>d0f*eTmHEaFn(t+#Fs
zzj^6w;i&!JqncS?9ADS>Noi`2sg>;A;ya_?if;6sn)ozg$^78z<%5SWEU*1+@~xds
zUmWXF%dBildvE6KTHldxOiliL^O$ZwZrtcn=kD%a78CPlxYzpu35wghoh#~|99}S>
z*RI{e^6IN@4)L+fcbGnX#JHVB+WG1SPQ7c|&g`J*5twE$eYXEW=8bRGMIT(8w`1Dz
zZN`E@R}$}J%70b&cfDFVk19&M<TSPY&hrPCb!(G-R9R7*ncip`Z(Owe!LS+UmNq18
zAn*FOb8-mi?jGPDSdkPzrBmg`+-IJCGw<#1H^Hzvw63qA>A*VevP#bi&mMDxdZ8)D
z=yreTy7LuP5AyPxes*)~?leKOBl(}_ruz7H?=(+1ryiHbX6J+*F0buQog@M$abq8U
z_jp%L-c#CbTwcYh>Zv||pL#s~M8JTacVq9pu{k2%DW_+jPJdK%v;>B(*y6%|weIqp
zsYPQlYLu7Pe!j|e@coEg3g5rq_IP;rQ2^i0yC~}Vh=A`NCtdrm+m#cxnlJK0g1-Fl
z+~gHY%PR~uiHk?96L;i1_1(Jmn_l5>mQFZOnB05TtV7XSf9<uzv273OJd5Y8-4K*s
zURyfi_~}ELhh{`LhlV#K9I9?J>t0mQy))IvgiU{~EcM)QT65yO^Q|y_LH*k$+a6Mt
z2Q#YqDZL~8wf<!pinxy4_}mk7-G89tmmXR78+op_&+1R&SCPjmEm=1sZm%BxTSCLM
z+?<GoF79K~=Ny{8x#989?WZPd{*kbHW$n$KUvMV_w)ZjnBo`c=H@0BWmAlsrd%jPZ
zaxp#e+QJOSx!Pl?7X+U*rp#wuJiZ+`_tdAEXU|P>@kkvSU1Zu5Z?3<W{_gKbKK7as
zw0gz-wL#;(r=yYsXV&EITJH94)zYZI{=RLJZq$~HZkqL|$hV<u()@kBPSp7NH3e+>
z=8NR%YcfB+a<RVZyUY(p3_h8VH|NGMQlZ@w`$sYF&gkA_=N@a+)E6hNsBNft5bFGW
zF!|G@+!(*Y`kW=g*F6(AHny8od1#Ew@z9b@ZVGDq$jJFSo_zj${MqZj^j^c5$KUv>
zYt#?URZXY*Pbu$rVB_{_yO;Mb9aKn_udUcrJ#K2*Wd6vTJN7Qo_uVo7p2^kyjlk2a
z@z%YILQLX?Lf_cy4er0!S3O;xAZ9K7{>LW6mGT2F?7It!m_6TIiN5(HqV2R!-ICTX
z>h(tBPq%-W>UOB6msf{O-QEYW?~m3r#dKM?G5uNBcXy3E8>=3cernByE;;Ulx;rlm
z9)HF=^~C7do?>F#xwFDz$38N@d(Q8tp!RpD@`II@+%w(w{5mgU&7y0x{9Z(tsS2+#
zyMB9k%e1dP!-r1x`?lX6&Aw|##{a?p>^n1Pj@OJ)4`Onh_KfPeW9!<zXZr3r6r!7Z
zE#XqQPsa_8OJxn3>yy0KPj%0{xx2~e_ISK}^XJo_=e=>7UcXJOK|y=Q_PgWzcewuP
z<jHFu)<1IdO3hj}KC$P(A+wuSdoONO&xmYTyEspXUtBAHKeu{Vw%>@u+unCCsPDYt
z)~KF_U&bFOry6=5ubk9j+@Rp&qgHC#$U;gCjp)>SzEsG)yRN4@MCR_VTE=$JbVwbx
zwcUkdY3JIwHHweCR?6Bt`o_$jU3Ast^I^wE!>qw|Wj@~>pR_v8=fUO`yUZctt-Yfg
z%bpHJF8tNV{bw%6|GGOxXZrnA!_1jgN2mX><$T5XlXo2wt7x(R&|P=(5>FQmsvcVS
z>B8>Cmp%<H>vm)5`t(ib_D}g}OZg^M$&K|5<>`s;+WBSPW!}lPa~wjVqGl7WoX3Qf
z50@`;IeRK;K@|1Rh=&7Ig~b=Al|LJi*1z8%s>Uf}p7`a0gsxkDUbLqlda5#g@?`G8
zGOvYonv#-FS9QGnC)pI<SXgMPNtpPL3zs~X7O%~2;+x2KmM?76-o25i)5QFm|LXzI
z-r)h3UYm|bmRHaA@x6a^ifLcxkhVz<T}B?zeSR!)ZDAFg_9W4e`t`}gANx9(_Vrlh
zy1mctwPXAv6HZ43wb!j4{Cr-I$K8HDEQ{I8YTWKSM{n}F<D0ZLJF}0+6?cy-M`nDq
z>bxP#Y0yV~mqy(TpZ0up)!AFgKTNhvT!M~n{+38Sv}ZxM?}QuM{6otg&HMP|mXPhw
zG`0IGKl9yuefZHG`&Qp?)4AKHe(QfatGF^acv`wg&Y8PApLvYvVeD`(FOIL7@ZO^n
z3nN17+mf==*GIcJ8tNARnB99vRI%UIDE$pTw?L-u57f=P<dVsuU8M88AtO63l6_L(
zJMd5E4dH%4>pocA@ys0G@^1SNPINf_(N{;m%ssGX|F@$Y?~FSdlKxi-f3vM;kkTU{
zzWr6du3y#pxE;|qg?2dG$*ZixmD6kX2J7#R@|0CNIs_P(kNC68CrbH4&)$c&$&xST
zG9Pngzh9{we(nI77A%@}&p2fY4C<E?C*+--bT}(_@%ANs(tY|YE0PV@PP{$;mnx5x
zM!M7Guiu=Y?$|bN-<Lg}`^AhD+qGAHay90{&ah{p%l=xa_AKn6^oWbUoVz4RmayLO
z)v9HHtnXg$ERIFVgqcS%l!@R)BgKj+L*#f0rJ5*KK$!-b*P$eiBt#mz($JeG4YZzM
z<!R|Cg|Y}18h0i|PD}A9G)#%YLP8_tq5Z>E#T3D-I7WxkDUqNVxUFGH1IG&}o8xp0
z!RlaYhCzvDK}4x$QJ{4c8p-4KbizdIgduX6;Kk(894peSnG(Q4o)9TLO`;UZE`mX^
zMJ9q3u#IUjvLKJqWW}bWrKV>pQ37Wnl-NXdi+w0KExp<JV5Cx^|IrTu;=mD<4iSh2
zV+6@JBBvvY5h1>^5b(|@aG(WNr<DAp2LDkgB19Py5~30&nF*d!<#U{98-^B;2=}8o
zBiLRXgs-r-nb&w*Eg!{l78d1mJfcM~3fv`vr)XA>1d5?ZDLh<0(n|hg2_uEDBLx#h
z(gY)dwI&EK#VC<r;3z?f2$G}(f#YfLC+?}I^F?UJ23Z%GPZ6S-hr1@8)0s(_G@s{;
zh@l8+R%s#u31-rO2m!@HfGHNm(Sm{El_-m5sbT^agFnEqaXdw#G>S7Z6if}%funRh
zfig%=6rsh)!9tAS1;l{`yr6y&5%3bh2pqBiJeXn?Pw8P?2uQOl1ha|aY48gPIO2=|
zGlAj@X%Z%)S&}hJ))E-IA_dD4Rr%N!(m?Qfc!RMdUvNev1Qqb5M?3}VW|4_wXtG!d
ztA_5_JIKO=6LAl`0I69p(AE~)l!98GiLn8(#IV7A$rjX#9MYO;;Dgx&FvX5Gg1M|{
zHX;ZdzRo9%G%OJU4eM(K3v8+gO>8sT^*m=04G<`a2gR+tWkV~*NIuO2v9f6aOb`H@
zt#Jh;9Vjh>0B^y4pd!seq&Sgywcj9Xz-SQ}AkKJZNe8TK02&HNOBLf0@SMrC&H_HB
zz<(I%CLX-rIu4l7V>!M+i8KtMC5(h*L}~#;7&s%LMjD-t#!v=b0M}!m*xVpZZo<??
z;77blX+E3ruU}*`AXTucRvZ?Zu#m7&wcJ0O2G?O@DMA2VNIammG(!VRU~D`ogFVi_
zBt{6>W3`qlA}N#ANQf@eihMvV2@XmGplb#SM3~}109i>~Iq93gh?g}GgB-%oii9A7
zthIy+gWN1Fxfw@N1n}4#sJ4~utyEd*j|76&p+wq<3SsG3+m*mz3X3afI{>X&>eR?T
z4iowX5V16vfJ_3i5KIBw;5oD2AWfv>K)02M1RqEo0X~^oo-QO9fMxTt5mkh#NU<b^
zY}#hzMCc@mla?~jfD)>Z;)_A(U<DQm={PmY27hy00VcEsO9sS%DiT2-nper?=fkk9
z9!T|)zya5R#?Vi&i5;y6hJa@MHz{b2YwPS{4JdZ7q&qlU*&J=8)cyby7!-yAFVGNU
z$t(-SkU-fM7b;Ofiz<T~X&ympG^jx6B{02eUu)pBXcTY?M}Ux0W3vzdxtL<4179%!
zbscCQR-IS!O-rmf%n}~UGN(k*v8i~JwDj17ObBr%9@|DL&>Ij0(uzhHUjT*E(fK$7
zyo5$jFJKx3(PkUK$4CPmKo}v9ON@OP)H<L%Z51b6BDD50Pyt|q&Bu&u8J2*Em{fok
z@V^n?mF&f-2HM18@P9N+!DAUvNn`*g;83+7RLVuICz8y<o)pYH;MR&B%voDflk^H^
zVyyYCxFv|MraL^w3+)wJFoiKG<rytclYmyDW*wn}BqM1NARm_>g`nYpWgah-wd!Uw
z^&ksFn$yw{=$A5+(JTPrrR7N0;{pQ$pqKKv3A%HTQ^2H`B4o4K0?5RKf*gW9g(#zm
z;ff)@V7!Nhhd=-Y4)Q;U^bjl0iHVT$LFcRq#|XfJ5wu{nnPF5S4ak4t7m0SDOIogw
zQW*)pfPxfA=y5v7NCr}XplG2u%EzI_Igx;*DS&<#!U`dj`A}d;T8=Cr`G{f*X`Yj0
zP>te2z9c^5fLjFuvn?4C36xe~eg<YTd{&}RSclrEM%PLfq6lQdA7i1c!xLseoTV;d
z)^^-wKyw$hT%(0LsbxX(X61ubKz76!0aQVyimcox^c+U74icP*lUxh|o;7hiBrse;
zT8p{nbVLHmVEy29>jHp7{lcwb5%4IX_yRi~3}tX}W5HR6fC^Nq29dQIl#(QhRl%rO
zEN}%t+7gig=#Hra17r6>orUW)h!f&i28t@EB_OFo6V?TgdLt;k8q`TGj|SI@P=28d
zBg`o$ZI_WeWio?F5D{GJv$)EF{_-puax$Po4-96cas}2k*dU6g!NI{6i$!hXfjd&2
zkNXGX3#h`N4iqRM7-j}+n0b}1Wdos->PHpkg`kp3S}TQNVigU&Z5s$xzmSlKegi`X
z4Nx0IBO@PYCJQJYb`g-;Sd>IVZUYB!CaKmPjDiwGpap?=S|}*S=$Z&cZ;)4ATZtVb
zq1b{vnMCJP@?@e2C7JaQJ)V+}<;)OKHV9WQz)k=bpk2#BeL^7Hm!vjW83pqK{^KB}
za~5QvplpVS0m=l@n$sqNC&5Rw0EkGj6*4FjNP|ltL4-OS0s}g27Gam8M44F#unh%>
zH5=fOShEQ914md>r3m@?g{@d_v3{8-qCm<-SPvx0ra@Zt1`&cU2eH&yc?0Pm6BhFm
zKyJZ!ff~(7*-9h^ixEQ?KnL4G)$)GfgMoM!lnsl6HAEA<7UNSQ5>(aDz@m79H$sU9
zqet;jw4f=N2T)O@Q9NTbi=ct9w~3Lzt`R4F0-Os;T!pg?PA#xz%FJ^Nr<dFo2R;G=
ztn!020D78)NuzNY1?4!jr!>3BY8dvxYkaMG+1dDuR&$dH0FksLNDg4W)T%^~ra=~=
zz?YIK#1)UrV19}(f`Szq2dUX$5*lo=SHBRCe}AX^A0b=*`ritUYkd^1w%-a|a7_qg
z6{MdD>fZDG=$*j*y#x2R@v-mit%7CW+e?VH@9ph-d;8uV^1M~`?R$H0gne&s-`hhC
zWZ&E4=SB9ty?t-r^3(^O9@_Wz_Psqkv9j;&TQ>amz5R<HqT#VZ%dg=7;okmr$Md|0
zhReL11=2tn3564!*X!f>%5gxM!WG(Dw*7r%Nj~tDz|+aw$6NXTbc!(0{s6*$xWIn6
z;H95%?1u~NhYRe73+#srS{+@pA1<&TE|AVuzH*ihKY)yGIqzpbTwp(3U_V@7KU{zh
z%-IhYv^;CDA1=Trt1|v~$MFBp9zb}meBaY4(8cq~gsw7~w~v>rCOthhd$9F+gjb1U
zFn$fxYv45+|AE&ijmDyh(re;08oef3x}}MV(rA+L&1j7#0l&j_^E+JOk5MnKQ(|Dy
zLC_XECKmqUFTqRtD1Cx)q5ni}Sa`^|?C=oMBDKQ((Rf^GaF`s%8>@-3Sm%{)O5LD~
zLHfYix>{Q`niTwjfbrfAV+!65ODWXGXNg+Le!V6}8jrM@f_uUY@RE5UI5#!POo=Me
nWKt1u%Fu$~jFI$D2cZMhq3RHCB|MM8k7Rywm&wxLA4>lPWZ^x$

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d
GIT binary patch
literal 3068
zcmeHJU2GIp6u!GGYZr<vh1u>VOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd
zr)%p2U<hKU6y-tUPmu(TQBZ7vglG{-4E}`(O614-AO#a35t|Y$rs|n#i(31_gZk3*
zaR1IX-#O>bx#!+mwyv0QF_?+Op+i{+<sB&+!rm<iWujr3KGHLY$ztZ_1I){~8F$d1
zF{<%rjADH#Z=6*iuvtAZgq$#OIUJLmp#evbaRo||+Z*st6J=w)m|V{(l9`g3OrOj5
zsl#`}5n3I}xEDhK%RR(65j0-t5Q6z(4|8_%U&Gt5&M`Cv9c_%~y`CGp?{|>J=%~>8
zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N!
zT+gohy}f4!viskNb<Jyk=3=U&>vSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h
z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{<Is@}v7#
zAMEU1*s}fjEYF3{y@QEpLoZi<sGzI%g&~J~vNN-*|A2Q5cdk{g%vLVV|7LG|?pEP!
z_ES5ro(Oli_-sLbU}101gXdFMyE>+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk
zldhaoEl!8**-E2<axUk?mv(0yjCn%yPhC4P?YSq1*uMU<+`zGiqSH~qnRhS!`qj3Z
zc~f%EEMM0B__c;?_qVw3026K8aq)@kH8-}z_uXEVJCz@sE@de1vU4rf4XPzcTRtXO
zC8CCvCd?646FE^ab*zd!Rb=QjVi*!lVpSrXn3URZq754Y1uSQndOqe%&H4Zo9N@(y
z)kK-cL{<Q#jnG)cBmoeWa?Q5dfF;==l47c~5Z42rMB3Z1B#{)slE@1Npd~S-f@MR{
z6q6ThtC9$AfK`bklu^5qBJ&n$lcGT^lG09P9!Obm$W$ddCNOZNEkQJ*V@`n3fS*$o
z>0aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1
zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe
zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R<q%rskj$++VG-{8m)S#9FkqBT4
zA|)1*Ba#F^8&U!sE*!tSbk7VjnIq*T1Cqo5qq<#LPR#V(oE|#t37wo2oXE<Fto)a<
za=g1Ju&5OI-8vlI6k%L&IpsRX+<l12gtfZ~n~UN>$Zwx35Deu1M^|QO;^<5i^#8S>
z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@C<t&?Kt9W)9%ku0J
zTg6sYv23-aT*9(3tB3h$4>N06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1
zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^(
zZ9Z<q6$=R#K#(7vQ(B8jjRH<p3y7w{WeRt;F2eJmxU8_aFdWE-N@5k0XI%)@!n^+u
DR7u&o

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json
index 8486a08e6..717d9bbe9 100644
--- a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json
+++ b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json
@@ -5,8 +5,8 @@
         "job name": "fdclean",
         "job type": "pure python",
         "job id": "job_id",
-        "start_time": "2024-10-18 08:21:36",
-        "end_time": "2024-10-18 08:21:37",
+        "start_time": "2024-10-18 10:10:22",
+        "end_time": "2024-10-18 10:10:23",
         "status": "success"
     },
     "code": {
@@ -16,7 +16,7 @@
     },
     "job_input_params": {
         "document_id_column": "int_id_column",
-        "duplicate_list_location": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
         "operation_mode": "filter_duplicates",
         "checkpointing": false,
         "max_files": -1,
@@ -25,35 +25,35 @@
         "num_processors": 0
     },
     "execution_stats": {
-        "cpus": 69.5,
+        "cpus": 112.7,
         "gpus": 0,
-        "memory": 25.11,
+        "memory": 24.17,
         "object_store": 0,
-        "execution time, min": 0.003
+        "execution time, min": 0.005
     },
     "job_output_stats": {
-        "source_files": 1,
-        "source_size": 3093,
-        "result_files": 1,
-        "result_size": 14986,
-        "processing_time": 0.201,
-        "input_files": 1,
-        "input_docs": 5,
-        "input_bytes": 6143,
-        "output_files": 1,
-        "output_docs": 5,
-        "output_bytes": 6141,
-        "filtered_docs": 0,
-        "filtered_bytes": 2,
-        "source_doc_count": 5,
-        "result_doc_count": 5
+        "source_files": 2,
+        "source_size": 4490,
+        "result_files": 2,
+        "result_size": 18001,
+        "processing_time": 0.308,
+        "input_files": 2,
+        "input_docs": 12,
+        "input_bytes": 8753,
+        "output_files": 2,
+        "output_docs": 4,
+        "output_bytes": 4650,
+        "filtered_docs": 8,
+        "filtered_bytes": 4103,
+        "source_doc_count": 12,
+        "result_doc_count": 4
     },
     "source": {
-        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/input/data_1",
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input",
         "type": "path"
     },
     "target": {
-        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/cleaned",
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned",
         "type": "path"
     }
 }
diff --git a/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..557f866a7c3a83d68e8842afec48e1c9af5e5cf1
GIT binary patch
literal 663
zcmcgqJxjx25WXZe23pHo@eK)N$YSWw5BnwP<eO6QqiP$JRu^e&h@v$$tt~h?Iyk!e
z7sSQg$$#S?5Crc{>>@5MKEvI6chB8(N77(UfyU^qK<jl%D9i(-U-F;k+B?8=5(vzJ
z3~`u&6i5(Zc3%WZpg>ic&R`FrGL;QYNqyHfWy(|`{Yw^Up+x);K*WFul63d>VLgZj
zjp-GTiLND2{NC_*<R1ra-ygd)p%V2;RPRMAu#~@O=^4BZs0e5XIsnwjR6vu5y;HZ}
z;g#5;_)`#6q8qyLUpw*Jh^63t3jn&IC79#vwN|4QwO1j}QT7<3$qizgu?e$1w$7Nx
zs$q%QHe)-ySY>RJdyJ<ZqZ&^gaDy-O9`E1aa}Z5Vj!>(H%xGx%jPhl4_IbCsd($pC
z<92E5qT5<29^po)hq^d#66uBfP$}vJ%^Y-st7~_Z?F|OmP7wGLbL<AgGyl>oEawWj
QCEYOa<2(ug4g4j)0QCQOxBvhE

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..34b15a76c75091485eac702e1c9f8c6b28c3b30c
GIT binary patch
literal 663
zcmcgqJ4?e*6h296478T3ctZjivKTt_VPAqy-js@us%=nOT}oR+6s@UgZNbUW!O_*f
zATI7s{u}>*Ab4(K7jbd%8_svn^E-#6&YV1r(OZGmYLrk|07$*$KF#%afM*G2K!!NX
zK@teeLxkA_5hQ^ERcR`XJ%q|sHZ*0L*A(fyQiaqnQA~ys@go2c9U@55-P?!tAR08L
zmq8}FmOSx$!}iE;2X5aVyELH^^+{CkMJuqBzi8=cybh=cXb2JjYNX4c$-~~M+wbs9
zY)Skn2rAJH-T1Ga_-({e@V-R=-Ov)uarT>Q(TduukmD$M4AI~QvBTJe*&f?q%wv@>
z#cZ3gU7oBkw#6C!DWg~Ap+j!)h3xVEO+E+F;KdQDm9R1z8a|_V3CTY17WQshMQ7Y9
zZeMhptA%6S2=!1G=S?ELus&3ZIzclBo#5))9c6lhL8cP~{=^)+!SKw#H1jLje0Etk
N4E#8c0ze&q$uINqc(?!n

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json
new file mode 100644
index 000000000..d4cd3e362
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:49:10",
+        "end_time": "2024-10-18 10:49:10",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 101.1,
+        "gpus": 0,
+        "memory": 24.02,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.007,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/python/test-data/expected/metadata.json
new file mode 100644
index 000000000..bf26b5228
--- /dev/null
+++ b/transforms/universal/fdedup/python/test-data/expected/metadata.json
@@ -0,0 +1,49 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 11:20:38",
+        "end_time": "2024-10-18 11:20:38",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "sort_output": false,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 136.2,
+        "gpus": 0,
+        "memory": 23.89,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.021,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c7d3d807242ca605dd7ae80e4807895ad4d48c50
GIT binary patch
literal 3984
zcmc&%XH-<z7T)LFd#5oYAalnVx`hD|qzN`8%+Of?k|2m(qARijQ6MX@V_fCT|d
zu&#jkUIG@X1xp@SF@VNc5R+(P5;cOpdxv1+OY-Bb_14Q?Gv}OrcHd{O;UiEHWJi;<
zEA6hLc{B_KFkSkpRLa?Qmm6_|>r&>{ns3qC<LO|9>WSl>YEfiY;u0EYF)g*Cg|rUM
zp*e<5V8I)dNqK|X03aBJnN^4|Q9#Jsks(dW9D$=Flk%gZl4It?N2SNpSR#}(B1tQf
z$IE$t<w|D7<OnndX3=3b0EVASd#jin*qm)i0{93>K9>Ya!~>Ct=d8nMv7|e&v8EDQ
z|F^8Ayg`YSNBpWIVJi9KEhE&j&{Tp6BJV7g_mJ;Ou@J?i%*#xQP0pAT6O|DqN?;BN
zF>~gnBxWWhYl_=|B`=Zu8X<p%4gZUne_C>-gszsT=vGQob%EB8+bGY?0g!fVI<ASY
zA~zg-122w^L#M5awRj`^;K1+$<XrtdFe=iF4BY!8s`a@Dw<c^Q56x-Uvb4EEZ1foo
zb~heC5_u7yD*Oz+GQWiFQ+m0J+O9y$icZv8{sh?%OU0!rG4QH(AHlb9p$?U_!Lj;Z
z&<IfBNzJzCppOVmYdi_3<Y}mBc{^TFOTeZ@GjV&c2Yl=F97~UQ5qlbh;B-6`)$D75
zJ9&lhc0(F5+~#BKJYgc0kZ-{0JzI_J63&pNxg1Wg`U636s(~QUxSQJ5_!Pv}h}!du
zFBXo&L?&GaUL#+^1^fGgp2y=UF^FKo(lhY#kPnE$Js{E88b6&~3u{}?5Z8lUAg^IQ
zeqKkRtN!P}+1ZRdVe%Ym#98=gt~(gJ`@zXCj0o|j*T`(;anSa74PVVU286RMkwUuy
zFEfw)d~N~fm#yjWuq6j}oUNjE?wyHCYwS>+c`f1aNk1{`(j&s<%y90=+$>yWbCau%
zbfW6H-%*LS;lz1Jhe;*#;N!>M)X?RN!RoV<9Nn7;J^z^*wy6DJ`@s}Kf3XmP`_>|}
zq851M*Mu^L@=?qgXX>_%E!rI61pGohpe!P>xl1)Q?_31sdr}Jd)gdU~ih|MC^r?dv
z-lD_9E)Z6Sw-8HidBNvtGjN@s7Yf%a5%kelsDB>!r=qG7bt(=Y75gm=L!V@IVq^IT
z?i|@{LEjY_p>SAA9!iPC*)P1%)z{s+73SW=g~{GD<vz=#z@UZetJW6oczYM=T9hC=
z@m7u?^a(Gio5x>zqmFkdcsP2{phL~vrNvJhJDO9YeH4G$ZHD(qT&Yy8Px02~W}VP2
zN-9xW0cKW<xLH&dRK4AfXD(^ruGXeF299|syJ|7Ik~dDI^T-miCkAmoyyuC+N(<4P
zRXbr*hY6&oO9Uo?!+ADaHw(0y>hOYy@6lb~@w`!8k5Ih8N^q&#i5s$Ig-(*zXGG0|
zG3e##BvST8Cj?H(h3Surh>Ks>qPD0P5El=q-cp1b+QabO^>c@~4c&yxAJ}8TPnRL~
z)fM#e-a-=ZUn1*vxlC>e97$;v84}TbF&M25;>Gn?!jnf00_#o;X99O|^P-}SD<_+S
z!~Jh5J%_ERoNJE_3^o(o)lb2D*K)ATJ<L&M%;Kd!SMvjVCu-fVtq@Qvg)oF5i3^8s
zaDNH~YOZn;Jl>$B+qPVXy73qAsjY2<<>^Kp9S3)w{j}ReYljUdow!7~`kIkPz95O$
z6Di$}@+thasjE?xlFPT8`W;&K<TmUs@Fa{(-T){0DR;$)vv71_8vb<iWK_FdN95dg
zg3_Ha7Fyb~@!aOyJb#}|f&9xZxNn?go*G<(Gu9mC&fjiAdCd31QCU&s*t%kI_uF$E
ze*aDWb^d;C>Z=ZP<MZX%u&7?_bZsMi*#4<$;(ZC0mYgN-WaLxf>w3_1Ba-LRA5LvK
z*^By$Q^_PfV<?-vhFUnaiim3l;`Op7IP=I0mF_Io{>QL1-r*CGM0g?}+O-d&WqKP3
z*JB6^zL8S~=d1D5w!>lwZ4x;CDCW$oR-wAl{b+sTTHd<pl{l+uA?FUIkD~=6iOabe
zaAWr&)Rqtp`}+fhw;x*KJ5J5$sw@Y$=E$_x={r+`vOKP7e&mqG`}2{OxEAGz65wk2
zJz_h@M3gJmM}-;Nh^70wP-}V-XUvHpsHAoH<-!`ReGf{2cjT?^iJ(fJgCGPw{{9E7
z8q)~bE+Wa&0vdI^c!GP+^@4x5khs`bMGh(b9!!FjxFb?dekAti>=k>UxF9w2>rB@R
z=~X-{x?b?x-$;^ey_94JiJ>x%sWGz=Z~leQd<veeXP9~c$^RHIPwHB>d<DT9Y7tgn
zwqhk$XM}gLYk5wtmefA}R(<)(HQJ`iJgK5$)mlA^pvtg@irn?XY@^Q?EB3D5XfPqI
z=T^hsHJeRbb9gf4zPzm?z4I-?8~3f<W*%7KU83B-u5e6Pb$mzT{`EyRa~tzyZU;7$
zIK;PChBqD9xN|(*Wux=e!n`KeO?)-!_KBw&q?>j#kvhX^wc!f$&7}rQ(t4C?fM<-_
zGRCJ!ZTph(cX1En5A^jab=5kaTNqarjLs^1HITv`lscdfR<k_nL+WS!xk4jlfp^;o
z&y9vta>PFEqkIdkA{z^QuF3;;J1@hu?=_2{{i@x{Lf`9_AxCGO%Ax&kSVf#j>}o9Z
zyE%6Dsipl`qPk@p-Lk>RZM&+&e*WcMQ&vj+Z;xAW{YYffcK<J(lDkeWBZdWZPDp!j
zX}4QZK-Z+KZ#z$|95%Jv`J<n{>1ryPddEfG-`7t_1Mj-#P$FaZ;=r$zt3^gGtEAKJ
zxv!(GXEzs5``UAp(G)c)3%c)}Z{}a>UJ~@ccl#LY$kVH2!4FkC9Fw}6OM)K-lulXp
zhSZ<l6IkJ~(b!|h^v6L5{CB(L>d)w%erU$g*)2O}JPA1#c}h(ggnScL4R5mX57}7$
zvzZ}{2N03>`!Ir^l#tA#qu6vZF(EyJ8=Dx*LJS5^)C(l-BPC@f@<T@Q8ye{K`?2~{
z(BMi>c!%wHu^1=>w-|gBjo2{+=o#2peDofwlk#q91{FZc8`NR((I0Oa2frdctU-DQ
zM*kH&(tZb)5@VC&G6v9JFOob&@&XHay*UGr{^<TMg8o^pvJH+P<)qJxk4=iANd_dE
z;oo6RGegx8Jy!-Uf+Lo`-*B2Up!`05WA%P}3L4WKl{0HiN;S{I;!(^*7<?Wxea62j
z>t7T2KZ^y4P-ZAo{f~k0-vmbgFT*SUyCzApZnQ)w=Sj|w5(CH#r9wqe(A3aL1A`?k
zBW+l6R45e+#Wd#2lAl76tx(1(JQa#Kg<G8FPRS_vvnMx&!iU`%esE_v%`u(zz&cV)
z{b-Dh%>?G)5#Y`8ZrSYW!7hwzg+j$H%5av|$Q8CMzcAW9z%Dv8Bs0cy;==H-pm<iE
z6v*VacmcC${JCkIFi9{p;7@a8W$)RiM}#^i$9TmlqF8%Pc%~eN!Uy@F88cYkmt6)*
z*Q7p>&qC%-nGqH0m=YcqkP;E5DV>QoZJ<8i>F?*mG>f6%<-@j8nHu92lEDfTJ}OpU
zqm(@uf0~b`-%PU^8f?8n;m#ht64@T4MtDwK5N*GR>B%@&iD_~++kYm-fk$EPlp0GW
zbEY~*9<ESk54@8mS=NftQD{0jV5BK>;9gUDc#>x(6Gu}gYtQ&+OxdRy&;+SzYE;^X
unXwtx^HNi-qten+vQ4vM)6x@Cl1=TL#@dawm8lE?-hJRC2@uHqFaB>f=8;VR

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
deleted file mode 100644
index a9ea0990f6152fddc195c2471bf0d3bfdbfc49ec..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2753
zcmc&$d2mxj82@(P(X_O*w0YsBgi@dh1$v}~mV&YHQhJBdgB~fAk~Xm|q{$;mX$#6J
zGvFWy!U*U<9T1@+BZxE577Hr3gNllb!+6YiB7!q8%Hg&9LVybV;fy2SB)j|VcYnM4
zONyn^6oD0Dg*XeDAQB+LbYqLiKmS!w^|_$Tj;vGb;sp>Q1x<p$3lZeY>(U`!70?k?
z0WAOsj>Tl;5n_158%QUx+BD!Jv|@0b&1J7`w0Ro^tl>2sNV5|e@(ikndD14(Tm%ec
zh#(UH;f+F)g@mUmiYx=c$ixx>CC)!lZM7iY%~MnbB%+GBt)n5i-@i+QA#qCX7&CET
z=w$CCA0K9S*S9n~T)tYn&1Va1B8w(_ZN1ys((F>o`5b93Aj3w+a0cuCMaTyhxl<+V
zQ(1&(uu|uG_F5FqSyKd_J=5_1#=Y#SU4O!Fd&Z)-6F2r#4J(8lLwB&VqPId^jh-#u
zb`5pqpN8`pYuV1)v;9UTpJATO9|<YvzJ+G?G(NQ90J<^aeVppP7}WOB892J+0y_2b
zPbhVujJLS$aHDHG6C9t(b#D3y_8h*6hJgi7c+ZG-<%gk?cY5HE!Gqpid=@Y1VqkUK
zbbPif8-7l|ipAZz%+@2Jkp5;R+Q0oMe7Rx+{I$x%3{75yO&R04rgek(i^uk%l%`&G
z%W|GCTR1UfqdX|Y8S@4A(mPi`ort)tHw$p+Sj@Bt-H<!{I-E@XCgg`7$8l;1gN!G8
z;d*C2gjut}8Iy>w%<O`dr+S%B%Q9ibktgugZVr7s<v5s3diJd$S7E<;9)5lKWQdtu
z2t6MRX4I>1A^oy9Aw2sQd|kT-7?Y843zI<AvVuK0YdwE+trxyKx)e4a+snPUZ93kv
zKLvG<=wima`Uf-P{qLE~-l0Lmm(Rm{lRpbuSd-2j4!X=ajaKFacwxe(dRX*h9v8W|
z9TE=o@B=<W=<2WJuGwAz<HX0A==M-3`+X(Szi<@3FMJpIB7>2=*Tj9EY(#6S(;;|6
z6mao1ctqwtuKsv6SI{HEx_uRBT>=LqKZ)jco%{>EHt-~q@Y-|Cg7dj>&{K}P3v&^<
z>E}72i%AeVxVw*1Id+ky2Sn%+jtwUh^XG>M4d#AnC+iN5Rh0w649oFBs^E}*p<(^Q
zBL+l9sRhkIQ5!u-H#lZU?9gGu<Mf6R@gql#PDo5jHm0PG8Jm_qE@S+J$4rlBPMjo}
zvnFTf<mTlU6k4VfO)V}dEt^(eQCVfJo<5^yX6-CnUA^7$L_=fK>^aV6mm6e3R?Q9d
za5C%lWy+v#nMZuL3UWkS`}}bW%rd}}oD5G1a>7F5U=eY!xPFNvOopYKiA<kFhGj${
zqe}+e)70rwm)meTQYT<PyyW$Lp~1~fE($dcFY9db`hpxz2YtQdsdjLhi~Tj{LJgPt
z8#XF1c5lZX_#XS<qJts?q4T$AateKLQB(yaqAH+E@#g+r;^a0~Vv3b3G!KKD@H^n+
zbhsLPe*8RvG#?_vb+zGungGuI-X9Y4K?Q18rGdzM>l+=-Hi0EDtH<}CwO(i09W{vn
zE~H_xZ<ht$uiO_8sovfG3s~SSCTh)!%2`lE$0S1FO!U#je=PN{1pa5Spy9PTt>r)d
zDS1TT{=eMVM|P7myQ4I`L8a*&pa#(D#89cUbZX@U|HTqJ*knp+l39|Z67r=~C`ql7
zxk1X2qy{OgLD@4Cg(-BEB}w^oCVXHfoU-&%57dz~b^hq&<P5UNF3O{PRx2H|=|Egd
zl7$XtD`geAWTgC@y40eSy2^?cd(QYd)~eD*DsL_(`L)j`gW@ksaYB+H<oBm6sjQE^
zx4JUTWzTJpY}8%}Ps$-=4e+Jq<&-a=gTHhob$>o{$e!6}t4woStBTy!RZ8h3UWvcH
zf@yv8A<ZImPd>DjX4#%w;iCd6-$L~jrPe_Fl6<A#q}hZ5t(T<9beZd<J&>z&#?P)x
zZ6iGyOO;5ITWSAEivB|(duGLwWKOCha;s!+^`DcHEVUv!lF~`Pky508Un$+%oYO+$
zD0NbM;-8q(qw#Bk7%SU6b6Xs~#Clmyw0S)4*4TLtkGIL~icLvRN=Y(mEjoZZALt?f
K{))+;*S`VMIFrHv

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c355b299a2b6bafc20dea943a5a9833b966e68b4
GIT binary patch
literal 4763
zcmc&&d036<`@f&(ec!XEsFQP&igZN#YK!TV_K<89(P>Z1NsDX^MRiKH2q6?j$Rssi
zDpYnuri3Cf_GQL0Wl-VwzNcx1S^oH4*YEm$pX;3GeU|&aKlkT;-gEVQ@{)R62uY!r
zaJr|ECxih2hC3F2a%Ycj(%o2op`t81<I?r8#=YGu^p3Z88ys_WS!Ta^v}K^PXrn=f
z5QLgSEn}fTs3v3yS=zQ>$mz2car&$PfNzNDUVsQfVT~mZk+zT&+W<>VEaC>rVuI&|
z%i_a@SW}?cj5OPjWRZmPZ?VM2bh04CKzC~N1Av|f2wgqtB$(vQNd#O3EH0Y_z6u8f
zDjd6hv=!4-dmoJJS80nleLEr!`rSs8rt<eydh>TGn5J?LOS;HEs2HjQ%Yzc5Lt+x<
z1<Mj-DiQP`B6wbqJSs6dMoBIkYraR4$Al!6(EeXw{v&0Djxa~ZQ&`VdvR<XyxsvUi
zu>j`Vtc21QDYTpj1>*uEcyL_+oofx?+1U{gG*SZ<?~}voxf)=w<t)g5G5|1Bhd2E&
zWd9NYen&q-^QRJEg^h)MK4K_*tp-cCNzw8+0iG`&3w34YaAAiF42?E{H&Z^LtA|Y?
zM_YtO)C>dig&A<<9$>$Mgr>v8fID0d)HUp(alRU?^RtD>EJv8sG9TtF(gkj=I^3LG
z3VZq`fsR=@x>Bl*^iLcBjVxEtKXC<OE3F~sr6$z$n**CdwPAJ7_u#ZO1Og5Xg$92X
z>}olJ?(+~-T;{{A{_0R)Hw4K1iC|vQ0cmdfF!^u+h!0r6ZfpqogDN2T^I>=d<Kc#j
z5Z_-l4%SZ50P%M&koNomnxEJo%E)05oyLV6V+zh5JqpIj&fwL>g39a>AjxLIp{+8w
zcvlt9U!&mKPIKg5QUJr^e}L|#&Y=I)44MOT;O?gwSd=gewx;cY&kN7N7MK07%Y6de
z--c25@+V-V7z{&8E#dW>R4BeV9EQaVK%KjPLt4R)P=X<X*cg3qUN;%0^<<*W@d_fl
z<P*F*Y>iHuY~^!?dO-1zVlqH$7Yy`oCA{`@qL$Gc$R{4B(IVBmXpP$lu(<IEqRAid
ziQHo-VE`9JN8G`S8XMqjN*lUV@B&#5jKw?U!LZ<EI_4Tqpbq3U!r|I3G!#7X*fZv+
z#7zZFt*?O-k~nmFMH5b`B4F*J+4vXR4lFw4LhPy&K*ip(a95EFpR?nLA!f_*i_qm{
zM9v`A^K<3MBH|ReGmXWf4E^!&3FTDKc|U4L{VS-P<b^m-?6ANZ6N$oVaM6Dc7cC$1
zo<18zX@ClhUUmxJA8-Q|UnhvtF~P6qRzc>aQ^fUY6F^ZHiaV+)bZydk$SI$RDs?*`
zR`ob4O`8Zh6Fs2j>R`fP<xx;|>V}`@9R|YAoRACcfs?2pj|XgGb!~`;$7fSvV6+@>
z-!mKUtgt}U!>fpri=WY)%TI_2r-rcg(-z``11}Qa`rA^*2iFr(=Dx%Qh=;LxL9qOp
zD>Y!nVlX~d!|Hz%p^o1O09g^3e>0xYS}cHq|Adk~!UpO;M<-FTBT;IF2TL_PB7M-l
zg>6_zGL*eD!VvW|hzY6HGV*}jA1A$YLD#z5`@c#(&rNR(;C_dGQQ0);EZhBCRl&B;
z_fUVsJY=D<fyJBkf)m}&;V!#T%{ee_2&(zP9M9gt=f;_iU{$D=;@j;8IC-K*|5$z*
z-aPjiXV!WLDoRuc2F8on3#o;0<nv}cdubgzU6o>0y!}8X9a)ST6xJ$gPewx07;n~s
z`_9N`XD*tzYCEiL)rI1<#-y&-5RTb~bv*v*YCPZX4Z7z(iZkra6BN!f=3OqgWzSrn
zqQ<SBhASQ!q4y`FN%7S-@EV^6-rqM<KRl>Hjk0&(b8-_=J5mMJHTmEs)gX0MZXqss
zWQloiuRzF$2K4@ZB#DcbirXh#A=i89Q~WL3L|{)aM(N(1u%{#8#gjVT=>g~Al-CZn
zLKdjAf821ediV>aZnXgwur2YRLF)*1`YUkVu>wY>9b|bX%;Cg#e9QHEK8Am0=<<Hl
zN&)mENaEtb8|=5UfC_LJ3qM!L*^TS3L-nYO_{4@rqHKbms==Bqq~+9GMB#8(qM_%Q
z+StmqYP&S;sngEEtU)#kbi-)|CExBUp8jkL5f%>U-s*sUjsw<WyfGS^v|(^ga21TR
za^XI#D&+lyeq{L>c;JpG3i#i>r4-k~35BO0`%=n&5PWlv44ya;x4RgkifzuSCR0;L
zPR9*`)2hW~-#<#+EN#G=N93Hr`-?#PR~7Ipyi8nbHDkpSmnnO91M<*Sk}&Vsq}5t5
zo|_q)j?NNQ?2%JmqU0C1Ab*oHF<AE#uwq`ZQ-+>{(#SYmwr(7%+N`Ex*H}sQpJ57T
zo0716>O3^bEs-a=eFq-uEF2y?tpX=xl(IuN>rzgkE?BluMjl=z6kOFG&z>0KMqcL@
zv1329q8rCoVC^ln8n(@AU_n!vp1Xk+66KvEPEIc(e6yaS>4Qnmgpa<|`kLpcXKO4O
zt*!&%j{8vLlp{pg86dViFoi2i^0ALai{Sf#ahz}bw2*HU7n)Q{P_lY9VSgB5-p>+h
z(1miGEPggXVe**USRR6d$~{r_h>vJ>eI_Ss`hL9dbR_F8rG*1|`oxvA1h|oZ05wJg
zg3VP=^48;#_^$04bWNO!FQtn4Sz2}!Z?}T2m*d~B{$VKMYgD0Bl?b?2aG%)B(p5>*
z&_cNh8;NCm@1RTZTUbVw-e4c_fK?!<;O~8u2gW6%__cQ0V8xq>p1t`MM|bfdX@ZL8
zvQ0wN`tAk(JU5*?sa-(4IUkD#-`kFKr#awOe+junV-jnRh7$_&{+6Pz7PSK5dV#U9
zmiOnSh-6AHBAGv0blc6+)1e>4rymgtrJ!>*bBn3vk=*5gIU@VrD^du~07IYJ-6<>C
zYC~PO+83mz@kN&5Ewu$JGgS2)6e4Nis!Vl5@BKb?g=wn?ng?FkD&3R5X3*%kr!94R
zGS=zZr*gy&dleh>U2_b5>-T1E9PU-<n&(iIm22cv9^P7Cw0etKK)pijSe%__72dSp
z_jK`^?W1DvU&wRZw>E#=qF<l-mSs%Q{bo5(1FRKZLmV^I47@88Q}tce={e<CXXa$O
zZ#FHd-Q;#nGCAKa84KN;4ZVvz^Br>CuaBHrI_E^H(Bp=&UuD#t`dp8jrgOcPeCVQ#
zx0cDyWQoV68|RgWT;4WjDa&MAMMU!f-|BRe?cYbYRW0sjneI5Y;NhjbaT%uhRf#{}
zI=+-`wzFn&=kwO;46|J)lY6>8bhFKO*QStsEjxvIL4BHF(AZ@hi^4OR8b*FK3X47G
z)@YAg(!;Ubdtt*ck8O6DmPMCxjb~Ra<60G8&a;lZRg-D8uOZ)VX;%-|djF3F6IW}E
z&$2FQD*6lh;A=FxO>!T4d=`zqPl>K}fJWcrKE4lq2k&e2k|jMpHhWzU&N}iH`avan
zb5{I8CHh0&U(lCy@c!h^pr5o&X4~$pDGPkj+KV34{h`}MbAD?|=ojdAd<NY}Nxpw6
zO}?b~<CpO6mB!RZ`-7J&Y3j{c8%rDVqnCCi@P*4?r*xKjA6xLKp=kEam;N=2-@bfw
z`{t{8wJDUUWKPR#*_lkCS!i3!o1hCD45k#%Y5g_ia-NO+aa(Ig*pCHH>m>fSIwP7(
zW_%ZV=hoZEmLnloi~Vo^7TtC{@zvuyx8KR{)upKppWF6+!NZHWW?}7ZAL5@j7EReV
z_fA*h&#jg6C+&B-lU_?YEwqKWR|mL$eR<|aN5n9iAS9lQiil5OheU-i7i;<!UCYxf
zH`2^DlI$>$JXI==FMnMBcvTJ%PX0q3aH<7BA+iNiY!1i}mjgkxK$!RYt^XDC^Q7C~
z1p@7+Y$8sd4I>c#zDgT?Ef8vK{!JjL{vj8lLSn)adi4)25k4Wwerw4|D_SD_J^sI-
z{u6I;whf3_@j>As(J~=P%OnH*h1xRE_B^C+PYWin!J@B=FJ$!^f308`yFZUbh=nXq
zJI0$7DUXOzWSB<iJ1PS$I=&tgUkUtIvY;stYl}VqkEP&$Nsj%$yvUOO)|;fc(?U}q
z;b^v-YXFF~MFOd}_mo*<dtWTkZqkftHc|(vR63RZGR;FOO_Dl<Nu8zAFsWmha?OD@
zn8b`6rBXNMNzeP9^h`O7XCg3xq*Im{nVF5I2Tqe+nYm*U(>pO89a<{&WI6|5W~$tm
znltmrK+DM%fwN{N20M?5^!4!$XZEAL==>JXr#odlM`aK?3A**hQx2KEul(cvX4%99
zyM#$)jK30|rb9R1zPa~|8O+?B>3XRvQ}4|ul3sI2kj=7@`}$0l`}ruT(|A*R`MFR3
zIv=`Nbo(M7rj!n`!7eism<_3$Cu6TPO6tR(>ZYtWU2MAbm0l{H$P8Vgm>R_TIggni
zXt{{4i8W(H7deTkKb>Olq0nm%N>4g-njO9GD|JZfJtt+dj2CSqRaUasNlCJIT}j<H
z+BuQNQSxN`>G-rOb2Pn{AkvGK#Vtq-NiYeDjWv<Q#mSTO7KX&dN62IJENo3JOwGlf
Q+5mripoRqSqW>-RKi7BbssI20

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
deleted file mode 100644
index dd4f930793d5e38efb9536223e4aa1ec1aefe431..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3122
zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkY$7_Rq6xT5fQ5hu3-^Dk{}6!wHB9BQK?p{
z6|JJyRt1Gx#EuKp759Cm&dhOY$HC>)>e18oXb0!sWMZZ2A9K!}>35R5efxKBe&0#5
zloBKgqKD|6NGCW0PD5t3$9EcZvDe29#~XK4EIfHdgy<|P{V3l_l#6mrD0~$*o=Rbp
z24LsG(NQ3T=d?SBMikT`kUObWb{RTD*5qkA(=?G&3d#*cxs_;RwTjnSQZru=Incq0
zPQdR&B=n4t+&VGA4k4-(HjYYRlU35NUp}P<{u4IG!mbvchwbZ-Wy~znXB*6uvvg*i
zLoQw9W=+mC<`?M=mPC#d<$0oA%4^SZn*YM%E7SYCgLu~=QQXa2F5!Xvv5}8nI0Jbh
z3sBc8p|yD`{5JN-_gC!kc&QKWwDd)0KUcDOixEpExx!~v3yk0S00Ay|*p)-Y54lL+
z`7c~<uYo)#3|q#j@!10>%w8jr(n5PKPaTGXI|6ZhZ45m1{&+aDot&#5fJ%*u^lkEn
za4!G~V*-ZF6>zcM8+Ls>;Nm(A7xJ92EIky{7e``5YaS+)^n%?=7hFrK!=|pq=o!#J
z&eUm$S92{~7stY@`3wqn4TRyo5=~tvV8v7omi+oPqN=hnWxFTNPLyMP>sRDvXM)Dl
zU2wgd3l1IZ4Po7I1U5cGS)3P=>Ni4N8-xv<FV=N$#GGgKxP_s(8Y6Ny-+vQJN4i4&
zWem#heoyj>x?zXl4Sks%D*a?=*|`&bbE1*_M2=k*eW9(8V|$ejCvG_6*d-Y*RR@yz
znvL)_ox;=E(eSz*fXf+`xY2Gvi8&QjWt;G9=25JQ*^2e?;kdbmBTq})(7(I~JnMq-
z;Ng61zSalc250hk!%xIL>lQKl5)>G`5WOr3qkmmQJ~5T^6*cYnxqcw|#=okI!ZQJz
zdv6w|xUYxX#BcfJO^-?Ipyfha!eLV4c!PWx*B3!oZ$U4d;+j|PCT3?lqR;&sS90Mj
zTIT(goZNVi1iKY*)y6E$xc>oX=Nm4oU3~%d2c8g5BywR#0!dAr0~vLw3C-F<a(M1V
zZr)xVOH0OcKZHKwRNG_t^#|?I_<0L%l&{3IibB43Kq+@`YN?Q0*<F73Xafn#-7i#^
z$>lQNiCmxX23h8@blKWNKVsL2WTI#r!r2ex_#$y1V!VFAiQs>9zJ2EnnJXMH=-vJJ
zr8W)@X;H}U>CgQ*X)hL?+|OSb9ggyYQ@Ka`WaQF_W2kHxOLp~ogaXGsq^@i@dJa!O
z)43kJ&w`zBjCzXiCf9=>6383HVNev63wx%lkUv>&!arN)!%c7GKHW5)t8NS;`}*wV
zYfd~P6Hd4B;rn}c@+zCjZF4)pzc(>d=C|!5K0h#xKMoVZR%fF0POQv%?kxE2ZjyJq
zM#!U|ctH0B0^b_SyU(&`Aab7<AMk$S0sc3r6WEoh1ol8F{Y*SM(}{0CPl%oDK^t8k
z;048C5RMmQ@=gl7&Ry&sx;i>_b9QkRm2N7vdv{Hbp1nMJdwO~MX#4o~?bqMWe?UNB
zQ1G7yhJ?N`XmHq@LxzUGHB5?#93B-N6B`$wkT@b~Wb&w#(Ql6#n>sEnef)%plO|8m
zWn^Y$PtBQ@n>RgQZ!i|jC^VUiW)_#sdZ)|mcjvrUI(Odu1@D)Au&{j5;w2RyE?xG~
z^2(1_Xjgu+s$bRWHJ`3sSG|72#?Ll={>A1kTWe~!ZLh2Ua>rK<I~#X>{pa0#_U>y+
zZvJNffrEz*A89#y?D&b3r%s<ad+z*&i<d56xq7Yj+v|VPo>yr^&Wh);9f8qKpKD;)
zo^2BHb4}(>+4<QFG$}$J;FY_bl}DVlRj%}R$I$-8V!jdvJ2aww_N?SUD{BFiUPgJF
zgi2wPVJQFdDOG+E<?nWX9m+dC$Nc<kLyp<nu&g+L-CeuJo#On<)?efERSBvqLZFhH
zGN)zhb)rD~nh*Cvd-TyHZg&|*n=nE+RmUoea;tF1HpJw9tG~#J^28xbTTof<g0)bN
zda&Jzk2|&R!R%Wc|2|nL1+_+<_&@I5e^5{Tzr0xOAN7)yx0Omkt58<BxdLj9%3exI
z8JQYpeORguLI9%>DMFH@QS{3wL6VB4h#V<el5(WT9Lre*6&S%*k&+b0X1ce{bZ1$b
zm<47?8u?EDfPg`C5tS6n_L0SGjA8>dElG)Nh)82w3ttLk`{^0MNkJK@V~et)2TxBM
zmokm<^~p58S$TA@*o(9np+Qix+OsSfuY<lRJvGFT6_X?Bn7+k5O@~sNZJ#n`4BN-E
z!J4`ybZb1*=~;wXml|SB8<%8EA7@FO`Wt1<FaGV0@z7#XdLbTG%7}ujn6YNYkm3@V
zyhW(k=6_V2rQWpIlx(G!q~UBClh0~UkRClaFC(~w)?^?PqD3xd^`{|PFNK~(STt$m
zG&{;qlOl?(_hbo{X;B%;Qc0_lB}wbKCG|9YbP@Gq$&=|*`&5-(ja3pVj{;rcjG}C_
pe`Y~}zpk**SnM$~yU>(tG<XDs4hR|$s7}-X&p&7qfMoh_;oo;1KzjfH

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ad59ee31cdea4b0c0bd77befe4a698181186db33
GIT binary patch
literal 3695
zcmc&%d011&7C$rh-YgLI+!zoQMFJ|wq99sC5(EUJBA^f`#jpsJEwYHE8W4-J3gU)!
z0|mvUR4LdBwjhYM*lI<q=(B2H-F>gxYM-s5zPU-U^|k%uec$_D&qrqF%$)6aW|DIV
zmWQ~qH0#4oSF!>Y%mHkcR#63GKPaA$mVNuf$Mstt<&7R5b!y9$`%7=R4ZGP}^kW<g
zteBNLvc{|_%VT-g9^fDtl#vPswE@6jG$x}l!qhqwSs$`yX-jwDnOaJPvFh~rm?U*p
z5{o6qk}f3aMY1HB;Ll81I+7TH#XtsAG64`?!v-r!3|!2fGzCHgw2)5&V<Z3``Is09
zyiAECM`rqoQ<tz-zeX+<44y~@)Gsm;g68+T<j~MThXzwr@o_8J1NtQ#2c!6mxSX_v
z^z4{;b++0lnJkjyW8yMWbJEgv0JmUC0hV2+WDl_Qf8p~FVSi-Cc9|*JCXx_>{IcLt
z6zyL*8#0^1a7)rbx_r-bc-%A*ed1JOC>S*pc8%OcFR<DUqoeKU;GK_9Ti{9P_gq8o
zjp;KS>wJo;2pj{h=Wjq7eG-4X{v-6W{pZ*%<1T;6nN!fS{4(mTyNBF{Xz;d-c=)-k
zo)S8EGkZ6kfu_zUXcQ>%lw&SvPoNQ++jRs!mSv*uf<C;wje^xnqHtfRAN<qfA(rkB
zptc_sfyW2osHMIKt`@I{7v-7MNauX)<vE#2t{%#}d%PLBCU?-=ig>(G?KFcL%}|5X
z;a8X~UH3ukgqZD5g0N^JrgGSJ2(bMLK6ATnaQn_AMhr&axwHd*+8YQ)DnCda?u75Z
z(FP^G9n@E$-cWpWA%57-ptG}1fR~pY-8$kSw1^ku_ll;&@aZ$*$mwB}c=c0cx8egZ
z@p}s2#54ir<w9k!J|M^`rVlSz$9u9S3vTul!n?;0GMjfs;cYFhsNKGeny~vB_4?<x
zDDRGueA}YM_@MIzzBbx}>EwUMq`Ii6PazAYY>I>YJ2M#bg15l&qa!@C3kW^@7kO;z
z25^}+m9lzE1fc^Z$Zlf~+?v^qvdx7kzQc>T<m`efBRxR4-U1khXl(D@%*36DWP*-J
zp}KiKs&-^x%sDG&&u1^tz9FAcj{B;pxBCO&aOOPRJ~IGq$SS6WL|>$qm|R6^$zS6|
z->t;=7Ut8fvx1S}mKWBT97aRZGH~xjXI>WdIpY&#NAEvPQ!WqJSzWA~DlE~gL_Jg+
zf9#y^(6W1%VCy=6YS@V9z)Qc+Uq0$MyqA)R4^+N}+BTRPd7Wux%;t@Up1xe1F*gRy
z3d}K({q1YGK76sgCbR`-uX>NaaKi}3Z(#sdFILk{@35lNwp01j69Vb4gbjSn&ll18
z!v)xSW2e~TTm>xZJ75!JHvvgE9j87CYoJu6w^7(ITHyUm#Z(=+iw0^mbehF*Nb=o{
zQsx|_5|073@!Rq6Mc!5%;o2`cG$d0nEz%09QiafGvIi}*D5rdy5H|RSj2ZfAGhSwS
z$Gq6-rtnO20*-4|qV_S*P+3=rpfs!zFYZp^U1h9rtbr}{MNu}K-?|r_Nsa~g(@Of%
z&9V5Z$1!x)vJm$cS{jyGc`*jH#eAFUXydNy3z4C?4HX(C!`ZsOQyX|AjEcloXnpot
zYH9t~s5fgPZ(M5#_$>IAS0`#QtiQ1d9QRBz?DVpR2?q1go$r6ZX-^Cx*V{<4bRCN>
zKE8)vys=U^>xzi_{)7e%yS5pP2vy*V(K0$uJd3we?1vIVw9M1VjG0=&7cMw<Nie>>
zMttSP37+uT1!3xS3BH<L&D6fOig~)M8#-<UplzFLO!l=#Q*+Mj6GM2nf%_vdui;s+
zsF5`l3iiJ+YYk}>D93fd-4l1=WY<C3c-!~fPv&uBcCWyZtrh+Hdrxy$E2X(FI-#9>
z8(T6_&%Z<{kAVkegsBzL!hFC2sZVXea!O$C5Ybt?d<EZh)QlRRy22tusasNiXWfcb
zCN_#<seH#fB^C}LjS)w86qOBeiT$)jzH?>8P|wWU{YQ7MsvO}{D6mx27q79MQSG4W
zsxMh<AG~A6CPhQ(`f(A>Nf)~s$~HPL=qk4K-BrG6LQ-F&s(V+(=1Fjk^Dfkiio1PQ
z3$?WGZhtLES8pYrE~{9r^>X{lZA0JAysgjzJRoArh)pA{%TL7K$?e1*SXnfhX-)mB
zh^ZY!<{dXJ(4u!4?d$<q$?<4_SQT51$&-r45Fe-N?DFUpjm&^(^rg<a$$6~S)X<NB
z5e^8-q<4Y_K-AY*vUdCc(4LnF_n?dmveL${dSXe4-OP<H34JXkq4smO`xKeGmxhfA
z-5uNz(poxiT=@G7daB*a<~v1wsJYSCS{Cjc(^0@!c$7!D#s!?HvI{*_u9}c|y4G*4
zM@6K2%7p_7XAf0GP0G01S!6M3_3M+fZhhGh`r+#6*K+S)>sdRg@{Or)KYnoI?1z;x
z)ABj@k19j#3f%I6W~<EdO@pra6uvLa7s;b|i<o@n`ztyMMk*Vt`W10{S8Cr?!?dEQ
z=MGhPYhgNnO>}#)`8PeAJ&D=U$73VUG1})p;(z^!Wi2DES*$PkGhVHb!nEXcuJ|Wp
z(W%K<+5Cjm1g?6LQdujK+_sUNw~}>9Wmj~a#cyBQAJ#)3*7IfSB1pi%Am}Hh|33FH
z3EkFD+&g}yX^{&0xpGcODj1aEO8@V7iO4UF48~NLvF!TjKdbyEFZ+Phg!II0eRFdc
zNp2!ppRMfqC{p`>Z~jl5{t;Hoa(9sOvf`2w($p+XdLTRen^wxsTDjlChx7#Cj-{_A
zoaO0-Uu{h|xnHk>#Vk+h#c9)0-L-J-7A77BugA`c*tg;A>m2_lSdbW7T3agr$9>*k
z)Mx!)9`9fFk|YOY5@VS_veHHjU}-HC$wNZsgip~wENLz6%#piXA(zYNk`G5S<?>v)
zB2n%ymnX`76LotEA~1_v`O4*i+)VhvnQ*#g7H5GoB%hNv&e_?MEc|BA;CSC$ZuH{@
zVp=X&a)Uy}vAT1)3&*F#y3KZt4WFMA?>{+36%mreou>tp_}+Sx3_5$hIwK?qLVA0;
zC3p6UepY0-dwP68qFl}C>)aDKgj9ok$h>(RAH)rM=(^DL@uZMFMYcNJJwp{SJ0mhe
z2c7tvtH&1<_G&yNS%iKQ50^@XCO%+(Hg_NoRC4k<q1-|La|3nxCdno=n0mQ<I=2i+
z<#M2j^q>4@tlJWjlZl)VNpdcie-fhpQplb{r%56w=*YQBuE^EjlP*|Ji^#}znba%k
zK<f8(&{b*vImC|+C#O&B6IJeN^pYU8(Wo;Q<s@V~#c4E7>defHT${xSnOVsh={Bw&
W<6XzQSSqanUjD%m8X%bbtNCwBcqY97

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
deleted file mode 100644
index 26c4d1bf6cf98bc32a1045d139db4a38b76b8904..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2862
zcmc&$e^gV~9sl0@-b-FE2x4A%!AOIWh!!MJK*Wf=ARrJhfC51&1`;8fkY^GEtW~tN
z!_C^(qg&f~bUSt3w4LXu^qh(~T05<_J+5`_>UP$aZf$MtY3-Sw&DyQqy)Q;(>K{92
zXZ;@d-uu1ZANTwH-upf`PN_aeU<Ip?=MYo^Bms!aA7=(;9T{vy5B~LU$M@~NvMxP!
z+1cGim(~j)=marS;DtnDt1=cq1|QRje9Q{~;?gnUJi^RQu$(|ffi>iVD$yWn+TGor
zwk~(LOTc<we;Db9k)_$f-z}6OljH^h2Er1_06^ptq0m8cpt+h@1~dp*jk*LlqNSvY
z4%B5_wjv+Xh&*%MMo%iZeU&J$SAa3|5m-*KH!8@8_XIn70^aUOo5vk-$NR{{=V|K*
z`g;Q1N=+A${u5*wVk~3W_+M;!$4amA!e)&_*n^c~N7yU#aQS1k5IVR74|N@4x9$HH
z{&6r5{U+<l3H-EbcsA`>c16lwNN+W>&S$Qm;mX(HeBl%9fwprKW@Vpc9<7`SIe+*V
z0_^Mf_^ubx*E8P0xxtI-zPHZ8$xVMoul@XUl$$K$JwXqAJ-m<6WE67;o_Y%ozH|*u
z0|zeptrhLBj7N1RM&P(5gkBjqhc^u~u)S{?KG#qNUld%%;`0^E-j}sdaHI(h?K=q{
z4DN#OwuP9q?2Xt~IG6M7n4-FP>M+Xjjk0^Ts8kIb7Q{U%Pl@xVzR&GGaS3!;h}(Ou
z3TyK)(<2;%is@J3OzuZ<pMExn(?L8Gt{;V~2Pz@nRR;dlEPUzyVc7cGDDzH3F$})E
z5??;Xq2JfM3O1XWec_(VFr-_Hf4yZPq%N$6k>5>ablbl{=7)|zLfJR)VcS7qY*r>H
zlmOo|$R1s>Q+4f$Fnn@yGdz9j5ckt(mf<}^Iq2AoVdlPH{)<`u#s#K$G)+By%UXOW
z`(5>h)&lM&^_QIA>S9hq7>b_ifQ_Fm;*tjXA@ju%)ueY3y8L%?*WAAV>w@`AO1~Bw
z{<#&Ie{vEoRKJ2ENgCuCwQ+ySwxY*c3P7`KGH@BKct-JIuH)4fu4+Vt9fupyj!X__
zzMaDDKl2@WF8K_T`P}2o1LrH?XlN-uR$YO993EtnTmQ)PC47JazW4B&FCW35uiVJK
zP~$}W1sj$Vj-up1klZ5Sgj)C&n<)&aZ+^nc^$=Nlkc2*DOgb_!-@b`ZF$Z7u6JbEj
z@@s&6%`B%<^P0E`+W3hHiIb8h>jZtWXh@l2oSJ%%DQ(*HbhBkf#>`pwW@gRKw&vvC
zmzQ5Kr*Ll3Jlp)@1tpTbbYWR}#iGipYDZ1&VrN}_!;+<qP0g;BWy@RdZ(HGR@9=n6
zc6Rwzt@a1HgCGkszeXG4WHuZrmO<CEmc;HA<ix)Ib#orD%K+;+86Fhm%nc;KMiOA4
zW0N;thRshCoq0B44-tdHVHva!lb{-z-NKli@CX`%AEY_i@E|t^Tjx6kS^GnhEGKJ!
zw5@1fffFUJuYR<qcIp^xFCPP*q+&U(GYU8gVT;BCfF|JUrf}vBvwmMVqW1c|6dVW=
z45;<{Ch2!hw46_{3@KQ7YoqV@xtoM^2gN%AY>756MLR<0uaD&Lk_7!t@D=%(M&x6*
z6wYp6B|$VM8imJ*nP~Y-^4-uZ;Rd$(z1^LWC`O+~`Ztl~oZhlSPw?#a=y%!kP6Zma
z<%6gScXW9JZh<9GZpOE~2xg<>`N<`ucF4!#_$~{osPTBXrFJ*_FJM9Cuu*STRL+9F
z8kp=5I}>vXiEpCum5u+iSkUtZqrve%ey+Z!x8whEW52hXq#xAlc?++9crt-1gHhB<
z_4SLJilP@w{27}~DPOWnl2k|jDOF2SuVn9($|b2&D(zI(?8Kmk&PpYzl8!`<jYLwW
zVH$x3k`}MKH#@tKOv-8(QMt62@-oUuXi0KVW_M9h(Mwh;uWrw+&1r9H?D3S(UF~YF
z@1pvEljPUGiZCT!sS<=FK`0tenNr<2|8PrFez&KhQ*u*(WqVQ%Ay-VUU%HgaRg^_b
zS5lAWvzn~gBkrdBpsTqy*wU<&PWG#d)>pM;d_JUEgl@@)w$d(pDjFkHAyqo4y<*fG
z+rO?-={IRMp;+rBX(63f_-PO1mh!o)+H?CzPx7b{X>u>^KS?oqC}hp9c#_OXbwuxy
z?7h)*Qj(=!#70s&8Ff;MjIJxCy8`7sWFMtY>QCYmS9&y2OAt-6JG7?98_DXB<t%q7
i6znyv^@hT}V7Do!V0O-ItHEIexbX)gEP#{zQvNrLA-4wr

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..fb2a0b13d44e39718519f4b481204d15e4dbf2ba
GIT binary patch
literal 3684
zcmc&%c~}$I7C(0~NhX0nSSBWrR@N95WfNPa3lks-ii*_A;(}q3ogj<Y+Qt>d4Z)2?
zsen}$#SN@Jw<ow#mAbSRX=^F1pIiH?h_ALywRa|B_1XUMzVCgn=S%Lr=iamZ&YjFT
ziC`jqX_5}2L&9l_273UP_a1Cqd*7{hVde0dlm0QSd(p*J-Ms<+fn5#r`0q{(ylr?$
z1MNu5y=XBlr3JJ=ITF-V|CyZX*9HKw2f>br;C)A^E+HjNDgr<tRmes0hOET66hm$b
zO~}NuHYEEPsZvzbUzt*P;TVA?01r|;0bo9Y4hqLHuraGi1R?}hA|VOZ66!SxmSD-o
zGO=nh+Ubvo<y8NPoI+1zWEjPZyZF#b&7mN8ydo{Ay2+<7)RIJFLVjjaR$g49A<rO5
z#f#L$xCCQHer6U2%#37rk?I^zb(c{77e4<K?tD9Xxm`G2jT3<pAEWx-B(+OtL0Z5f
zXzFC3^T<r_su>F3er64Q#h!5M<X}h`;)u4@8DYhANAN5=3C3?c0lXdHo(%yje@KPc
z=HKAcM-3oI9u0NT3aGs+g}IdsnwMiuq$x&2>uw)7vq=YTnZw}Tgh%L;ox@>`QjP{6
z_5kvZH&Dj05Hz2JOFKP4G{^-U9E0FuniN*Yj)e5(!7#2f4W`Z+2%_~4@cH;A*kV%v
zFMIDr=bMzs{YWD?F4u$mk@JwfZv<rhO9qE+roy_JN?2k32tvz~V8#wN=$Iye&7FHt
zj~GGAIV<S0bAYy1S0JlGz^COt6o$FO_?<PNX!M0@LJd{UE%5&1op2pqhbuao=vf>H
z#S<Jsu}cSqx4%Vc`F60I^nlDl5v=jz!^!4m@Oobh5x)pv-^#(DS}A}X<pyZ~+7?b<
z=ELO=e9)MN8t}+H3%}0Qg8NNx_%wbEeEld3=Hx{|dEpj#Jo^-s>FQzgm^YxOl0d)C
zy8%OsUIMo!Ke&5uA#D465O`$SqrU2YBd5gcC{K+bJIftv_Rl2EbO(Qb^Ekd?PlmL1
zdz0hX^k`H!`%B_wm7DN`R5dbpD0qzjdt{?=8d30*4qg8Bn%$2JPm7jZoFUqUevqtl
zJ}DeyvbC;!+>Pwi70B1IR3MJJLuFp0MDJZWL^ZzUiVmOkA*O7y66FjZENHQ9BEGoh
zNxUE8XqRoZo7gb@78O;Z;b+Ke!P9Gwa5jH7?0LL_m@>CjxWtw(XnF86S+M6F)KN4-
zBE2yL3Pwc=X7y-M^atxv+~SQ;e03mfEA}D>Mz~VmrK`nO#}5%{vG-8-m{%x|uWq0e
zv6uMV-jTw|B@3jYLvIl+*N38qM>9#qCtpIu>xB^c(Wm^g-yTF44L?Ej(RI9ILnNs6
zQZ#YNHo?JGR7=!c_anp)&O_4A9q3_CI!SDutGM>Yd9ozJoo`j9<i(p43A7}VO1?P+
z?%Ze<AGbdZ|BToqEHcEuTpu_H{J;Bw@8DmGYJ`4-v-4`6aLJFL-!vbF6mA!U=S`)u
z@0&ysw?|o>aGPhtUt|q7Jd)SG{fh8G6!2$gM#J|lM&ZSh&*0E2?ZlDNi@e=$xY&BG
zDkJ?ScJXQl>3JRI1Jcp^ilv)nLHy&|M1gZa5xNrkHs82Wub6bJjF+4OsC&f>n_&B5
zf>@hL4EvG*ZJypb&|fF|?qIDr#-TzSr<g8o57g8Akf{T&w@Ue?Cr(Inb7kU8s{_1S
zD>d|xqis^De+cyrx*&-49828K;Dgn!2mGSTDZHZaSmDPD>cRTWQ)FVu0OFcXjan+T
zw!<baAgTLTc+^!Vk+7$k_jywXA=_i5UaH>;%3cY?)}G_x*Wy7j-9dTL-P9lNa*_>0
zPO?8ZOjUdr1)g}1E+7=ihdvv2x0@nG^MIh_LDlmY@F;tA^s(v%i-b})eR)vL!a^&#
zUrOh(nng=(T{J~<rgrf%2X$n9bZc$liUB_HXUdr^OIA6*nsc+Wb<5J#1A`V)3eAT_
zrSAGQ>X^0<m#rNXQLC@eY+b&7X!PEct8H6XlzGo+D^dh+TUp_sa;ZM%__kFWU&-z{
zQxRNOTopK{_vY2(b*neOIiC<Yh18eS1Q*#4jy+Ldx<yyw7FHS3ux9Jna=+BB6Af$Y
zCRA&dIfXW^Yj~?ZvLW_l<NEDUyW`s{L$_~ensOlLR@cexWxL{zEfhIxca-l*IJ0K(
zlv6t@nv*ZohHca~Rqjjg+?)Evsiuwljn8;TCbjJW?7YyKb&gDO`_tI>tP*5WF584%
ze_Zi}oymH%u5sOP6TEhzy>k7|4a4&9bXvSgOjD~kPr0YRui`u{(mnAcGQmANGuhiK
zGD+X*!{8()^0O((tejQyJ%<aK$kZ;0$@(kj&_;<#TDS|tQJ6-WL1ZGQoRXL&rV+h3
z-f>?aHD*6-zon}Scy|OFs=3g)Z&!)yeB>Z8s|ok8$`K{X*0KDv3Nz`a$6?!Y$+=yP
z=8>+O{C)$E#g>#*`4;kjo0L)0oNT=%EM5OJA?IuMFIAPeDrv$}-u2JdMNwvI7F+6*
za><O;+&p1YMiN^yae?fnWFIPJZBD9R2B<o?KH}MD?@y~>1ly=QDf-kQYv3c&fcUuV
zC*ZOl|D=}3J(*uD`&fe`In^)2mi-rZvC#8nzr*9N%D(N>CLkjzD>={7(cr@0hg1jM
zRb6hl@V{vPFTDO4WW~w=kPC7XQj#(aG>ID@PvV*W$x|7=!yyPa1Q0;TpU*ojun0fj
zm#}hw+z3t3g7DW^ZBowN4%=f9*hBv<c{*YHF06gd@qdB^nOLDzg#VA5yT7T)`oBEG
zs=w<=l3jC^iB**B?m$NXg;H+KL`F`C8f|&7<kh4%O970AVVH^d%hFheDPT0ojFw@N
znc!q@PlE-<v8!N)31er>_s^K)mbt72)(|t{-J#yzui{1Mcs<Jp7qDX}J7Cic6V48r
z7?$PE86TEUkM|qz8y__}KT$g>JtjIbg+0%V!12A4h6m1GFlPh@fyrWzTe4@*>F35q
z1!N`ak{JW5&$-8NFvaxqk#E1v@?+S+0-X!p5>Gnb)8rYV0*o=y<BhS=9CYk&q6OcW
zNzccFlZEM-c-T~GvJ-Wa^VkC>ES#0+gbMooPYmPojgyV3KlKa~!Y*|gY!0$xwWHGF
z{pR4Dj9`Uuk_*`U;}9*Eg7-9>CXO7V!{;%KroeJfT(GPbmSMO|T9i1DmVFL-Or|y;
z`{8i1`q(~JWv|8}338WgL(Z)Hq`YAX+1bMkIXT7xm)S`<xv9o17vGV?eTVxf!j%9|
NKX8}?h`|3Ze+M9aFZ%!h

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
deleted file mode 100644
index eb8e2a4e873ee441c836a8ea92bff584ccd91d51..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2537
zcmc&$drVVT82_D1DRcwmb(_{z$9Pm!q+rA-+YGm83s?cklt-LeXa!1N(qfs$N!-Yc
z5;e}n&d0>L_!=OaxVWfOw=gx%$J9->e`FXlozv)aOQsv{oO@Ru3jZzh1kU}=_xyd|
zIp3ES3PtI>jF04FO+3dV8VJ?=zUAHf@xQIy%!7~Sl_``T$;)}UW-^jP12R<~1wawU
z(VZiNgSMGyc$qo_@<_ExQEqlx7F3x%RXnHUl<SGoPqb6Cp~HDnr%)*1InXVVet;*T
z5z1rhNocCjfTW*NQn*iZsqw#XIR>{ks1I&(xW!db=de1x3oK@@IlP(@)s_VnE_<EB
zDJF1;D325E9ZvfNr|FN=PYn}s$g~<l_|+jMzMF_eKOXep7%{xF7`BYJ(Ap)S>*yk+
z>==)0--Y49+9cfU9E*x^v1He77uL*=Mbf5DxPD3kbTPOW!J+APHOdbBjmy1zA+OBD
z?h-XVyc>ySn+39>HjJ~WGjXCl4X3sm5$~9Qdj-AZd|N6uXjEkEkwnOD=@9CgiL3@0
zF195?F(wW%v03=m7KwFblToue8?(A>m{<QI6z|30N`5PLM))vV_bEBssv!wS_aSz5
z4ib)@h5O(nIPWNNBw`*mF4ADlU!P%y--?AT@i_Oo9NW7-AwAC#bbRwXevFF2mnU9=
zY}<6Cb=*f|ZUXY#c0j!^9nD-aw!P4S<&WEN9k1ehBhU4`{W8`T#6rE_h{j*85nEjp
z+GUAwG%BzmB^aFt4j^T@0fi6cIM_56+9o+#{AQfKIszvz1>@2OX(Vs&4kUWc;L$Pz
z5^m^lxqJhz_Bv7TEr!2wCmt{P8k>xJusv@YdNy<9(Tblje$^<%w@$>}dn>W)$`~X%
zqsfEjKgdYSb>dAXa61#&qr5@*J0ykOYG2F97;Ke{B}|w1t|yp&f_g6y5=4n^A`2PS
z(6BHoSrxJB?eY^<Q%0{}Wzp{~N%c1^`s!x=wC43y)5pD^zh|SZ)AsrNcE8>5@ej^J
zTb(Cw{rTqcW_M1?$aSYasy%Jb+<sw?H?Os9e^<+rGq;x>>u#?%rEWQYqhsk==i#f`
zb0ai7C$(kH;CHTYR6ALVT0Js*wZ|J`wOd&a&<Ed5lou7s9);E)uKhKbHSNG$hJ4k7
z=E&hNmhn){8jz?$0~BkW0Nu}}#sizRD|)zQ9nsgicB`|}E6p3JT~9@6{ZX`CpY}dX
z%%KufH)TL2_f%9_9cEre$5IkEV7et~Of4~4bao<xQw^>zFPD@DPZOrsUw1lS<)&Ad
zwM-@60h@pvjS#qtq><Er9P=**ell4oIkiS@`j1ENGa8Nm%S+WhtCys_7_Q{Bp~{Eh
zv4C2m3KNQo3W_tOhox$k>DbB;^nxJFrvGfs6a=53uM`Y|P$^_riab44n8m1ULC9ra
zdJcT)NhCeY19K!4EFG`YO`*h${2X@9_OX2i+o)?nFtJTv%8sI3NMq-k@`?HB<;8RB
zEQYByr6omGOztS8_};M5P4t&7I-wxwlKhDzlMS->loe+<Eyha0%<RSRG#$E11LvYS
zbJ#hLZBptYbO}!l<>|fV;tW@5NxrMBL`<E=n=R#+_u61Qv{-Zvz{5(ZcUz2ey-XnF
znwY+*<O{@~oh#Ox7Mrd>=>=grBaL=e19zEWs;zutJ*~+krbLVEWA&#HrAwhay=Y01
z)9k3cRM7jRdlJDiE2<-im6VLcBqhF>da1)uN8^ZjGJEQunzE~rG@*)fn`;->S-le~
n-0lhH+FF+{Zi%(lQ|)rbrB6;xPfb&sG(g`Uj>v#Q`Xlcjk_>gx

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet
rename to transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1a46cb40fef1eae7a5d0207565c796ee63a4a4fe
GIT binary patch
literal 4466
zcmc&&c~lffx38-1o_%0IW-<dPC}Bhd*#tMD%&;#;1S6Y<T|hua21N7e2q=m>C<+Lu
zfXF5&TO?qV9RW8mF1Vm^2b1^|O<W+p?jhvUm*kIk&O7hBLs!+Ux^?gU-KshBYuqIs
zW_+CQ&R^ojXYqkL00y}|Thnw4;+7=eQ)_-uHmbP%`IhJ@t*KYebq~H?tiR&jqpf-y
zBG0{j^Oz6#0=}jZpGSUZe44fuFkp<Y2^r&F003+QM7BJLVW~6K#Sfu29~W5y8c!r-
z1_iDP@edD_hx3tuC-?yhZbR`}G2@?niHyi(0UrUf;gKBxq@D=*-fpByWp9IHfC&Mc
zNyh<)%K%*RQ6V2yF2kBfnp2`S)=@q_$zRAAUlB5}KWqdfroYaTi+^B3B&Lpc;+@2w
zm<+ffF~RZCp{wHjLjvOhxe??bBE&y9CMrI9l@eto5^P7}H<&mX)Baz?{9EizB7V2X
zjenA+M1PO-%Ae-2=SvV<xdhdQSK&KLe+R!+&V*--v)GI&uHe|@V|ak(Vet8KecZk9
z6>M<s0XJr+<7NJRY(tZa*e>U3!0g&15RLbs@Auckw^Pp}^Ozy}+TM$xJ?S>=&VL5Y
zKa-(DF(Ke>!%>WBFrO&P?gf=i@8A^RhUT`J!cu20T-<URd@qiLt%-dosR0AK*ZQG8
zPkZpQ)i4qscf<}ibAVNi7py(n4(_Jz2OoCEVv|iaBJ0_6h={#fw4sh_Xclo6Ka@hF
zc`6pLvSeDUs7V7vZp(9^VGN1G?_3aPCc@(Rjlfaw4d^m|$a*?Bi_idEFnj%3@TSZe
zaDD7S)Ffl{{Obm=t@|u?)pI^bZC-(f8wq%2VJEP**2n8~hC!{y8dQ;D2PWCMg435K
zVj8>OL;X!PK-K;|_|d--VAiHs4BrMY;#2YO0y1dt(&gZBdonoKQAOkw`k_O$X0UN;
z17=Y?iY+_;1e<?$GF>la4XQG^PFF0qBAV#G5K*Q+*f}5vbF+iN#z7}SJ#if{sy|Ir
zyAI*-ujFM5ECQwrzQ8osaX|j575E7MB5|r_ArV*=#VaVU&~T0PhQ(`cqe<c^^g|H_
zaO9#0lUS_B%VL(Jgx?(DmG=W`&yzcuTY3YSC*aTA46Sy$i$axi@WVZ*W{?fdG}37-
zuV;+t0S0sZwMIsn=VW-g#}xVHvYD|nrqOCuE6}Y0eYC+&Lrum$i4J@{$nZ*&5>dhe
zpl`I6zJ^!>sy-Y*e*bEwZ&4*^wXfge303Rh#nhQx-V;NRFvo+o`o077KC~bDZ_WX`
z`*pyv-A1^M`(%bm`W_a$wGplKeF5*e%wl|g=LrmF8L`e+ThW)MCGnVzo~ZWGbok~>
zG%mVy8@PXw0zAI`f#`YA0DA*}1KwvcuqHz;Z0_?$eX79|RG9@R|B*Rjy}kfK-(G}o
z?nmON=wG6N`4{jscRhlgsf`7Vgdn)ZgAw-B5IlR*%xYEd1ZUlI>8XK1lS<}I1r`r~
zCMH;<!+g3q($d<4(YHJYPPvJ|Fr}R47PpKc8&)vghvu-`rfgIpHgSLohGSjj*XXak
z01+Ua3x2ANq4%a;1&y=1(D&)R*va|2s`|S!ar4DDv4W{i*u{~0-rTy~yu$(;qSYaU
zre&E5ui1Y^#N;@MmJDWMVc`JY+Yz8*tG*kdebLDHHUbWDPMmobj?9M*1uUNl*(`t2
z*Q~C2PJF^{na-nT9+BSG#*@nhtY~&UHn>yDH$2nA<5}1-euS54zWT0cIEn!56R(NX
zE8$qGn=idPsRVEqbl@R&pP>Oq16X^|LDhJ163!UDhB5jzne_WL*!7BwNKh5Sm{?K-
zv|n<8Z^3!2yWfN+$IcTrF8cWKOE_jaoT1sD{{?fKYzu718t8_Let{dF-2{0V4%kGU
z-vMpab9&O04p0#pi%#yD2OAFXxYoUOgxXg#Kzm;TidpOr7dpqY#JBE%hm+P!m3h{p
zxUCiR6$f+(`xTBTa7`dyxtY(or1u5gF4P&n$}FPG-uA<5-z6gL%q9)1A9jJ&eJ6EY
z^evz;y8}D3qzLob{uC~mh%@Go`VeWShu}z-43C~L3544g!^lNdSXdjtG9SzU7rx0u
z-exyA-+mU$Sm3J(eWIA4Pqh?on6MMGsf392lbFytSB*A^2GvuIA2WNaLs4+G8*H35
z3U{<@V{Bhig4VP~((V$PD2SzpT}X)o*Ye6>Z$uEVyyS-8d~AsBTD8F|qGZ&aEMjlh
zv?f?bQt7&Tm#efqTmjh{4KSG-0j}iV#}3eRxG5T%aDUuBZ2i$Yuv?x<n_lMuYyuw8
z@;SBaqmQzIQRyspleIRmU@e7%FJ7YPcWjU_pDS3O!H507JwqS9-ojisz`<U0%HYI%
zIZ($_iu#v}@ozL1(h4=~VVH-4ct3|w3zX8C0c|%KGa9or20nDsn4{O3Q4a;^Zront
z$h@t@`wgw&>=Q?LC?`v`yly$RsJC1Lc(t-DUun>aM%_6jd>)f={DWGZM+w7idJ7op
z90ENpRk+Hb7u3(@c^>~4wh=#3<=?;jICZo_oVv)Q9AoGjkRA5>We9f@;FSs_wL}%1
zxe*|S(B?>D62?$B@NPPiw297};*@2RpPa%Lnup(L%HOnARacrSloV{<Ho?H7#Jjm5
zWyfcxLFck0g<E!M&5nI~qq%VF9vz!xhDdreHC@kXuYpg?(QW&tx)(TQON+MepYB~9
z-rrKRBhw_HB~@g5Y-hGbcwdQ6>#<!qvt;+rW!o0-&YQRP<<tJw;ys5KBqFA!T}fKL
zZL0b--?ozULdUcz&Ij#E_ZGQknMd4gE8SPT=!kTirhQpPsb`5tsc(DP{&KIAL0t#!
z%MVoe)yEFrY%k9|;op?Z)N(kURTX@0?=-)T<JmP~mkXS894ZdhMc$~6xYbdSb1LSe
z=ulyxT2BT~(V~Jwg;2edjJ-MyDzvxdlA*h^Z>cFba~;bvW+;I5<*tMKD-RgQKf5s&
zjjLFer;H@;{3%ZvX_Mn0k(dhbLTv<p`I4#Nt$Rg+<0|m0esDy4b=pr#Tu^~W+~g`a
zceL!wxC&m%2@;M-VKoAn3f!-QJHl1Wd`afJ@a~xy*_*CG$70KGW>Tq+q-ec5kv2II
zPT-ChAU)@UGt;!2U5Osi2yX5~rX8#-&d?FK5ucoEJ_<hHJXVrCvlL=fH!8m-j8o_w
z>BJxXucQn4r?&F+TymYN3fgxFEmqz>S5?@#N7r_{wsZB-u6+icWy`v%i+VCmgS&H`
zYmQyWnJpi>-BnY3>9EZv)<l=ulHNk6eTK`sYfG;byBE6Txzv?iE%&Zjd8fOs{Q8N2
z*6kBrzdhbx6Mm&^dC#{Mw@%3(bmzIAs=U*<c6jJco4&q4){QD4yBWUc`Cw0boK
zFV^(O7E`8z^JC6dIiWCG)Mm3uy!3Rv!Gl64Lcyte(CIEx;L-sH1#r^(uEMWt--GTV
zH_~PIT4Vo#9;o<Tj{Pdf#JR@Wd^E;woj&D(%;<<!6weKn<53avIC^MQD8&&;LYSx`
z_|;HQAQnH?6~9xm=#Rfkf6Mx06kS(QQoh-YPy`5w2SS3Ryul>Z4I&?ZlI}jIzKEne
z9qayv?vgu|B4mu)P;~dNv!v0-bXQY6Wcp8ZSM^Ww8x^`LEN+a}=OV#oB)(=MemsLD
zynpro4^jUPx@e~*5Yptq;i1uie4J$e`skyV2z_n0;}dMi*8pINgrC-*Pa8A-l#f$(
z|2`Q$;?vx$sol6xc}moa1|lPjpO(HR>0g)fuMGS@!2$tKq%Cs$KYqjc54~Lczue{j
z*i8~-m<f1dhG3_W1^^;$AxGljvB+!g*u@ea!A&T!lt?8K$zt-M#8o0mkVwNM4iZV2
z#5PPhCnXIQQX^Z5#F^?zeY__%<xoy}pd3jSeKXy}WHvdl|I&%lZ4;>0o@z+f5{Vnt
zNPQ?(xh^rK^vEFdFU^9ymd1xT%!%~z_6Voequt5;)~zHPr9WGx6EX=>#{4OV)Y>Qe
z<-T5)t3n*ZB!Sd^Wq1+}DL&)6$5&rbx(n5eL06_emQN%(CyfjAvW)TZ{xZhbTM3<v
zw|ESn%aTv?A&W)INBK~tl*&RJm&Q>G5@$EcUTKst9)Gd3vfgB|Nf|G_L}Et`9iylk
z$b22<tPC<=OV(s2WkeP^fvP{5;@F{(b5i9_GIJ6gx$Yy8CXAhvGFfUDX(Lfqa%__l
z<k-9tx=*x2JQ+ucliE-EC$~~ZGiC{dy0XC7)$yTm#=$a~abRq0OoHy3&{%my%qm?o
Ys~Ki9Ohs<m0Q~s}PU8T$lmBh`Z>%?e=l}o!

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
deleted file mode 100644
index 1a9169d9f5dab747b86df507d1750efb57844174..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2537
zcmc&0YfzL`^xXS>`#{-5;@fs{CDd5ScqfDmq2KacToLdAizzPb%7V-8whut3QL>z<
zA;m^#)^vo@Oif9XW)M(N!x!d*rnH$Rsp;V(wHcgJoUD7lZF#8tYw`@-@7(h~=bm$B
zbNT#Oj^zTlARUJ}2n3L-dfsXX{m?ak4F_BxCyyZX08Y$_RS6I-_3;#g7y$T%Vsc`H
zFsPf5ier`WAP!I}{7MW~W3k!bG;>&v<qb&gLFz=cbTmuK2oef#7|0PoJ^&Ag#t<5X
zheK0HeVidoPQrbZN|gT%7t?SbFoWTS%Z#>CccsbdDmEHi23a{l%8kXPHjBH`DkRW~
z<XuR83#*S{)sUYdY!r`=O{-SIy(7_aeMl&J7jvf*WvtHuNArB#Vm`pu?|uM(Hcv-K
zB0XcJlV`)uNjur4!Hw|T;xIOM`yJG#?SadQ8`wR?y<?|EpJU$DJ`b_q{0x<B4?eu9
z72TV97RTAHOKL8jgYMVALuWp?f#ROA<83x0+-uvx_=Ts4_H4cg&0Pa%GU#y97cpqJ
zR)+G9b--b@10ApJ#jm$9u&!ni?wywoHxv4?Vs9qXc+?*f4i=!69o=xHZWH`n?_efH
zug1xVGeqU@O%Pu{@fnIO?_{^F6^rMsnmNW}pD@M}^1W#5vD*+7iA0S9+1P(NX53sm
zWKOvYr{jJabM@Lwq9Bk#;%l97caIii`gE{_MB>{^+TiUooy^zsQlReWGTh%TLSM}}
z3CYP}?B~z+LrYK<-nTXlLegeK$N7m&(7O95?2UskF8x0IP}~ematveRW`WdQ#~xa`
zQ9Q803BPo&f%i`w5N+AM2ybhNMeS4Dm={0(i+SbjuS`nkB*~PuRro;kCCRGA38F5^
zEs-Tg&zu4$ByBE*)z`8_fwe0jqP0Uj{t`m{e~|Vvd;~EwUuJ?=_|vAminv=Sg1+kD
z$OuoQU}-T4xPEtv$cK4w0U=80L#$+CZSB%o;fkl~-Yhw4P7G<NGir7hMtSO&9lutS
z($rv1o3=S;*T#zOibG5Gdn_5Bd~0ppYCU=5-m0!9dsam7`cpd`r!7g17k0U__Z5A5
zd2dzE&6S;f`)hPjTh3o?S^1^4{fheBI2DJzW0^Iw&;2UPt#m|9PS#THbV*DW6P*Fl
z@O?<$>nH!wPwkPZe-qK69iGc$yL!YPG0_Po@aVAmI0~r`qod=)hp0q(_^@6M9Bo*~
z4UVqGWG!=f_l<X4PmNc5#*=Y<)cPnsAD5W2J{}ZeXQ|m#Y2aA0mcsC`?G~oe?R|O{
z*_{xN6(g(5iM_%jw+WRSsykV*VqG%TW);F4&<%)*2j5$S1rz(B)V|>O;bcLMl`5s~
zKVH30Xf^&XovMCPoh0v-$+220ACLtBC{+r7K0kkML6Wzz6isY2g?L`W^L!rpr!br6
zt9eZspTYBGd}^6M(-46<6ielKE&Y<a?@Mk1>ZBH^A%5=47owvR36h?ZMenKA^h~D*
zG0pQjdT8|YD)9LjdS6}=mlIo3u+VMHn6X@6m~W>1%3KoPiVAWF_EH5SBnSdtdjd*%
zBlMj`1@Ts6W*Kjw`ht6s4gtOIp1)uLy=T+Io4OFXH=gB$rg0ex;%)lE99vPLkUH^~
z=glvB{>XSpu?P%{hn7-fH)bw$Q3kKoQF%eA+UGw{E7Y45n}DzMJfB9<Obe}ny(nWw
zMM+!@smXLIM2cKZ>rX=TJ_?~}1Wgh-$&T>#yr$awPC~F$i^%XoCA~^Ql3u!yy1p{Q
zP5cOXQhj2dsM4qLN`fNPZg9NnHn}28?e<86!(poqtui^B<u+?*Y(i9QRE$!m0vP;*
L4i+Gn{K)$U&YyKv

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..56934cab8de9fd4346c856340c7d0894b2552537
GIT binary patch
literal 4906
zcmc&&cT`l@*5BurDbxYxjx#_6=_pDOupwdSASwdV)W}c-q)Nv!DvE*~qI6LZr3ngx
zg`$9>Vtrr(`!gCWF>2H()_3plOyZa1k8iEF-rI|N>fZae_dfToIX^tbUgi`*1yP|M
zlmZ2+0A_7I6-xar*H`oX^po_q{-mV8?|Jv*%MX*5mL00-cz_?ZFQv}=`%F3Sb<ljV
zM^8^e3I&uVrENl~Qap-9u{5p0i1Wop$ob+80F*2-x>ZFunPu6C01VzCO^Og%0gERR
za)TwYq4Ogoi4hbgP!&8zf?tq97Bb-coj;KY9XL=J&<&670O<J^L<M=!kuaIK5(u~m
za4wqwLXiU`{jlgqo_-V+IW|1ngb|hDj`5>S>HVIEkn?3n$f2W^iGa@S&#Uwn&qkIT
zMvy!UpA-(gVsMcBEjuH{(72G~n6TKS`Js{|iDD!@hzy+{5*M8u6D!O2G$ts)3<?N?
zB3$$T0sNOmcsr8H(DI;evt?yyQ-0OTc3!&(7Fw->x_&YAw=V#bvN7<(6$N;;Ngp0{
z8$-xwKH5_m2kXD%gMNND#Qjqrz*G%hC}FVSX(R+5_z0IjRRb%0GF1ADp!_)xmK2H6
z@&pBJp=dI6?l*(eB`%;7GZtRV{DdynkAqxIAu?{$1LBD(aN?$c-3kKw>h*wYs10g-
zJLp}=gUx}~5Vg($ruQ#|xoKL!-KGZDX4JthrBoPc+KA5AX(HYB8sM*U1>N@Z5Z`PG
zu|Er-O=&J{U7!i;KO6+7f-nf$s{<G2v0z7k1G*=Npy`}ETvt(puFg?FY<C2+rk9ZE
zrVBIb%Rp3P4m&X;*sjq8%Rbk`1DFa|T`26{>M5{kCLcumTp;uD4`^Ys3hXEJASROw
zxh5oZA2<Le%bdaU9SfQ_7=ytE7VIsMK+hdzICY7HOQmMWy}Ata63@c>CC;Gx&=f8Q
z=fa&&v5=PJ2L+kC;B(4J$akrN9qzVpuLwi$mk+|2tZzW4&H|pl$bdc93_&ke6}{T|
zD^d@AfRc<5#K-D_^X3_lQ162^MKll_sz1S-dP{V4Y=JySXBzAowTB2&-vN5_^a;;h
zuTYEIS?HgbL)6UglOJt*0pI9m4CYrKKn!sfYu|Pl4H=%pEaD!s(|RvJ_sW~-OxY7;
zF(Mu-jSGdLmI|C}WJ}f*_CkHfJEQ|1*yIytsM<{t&FX4{c7p_Td_^C&vIU1tX>+hX
zZzp(a{Sp)Ib-{OZDuDF?Kh#vw4R^A(!RHML_$brmn9amVWMr-e>+#7(WFC2pD9vQC
zyrtjD6~t@EMUT8qmUKM>{#Zorc;}8OSYmiG)e0`UZ=uKHzTCq{6G%QN!o;P=;BAc?
zC<Zt|^vJQ;v+r6U`^+)?inlFfbuPeOwvy=5^iyDCqfazzy@V!y3Rait2qPV*LEFV|
zaQ>$ENPpD<P<DC`|D0bBxQ!VeN7(@<Ig2<Fw3YQPClP+^&VZtm4P^1IIaq0vIcha*
z!N*sA#OIzH#BGm_V(Vt6U=60%*wT5{WC#0aGTJNvKMjd6xiAElKXN5iSEPf<;Wn1a
zHH2RNiUUY?gW0!JarJZs@cxjE2B#fINvd2Fdd!BrZfb@|qj{L2Z6g_SDv)$<6GCpI
zFEYNYPFDAPMzte)aFg0C_>z7XIFc|MYn|qT@)EP~5%UJ{H03)eCh``x=;t-qlLgC(
z=INe@GiZawD<454V&br}Lkoxx)CKa;f$5~AA(~fSTgRUk<&P>;Zek-1bl9blM(D!@
z5iTCTl&FcDho!!8L6_d&R(Y0jio2#ah`SFxRotr4&32b6D-?aci&TsXkvTtyCFl2q
z6LXuxU3#^ZQ{z1fwVgG?=9I{D6UG^{nw0CX8@KhbWsZE6c=`QU-gl2Uep~FxXkj_%
zo20Q*$P{S!oQKU>(#c+<OtP9@4-u&i>F7e1r6O-|G^9@QVlBGojQmTtq4}$eVbg#X
z?Ac^OXnBs}nC5JjlRw^yEew2t?z&Il=-nDb5ppJS=NhfqzFSuExUJq;(}OYS?a>%Q
zbnzy5PR#_bgO|y(KeV7;$s6!Lx)tvjt%y4N{INde5H)3PIac<-0+V}v9>Rt$pttv;
z2yFKf(QVuF#1>CoQa)c35B?B}p*3Ed@Q0(}$zZ44an)0B%(H}@B?%r`HN_Cd-+xM~
zjn6@4Yzs_7V>8ZP^9)=|R>0`YT9!xBTu%H;DcAGyB>59M%azDg3ZR4&cu(zB_G>>N
zgX|~6Pfc;`-Yr+4bwUr;p3{r(x7AkG-<VHW%({*Ts1?f17kwvJZs>~lOmU@1$GKV$
zI(g&{fgPE1;sj5_Dhnkh3glws592<M^6~HpKzG*%DLJTa!p7dj;C@TF+@lTl)aavK
zJl=Rm&Y1yIRw90mv~$-d_Fg3L_mfDKfwHOG?D#cEV$bG|p7}Fc_T)Nj-|CEiqxA_`
zvCr5mbxuNER06ht^AyyQ$5XWFZ6;M_kAv>MR4nMkb<T9RWI2Nyx8VNB6vKG$CM;=f
z9eY8Z7U{IW1(T#mh;gk2{M(;TvA7?vaj$T9v*U*b(A6U=FwOi9zV+peu&8gpcJzG#
zCM-OO-$}|P1J*r6KHm@=+m8X{mbS;}LqR+dqc#$DPFYJv&1}HKPXPXY*>N~F=z>a%
z3zQFzNZ`~q&%*<vxzMLvjh3lx!0qZ0Cijy8sd2gyo7r2-2fyQTR<HQ1kVX&GYWxwc
z@5<(^^Qpp8jz_WXkm^{loGyMoGYPJ4uR*<$!La+Ir^59gM`L%aPoPVp4D3vXNPeBV
z4Jo%Xi>;kIPpRwv0wm9GK^cmXaH;Gbp2yNs%;c-1ZAstbODk@nGl}`EG0k33Me(?0
z3Qh7A4+?>^_p?f~R~2WxoG*Ix;up+gOc$iuDhifvrO?2eC)ndtk74?41^jGR1EEy<
z0<^sCv4MF8#A5z*)-Jvi3ipzd^u44{m3pUOLfuvV{fb2}r56&+j~y-5T<x)RhkyD9
zLggg9vSe28Dihr0fN_L&J6Ej4IjTnf9XnU9V)Jxd3+&1=GUbI95&a!ytJW%O+h+;I
z<*T#RjJ&G+JIgcIk1z{9T_E1IW~0W$gopi|yVh>jvdiFz>?^WzbX{|e0=g=)zc=(O
zcP+Hvy>8nW|Hg=cuHEbNO@q3!L=Jm46poMRs|q;2XJhe%_<N@d9V$0%pOW^=!-3<K
zn|FM>0^_PXR&6PB$Wk>9JW-Xi%Vmp>TajaR?(S&?7LnIaRDWMNbEkc_x>L>8YVRtq
z>cH-rZMA;;gL{gcYV+#m98P$2y}LGl-~5gYu7>m8f`*XOxyEx&?kzkJ{*Q9EV&}S|
z=BWP0$Qvi?iVwxbcl7**9?)S}B12c5Q2!_N?2>F6y}F;0>dn4z^N9CD26wU)%y!?9
zfqBZnNNGeh4K>h0Lp2m%t^WdbemetIBn8dt++jb8QledhltNXX6yUi?s?c5RP&sOG
z#QjHf7&qIPBB;9TlZA0*&TtWo7gJK9;SwhG+2{jj`J3l6J_50ApY=z8NA&Rd8YZLn
zZ(qNBqa0+n66Xpm(|8j5OnH%Y(pz4zW44;HZ&R8|i1S8`Nhz=1s)V}c=(^=urmKdz
zZyOVEFlk72fk)x^$g551YT+}sPf7pv)sR|*XPHB`z-lo+(rfp$LhIyr{DnT%-qpU%
zi>WBzx;aNuUcaNF{To7h@~oB!Vgj3^hG=vog=nsnGv`p8NUGP>OhcYLtnna4Q3`{U
zl+%|yq#~UdFoMB&`s=Io5fPqk=@wGp_{8}z-L3Z&l0IKEM0-5AZ+`P~#1_~T#7mB^
zRWbB!D_9hAYV#<UJe$IVu(SE*;aA%V6T>g=5U&+XElP^KR5`2Cx4kGi>e@cZsl2Jh
zDKR$=EgHPqUYr_t=h#a22-}jhML(R%G4wlHlAbttvCJjkcKhPwC)XOnuN~dKB=y;y
z&b1@HEnS-aa<I43?^x-wC2!(CCE~v%;s)z1H7RVEhq!)yQ|HD+#xk5FERl$gOiW^j
zMTarBc>2nIS6R?xA^6e6;Ix531)Cu$fBYi<@?IW9&G?OFaat9CL_`=y((I&PkwZuj
zmuD0FZ=Ugx`KD>1KmD($m5?!$iIDR}hoK~YUZs`(O}8+bZdEkUjt%-J{2d`t{*9hQ
zhsA~`4O6gOOt2a=IBIQh*@~tle}?!Q;J->;w8095ti+IruowwN&_qoi`vc+A*Ywz{
zW=AtNu)>62*PmhyD}T+H7`@+5hQcV8hYe#*2xX_la6*hm_;OnM>a>4t#=i{s|0W9p
zRgtF1<3G(t|401qzw$Ew&t4_L+lc~I1CHRW1s_19DO3=9dCl~jJp5(}cM_(ISc&b$
zV(~2cVPu+EoGP{t7dwl^;bMny*_u7AFr67Wh{bM9PtU*f^h`EPWIQmA#4{I<F*Th?
z51eMWGINJirgdT(+O=5h!8G;(%v827He=>d!4@;jgZ+GyL!Bo@1^9bKF#9o{bbRRx
z=|<+yLFR-Gf|g-_vLUngm3?BMpH*zAOSo9V*vsJQbZ80qGWVK2o0+>a&2Z|n(1+uR
zqSx$`Bz{(L0sb@M0{vyF(|EIn^K<w4Iv%=MwEQ6+rj+*ap)S5j%!b&_gVC2MrGCMm
z<tD2)U2IyulwK@$WQH!$Oby}#ohK~}wn(FEV#z4cMNVbvPlq^sDD;}W%#x0r&W_#>
z5Zk8?pOY+D#){Sv%PKi+BujF5U6y)4jB_%LBg>Pqr~T8W%+U;Mf>1kNlCUT_ENN^=
peEe8RLPA`sc1l=6Vq{#bwz>5<^KoV(4^4pI{-B)z@TC7s_AlGe7<d2x

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
deleted file mode 100644
index 85208dc489bb83a14eecbcddf090f19e189caa37..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3450
zcmc&%cUV-%7C&>#vP)O@F1xr0QdUrqg$PlC$fZgV5etHqCPkLL!Xg-r*ho-`MxI7g
z6p2O!1q+xcMMQ(WL~O*DC~Ayd@!>PJ=$m_2EQ$W{zVCf6^TEuVnbUq}hWi^L*(f)L
zVtko^a7N5PR{)zCBi5U?)?Sz=jdsx4FTG>xu(iGU)|nf+8D-zE-+PVR<HrCaW#rC`
zA!E*n7?IKw9L1eFa&f0M02ta6e2)lnJ*jFzN`_K+fXG}Sm&C_qBu+?)%SvGgnW1bS
zk{v;+6qWcxt`yEVM_>rRojD!=upG(whT|OgoE<3y5(K2gfC9=y>`h}0zKQe$Zzf|b
z-()Nockal==#`EPqxkD8-ZXL)P!J?p5Cp0=>NN~UlSFMowk9b<Hz6@j7iW@+C#i`O
z613^rnhXKhb0lj=s$-<8olyQ4G2aWf)r`4f7S22(1;lp@9^NK{zL)}O9&@3xm4()Z
zN#I=82d<tqhKGwB;AXQEB=nY|4I8zvaJ&>8mN$d;vIBr?H+W!5K;hj~h^_tu&b%lG
zQL-0o9Ib$F+Rb75N*3j18WU*>FKF7PhC?O6V5@O~2SZ<=lU048NGV57_4YvBb_KCE
z418x%&{Aa&l3q5@P3jA$(#&CTtS3xf;15Gu(_n0_HAt3qgYyxUu+B6GdbsXD$19b{
zuAu^?3kHK-!*Sqt_JfRjGN?Bl3ri;{Vd3-dAh0+ICT_8X6XQfs+PWQGF+@;v%m^-Y
z=?43ndIGgN0Ms=dkRM_P5mjZNsBnX|gd?o(UIU-Ktb%JW5Y7cN#FcrUz@nj2P;3o`
z{99L1T6P!MM%jZVUjjwWG&EONgY#!W5cyODI}4paRVacj#c^=>k{KL4O~dImY7|;t
z2KHG;;o0;cu)E<3XX1<C(u)kp)kQ;b{yKO$^#Cjn-UOwgK5%6vfu7}Ehdu>XU|Z=9
z?GNU_hV#9^KBFsoxb_jUOuUA4jtIC6I|y1F0hv`Jh?<n`RAKoG_`Rwh+T&7eB(@EM
z4Lvtd6D>=@ew+gpx$YsV4LJ&}{g+S`6IzUVyPhDw3~>UtbJsva9VHr;?Ltp_9V6Ve
zw+wPmoq*=qKcgdMw~@OAN378%!jszdq{PvOu2^vjs`frbwh&Hu{h&tWAtq?pzItd-
zWupDFT8P=TBrM7uL$nMJgu9*{gnUadS=wX_p4HK)W_>eUDp&?D3p2@{u6cxa{{eJr
zQFqa;13QpgY9qBKUnCknbC6*%*WEC^$3?ni-+hp}AiDHvC}G@>AhVe|2)26+huwcQ
zym9kmS_&r6e?}ubt_T5>m_SJH;X>RWUkme(G?HhB`#?d{B%-5^MyH1y1aEH#YNvGv
z)JUfimH7eCBOnawPg;@EMbD7K+-fijd<K^%Q~~L&Cbf(&h_ef*-4mCJo-WCP-<s#Z
z$^+Z!RqMtOYiit3U9Vd5qm6%%V~<@YeHwcj*yT?pw!5A;m^sdq-fM7=PFKf}had~Q
zRwO{)&B1inS<}FISG}mqd4xJ1kpOYuf_l(E(sG(H41Yc!T@TxjbX_GVvC*5p;HpNn
z<2a(1&kj1_U@RS4FNdNXqma`XOS=5<OSIYIFzLMcD{^{kFzn78Nz{b}qp!0HNQ-el
zk-27<kS6sPV#>YGiQAL%sGUP1k@&hd!I|ww78)&qKVel8W|zpBX%oNs?h4mIPVs*n
zxSgkL6nG$C{D@E#4efWa&?ctDd4NB`6fH6kOAL*SO-#+qyL9a)Wn>m|g=Kf8RS#>M
zp0;-O4ys;`PQCj$yYzKcySabV&%^WM{sX)|@gC?i$d~o=4+snj9vl)H7Ct0mXymY{
z;Uh+liXI&kJ7(;-@e?M-#U~^tO-fElO`Dvq$<T6BGP88qQ*&~seL8){XEXC=&7L!N
zUjFA_6wF_+u<*-8i@#b@^pB;>zFuCuV&$rm)oV)EmVL8s{kI!7ZYr<Xyrr^g>$dGX
zs%v(B_x-NjwRQCkdm8sP?c4uD^MQkh4j(ys?D&b3Kb~qiedg@B^Q}K!_~*~RT)f1!
z@$qQR*l5P(nTV5KxylFY%YpJ&6%NvSRf!2_e(ksG8*lh=05@rlifrn_sc&Itx3RN3
zr|+IUC*k1UO1!%ao56jWgH5#@7`NjH5>EL*59eQ~4LayHcu2R||NAu(E6(^4MnH2)
zv)>;J00s^-zghRBVUN{icp7vW#5hG&i<Ocg^!zvYUE+{vQZsliOv<9tQ?qmiN$E*E
zsbB_a6U*8avUwJ&7MbdVK(uck_<LzW&8WjMEAy-<7uOjBjl7e1{=%*G)rhasJNzx^
z2^w;7r!LQi*!oYaSbW9l1m<)?_0;A=WN7x98q<?9l688Xzkp;Hk?MxEs>B-8;a}Z<
zAn5yWDhfS7F3L(sNz%kI6n+jk5O2LI9F*Z(y7}S~fCnLex7dtGulz1s^LlTNfgu=C
zxHoT2$%P}~UmgUG(0Mctme{`y?_UV~XRshMR45hU|Klgm-}LnOzg*nE?Iy_ztYwBO
zv8+TX1yCsE#%xs7&}c9H#geb3T>0j~`mroK4FB>ijAe6JzhpLuWs_O|WMRz@D-7Xh
z{wy29k67*;u_R2hcn`cIcIc;lTwVL)NnpfaUiQ!7`#`?Ku30vm@BCtTQP^kIygWJH
zJ;E(MdQ^5|(16J?qoY#zeN80JZ(17e1b_a56PyHYdVj){-+RYCD>m9ABQZFcjpOZw
z@E8tmF`aVM$dSAp%6EF`LhAZ_CgU|fU0k$>HfD5$Hg>cC9mgA{#}_){-F$GdaC<8s
zzLb7kV(=&(zrlut^ZJ5PPG|gKAws=zv2p7xJ<A60)8KTz23%~=fV6n`TwIfWyb>;Q
z4qtzqqW)0unx9~aGso!gehllEqdzAhS>6ikutFvEMgmCvx&S>!6O@hP2sn9r>>r!*
zN2AvSxeXVWIVC$u=aRs2E^(Qe+8mpyNts!x+6)^v&%SPb)rxQ>!0R8VrvM`He_j6p
D2t(7I

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f82d9dacab54b6721a1079f37ecf1cbdf3e4d8dc
GIT binary patch
literal 3317
zcmc&%c~}%z5`XXYF$Xh(LXR^r2vHdjIYsb7q(M#x1q3;w;s7Hc*9Ze+yjG18yzzo~
z22rD;cp%0EjPbq&!AL|A37&YwYhB~6iTk=I7+Lj?{l5J+^-aI4>Q()!s=un9a5mDN
zAsIKOznY;L&<z0jSlj+<?Vq-vI_YFs_#!IMWYepLnESB@e3}`+STJ%YEN6xvTImHG
zgdIG&utOREAnuLuNJB*Q2CAb_$&d<9K$|J#qC`zrazd&mH<dv$T2={V)le0y5`IjT
z!U<yo3<7vC!xI2JR+qO}NeK`^?=G0Zk*RUwq&oqX3p+S+q2v!483zCUDQ5lw9`WE0
zl5fE~NRzXZ@-nqq`h;YSUL#Gzi?rl~r0k5m%q-rWt5DVqRpo@L6)FD<lOJ@l)RZYV
zRWtVpJ|`Cif3G7p9+*hJW@@Pe`@$&A-VC#{9TgTM(qrM){AScc)l;w`%>llyRS>Ml
zbh12q95TENf|vf)(zVP!m?)^9AleNdNmp8(5Y)c@)@;q2R@l{HHFURFMvG$}2s2xR
zqUnt_!t&9dz=P-9(BS<`%yL}%&{d`t=vs>%`by?Y=}h*Z;_>amm?A$aLtX~#oTdnl
z1t!yb-xQ-yryUU#m{PQrXCZ71h^A@_`%BI4Is(I>NP2QxAdKCx3MR~53zjt5fUlQ0
zk=bj56|T#cicOByph@vhVQc6BVejjAVXD|ke7@34Fs5j>nW$zos=D15{(2~rRMa(t
z@S*cSRC@_=?q)Tt*SrKNsQ?~!l)@tov8Z+7gf9NwmLRku6Nz730NPiz@Yl9<5^b5L
zXz{r~7KQhsOiGkQ;_GAt3nGOn_Z-24yGO*%%?SJuzD`i6N$jz0h&}MQ^@uX}SOx?2
z!4fO0rNj`AAW`tTS-^4L4qB}rE7U!oDGI+o$mCXanV9-q0=f_+ab`!O;6)6eCio2o
z538~T^+lIK&44rL`(^dS9v@p%yTv7Bzlf{E&+sB0Z#M!x&!B+G?iW<yrBtF&9WOXL
zdmE4pJ4q%F?~Yo69AMR&KvU<4*`)A!BOz?E6bah)5kFPbBH7+-q4l;cK>17x;>*qx
zXPaE<T;e?C8)`@HtRo4x=PNCnHVzfd*A>7MM76+i)Gu(xgR5Zu%0R-}=6676Jr>OF
zc@k8l=b$}HhrsG$GpTp|eyVG<3pmkWK-m!qFf1fbth#m`-0G2UuNz&3^j}m6CKlUJ
z0TY9eCSODDn#)M)dJPo}*M^XnMOy^AS52_-yIDwCa@fM_$HicB!yemEI}a#deUdm7
zwS|aVa1TaVlR}^0;;5p7_u=a=b!4V_50L7=6{e5cOQakJM9EDTaAE3t6zhIP@=f;~
z;jnm17?&Xe4W`@S4D&^V?=FbM4^@=a=}I(1(cZ1l`HrZ*Qj3x*)v%_|Z*bw!`N9QJ
z+fe?obovHmi4w)Vhzs-dpmBXUtWQe>o^@*S>K#XP!|OP_q?m)w&QX{wu=J+HrG*09
z<>R^>y)_Y<SX9F~(ll^sV;fOS+eqhGSi)8M6~y$-*WuaR61wmHNZ>o+CcRNoWwQD9
zYT&eefXQKRCGZfBf$dM9q0E1p0E3TIHhm=nn_fOZZ^jph!dfK6(^EQV-MSXqjP^rK
z<5c8Si!geVMF32RoJrv-b%kOUP)<xK^=CyTxzNi=?#0CP5ZQ7?iunB^gx1uP8F*bv
zk-}(zYb#070-;E3B9V46HS5~V+=7vHmn$r-l-50LY(MGQtGAuX-l30UUngf5S2y>5
z9{oMN1`HfDc!>8<pJBeNpa1ZHz@Xre&=Kmek)y&RB1cC>kBNzmi~n@&xbX=SG>J*c
z+KDNtX_L}3GPAOQ4oxO0Yn?Kun|UrH)8|Qbw)xnG!ETEEX9}HUDpoGiDW`For~40A
z=*Su2BHh55yuQDUPne}ioSm$d>Yn7x$@+Zmyn-(Z=Py{eXz`MzMa!12Sh=dW<jd7-
z)~;K>;j7Y(Wt%o{`FiWN?d3amR_xlnXK&@cs{P*_`1ZT%nuFgT`r+`AqsNY)IC<*y
znX~84U#P9SSl@8z$IFdBU1_>{t@(P(jn<#rZr-|m=kC4s`wt#I`sMMHr_Y|NPFpD%
zWNbmfovXMgGcAj2N7`I6BP~}i&}L}43XjWqDJ5%TWE&XOKNwXx-|ODZ%m-_>8{_u2
zF$t3(prCgG<gy6BW8?IJ&V=Q7k_$UzxX$$cDHiI~nV$6cSZ6YQ+ov+LSt)vBtHr&k
z1*%%CRm-h$Z+c(-M~r^RPsJinAg6PaQni^HhQuA&4!zUZ?Ud@B=Dzql0(c^M=U_9m
zQMhyK<mCQL1A`b^?akdL<$OxG=8v$2j?~y$V*R$9e%|nZItyg9La9*ykH0!pf6aHm
z|K-J1{Z((0?6jqfRtaUxEiC|0DCH71GICVRVB?D=FC|?$@?`y3mW{v<M<ZC)!1|@I
zfh?QC`ls+`epn!kTluqW2sh(>$BcLUGMCf9DPl)W?d$3~5HA8o26KCV12+b616Iwl
zYHsk0<F-7Xb>sHwiTy^pC&rA)OAZ{A9v2&#%JDP9F}}|x;eprZ&nv+oFd6mnOODs+
zer|k>XI64h3ajDn^X9QTnBqG2k<roIK9n1buJh21c+&BipI#H=nH?89GCMw&cOBb{
zF!~o7)fo>C7N&RbaG~_mB?pbsa||{_&B^mZh7S7?A$+`XurYOno@Iw~%b*M{2D<pb
zL6Z{uO~Eng&k5lm8@Tvmh{mMgGe7<&MvlG1{5aOnV9XN_mb-;zSU!@*OT3fDbKdp1
z%)mTshxe1akM-lLTxyJxKyItk<V?=f>YbBxI%iEzPPV}|Uz?MgmYrqm?&ad{;-*k5
Q0eJfZ2T1_J@gJUl1J8q)CjbBd

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
deleted file mode 100644
index 4748c07ab9f5c77e8fc8c141ac748fc7bb24c158..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1354
zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x<
zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;<l$a!#xUoBoF<sXrQ7y58
zs;m!f-x*BoXd($RlQsE)&h%T4&`%i{dpY&%jV$-!rW1^op6ka0$BymIDITWIU=+-k
zp1*?j=?b(Tf+h)^$zdLC-JANv%8kVNG>UJ!bC)Wuz<v>~!2PxM7nZj#IL}>w9Itl$
zb1es`ey%X35RE3TXG<dLP!d+uDrx5TyDv%1ilF4FONBHzF9^l`X-X2Lg8727MTIwr
zT1x^}$Qmu(NBkn;^TL;DGnuKHYW~Y0_3KSe|6ilhuRR+%l<_wzpWkCBg0caf&T;p7
zIw<9vxI?P~1^~42rS%#h0b>j`fH73Z{La9DH*~53H0X<V_C<>yBSIh`IR5x}XXhCn
z4w{E#R}<O~XhUcKlQzR5llx(p?AdUySsr#zmriYeX7xG~@_Q}h_u&jTj#uR%NCH=i
z$B*R8<wyN)#dqpsuqmIH$98a8ncX=#A^V87v~^BB<uk)OBeuJh!0I)Fevh|~_1bBF
zM{o0dP%K<4d`L<obn2%uIiO)uJP%5;`t1hS8^y+zNe}Rfj`cZd5cX^PXT!a7)Z_^T
zp~wm8j}+6P;GMxUkvVpUehZ8wof9WZSr`Lc$ux;Kncnl(Ew8r3I=oNHM|@1B(WH?~
uDTemq-O`P>Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..842ce2caacab4c58384e71071de8b40ef03e2e4b
GIT binary patch
literal 3138
zcmc&%c~nzZ8o&3xmlXntyqI7VsuB@dL<EbX@`9`eSwyyuLLd<kLLvmxx)rTjL`A6z
zb*)&bxZr|VEm(_dl?n<}DlX_)r&evN*74YO>~inR#F46h%sF$W-%0NF?Z12TeJ8PO
zye~sDuFMcMo`53&4nZ3{?e-qKe>b7w(9xn{j~bI1z&J8ePt12<_zYhW0NjOEo=j+!
z1^|d$5gus>DWu#76$~x+2Yd&)Oq8L?*QRA_3bPp`rKMY-v=%BOl)_h8k`rHmK>!a9
zcmm+hA==?HLiU!42=)q5CbV*7LaVG4hyCIyHt_GT5ea)U{VZ&EiPn%=q}Sz}(zF_r
zMv{XUIoh;LLtc?SpG)KnlwN?!a!T2T6#s?Cm!?-@$CTNrnP!T+gc89&-=?Y$&7l7d
zFYpf?h~}lbO+s6D)j3VdO@`ZxucCp<L4wUW?(k`woMQdv(mM^)koieC?EF_xpRGe9
zs3q;`qP_5Y$)h|TXnT6tVg2tnpv~ULJYS~@zBuWgu(UIqDpjWnc20Q>9{FMf8gsDR
zp}>0>zrnr^UFmT{A4r3FEemQv<@7tkq*W?jo@^_yop72y5vJv9C-y^c%sD3L33rG6
z{Yzn2=u}=?=_rZAZ4Y1$jOWj~83vO#uZ3xgHh}VO7w}QJC*80<%-*MBrP$^~6PlU&
z5Z;IyEgW*~Hp~`#iqF*t2ohH<a1b?3L0)kS=%0@2Y5Do9Aa>khkZ`Ap5<cpp+BHu=
zRyKgm9uj!0BN^RTmTni~SdP$YJrX}|1-hTx;7>PmX;d>u-ZQ?H7Ir&}Y^oGg##1eV
zOX7uD-+F+1w~vXvt|IVR>_&m$<^ig9>`>r$>w8{bzX}*;im>h9e<k%*T^ownSOPp2
z@8V1M8-&J3^F^_D18oM>)&Sl+w!n^}sk6Jf1do#dFI_bTeAi$Qw6D4dnns^RpH{R}
ziXSDwZABHGy7e5@((S`9q|Whzqul7-=V@xhqczUm)#F4<jZ5Gus!`w(^8=iB?=sl5
zCX5>3@*Cji-xn+xbQ;v<7NC7A$HK-+2T4%-L0+G!-r!V+85!czV02`WSb60dxHYhN
zsBuaIGQC?Tm{IA%3!M>;G{qWv&mzY5{NQndiMmMoqNqk-{J9%;eNuuHRn1NTU#<YN
zI`+9nx%olay3^Frgc>TP>|2;HfEJGbHHEk8$X)pKeIu>!I}l`tY=^lq`>CvxfU5f1
z8??^egpz$P*?!!wKsX`Q8K&fkK!^PfIIr(=DtHe>;_sBa{%7jZJoz2RQm=1B?e#j8
zS+9mo!+wR!jxQCKCDfwg6S@5BJZF?49!#|^Hi529J7IfH2Jk<xrZ0cvfvyLfgq`w*
zsAZwtrp!5rC*D#jaD8u@-SJy9pp8=_Tqwx_oz*v~O1_I^v6C}gYg$ds-F6MO6jt$v
zAB+dV>0k4!Z5wR1{bL>Q+%ejwIY<Hg#EJ0E!yl3U7aL$6FOkk&!@%w*_t5Xtmx!W!
zY^jHz8{vQ(8=%V+73!X*q-Q%t^FMS7g<0|QdAO+jKrs&}Px7mLR{CfHy^JOg3DXnk
zD#jD_+XV<kya%`Ox|yPdfdC?k=J5qWk=VvoVrTEr$FZ*yBkd=XJNH)%80g~q+MvNh
z+>}Gzhj|S5^z!x@;XBfAlz+hJ*8|6l4H`FoLNKcenHU-t9uXNeNgX{oCN?g9O2X8{
zq~w&;H~uzldRn?BBU7uJk(He@GdEA4Z!pd(C^Quno8NqE_MEx%=D%ICVBtH97B6|X
zbZObL<txfpuBv!%^_sPnRqwA`zhUF1%^z&3-umITnvb^E?%26&cio=7`}Wr#XgK)s
zp~IgvHXZr&=x5Exj-NPr>h$Ml&bEAUuC?v_h4zllFE4g|b*cOEm8;i!t}7qN6b!P|
z_lVwwQl!txC$(NzNay7gngqH$9Vw!?Vm4FKOODc8j>^MM%H>=ic^=G{mc?F;DGd_H
z0l06OUQ+d~5;CDxhE)9*PqBgLtA3f{RaM{qSr5q5<!6~J4a`#ae{oheI^(+kqV-q!
zd|86><^Dj%FU-u==`{?E`<xqku3fq*)Vuoz<E9M!k*qh$4BsN$yDbs9|KQIchOZ7H
z+O&+j3(`^%_F%meH)m|$mDuMT|2bJ8rR55_`hVQZ|DqoIe|fRWzv?APOZ!P_rBJ%N
zpA!Ibh0K<XkB><jV|iGzEwm3Ie^$k^Y#ja)n#8hZR+Ytuv1}F_lEs~=us}3fg|KWS
znepB_;~lpwBo>GvHfHv4AD`FpB6M;D*@u|P7)l0gnq}2wP^FM9$7e^7eQw6c$-Ws$
ziACD5z}%GN_-w-0$Kv?jnu!O_UI=Fd2Z70Ak6RL6Fa5&QB>#MEcowT6`kZ^54yF|A
zK7Q&{vX3HzC3P-zOFX&wOl8s}`5RJ_CmT|exzw@0I7@y}3BBXNU}1VL9s;Gxs0~jv
z5e6HnCi0w++3G(olEWK=jmZi<%T6TA@H~QnF*Pi3X2!@jF(#vk5C+*y@W&xqE(M>d
zI87Wm&JOcaSe4mwPh7A>3(K$^k`^T{Ny|BxdWt@*2>apkB>LDsRwY+skpwbVqo!b1
tk<R3mX*7Ci3JMHn*J52kVU8i+)i=P~*L#FqtpMQJ2OOaRh{gXN{s7EyRmlJV

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
deleted file mode 100644
index 4748c07ab9f5c77e8fc8c141ac748fc7bb24c158..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1354
zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x<
zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;<l$a!#xUoBoF<sXrQ7y58
zs;m!f-x*BoXd($RlQsE)&h%T4&`%i{dpY&%jV$-!rW1^op6ka0$BymIDITWIU=+-k
zp1*?j=?b(Tf+h)^$zdLC-JANv%8kVNG>UJ!bC)Wuz<v>~!2PxM7nZj#IL}>w9Itl$
zb1es`ey%X35RE3TXG<dLP!d+uDrx5TyDv%1ilF4FONBHzF9^l`X-X2Lg8727MTIwr
zT1x^}$Qmu(NBkn;^TL;DGnuKHYW~Y0_3KSe|6ilhuRR+%l<_wzpWkCBg0caf&T;p7
zIw<9vxI?P~1^~42rS%#h0b>j`fH73Z{La9DH*~53H0X<V_C<>yBSIh`IR5x}XXhCn
z4w{E#R}<O~XhUcKlQzR5llx(p?AdUySsr#zmriYeX7xG~@_Q}h_u&jTj#uR%NCH=i
z$B*R8<wyN)#dqpsuqmIH$98a8ncX=#A^V87v~^BB<uk)OBeuJh!0I)Fevh|~_1bBF
zM{o0dP%K<4d`L<obn2%uIiO)uJP%5;`t1hS8^y+zNe}Rfj`cZd5cX^PXT!a7)Z_^T
zp~wm8j}+6P;GMxUkvVpUehZ8wof9WZSr`Lc$ux;Kncnl(Ew8r3I=oNHM|@1B(WH?~
uDTemq-O`P>Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..fcb03c17aa1da69311db0ff5f9542aafcde37756
GIT binary patch
literal 5020
zcmc&&cT`l@*5BurX|w_6jx&OSB27_>fDMrWsiGnvO^pskK&o^_N7105SRhIj1S}K<
z1;Ij5KoPqE6~P*f2Fr^*v4HyS9byu{B!7Hsz4hK++*9}7zrFW4Gu+>}N!(2-f?7lc
zx>AZ1r~#NX9ZhSGjO^{q;M=S}{<%h2IQ;TO8_#zy4!sFJ=h;?d`Yh{Pt<-rbHK#tB
zKDawUao`{YlqRKZL<uN9#iCf6mSDj7qATKj@dg0O<`~@y5KgA;&Qt=>`+zhlLTmvn
zzF5TdmqrFI43Wl$P?%64e1e3pkX{<n<NT99u@N0OP#Dk+pY8zYc`>3wT<J)dOxy?r
zTm(3mO#q?90g`@L^rJvOiu9wT#IfSjW{ju~c8o7=YSbUOh&W$%L>%;+j*!mpudDPX
z-%6ezMv#0n&xBlhMejKIdxi!|fl&eR5kZl03j?KbQl(IO5E{5JASygQB2u1n4<_7=
z=@k=t`MBo)1Nd(Xaa4<H({iPrvE`+>rSj$`+hP4uSYojjs(K{Q)4UjrO2)vC*A(H+
zW?gvPZU_OR1!!+s6m0xP0J=Ht5Oq%%z*rq#Dr1oGb13*7`UqD)RRAk^GL-p<q4WhG
zlJg~KRg5CGL_8T<51PQ49gZ+EVl2Fz{Rv&J9tT;PB4pS!3W%r1z=@gxHmeEfsvZSg
zeQi(|*g)qJK5X%`gs=^EFtcX~%umt+?sj#!KC23LDJQ~k<63mFN)zccSAt-J6X-Nw
zgy?#6h<ql5Cgu6CZLub795@X2xk0dK|46vBfCW2yYS4Wig1QR|a6?rc+FC~dQ8*n;
z>Rv;tvkuIvE&*|+DHLM{P^eJ{D?eAmLzoKPjudwP+bOVlwgAKj93l0|k7!A}DjXz6
zK}0GSvW!S*KXeF;Ryu&&2Nu+47=m5~3-;$q;p|-%=(s|{l_C@5Qc(h<V$VZ=vIFQm
zGKQ=ES#bALBqYUoLvHFW_?&PWavaNHr;9b*&&N>zsy-N#HVj5qnZb*fYhdqneHay~
zhTataiiQS0L~#ZPq9b*{VaqItsrJO`LTZSNicj#i+8iApo2$SXIRo~N*h?%Lx)Vk%
z&?Ve<y+IAm=b>j}CQ-StOJTI}C47^!A((bQgb3n1*1Y{Fdar*0GmCn{PU^e_?J2j=
zxss>IY)CX#6cq^X8+PMd18cG}uM?_UKA@4{icLOcf-0Pq(44j=Xx58CCs%i2DGfMm
zPMU{xdDz3xmaj3<en)&~t0Gt)@<w&L+u?57cKDnTgO4y?g;`CUM22Q*u%4W*MW&%A
zh@w;$%R@Gemm96Y3m<-m+|l+N1Y;4o^MebfXpZ6WR3kX*yo0l54|tCrPap-L1QS=B
zfOnP7pyX>0;lsyb&%bGa^m8ZhYaZ5+*18yb-AJM<GdsY_N|&hDdJT1g1gt7`It-sa
z1DY-m!v&lBk?z_<pkm(-_ZC(IZe@Z;Q8vJdPa}>k+Q#~j84EwPuYvs2HDtlAd00`M
zDQeVjz{i(;#OGh=!>vz@VC$qNU^T|q*|G(eWDENl8E)c>pMhAIoEHGA9y^h0tCxY%
z(I%Gab%b93iUUaZfXTF}_|RpF;4zSn`evL&acW!?c*2UjVQhj(g9VtrbuAgt;YYeO
zi6E=i3mIM=N>-fxjP?yViyQ6RiYNCt!m*gSSmO*wlpULf4_VNQC#l>;5uvxSrO(!3
zPZzHu>SwwkPM;MPt#S+viHO2Vjw~hys7vILLo-QfO*p@FUzK1+m=7vTxP=YZ8_6yT
zH9!NG#JFVq3ZgP<0hajI5nbuOqxyVJ2X|fPBJKh7v(h$=cD9R5MKS;L4@lJ@519%w
zSv>EjoQOLd?uzb4PNl~P)O6kio3}%O8#B(3Ri{#g-MpiVt(-1UjaE2_Wq<RS<Gt0E
z3>TGxu2B*@flPp!&)L|#<W}}N6_Qo=%X=cRW*NGaX0F8V8x4t*+*wQSJ0PE;?P%e*
z1+ckS3-)d{BDCB_aEvpz@DxrqVoUs9q90r)a7NwkLm@mP-i2CAw%67aKDW^Wt9v*G
zy*nO3h%es)x2dV%e)uYR{>KK?DSZn*$G71vqm@u=mk-vZ5}>ZaEyYS6nqj<OE<(`z
zOX%JGFaq0?EWTrXk=W{{Ln`EG;{F4H7+UAf34Syhp7yo!PO5dl3AY{WG^zja@+tZ-
z{=v_r`uI##!ZyP+G`8UEb<e?R$7&dzx{u`=H=h&zTE=yIGD+do$W_YZT18OC3H<E7
zZuT$UKrXVK4F9T&Vs~!628|QWV$GSI_(5xJ72QoagxQ=Mc&WY<erez+e{%h1{!XC{
zdD0<}rD2hVy6xwZQ3Xz7&&N4<a0s9uHZD@OQ`?MT-$h_!Z(-mN=cG7gyd(EPLn+T!
zJ&(6g{0;By6eo(DK40r$E1%3fb&4MwE96Be9K|1J*ixg9xAFPor*qEr8na^Y3#5&U
zF0ubIf%i`$ReMXOa?_*NA*n5!J9_prwDRc<DBR|N57YVttjOo=l#!>QDl7&&xMd1z
z$mT0qb=H%rbH_n@S0c9P)D6x|=Xjpp&D-!`c!GYkM;#WozKXp#TZ^<`?1)Jdq{O(!
zT)~~s9W3t0>)dPHJ?!ZBy{P-xYD_byMPPY#6D;jIs2%=5h>7w}<9Fk-NZ$>QkmoRh
zWBt*W+}iX64dg}>5$eOCc*=S*Y<3MEd<yXXl_%jupCc+N$W=K!B!;uEegW<q&V??O
z3bax^1GlM081G+tq{f+AY<A~90eGL}S-cUj0%~1Rqv1!iu`Qjm!LuAoI2p#eOAf{S
zc{=#T)HvuatVErm{;=nxo8pb1Mq_s^PoXQ~HQ2c|VucMutw>&R8e2PSfpXh}#YjQW
zfYvC5!j+Qycs5H*DOE5OZIAm7U$Of(Iv1P68dL8M<rJS=qFAS}`(Yk%_J3BbcQ5CR
z=Xs&WFJED<W7;6mT1mKK8-;q`KE<ANJb{^a6!G(IHH31}OVIML#d;U$5z7TLS-S-G
zDA-*_((f%d1k`3_BdSI9kJm7QDZPkb{u$Cz#nU#UJN(o42puNjjXAU0qDpXA0mc#8
z6t7OfIcf$zEyXEo+5C}Cxi%$hQWZpIAw4Z6YuBr2+op*mrQfEj8@QMIw3eoB9Ae^s
zCReg+-6oBRF^_s$cdg%|WwVAOw%whUspFJo;M=x4{X2cPQl~uIJsY-<@u>~zZQHXk
z$9Pd&n%Hh{M&9_4u5#a#dp8wKh`xU&&#r89;gqCTk9tp*ZP__(HO3t}y?kqlU7DJq
z->LG<U5;BvI_FQX$l5a_*DUnLsfzE)W*6J054Eq{R^d_ZUg6hXxqYAaLI1P)_WQD{
z<{gcBe4~9|&Vhw3Yq%N?`*Ujo&SV+RJH0>eQ1JJq&IJxt`SoEvwV^jpR}~zIif%dk
zJGyU+{th|1+Jx%Apr`Lhr_n2VDA}mFmu?;Nc*NjNmVwEhn{qHWIT$Gmsi2{H8)&GS
zg6`@sP!|gssC*e{R%8wOQIZkO8l()$yJP?_#4^S90=u#i%R?SKro*_^#1uiz@qj#x
zYx4$+V7Q!;iS(11)aRlB&Z@UBW_<)=`vJ?30FUY6ixeiK{&&B;eyg&`CI#mT&6D_2
z+f)UyW!yWy|MYZqL$A6d)c}W08j}*<yi*Ny%G7brHeaR|<g$H?@8P)jYKvX-#)o#-
zEmIGkRXAnYuW#O~hq#s4r3)>V3qsxZ%*eBh{~%c6S>aLPRll4H^QxM6Ea8_ARJcz~
zz}akzWMPC~JzWveJwF<<HeOXG3Qwldj%Xm6=F12g?A~b$RT;6Z-%<ufeZRhO9uo3>
zL&9^R#V{EW<%wj97nA2kkuuKherlWyh!yXZ24E#Qpm@*k#47OsfgMHzibpi?M0)ui
zHNfE|yjPX+CyD4-#WF~DkpVTJBzv)un`4oD_@K0Yy{f)fQ|{7$jx8e`v#s)Cg3jld
z26s2*#RgyADOoR^njaT>rEE@_S95-R*!2U_j_j!g2@yAsEbZ%VE=Y{Jdm@EB#Ck{4
z(jPlA^}UbpSQguNxx_KYx^Q{?)9W?C*N+z_CqBR1x_-#Cq7}<t_jQ(epD0?H{Py|1
zj+|*bWh?uK$?vs&lIXWvElK+H3NL+y>-}V*NnwMm$LVWMz>NrvWEfCTED;_W8^;a`
z4`TQNO%7UAg#+fohH-jZ_4W3%8A|l$&-ULkfN09gl~R_!{!ipIDgud!AdIBBP>&La
zkRmQ00%!u{{~N1$#C${0LVuBbMdn0|sa!;yFFFhX`s*sK^c%s$7=l-$Ow*sHN&iHD
zRDLHz;X#qXaf76<6cc7(dZiQf4o;v6&|e|`0r>CI7iU<2h!q<U5)>h&2$~w|Vt=BT
zx|**0)oo~w2o{*=>ocHOgUVm?Ge+-^)1fem<!Z%P6C(LZF)S3L5x$(5?oislHsfCo
z{C|@Lp+KxDcKuKD-~SQY{I6W-|JhX%{xC%-(BlYinF|2Knj%GsyZdbK$%8jbR7@B%
zVj;1WNF;OUhmjc)NutCySmGd&1WW9K<!iRI!c1mlCy_WaJw5-@(=+)nmhr$ilFVK{
z#@KiwJ+PnU#LVpynbw|ZXx9>nE7REeGE@1!#DtlL`J2r$_4oFQ4|JFm=Ii4g!t6)5
z(eW)?LN{`Mc5)|l5VQ>XlMk7_uk2&}ye%RF9fKuO#$FCjr$dYHm%01gxy;;!X$Di5
zhdvlj7`<j2C-t_7^7WY&<>w<$oyMCpn4gR1*YVKBqUBHVFr~DO4s`U2V>Tqtu8h81
zDe(*b9A|mG>0;CJrSuZXbY|!n&eR~<&tcLMf3qaICgzM1UF1Zj{&a|gheEH}$}Q>0
z>FnrzUx{tv;5o^IWvpl&iM*17M)D*F*X61EMmWUNIPyFhd)hy3${fw0CWy47r7=t6
vgW|>pL`RR6#>7M=Y9|E6#D+#iYMWY)GaY9lcGU#<{STT6`nN0kce#H9l9NZ9

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
deleted file mode 100644
index 3c53b83a00add2b8dc0dbafc83e88f7ca6fca968..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3442
zcmc&%cUV+c7C-luVTK~8^M)B55h)`oI0FHq29XECND&bWMNw%Y$jm$$5R3&I5tV3U
zHDcFjEGR?;vq}-gVDA;(L{VdGQE_pNExPx;5tUW{*zenKb3VBDo_p%=+{-(cNH)rY
zp%@M070!qlXa^u4v8$k7ufKG<(8hoEp4Z!DEA3An_aE}Kshj~u%E;XqbH)biij+R!
zDsGX<#Vu9<VD3!tBO=K4gsKiH8A{;|A{&KVl9-@R9yK<>IF=z~=CU0~R*6(GD)FaW
zDco>wzz~238$1DEK9cbd$2su1x>5)v2uO(;1(cQ8m&O`AiA)o3CS&a0Wh@uBNaSMl
zMn{HGe7K8+7Onycf+P!qL3N6H3&Yha*^rd2OVMYIN>0d1uu8*=wB%7qhKy{TUI6wI
z$(oR=np8Cs%KswfN8vVDGuN%dnJ1)x_;<6%_sHPc<00L9CX_a?&`>i5+}3u78|N+I
z@q8D!Q{NSmx=GQ7jRu%EQVK3h>cMc$1wh>%9<?E$@O~P^mH!FnUTp?ZYH!#Wqk!+4
zY+%wd7UgAH66uQGuy30N4z19Fqs|>34Sa=8mi2%lr5tswb_VL68pMV$@SjG(nKEaP
zba8<8Qhzv|ZUYPAd|=$%Kp4=F4kL0qfn;%exEN6i>)Paiy?O^aUaCY+H6<XO+aH{2
zjsv&T3-k|UP~Bz(EE=PPc`tv2pyCu5z10y;#EW2c!*+Dt96`k~3%K02J=E>%4Ajbg
zps8qv{17LIC|e7P5)W8IxWdW~6)@#>8Qg?Ea6!uu*JphJ^9M>n@q-rf@7_S^*==DP
z<qW!f2^6`}P+wjSZc~CG^0^3h7Ip<yp$N7XC&1yW)^PAF4QE$rQ0V5h;A}h!FD3<p
z(`_}JODuw`uk?_c6%EDt>)`c-1F%H9308;t!S!VXdXaYvx)*c=$5KyddNczzT<ikQ
z`gZ8?nkUFE`6kM8MZoEuAb3FpWR?vfD#mW73OB!kKgzt&9`|Akv11r)=)8d%ZMPbn
z<6Wr8b&pYH$WdtMxsWOub;hEb`ULrPNLTQ<a1(UYQKDw?F7&L+F~ZYu*DUw+38<g`
zGdi;N9`dy1h*gGUcviWdl(_oQB}-33+1}^K5yFYyKWWhB5GypOt{Q4onP~sCGsN^t
z66WU)C(aBGg8M$rgnX-(T)oc{e9EIy#rk@<TCf;i7iN;3)p>+(&t7y|Q3ui813QpM
zS}nCIUnCklwXb<G*TFo){tCUK?jcCs5xx3(C}HVEkl9QXXq}$IVb5R9Z{PWxmVy=Z
zoLmb}OG3aZHV888-HC@ID`C!&TJrp0KPcEYhG?#$(b)k9!PnP?+S#cYDx?#L()@m4
z-!BZRPj)1w^IssBndM*|^a8GpDg)A2LmC)=5N8)qyGJh)Jzr>q-|A<;vIE=c<?DtM
zt13KDRhLTgvyFd}BaYo7{c1a#Ipt3vwyQ6iO^x@V_nJMRGc>W}AuvMkrAd%?r$60p
z+C*^MRV`|J5uxTMBtXLVpy}I(w3}!NgI~@;x5D<LtacKVT<c3;R%;OL8c%fb+d(HC
zjH5%V<xsR^80vb?j^2FuHQHi(m~`9n4LPYn3%fIi5>;VZ^sTXgw2i++=2~Axy0l-2
z@ejTt?v2T#b`FR{;#<B1XT2NQ>I?+_gq2B{T_QK8QT*<^D_jRT#s6_&x=1-F@I<~k
zg-{d?P4_X=D5k`DfIq<$Eiw~J%q=Xf+F0APZP#AP$ZX{byAH~Z_MIF$J32YLsJggz
z?bhARy@y)k;rW@Dx6kK2d-eXpw~t?6e>NbnUr?~Ne@JLp_<)Fkk%OWJ4;eZvIwm%5
z_=xzCqedqrCMBngNgbP(J}yJ2H*n)Kjak_fa&jkrIcf5gsd>|;&zLzY|Et*rbLP$~
z{CfU^Zx$B)ZPDUymlQ8uwtU6PRjb#m{chd*?>B7Rw7F!<*3z;cwr$^0Ua|AXzwg>z
zSyf%Lr*`kYy8S=ZA2@jE@R6g(j-NPr>hzhj=gwca*l_9cKYsq@%2lqBk4JNs7L%_{
zMV$2d4L(>84wS#DaFA}xS6Fd2H-Ede@pb?QaEIoo$bD@&&0Xy59(MNo+56`&NH}<~
z3`-YdGk8dIu&I&*%O)H_!YLn_;QSLcLNnb6kLgC|f4xQ0k+Xb)5zw5{`j4jqfIfq4
z?o>Ui+0*eFJP*DGVw|F?uA`D6O#QEa>&79`rRn)zm|~<d(u`SVDH$nzQ^6giQ7mgx
z$Y$B9>SU@tLZf~Ez(3k1G>qnL2Nzda0*!oA`2K~d;mwS%Qmg(g=k)}Vi(7R0ZusFY
zR(jLZ38AOob@)^>w0_$fGg9=aS*AXJ8Og37RYNCLVJF-UKXm_zpdZ7jDD(!o$e1)X
zMVG)(_&MN0y!WbbQHF1A?~fk_;7!O|7n>28lv{UeUhmy8Fa#qC_vNiAxo{->%Y(oX
zT8_rW4*PfD{R@Hr6D-Kg6-q_;|M<!C7d<`xFBez!SKTDpQ+t`YN-Qf>N&yr~xg{GF
zH88rj>0-&(P->pM*#MSh2jO3y!dNzk4M=5!SvHjoOcnM5u)+X-709w7{EYdQ8FRwY
z$a~-&u>-&Cu2%QNi=c@9JRg|Dk3sx^U9)UBKLo_`tRQDKJU=ecGr}V=dRTUHaIbN(
zF;QcAxh@jtH!&R#g1<n)2~Gl&$)B*~Wv%RuanauTWNj*&z}pMqF&s>>Eqv6_p*$bT
z4<_hB>ZW|g;k|&YglKO=Y)ph9E=GWk;|((53mwurA6zU<@8!dnGJs3g4$I;NY)Ck-
zFDT`-#2*wQ)EgHYQ%mVtwjaOLX7DxO;(~jnCwk`Mnt1U_xX3wt{c(z>L&19if+fxz
zqr>u8HXz4zPC~N071m*eN}7xWkfwbBdaN!u8^;lF^7hz2Hsz1TqzQ5dE+KP#c1o6e
p634kGWM&$2944e>8q*AV2M?bf9z8UQa3#RoAE>4PBJqD){|0`F&W->8

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..84c399e67f3a1f307e88c5325c849ac627b14610
GIT binary patch
literal 3138
zcmc&%d010d7C-mC_d)_h5qKeqQBWjqAfSRv5qTgW7-Vt5&>{wifNYTjQ0s!USVXK#
zaU1Kxh*n$}rPxtK!37uUhFihbIy1Fe>rAy{zp-^%oqJy<j#T|)zHh$i`I5Vx?RU<7
z@0=t~Nbu)b-h=O_BNI3SI0Y?>U*Ws5-wgSf(Jt~`^NJFA;GKDeH{m<+5?-PSgaJ|;
zPbsxY1Axo}laWP;!P<RD!?UUYkT|K7vQ$G}dP=6joXKMatJsPZJCSz0R{A<iD(VY(
z3}kR36Tq(_vgC|Xck4t2yFgS*Z5*Z4CaWN@Up^%U{tX*b*qhnsum{M~^V14*jd_-o
zbc4kp&mxPg^pv#xoWk5ZF_1GzaS3VPXS5fv=D+ax%J53M@fF>4d<!F9LW$iEi<pQN
z6JgTNOHo7SezvmqDLk$lj*j}QcaZj-06Y5ZU}vbeK)+-!Hg4Mw$ZK6Ibc|fZ-kop)
z4f4IfERPxp{@1^TT=r9ZWX(Zjah9R<<{)lP`vquS`~_;OzK@2wnegWPboi;UhLH^j
z=5}pphq{(0s4wX7=#ze^HcF1-k2k>)Z2>w_av3jfWZ;IcA6k(V2!9KFgq3?DnJvd0
zLAJ&P8fseMR{0uuR$0LG@m++2MvmgLR`-;Ad#WD!XEn2%%OnzRKr-$hT+gMQPU1d1
z{s5wzl(6MVG<F=0nL_?BM0)-RXNTUl|K{EZ&K2Y^a(*-XxGM_e`f$kU?SmgoZ-k|7
z&CIn)!BBo|CVq67Lsw!>!|M8}=z#kpXmBmUd&|Z_?{O2L>0&QtwKoS|?|cm1!~Y3?
zPN@SX$dAeALqJ+s&K{akC3&*S40l>fVc=CYw{hE>cyoh4I^4gJsXhA)y>;#`6WrX#
z&a<ot?{+)OEKCmMymzl;a{Tnn88E}>4Qa6G-gwTrWG;9gY?Ac&453H=VBqbJNbnmw
zhEdOTgtks!iJ3Xag+zO?doHq!-=ivZXZ0A_QqvOD$~4*yiv12PxPJpSRYfqp+@FFZ
z?}6RozNcVsb^+eE;tkZe)=3`Jet_#SWeBuhF2?!sDJUkY&|drbO}O2=sK04a1Gc=k
z*KX!ocP@NpBsLTo*t&Ol$BUk0?8X_R*lV)wcBY>?(e*<mShKFhHSp?kn0<MlN3_>4
zq}*_dIXZbeqp$b|P42}?gMZO;E1SMWKd(2jxh}mSGwdUj9lM{&I0?+UuZF<+d7JPw
z{|?7b+zO;)lhjC`BZJG`YtaIiN+zTZVf(*oxt?e0@dDL7=W?Grvi5o-POI0U!vlXo
z%Z@LVR!rWBi%w)qZgFayYVXOMFSEe)O}kKgRw@Ks)Uh}248pepPogWTQruRma;Q)T
zarRrw?L1Z|cRPN2CUS6XM5Xd9xKjNWX060sUgoMsYb+ly^J{LRHuE~k-~$N|GUF>r
zwPS-r&DR^iyLN;_OOOVJ*-u6HzW+PUec}Mc!E(j?Djsz{zK@?xUm}b7(vkW8v<db4
z(?;YzDHL}mYuS0OF_LYr;V2_vAxDbJcMSiK@uqL(^U}xC)GJx~G7x%<JnYHDJiUaF
zjC**OtXmjXItmbGSWaRmmDxKu%DZ)U>f!9-$}8NIDs@jyuiow+efoO#^V0SoFmTXd
zZ=WH)e*Qy;4G#z$F>=)CH-g3lj|~w*!^VY2M2?S&o}i1F7#kO#Flq9Xsfp9{NpHTD
zJUwNGAvG=CI5Q(NYgTqnZeG4=c7fSaSX4Y`?!33>FIc#!WO3;`@0KljuY766vdZP}
zuUNTi^#@gJ)~;K>;ddK9+_ZVi*6QDHtJ%Keqn)+8cJJ9+_lJG^>pyNd@W~$!9%?+?
zbmVAr%dz7pPPU#pedcW2r{~UJxOl1k@|COCu7B3idE@ghZhm=7`%tOjv9-RB?^-Bj
zxmkI%)*H=iPL|nXXUs9uB1$S|3!~_8R@`>h9(2`Kihbn8WWKU2c4<r<v=tC~V4Yr3
z^=%SLsZEAf{g+RPffuWOh30itzx(qZkYmituvi<IweCMrYa7+1?!RpPH9lVzpsF$e
zloE4VrZLyRv!u^?;TPJamqxe8C4@9(2*Ap&S>`2H;jV3o%Kaz*Jmw|3AgawO#k-&_
z6%!A(JMmHz`ySN3==j&cf`V0PRJ#9hFaM2t?EmG7)&5pblA_#A!D^+7J#MZ5Dvi=n
zNJxlH9BqBDlv`O}N&!NsAPDi~OKE~26bqplLWCe>2w@rGStt>Rp{p=Kh@vyu+h(#8
zmu6~#8WLjX4fgdNNfzM~$J2dSF&)F{KuimQjt-%Ex)u3?AKhoC4xQ+qnmDyEJz`Y0
zep*5%<>$r`d~;`!L9`bp8X+JES?!5S%Il(UPD%{OOOMPD3{+oqPr@Ojx9t<AOriT|
zI#@#&p<D4}le18ZAu%9dKW$=u(ljx2;xFDBU-aazcu2Acy?}?NGSrkFIn_cLLX?im
zi$cXV|M5{`zDcqP*-|eE<LELnhvvYP6ftU6>d-kPC&Q@_NpdmGKY?hy6mk|SY7*om
zI>Ofrp~cpF62Vd}A|r^Iv?_@~TF=GM^|=v+#E%#!)hG6eD!m%3Bq%*hhJx9JMvG6H
p$>d`wD9A7NC^8n9v-0yi`~!#h5Ajp!Gyu;(&_sS$$B};z{{p%vL_q)m

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
deleted file mode 100644
index dde573d07cc0c2130eb43113befd5c673b2f0ef0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2537
zcmc&0YfO_@^xXS>E$v1pB42G;b@-s716qhMB#il$H&g_iLfI4wt+ZI$t`9)BD0`UT
z#sV76FdAXe81XgdSb~6pI;QC6YcIAekuBTQIdutUQ?}^t{krl{`M1oI_TF>P`<#2v
z$(P3$#BeMZ#D(ZN%t0`KMD<R7qo!$gsf+_In3G2mdJre(#Hx6Rkob9u5exvNVVJBK
zAq?tPq~cg*9EgLI3Tc_qZYr@D-4+hZvAhY%n~^#}Et$xYGLnP>90sxkkqzL%&{#sF
z@L*^PiJvoy$w{~mQ;G7w;bI!@J!T}_2${)I?yWN0JtZci$0)NB#A+%jci6mDb|HZd
zB=1G)n^=7et495dViS0LWLk9r+}i<qZV(AY?`3ZHp!CLESn~G<)Nc8ZZD_j(e;%BV
zjzu*;DVdoAyJzfXOG8^=_VRExZ|7|k{$3vpBy41_<eWtFH0PLiv*tp~*FQrQdlnzv
z(t$j|Qe^5$64hQj2Ys)9kIuY*9mP&{;vEhX-0j@ONF$O(d$(SMgT2FOCg^eECtB2&
zB}4fqy5XqWg-+I8!moES@SR4B8jIuMM*I*~?9X6Yjt7Et%M@te)d!dBx4``d7c)b%
z4ksln5Lq`pE&lb>!zjku!|qrw7K<X5<2lKPMdhc8McYr@g6wVuv<zqC!1<W*a$S%y
z>kjnC{`ADvYtM;7Kn4k~^}wCISs*i{fo*yezO|wgHk|2UzFLwD^~YD@p)L{nV$mts
zba)y1_?aPS52?ln)~CYs)EwwOKaJTGDT44fKY~eVf58tW2Z2e_G7fGbNWAszk<!iL
z;f-$irSA=x`(>zT+s>Epj`kSTHK&tl>%Wg){_F~q+%qF!*7|C^Z)!iYc6q!ga$gf;
z(;Ar5;D*Gl<*@ErrYN{>HAHrFizk1B(9j<YygHBp+83T@LRSaUM|l--H&G<L)sdEA
z=@lwiT1*0--`ygzG50PYL<xh44VYF}SGq7l@l^d=WydWE)0^r|y0)U|=7yCguhk~E
zHd#{VZOz@Yxw5bF$cjVFw)B19+B>$}PhG#ewzt)p85!DmdY7x;me_J(k0<*;@dpF@
ztIyt8(=&LeRv*3X{MGg~pWC}GtIthRaoE?End9HNw90CyEoyeNHmlncV78fQ50D2x
zh~$@~@*kz@W|{go5pCMBxjgb!kDDU{w1Wve+^l|%LgL40>-h0eDp4NWtOLOlHS462
z*0q`K6&~Nb@wMye$?E3Gq+K7jK7r3iC8lhM1BKXKZZTIGIhKs2a6D$Zg{$=Yrz|A1
z6XLL9e04doPk8(^p>m^jCj(ZjPomnaLbwAu0WtC5zl-otVn2-97aTvBEXc7^rPTk&
zqxUh5#{cD|svp-&l3$X^v05S@mW2Q)Rf<5qpkQ%fqVHiTT3HQ+I9|u|d_MV8$l>`K
zURS}V^Lz!LQX$ZEL|_rcQg}X#?&R#>$w@%n)B-icFJAMkMw390wA@U3PN|`58eNEK
zp4ZbwXP`%c&ui&<RatCqOj+SFuPJ@ODnn6$h4QQNNPMd+$s*WG5sZ)^2>9#?DCLdQ
zcNZ7N*-aT0ypie)?nycX4E}S$(xvpAO&4G4Lg>DDRuP)cV=Rnw7>aTo#YIBu#9zKI
zzw8&s$3u!mU`#x;lscy=W0{9C_$)n@7ldm3{`0ehdXr)k@Ry$FQz@EZqcw0Ar!S~1
zi>)O!nNNjCk!xuENr=8nAvB$!Ng^lN5x#-f)%fm72$pIQ8D6NQPf1A9M;B5zRHb`~
zA0bbwPwW#_dNn>tP=q;+u2;NfPgJ?n8D(_295rFpW|!OQu!qION5@2Km3kGx$Pc>7
KCr}>w$omJ>jBkek

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..79a6f24b35cf797574a2db8d32609d38de32a0f9
GIT binary patch
literal 5244
zcmc&&c~n!^*5Bt2$xR@TKyFMR%!8mPgMgwq1epazL=hAv3<Am|1kgHLv1sF1RA?Ol
z(K=y8gNisXiU?xC34GQ8wTjlZR;Z;;-@T!>^|k%uTkEa&_Trv#@BQ0*pPNH|2~mVP
z(*&JPtAc1f8VmqN+?wBe_9GwOz$1;jeli?I8EGnaxGl>J?%laV^I5}L&hX1)>xTCv
zXuRrGH#R6z$X6SzH>Vdh%XJ>pK+9=!2U<)EX%5YiyMrC|eosoh*9HJRSBz=J2q*K7
zmg@o7uEpduA#($cP$uQYC{yERCn_@&X-p!P{E8$iG1~%UOZ}5SnFEsm&=@d{km&%J
zc@d%$gP2Izj6w(mJOnt8O8_aLfWtgG%%jUZ0_M>ZP@Y1@j1^IT$NDm-t^deHO1<Ba
zQs_4w36tMnSD8(rr#3%~AcZbr*#*psZ7cbEhIWFuwAiea_*B*GIHgJ{NMZ&_akFF7
zlCx4$wK;cSl2Xj}2w{5wm;XP2|F#ep%;|RXAbK;UEk&uY$BOd(WIoJw`xweP6wq;I
z4mcDKhWpoap=Z4fbT`;T>>x4P`(+w@I!g>T+ZrJ4o(+JbAw1W^K>aKUB9Fa+t8e!M
zCt)OfIb8-NKMG;d=L)neLl>JX8woW>oZ!NCKd?+00?#MCMVAi`h0Ss)vOjGN#8XG0
z(k8&T<pi`Jwg%onb1)Q-gSNRs*bwOs^VW`s;EuU4bAcJ~wiv>ViDj@`F9%E=kD^Ou
za%6Sp0EpN6gVmW!kY3>msozWBwBAg}pCgA)Umpjbf_R8NWC^WN9N5)Sj=J~=G%Y%C
zQ)CFWH5NeZ@CGN%3&;(yf{BNVL3Y3yiZDCaVXT2A?+(L57!B9`Xsm11C|Ey946?8M
zAot0AG&f5GM+j?3$>qUj2ND{N9Rr6Yz7X;&2P)L|V5{c9p#mi|-O-1}D<oXm>4XCJ
z7lU<XGrU^l3s#RE;cCogxbrp@7O29ZAa^&s%RUd={Pw}Fz%kJEIfh;>djx~?Ou(|t
z1%7<K687F02-c|vsHf;>WEA%hsq7G>r&@vUhKZ1II1JMymJ{mzZ{g)(S9EqrfevLk
z0rpz#C8CXXfpwG(5wg1noeF4%juD%P1GC$8206Ck>jLb-`T9dhA)2u>TTY_Bfi0Lz
z+7s@AwpM6ZaSL56eu`WMq+>hN;-K$TDbBMSLmnt>gTvLoB1;IuM%Fu_{Q&|rx%M=i
zvCTkr%iFOPr*K%mU<TGc#Rs0bzrdu2{P0~hy5N2+9BE1$;7;Bac&E<5EgY9&o+CWS
zq|L^hC+CkM=cIGQ&Rh;>%F?m?f^=hkvgvJdd+iS(9)ieSzXoEut{9$0SAn0^FVN)j
zfd9CAI4K4Jj97dQemM{Tf(Rc-HXVZfFzXboxp)r0Hf0Rt)y%<ORFUXPa3gqn+7K0H
zFF+&C#>#TN!PI*KoW5*=i`TzGHXk1YeV<owZ}wrpJ)Q70dK^$$c|>J&KIhj>nec7H
zO8ESIIr+uz8Q4yZGpZVR3Lp068+>NVBYey`3$9geHdgL<gS#}!ovh}5PbND>;1?hh
zMi$1xvTlFUVEICDIC+{Qx`EJ(pK$=?9&j2v8aG;~3sYXNL60WXA(a6S#hvpcD>J5I
zq+Jv?aLiFMwlR|2d~_PJziLG8Z+eFg4rsz14sOJ&CitO69e%JiGY=mS)rl|Azk^be
zzQN{yzZ!cwXBklu9D=Avo>;nmB^r>Dh83TfL%gP2$rHzdNo9Gmu;gHwc*4Bt=*#R|
zn5nHLcW06vdfh6+6~h)22hyUjoR@y+%B$O=A67Q<R<}j-zCzCg`Nj?0z@_@SpTE0{
zM0SP9S-gqE4}VIf+@^SouUAnArdXiU%}&^i?K-@Sq4pe&ei`<!+cww|Z?Py{=LoiS
zRyP&C(Thx$mVk}J0&X^$4dw5)Vlx)ia98V-98GT@kyE}9wdT1BgpUS6jz=hGewQzr
zzH<wjz3K~C-)RPW*E<kqAr_S5rVV_Zx+-jL<a2a4a5!cC%_Ee^ci^`ib>~joxI)OQ
znu2K_4o1J6O(A5LZ$ZfDTnIgWm2AF$3biR;!t}HGc=aFws%f8&wd=<k>hnsl;)gC6
zzxNWv_qC#5y5<qso<*|TV=fUJL##-hZE`&3bsUCPhf)cT2f@=vHT*h*MmQI;otvkO
zG2J(6APjr(j5Hj!2^Di)Fk|BlICu3A;J<x249Y#o2~y3Z(qAm)g*@@lskdCFM}Dje
zdN_eM9lXx%4F@vXYb1Q9N#nL{yarXno3JyR+VCS|%=K;7Z6jPJ-^5D>`s1yyPYOp?
ztQYQ*j3euO<2c4{dFZ;&R5I-ge_2@fHasB_(A`g?^~M{l$FR>*upzfF@KyQijvD62
zdvL0RA7NO?pDmlkZyM!KlioAU9@YrSP4)G{%uESCMdu{mt@fe^ovjrLhj~-?(AS(u
zn+e#9WD<0~>Lv58B;t8Nk=%<b_JQu$^F*BY0PMD(9nyU6t3PD&3W9oZ9j7{tc-*dI
z_>HnwOj4dkne5vG@}C3{S<-@E>~!R0;w|L3KpW!FWdeWYL5ezyNAuRCuSQBQE^pAJ
z@6nQ{H(^J<FK%M?7C5Ota93EKhq8GY*pUsR(5bCLfoEF<DVjPI8rpNP==z&fa6lH{
z_Fvz?1Jmq*=~FbA>XS0=oULZ0&m2EYnXM#-RuzbEziZ_1-rV3_<L%+5_jRJ{mCG^t
zwra8a)panx{fK$;0|_QAJdfW|Z6+huK1N|C1U2SO1iA6_6ZE<uok%e>g`!cPkn<*$
z;|cYEzgkiU=N|c?onI8_A0LoG9ju7LBa(U0uD>5GF;wH@4kL{Joh@m6;V3q#?VuRK
z>-cUxVovPQAXH`l27Ow)hFTl856iBb$GJlqVKICw{8Fw8uJ1U2+LB^m&zlh4o8Jz?
z?zq>ZE3%c?#g#IhwML#KzbKDuzBx*-_Q4#aBR+*z3X<STaTmUoV<yNI8=)<#&+x^i
z-=K?`+c<+OLSY{*<Q3~`bV?r<0(Iz}s3LS9HH<$EbwB?J3mRMtIb#Hp#rZVqeEAf6
z()a{|Z|maCwdI7~&gWn@#S80<vL!we2Xl6deNaN^Qj+=p^OJ-w6*|z>qCdXQ5p3zD
z1p7~wkxCtN7pBACevQy^5_<gE)oKyJUj~GEH4x(HYD8HVRWld)-D+IuZC*o&!=h_Q
zX<ShaN3ObC!!=)JUQ1c$N7wS~N{edw4w}2QI!^WGb-J!sqw54C9u(E-dG_3`(;r7z
z)C;}E%6gH%b#c8xkn6pA!w_$a260%JvVoo+SKJ_(sk+xNAbOR>d1+j}^1LjuwD`PH
zvgY1-<MeuqM!D*$ve9J0gW^Wh#Xa{L&6X3E7tB|QV=h?aS(jX}RJ(Rvu-fQt*<_s`
z7Sm)?5LeP<yIs}QG_Yuu<wd*F{FsaO`$|hL4mzaix;Xeqy=Ak5=4wpykdqHenunh5
z>1uYYCahYV>cz1w&KInAx45*p-ftQ9wYSwJ*Q;T%m)ttyc3*P8rMiD<c=~dyKgEJA
zUUgp^OWuh4{}zitEv{wbs1>Q5yjtxZ7G$>ehv20K*OjgPs3BTZo;v8WRt9o+D+76~
z)%80T@>v{G+nv-RzG!8#w~$&_?`~uO4KJH5M{4Vz`te*YTe?V*Oil4d7g<P2HPzd(
zdzu>H<rp;(A75r#=pfZUZe(NcdwfiP?3-U+=#N7@SD}KGsf1oUdyR0sz@ta#EedcI
z`Opz6k*_RCBl0s@*dy|{SmSCCU{k0v2(;g?F_<u<vd183nRAn?VX#}L%5b7bx5jYN
z=)NAq5CzXoJlRK<B@PX+uMkfO_UILdO%8CQr%sK?qNhb9RnXy43w!D5$~A71i1@-R
zNu<|0kmpMn{8Gty(K@0n60IX<$jEhe6}=2@Wv_(66|9rb)M7d}(Q1c=3d2bmeLYuZ
zs;;@YsDbM26(hCowWz80cxvg?Sz7-xHStlmvkVs5%XIT4vzLsPQ4a?A@YKYc3TGKn
z>E<RQ2&uh{R0~(I37DvX7pevisYvb0b7vwG_(~)D#K$)%hwecZQah8qlJR*SJs~k`
z1DvJssfARmuDB-kUq4iei1yggQbLi9)C4Vamm(@ZF2JT#+}V3IZgK!~Sd+8qX<L#i
zw!}wyNPpZCl`WZ5m~>p0uw&uRvc%#w?nX&_3bT#o?%#jhXkJ<6&qm4RP431i6`k3}
zsg>QwjnmHb{cN0G!y7K2-yqA8XEfQLkY`@<cqv!41q?UIx)zaRlHHkf!X)RLg)dDO
z+*>o;bm7Co9Mg}w_n$Cb^sMrw>Eh>2!_AiTcIKEZ?dv{aw(L#cOXjuVk1$_B8Z9ti
zX`=m1B?Zdgf1s3sZ)1q|7{YeDhn&XxiJbq3G>4axl**F2_)H=>DO1IbPmX6P5W{z>
zMUv7HlDp2fN9=4*b6J-6=WFI~NzHOZ&kAC$C6?WdqpEa)L_|DBGW@PXKoLsB6GALQ
z>|z*VH~TlX_L#k~7-2594@jz%HP?!idauJ0yT7h7O1}|Tj3KVB*Yz1%_uBKHXr2D=
zBriEWH9^%+W;K{(D`wj@%J$ol46*ww#6JN4T@7SvH;{5NV-w?3lr+ImUK{LB)Y(QJ
zbjWZVQyp-_q#qsx&FNSEkax0re;g5w(VQSp)|!xNkBep37=!Ts*ldg#|K_ZJE%3i5
z3lgzRE(`ik^ZNf0i~g^?)&H|sNpfJUL~Kh*wvP}4$mCL8MQG@x@R9v*mb8d)WW`P4
zrBEm)GY=~h6p9>$SAxP<p-51SPtdM;F$%%#XuLuZ!1m1ieb3Ca!%Wr#>qs%_qrr}j
zBbb5DM1OWZK8J06*oJYfPz14!R|GrN?kk+w`MemHiOw<M)3V}xJ?2GB4^3qEQ$m>d
z7S3fFt-tYFCrl8G^!w8e*}V_!Gb6*@Qsev*6iU`!3(us(NW}Yj=+vp~JdkbrQ`d&x
zAJ05y%}b>WcT0<yJ~1tFx;AwNZ*qTrfnguU!xW2=KgGkA(kngAZ<>nTPy__A`dX!&
z_xO_owDo3+&B*)GD-_=B&@Y*-L3*UG$J`i~1x!s`StX{(Ic)uz5c>~>S@Y6bGLbXc
zG5Zk;ublpK(gw>~F**uuCHsxEN%pU6Q;$gT&0=u0d9wD5f5wzOntn}?nx`u>=4ZvL
thQy|)4^d`hq~(}r$7f_FrKOrXyAO39>Ld%21N{C6X9(suIp%lA{{vSbUhDt>

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
deleted file mode 100644
index cd2748f7d0eb609ee249493c7051b096c5ef985f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3413
zcmc&%cUV-%7C&>#vLFb^-gOsOMaqH-vJfGPh+Gg>U`512QN)!+1lb}Bf+d0)5>%p*
zrx6=yFm{Oo<|#!5gS|v-#FyABK6{DLL^JoUs66$L_kHhsIUmf-nK|Wm&Rp*A62(S)
zF%+X<daD=#104V)qbEJtH8sTCb?g4phj#sh_EkGq`s5l1KPh2=5it^XM#$J;TY=0M
z+=Q(*5@D+q09ZH^coZUn)PTGJ$rwuN0|FbV#C)7ur%9ZkHcns&u~1xr#MMY1D;It&
zl++z#1BL)R*x(5O>w!!k6~@3}b)XQKBOuMq0)fV-oCr)4=h6nPMA&*F5u(>NVqC%d
zyZG>R1q4A}1^GVeO$BaN8huihHd&XMs8MICty1wKRg;*cPtVfo_?lXf_&SnTk@Aa#
z?7v9)!AcKUGfmbi<{`-!d)4gm6*Bb8Ns#6<8@4pF&|Ehj+*kL6E2k~t@q8D!)z}@9
zdWg{4b$XaLMg%TP8bN=_1;En|9<(Q*@J=ekSN;j7UTy$EiXW_tmBP38ZD86m7UgDG
z5@}LDsNbr911pr^sP%vcLtdie<-MRtCPCe6oq@XT2||4Y1m;oDRPGGst`1-)3WO7B
zHn1Sx7beaPhQZBgFgm9zm@l@2vyoe%tbI0g@vJ~cx5$uF-9`}29RyBwN5QbYH|Xw)
zp|<^KSTtS+^Zxh&LW+}N>}E$emLP!A=56S*1%j$0?ckiP9W>N;18U_!P*gpF{4gho
zEME=Mjb5;daD$bds^GI%<!}x9!x<$*T>iWt%pW2G>Gw*=Z@GffvTR{1<qX<<b0~7B
zp|P?O+&>G2sOJLMUf3Pvg#y@ItcHUZtYQC28cvodQ22(`;A}h$FQ$cp(@jq}HLeIQ
zywpKXW(*YPm%*#a`(TN3J(PwAz~yBGdXakrdKPpB$1UD)|G_L+d)5`4bsf;-RZo$<
z<{HX$LtxN3LFj@=$S5C9R881M6>fM5zm@k!yF7~92^}L~ZMU`5So>0NPH>^3${wTY
zu*1;YXCbvQv8i1T&tv3QVco&&%r($bhl#qyJJ9d0M+k3yi&@TzW6(JB7j$U#ZRFk2
zK$Pe;@O$+d(%db8-njGxl<#?t96?3+{ir}2!mQBHhFYkTXP~`#O~lM<66WWOBASMU
zz#ZR5gk-alEUmW$-^v(NwWbj+6fA~Ug&AZw&s@U4PhUE<sFR>&Uj_0?-A$F`3k1Vv
z46rCRbh1eAa-Lq%a1TTth%S8|PFVIP$SkG?lupm!p!Zdao3}otMPLPertgMl8^gdV
zE(FrMco6r-RKuJ@yUEkT0-&INJn^W8MkfdF2Y-JTYJ1m5P$im7Y{?%8T?R%#?eWf}
zX#NZ2GP@G2Lten8#Bw106{MaC1YuSIwR7wu!SjVixZXGmmhIa{FE1NKlvH`48rN#_
zlXZWRqmSGm19o>abIPAgZ1X&8HY358-eY!`PFKW{2fzq^OOqh?)*!k=-V|`(Q7f=L
zi_oK|B!K!mPz>l#+E1~BVSmg)HzM|;%ns&Av)iBEnK6Q(-4Y1bfC@Tke>`1OF%oq@
zWlwK7_zG?6c#w48^fg%%p+wV~mGF(Rfb5v?Gnr$30clfzB_`edg19|Cm)br!3JGuc
z69(&@sH0X-;5S!;8B;89XY$P6zH0f$KtgeU6j&~i4kA2}FMmQPfQI{bu##s+2`2%5
z>r%A9OlWS=&eE#AwT*2DI}s!9D3RKCl6CIV)uEfCle3H5)vbGvp6(vKJQZHvpY-<e
z{j^VCzkdGx0|o@LLBRt<LY0HU!Xs3JBZou{jUG09#K@S~xcE_{6UHQtRgX*3B#%#-
zkeW6zU8~a@CS@2ivnFTfOqn`u`e!q8^JdPP{dxWuUlz=nJFoDo`3t^YSoDuYi@#Y?
zymZ;}6)Q_hSFQfGY|VFT*R9{Mant54<==1JR#92C{fB?<*jZguTeoZXp8AHpKQ`{$
zf8gMu!$*!DJO0y&rjw^mpE=w7^SOWh^6U8vhKpQ2+F;pk`lT7jAi8{oOXh6=i5%zX
zv3vz>kX$XhcKyb>^+5)Jo3w$7s<$<$ZsBMxINI%#|HjeoaMA7}gY2FO{C#AA2ebhm
z;=qqEZrh6>w>4Nk#%OW8Cr^2-oiEzltof~OS7!q}!wF@rkmLoOWej1WxIt|R!d#oG
z<LEBgNTsJ5GtH9IlR5IhOp#|Me%w)fN-SS5mRIpy`tAjPkPZ}#;tfd)Yb=3A{>dCC
zVrhOozfXGDju(lr)s`c@_jj?;Thdzt@~sXZBR%Uk6quf@OUX1*`8gy$kK`p?<xO2M
z?Y$rVBS}A0r?k)qBm!g7gk-Inp)ljS5bu0AxX4tS?E*3PgAXBT+iXT)GH%PtoZZ`N
zU<gK_^5?uM34bNr#|42iv|f#iJ&y0d#pe_MXR#m_N@Y^j|M;usZ~DaezdX2q+mj@2
zk%@(Jp}46Nep5<i5=%BZdPt0)>A{k$qC7eAVS`wf9g2TBiD21mHYkM+W!V%qIECK}
z!Ulu6RWQqjaWmFiXRPr{BNu@S#15I-)6=sLUW7yr;`HEbZVcfD9GYcS+z=GUsr)&s
z;Pi>(yd%BF#f;3-g!Y{n7aKi+JJ&{Gd{ff!z{d;bgJ2L?O!4?7ch<(=7$4)K(<oC|
zHRsQ#$K_y&Yt^GijNtTeZZMV3LpR}>i1&gr)iFN$xY$U2d@Nr&&NtLlU-<C0cyP0@
zyn}~pWspIm9GS@-uwg3Bo;S*F%|A4Z?>BBXme$s@>_Bd*Oy_!Fh!5?XHqJW-_oO#x
zgqxhr^&dktT?*a{;yp2PTpd1-V}r6y_r!zcys!<+chcm<7irq(OOMlrX5l=1ot!_8
zk6pQ|F<FAd!Jy8Vl$D(6kz_D<s53J3*$$JFGmNQvor9NeFRxw-sY(X$<_BsifGGSA
F)nBZCykr0X

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..e67164596565a48f5cd69702114b2db7228ee7eb
GIT binary patch
literal 4782
zcmc&&2UJwoy8ibmGiP9E!r%-JMVg2pMX(nJgrTX3QHr2cDbl2xD4-Y+u_Fjb2T_V3
zAZQd&0Rch93o4>fG(=;Iv7oW*J7<VVd`Z@NZ>_uT{nyN{fBXJ@&YJy=lgQbGNASFO
zevUjY55@o(H#@%FRrYIjmd%K#?Xi>L&tew-bhaqo+Du9<WiVAU|N6AXfCF!gE;u5y
zqbrhn;-*FMfTzq;G2qGa<asO}OE?wusL!^1>a#WgNb6yAD~oX4)=XLV1rqWIfjO|`
z1$>T=cbMPGK<}tP9;Pg-d<QB2jC50xF7;2&1O{~0z{7xU<mnE8o=5PU9O*0=6w?U=
z90UZ1O#mrFfi(T)%22aqX-!7tM?1#+i`@Ct=N&$Ue$!E=@%(j_-uw*@hU2N7D!Liu
z?|AfN{K9=>LjA*{SNeHJd&>mTgCM_^zTqJ;p<xn)d6@DBOm_pJn}G}eA2I(HcDWj_
zUd@r$!<Hb=kbAL*ZL@g|1e<Sy>Om0<wyXk!;z{u16)wEkp#u*(CW5cN5;|}&9Ja1d
z0-dZ52>(F`z(^6E$zrhWaS*s4`2d$cRsbttCLDAXK*>{iShr7v5+k`-uwW*%)*8dv
zJTYj78p5+hAJN5{$&fDOqlwMhKs+)6D%=4qHW1KXqYa!18lb3T0e!*pu+x1itWL9n
z1%tuhu~r>8nTl|AVKtP>#(|nqJ-SdWL|QGCpp<3{S}hkKqR|Y(eo=;ISr5oqC4{Z-
zkAii!KX@I|gkDb;6b{y*VJQR+=cVDgf+DoFjt3%tJ{UKQK(d_{EUYO8L8S>4VS121
zt^wjd)xcet0|R0nHvH`@*s(|n1c$|t{P0H<9HRiWgf@gGb0FP-gpMOez#!fRoZhma
zaoa@D-Nu4L+1}87OAfj&k#MQN7}-}8gLc$8c(=|5wC)?hWuJ7o^)U?AM!P_Eaw&X@
z?Sw3GITYH@h2eb|dY5<)CZ(!^X0<6ieU=0Vu1)~$urcUG(XU9w?=FhgLl6<B1vWbu
z!jkt}P(xrHv901Gys9xnCk?ZuDNP4BF#Z7HrBVpmo;rk6=?gSCeK&E>p&jWP_2S#@
zCW6VpT?i%4VJ(^8q3AIj<acT|xwfwtIyT-wJ;jfZ>DUOYAlwh$oY;?Z^yZS4Iek!b
z`YqA~M{MR9V^m=$gBG_nLyK-CYTwY0Z9IX)bt7Z+c<KnoKP1KrTe;A%zXNWiX2Pd!
zk@$F{L~Qo-8DvoUIM&0?dSnuGiYQ2Cu}D2nY{J}n(znZ<%xilB_RV}Kd~1(!%`iNM
z*92m%*U)WxN9z8AX`~X!!1VQ};B}=P$hcWUh?*hxWW@>C(sK&GvSco#wywfPnn-kM
zK^LUgFGG#$Bha7}i&ZDj2etVQ(0ox9*V%Lg<gDMp4=ZZ`pKXkX^DKahNhOYZWw74v
zj)I>$l3?Ox6*9MUIabhMf|@3rz$>~xA&>L-@VTePv$c|Av8u7%c!K9t(x7S=9%Af<
zpM@xxnd1wI4{XUX8@>jE@0wW(R}mWd6^C_$VlbXJ2Uq!;3p>^>$BsuX#gq&Y+2kNb
z#m80=?|HrCu_Fse@468AlB#MYht;m=VC)S{O;?j$5Tu9R_X=>)l=VbqxF;6(N{lYO
z8&Y_Z)Wu2Z^Wq#vk7Y8(b+GLd<hc7j-9`#}Imkq5H%rRp5fwT_an=tsQI$)^qvmtQ
z*z!DSPUPf?tOmJi?E4`dEPlR{LWFcJws*w?%4L@&8Nx3C9fP&(STYvsKJCSpuWMze
z$dRmumv4x;y01}hs+o-ZJ$;Cq;mleyY=c}2GSSL!b799dbvUrYfKYcDPZ{mrDJ9+B
zgax}lL$~dxQQ9}}p+G4Esq^(y*~@lql;<=p!5Z#PLa$GT5`v32z-dl0I3K-Cp8N3x
z>hpdDt|v3_)A}-~wci!%m-AJW<CI{<cTF*=mlweQO)q*qyqdtu)(M8@ULbZkX_3-d
zLfq%QABIw#set?X@aSHvRQs4NIOUYbPWARtE1xw1rrdc<Do)vrirJ>vxN$picFGg5
z&D#L_$yF@JXb&o4B!T1faEA05%|uyp6BlH00`IOGV83($(#vut{M-=E?%Q<*nx=JQ
zExY^h+PNBXI@_}d)5X{Ek_onW@B8oMXEyGTFI2W5+im<<<IGdhfb~)`Jl9t6)q^ZN
zAP~^)tzNQLV|HNJo>0v21_n0Kw%l1$#GE@PN~C^3|6;l8IA9|oBuF2ANv2*3#8VyJ
z**zP}fjh60@S8st8xrfGhJ7}2hKn~6)W`r%T~pz(hmYV_t9vo!x^POhybOfTWx&1U
zJl=E7h!usOCoSxCh(i|%{M`&v;ac$=&X$N2<Za33=r8&O#Xq_Z`5892s`^J@g*{<!
z)a-=n)sa~3&ROWhUU`|>eT}5T(#g=#ABTCJxlS#xi;>d({wCZ}i=7a$qydZGT+Lp!
zSDmz8CC0pCy@|<9*-As7x>%eKS2<TWW$cJI*U-T64VW<Nw9?ee+hI+AtwzWlWsING
ziQkG&C*9KSqpws6YVHR&a#!<1^gcU+2vt;rqFI~C)r;!zfHQ!<i*JWh_r$0mH(T!L
z*hs3X(GzzI;XuD!1&UYPhFjDiOzLM{a@^T^Y*Am861cQWnZHnC`PMt4rimZW*0wEF
z+E?XRZ2M}~Em8&Zk<!91BuB$QekJM)@`17sPTcE1>0`I1o<WxcNmx&kKsrrjHYrt<
z%GOBtlx@4S3P~%SKuIz|aH)70-^)^$Nmf!pnbCXj_4{w4o~SI=q()~b=gD)5xee0$
z@8$q?=#xUDb2&9dY8iU)>^bH*sSV=h$|$eT;Gt`;9$^o=9>RhlE`F}9j*u;Q2I@;J
zv1^{X#5YO{SfxtVD8M;^q^}yCvb?v_2D~EfA2%g}DLtQHe#B@gWou~B9scoO2z^7s
z3o~Z5h)Zx10i*a9MH@Ea)EGV2(?uIMvE?;wvn`5~lBM~kfrF=uH*J>FuuSEPO1|Bq
zsOMbn+FFvlb*!<^*=$j1%Jy;7BkvEkmTumuZjnR@Ecd7G*0N34b8FkbWzPhs65AZh
zvb4-euJwV}+RC<O8F{s(3ak!n%b60`U+&g^V0-Sgh~cw2RtI<F&szKZzFY0)Me6es
zfgEF&>NMVJv%HRTL+WBJ@lFlvbh9n#TkQ8vt~i}xcS(0){_J>+XMb7Gxy&)&GSmKw
z{<3P1mL#6TfPs5s$j!D)hpUrUIDPZxEoqQl>wP9oFe}9%r{4ekz8ULShWi?VE?2rW
zr5NTO4gJ4I&mD~}lVA{oOCk?3k=qi0x>f1MNq8E6yKT!jaZTRwsQdjD%e!jwo8q1h
zcjt-^7qqN<{rtiAU55)#B?2xbw5=^_-9#zsdz`H;Zr?1gWuIqTSJJVSKPC9a*}BrM
zof=kYLc99?-Fx(wRC;vRmz~Qp_U*~DJ96Mc?)0dKH@c4;yjW<lNlMkep}eouc8|Vi
zPea9}gEJF=uX>pdX`Qc-z~9qDN2>soAly|+2di@TR7fD2?NI`V1&P{qX~LYw{dU#M
zkDDffc9q2Mi=L4xGLa%38m1-!m8R$*_<$0%J8kmCM+;i&SB=OdY6sBCr|u2DzA0EW
zk??RpiDZIscQu`G-BH!a3BYwYX6%bTP*oR>(Hn|27Yo;HlFAe$Jlz;wv*bj~<DrgR
zmnSPu#((%o3SFOiwI$1_+vK@E^Xc5KIxj`&_S~;0({N>Hp4-T(-uyX<ELHaxf&FC*
z3T^V;Uj`3Ud!9;BUH)sx^~TU!o%zdOg$=dDe`2Y5yp9;|*sd-v@OTsX(}nzbo7Ftu
zMnAk#y|Syo^IhyOH(L_f>MP!_eXg5rD&%3KjKKEG8#E_0D2!nT{!v6oP*gPAKg6H8
zUDH?TBChhFq4M5Ix&u18No<BL{P~Ug+xxN?Z{hD`hic*iiHJdrWK-acI28EOEMetu
zSO5FWJ4wr5Spsb)5kB=<hhYhSU8R-&U<nNCxo!SWEJ5yf&Jf}s77#tkgtlSIACc~P
zGu`1SG*9@e|9^=3cen-H%z@8}@(uJ4_2v;YU(&(;#JhBaj)xR2Xyydw82`)S^H`(G
zU(y&x?~h~QVLX=OY{r`4OOA-4W*8me^HJ%j(Ec?T|B}G}8!RZx3WNg3e>W-oFOjnU
z$dmkEdz6&#Oi`BArIa_AC;<qBe6GmZd6CP^(I<;vL>Mt*F0vGfM2qP!BMu@_oX9dj
zWFry<h^zu6YnHUa0%l|-64^05J^$R(Gs!TD@xV9|E&67Xk<oN|V7<_mnOns%tu@oo
zu0<k8rm=KmrjmV;F*9H7W4h49$7NZJpUsTbZm!OO%zmg7o!{5NbR+R+C2>M0LCdH=
z$&lIm!amB~#XQVU93b*$>?Pr8IJCHZo;xpH%FOMVW)!+4_0fD*(`%N|-Y(|hZmtW%
z-CZTn>3EAr@!5a%Wj=JVX!%n<OerlR{KU(mnGKPhBcm@-iu)XYv7Myebg^mqTzZjc
zJ~I@DFg1v9x0w;_W4e~Ei5a6r7deipKb_*}q0noV5=%OB8XdjwCbEnhJts-Bj1{dT
zl2mfkNCI+nT>{-L)Fy_GBf-ho)Bb5w=4eJWfv*wa9l0jPKibeYBErx+GBP|)Bi27M
dDkwZm!({4YlgY*cM<KxPA7~)}oale5{s+nP+{*v}

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
deleted file mode 100644
index 29d4d0583c88bc82e4c6e570334ced2d220a2c3b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3177
zcmc&%c~p~E7QgRXVo*SYd@;eO$Pz)p1V*S*q<$cTB_d)KK~V!i79oiw2x=|3(4tZo
zsv}wjtQG}@Dq^cF;)eUaGLAEz+HtJyIMsv4>0k%vy)U9N>K}8?oasHu+wOkvJMWjo
z@^QW#!?|$oT0DUxfJ2bBbIn&zYd3^<Bt?9C$oM#y1J03?dUGPq0e?k`0Pqxd^JL;~
zX#lWsq3|d|h!!dLAO*+B{Xyg)m)Rui4EmYrI&(TlNk!61B;ARW5lZp9EXln|EF1-R
zaKIA)-mAG0TI`;LC}R-Vpdbb_2(i|QNJ}``t2kxiZjMZhUdTvs>~Ef8W^Z>Pc8a3i
zL0QAR4BpdDZ%oO}PBmC&>U9>KT?Src=x3%FvvRWyLOQpQ^fpqK(aH;y;=geD)+7t<
zxfS+W?l3LnS=j4I0UiG7T*&la3bpM#w71L#@2$h(`|GywWR(XzX!C-UVG>le!w4&9
zNWf!58yLUy08lx>V|xlp9%ew|zJI~Bu4)ja1;UQ0a`^I*11#Lcqk<e;DpMW^NA{@U
z)MgF1X8XY730>%7-3TaE$dFgFJ1{?~Kx~YLv4sp=sdERLp)TMg84H&)9bk200L)nt
z3gg=|VS4^xuvzZ}H)CpHn|&S(Qq`mLwF>0cQUj6|5#ZKx9!&d3g5mE{XttjY>t-up
z<*(mBSXnB}+U*J#l0;C^z8Br?jiBLNAGp=e3635a0?d|ipl;}ZqDVK0soM(j8egcS
zJYma#23YjG4(`DixS`>wyUX5(RTCs2|5^h@KYougbNj&_#vQVYY@pPeg|>bBz<W_R
z#CD2ce~A|;OGL1{Ob4g$^o5gGS-4uRMp4yU!QFfoo-GUqw;xn+Ex8o#bQvJu5)WlX
z+u-@U6R<(E6Dp#D;qE31JuA2m!;1%kYpowVdb|XxZVm-^gClxU`4l<p?;(pP0+YcF
z!dJ(@q+ge#hV;EmNp%<eqi!TR<Wts1>>3SKL#mir&K2OE<iW&ldxF|Wt!3^<A49`b
z7wFF-y}<XzJ;-LxQZ4HbAd907(zgb&`Ij$1+v3~k%+{ZfUw;!-Zq&m+8@JOop22L*
z#>-H5xD&a8mI^$sM%9saXyVakXi?^%V})0!#f>!FQmN6yfDTHwTSHeIv4w{1ZE&Y}
zJv=YTp@*mnsGw1!*^JTwq90GxBj1cxro2cbVm*_np~3ZR%E?4_^U<Fns#yjVol%tS
zNQ%zonn2_B3!L`*OYa{ZyvItw4n}>{3cu7uf?YxwWDWA6exA_?%g?mZ*Cz!-@sZh7
zM-z*#jz0;d^;6LP!5z>bnMc(Yje|ksqM`ZXK-y#JKIj|v48EIL2Xv5{HgaP@oLkHs
zoV8BWxz-GSZCe6f*PPiewoRkT8+=jI&_=rY^m8=*+<iK@bx1F_qIuM={-^1WlLA=p
zU2Etpbpm|~%n-OS1qvQSu#ScE!TUh7sNYS5I-b(7uw4V{55~~W^KD^O{xs@f&SXmB
zgV?5M4Z^QVJImEG-drX7>H%a(3uO$su$cZvx%9;o-E|3}f$XD)cwNad;>mz0nqft~
z#5TSA*xK3mb?E2lB;lm}Wpd{Mih+X$y9{x4bN5gV_4FDx+}me_O6}|SyOI6@?~NKA
z_<qos;19;~A)({K!Zi_*QPJA*F%x1Z#!dQg@|5_g35nCDC(W2SOP8FYPo15Xo{>2x
zE8Ac+&CM}ea_8mc&tK4I;YW);E+|~QWa+Y^Pd+VPzG7v`XRB7PSzG$~I_3J`Zx~j#
zanl!@x0F{@ZvArG_OGgT?5wWYwY#?N>pgqx_ciSQ=Gy}Y8=IPATMo4zK63Qf@wO8u
zPn|w<_T2dk7cX7Da`oEv8#mkkaO;m&e+x_?zO7s@H^HA+6a2Z+1d2N^1eAy53XZbE
zKt#`%)FwN_Kp-L2%w%PlExl5+QpqcepVUfLdQ>8<a#9|bD9Z%6dA-_ieK&jHSCcL2
z2|zn}(J|o;_P{ThxLZOd?v^3A@a8F2et~He!L)Y8yI`U3OOVJ)HKbXrIClz3FCpcr
z0m|Y57#QBP{tlmSOHf|o4>FNCB|SA;$1xZVJ*d}E)kC4(?KBnxBluIYo>k^VR^gtw
zMdV)9pQAXDHi&35GNBg)c`59nyC)vb*uD#~FF5|sWI-yDE9Bb$G35V7ApC#1vA@|(
zk}i`=MM|->v%dsDu8`UCad8vk1FaWJR>`Oc`ST$>&rif(LeV^*$A_fx;XI$lho%W<
zAy{BMS%va^B$@HPd&WCqX(kqkA%4Pw;VRWAya<blAp6ifGKP@>o91~f8A1}sR^an$
zvY(Ue7vq~8KP6WmK6*~V)VOrQ&yL0M&CkSxU@ufKf`h<hwI?hIuZO-lG2Y*x*QD_}
zqA$3|>0nCe-p5UzO!iS^u%<4AZjEOSJ`1tv;{A;YQ)7&YQ-##AzlqlTqCV^y4=xs_
z*Ww|i3^D07Q!IqRM{0?@Ae7hbe`2IiZ(M9l-KFRGab&5<A~i52hL6rn_RGgL8A*h2
zk@HCXafsHY;Ij}x6Gx7-!~6t3B+uF>Ay}e?Wq6^IRwW@x>$#A6LUwpA_9Ns;^s#-c
zN?K!;1euFTmoqmv)#8(4GWqCoa*TN{^HOun8AgMPZ@>uO5o)<s0r2tz&G=V(EdG=6
EZ<4@Png9R*

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..cd2e75eaadaa034208a570e691dc2fd0db882799
GIT binary patch
literal 3988
zcmc&%cUV+c7C-mC_h#PAKtSP*GZewXfQZr*OVBVwM^pqO#ey^esWKE>P%$Vdir5fE
zrHC{^nu%hgs$d0IBUaX+f^MSG>{_C6-y4F7o8*uEzWp}$o4NO%dr$kFdp_oehuqVW
zAsIWy!JVNQ7z|*za(T^**yrnePfWR&bgVbyQOb~p1?LXQf}d`xo03}~*;L8^qs>Un
z7$Kv{@ED$+4VcgaIud%o3IK#7F*hSj^nk1r=`o~qJn%H75<y@<bg+L|fHI81VxhPR
ziQADZOh*5iE2$ZqBQO})Nt0awSlNTIb7ynlayB6e5Fp?Q_#~*S)3zF{1~&%vFPxc}
z(fKuF2|aKlp^0B~#4L*6pR$)C6BP<f5NS>z>n1<IFi{VV2}+0xiH`RV4u}s>4`(;w
z!Tv!pkqJ@JDqwGs_!*L2AY{+5-hUDE58*b5nRc-|bA?hN&J+E(pK{8a0dbYHaa~w7
znNj*PysR9DPL0_mqK8j|gToGz3w8Fx$oWR3N8yjC-t|1(n3zSD`FDy;%`Xv~Tt|WB
z^#>3|p2sJ597V5<FJP;f$NZ%om!NgcZPcFs1X&G<#rtA{;dT81LSQnLD$D7B%7!;+
zIJo1<Ef%QMRUOT0IsqqTaj1E9Ctg!ez~-fMai^Cfd}s3lOAa{``y183rq&zP9cYDn
zX*=M3MjSEBd=<8xIEf11s>^$PrUqGtpC<RE@_1g!p9y!x>Ix%=-lcMzo`UumMD2g$
zhSkPlB7r#$&Ldtyn^ljnw{HTa4eBs)<!N|T<_hY*ju1I?41T(x9@5)S6W6?^LR#Y@
z{Ngx;u1-G-wzfv(5rY>{r=5r^QXOEZ!!$T?S)b6}{1Y--TMHVFKf&MqD}k`JAYvFh
zpcB%_V+*(Q-ef7^VQUKPK2uHYDV&S<)mftB#`T1C@jGJPg-68H)5G{9QWJ5t`8WLJ
z`8HGo|9dLZ!k0J)N|>Az1grX7sKKk3f!Wa$yg}a}^x|J^vjr4^#b;9pon>m^^)?;t
zRHhL_=64ZB*{$$sS~FUzaSz207NDr`J9x?W>u{!@4G4B<!IMR+NXleBHlA8T1)cSy
zj!gGJ^dnomwPp^A)i{Q>no%$$Dh8Lfy+`Fk+6c4qErhRDj_@sWiTb*BIu%eIsaa57
zp*=0a2NfsY#zSSp`To)c!nc>Cgxq>1Sr#)NC%tq=SAV)YsKD5TX#3QKp&aHJY}ak&
zyCrL=?S9{l2ASj_OYJP4(EAA;b(a>byndW6^BRU8G-^_Fb47x<v7>l(8Ws4~T_e0-
zY)8e4s&H0Ii>CJ$1r;eN03)-d{6s1ds^9O#b5}I-*J)5Z-SKHCsd^c@lr~OX^N}ee
zP4eU|x$lI0_U%CaU+#g;T?U|3iiHLq!)WuYOrfaxI9}}c1L}60K##oh2!#pFgcoXT
z_;a?b(ToxuCF&lGMz2mqk<!bz!DC7)%<ju3&VO5vIs#roXc(XdQ+3qX>4UpBF8suP
z@MfI<zzPfhc@aWhUqY|$N07K^h4k*!i{uuM5tJxfj|hAljL|wzI<(gmo;+$4j=7ED
zv_~#KEg*1c$*0C(-SZu#Wu1lc`Bqq0H<RG6dkQYOtHCt2oaY`tkB)tjEbw?dNz_wc
zAf(o+!6yVsw3T1y|I-_&g^J10m!V)fwp@eb6WZ{}tPaBTRFkHrwF7N6>n73OWzJI)
z7brV7Bl6H?lCXHOU8gI5iXc699cm@&`KB|!M_)X-346CY5&8x{126h1f6eeSP!SP_
zt1>@D^*c4yZ99%ogFYV%t({3YW|lvi?wTN!-MRxkLlcc-z3OoM`U?J{od%TSB4-?s
z7(iBj$*5f(F@^6C;!0i<6!BwUccJUYR%5;F25p<aZGt78RfcXx)<}|bhB!65i16Le
zi)QPS^wf90)Rq&E(c4|IWR%uW2(vFn5i_fa&=w%Fza0w~m+!?smN(SC8WKl;=BI;v
zBL&c@QHs9M${_425f=VKM(Lib!Cy%G2B(dADCnpO!9g|d==i92Xk$}4y<v6<PHc|g
z-J^7Hpl}3nF*P2p?=3?e;ejyzvO9V6p((y+(}J!_Q*e8VRJ1|ImJ;Tr@eQ}m|D>sB
z5fW+FqZIXUxSD^T*vT_cPu12zJL0zyD-Ya3?aFN4=p&wBxA0qDzFM8=z=IqxE1e)}
zu+;-=;T+WW!*d+<Mg&Py)x|5fGpOt36a0R`I>Ge2YQzs`W08LM9%SI9z+LlY<Z|ul
zyh3e96zZAG{-tK>h0I&N8PiSw`ddkIrI(OgBe7#IWhiAA;^!*}c~bCfBP(>%q+k_b
zT4I;CdJRDjHt}i5TeFt0IoxHJU4BZcNMaRsqalCodJRKGnnYgkWxAG$XNgZ^LF&dK
z7J=t>$qUzQ(w!LBd!w;%eWrn33N2L}NXr`Gvem@5=|K86V~+xt97WNF9ix3}!n&G@
zHfEbIY)X^bAI!+H4(lxOZ9ceZ&jjdhk=o^<WJ!<7uIAM<_M&9EN?A-LligkV$r6Ws
zY{Xt)lx#7D3)onkxnFk`^eK|rUt+)EC2WkfYz*~e3(cENMbq+G{}LT#$=GCg_AYJk
zD8H7HtU~84!(Deflx{7Ww#zE~W=rX|;+c7hbREaC?WJBNo~3@RWjo5fs{-40JC^UP
zn0qv??`CUx_F?~q6oIbOp<UHM=eCZTd*)D1ZRnK(*F8=ZyN^WNs0qJyree?6G2Kn+
zy3Uok$CSODrE||#?meFL?0(xG=fnF>u6Xsl@7CGF`%kX|La66bmDjkI)-s)Ut}4HI
zz2*qFT$k#C){PSD#kbE@7oN>DwBMlTT63Unn~7K1ytbO6^Vt?b?YXYC2QTiKsC<08
zt+x2`e!I0oeYd)jjzX7hrt{nDO0O1s6uRwoJ5qM7+^2T&o%SQ;-yB}pyg`53SBJW4
z0Y86*)HK+qQLI2<a_}{h{cNPi;C@!*@-dwdM1@Cl3@JoOMuscn`5}=Z9MxcXMK>+(
z878hWkk#wUE~)6(Z(HgQ$%7p;@dLl3LoiSXZm>Kl5OFOC@U(y7c+yAAPD0;MH7S6E
z9?;=<((g}M2fwgAtYUkWqyCI0X?!3}ks;Bc@%;?&F%mySvRo6{Ib)V6{oehb1pOnd
z(v0yS;VFZ{LZSi~k|l~p_&0phNYDL{mK{qOVLX<6TyTchulzBO<Me)g3I;PgcU#Vy
zl&GGC<D{64Fz`G^I;?*~&c7<~e}V<EP^u?&{~wd#zlfCnUz%6;S9Ov&!&oeo(c&v3
zwE?7h5;eJ}=S=U({f#BbBh9%nUapYK<+Io?H>SzuNpeM~+(|AEmD`7^&J?V|bZ%!a
zm%DN+D-W!!q}nPu51b?U%;lra%_p)O#~Ci1Y@fu<j@-n$mdo9_N#V<hs(ZNwCr1QY
z&9Dsgo|6#lG%3Q@$1{w(kMdyiTeg^;RQ~K$PS_;a(eF>S<?cSRSNeI6j}CSYl?QP4
zs_-lvcK8m+o}Yit$!^@#4_%dde?AfHnIb;Gdwh(q&x{y99~E>q-mHFnZnHnmhb<O6
zev=PZN=0n2^PG6@LhkC$>8q5I2I9|hRn?m<HaiANFPA%TTjxlw2C;rllNJYBEoEym
zj#FZboW#|iO|kz`*fWL7lFgh&$KLzO6-oW?q)L{vVs+%IO7<J6K=z-jp!-HSC9rW+
zI5~UPKWoZ8O}{2c3}XZ0mL!D4j|qy69TN~27n5X|7!s!pkBK(4v>9tT)<Wv82k_wo
MCrAJf_W$vJ0~5rxVgLXD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
deleted file mode 100644
index b4d3a24f9c22b0fac225b64e9de3189163f54830..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2758
zcmc&$c~DbF82@(Pdm%w6h`f+!L{N#SAcqnG5q;s7fLw|}5d(=3%_Rwf)_P!3oS|M*
zTWvdNM{Da*ry|ZMVnwaDW7STpooVZIrma`2o#NE1)$YC+1<^k`)2ZJidwt*jzGHWP
zNuE#;!?Sz{KUhaP2mtU;n%~mt_+$CC_OVxL_g`zeTF(O?z{?|u-k+E75|swRr5+oF
z)MEtzvJgyK79j?!KSL^>RmOqDU#XCl8*Ik1Duc6%$8uJ_3CTY}>P)rtWk^cu3wR8q
z@h2U?^C2>1jf%wGQNwN_Dx@BbLh7-VldyZP5`ZURV;Xid`!wuuU!%RE#%i*;%8Uk=
z!PiU%W@A}}-BM$<iHV#<^7BZ&l2M<-s(<0}h3R!j`O{JzzZ#2&u!+4h0H?3Yh3Z{J
zxV36G+qCTw+}$+_eIB{Kmvmq@Yz^JY&gi=l2A77hd7JN`wyZO7Ierbhz3gJIVNvIp
zx3Y#p%r`%Sl|6$GuG@$14>^lt?KiyYE}Vm7OTI&=KfH-zgB-ZoZiM@7TNqh*GPixh
z1=!W`01X5kPW&PoZOigS`A6E}pt>3zU3?KQX=7kz-Bf(BFb#gwbYjKM3})kDAJFV6
zMy*?p!Ij2!@VKd(35{BWlj0LN^V)ur8^>EvjQJ4T+#rz@E*$5*-qFw768r=A!I9e#
z7>T%z4<=xrQJAUW4?xDCdvGfDC+}<5M{|MT3-JpM!M*KS;HysqOK>E<J-rQ9oIb>S
zU6>4whiBr>104D)=L94rg|T}FbV6(3T>NQ6Dg>uyL;IKgnZT6~QP|Qw;FtCgek|Js
zOj0ys=f{Jzrjgx0<88@<HBR{DSUtRVd^h+0=Bc>3H3l6R(#DMZ_-|&~*{e+Qp-`_u
z4Ri7Cs7qc8OEp}F*DcNxt!GYx6B0L6z@qD!T)^Ua5V5aa()$uZoqv+6X7~uA$BkwB
z&hvr7zgD1koQ+IS>9<VSyT{;a_EA*lcLliuWXNj%9?!Y846iEDfNWhKxH)qX%Y~QX
zA;~RV#fcJbZ%!VPUQNPlTP7oi-+r_<f`cHd9dA4J80`o;#YF5_&5%ofi{bY%5qt;p
z>}iTuK*7?(B61gnc$1EKbRHoY_xmz3?qFDH0wBz=oWx5i^X}#2`<kDB?|?pmygW#u
z?AuS(KX^b$=)ggP!_-5<hYlMa5ji3%Iwp4Hs5s5&_=LnUNn?}8jTf{jscGpMnOPIE
zbvd~c^YRM{izZDjo}w?AI<0hi*$hK@h0!##vdTPbw#90*gJX`{U$wW>$5n$I?A%(;
z0d@Ty{_}{F`N$FdI`y&OjfJBZElwPxae!h;c0EB}$~%hQB$y2Zb6LgmE|}_j0V*E5
zIVQ9FW0hIWHp(|9Cu=b~U0x=Oi9TfVYC9PDdSCgkKI(g~shdPD>sqiEKFx0AXrgpP
z<noigC-ZnT3aQ7IGEdJ{0@9Uv*2-VTJbq7k$YQcpy4)n{=AL?9eI$@@Pfzrh_<T`<
z%BDC_NSqZ_CaZyG3D<_<E|eXn((UXsp8POC99DF%GB0r(cc(^b_pJUr<|Vo$x|>yq
zXF+Kl6Azv<3F}MnLnyxJ`1xc(&MH+(-GBUh@``x=f4Q;hS9O!*Cwt3TwN&2PI}kvr
zQuqi31rv)C-4{!-iH)KXCujvh$R}Sa*@93jXe)(uL8uf`D#bM|G035#6hX+MJ(+uY
zG82bRia>#ciSvg?Ma7drT5cwtr_|CmjWz^Y5OlQB>giO}3(<5wyF4~Gro4D^jWIo8
zwth-M71dkwNPP2Vkw(Nz5kW{0MBI4dkm|baca{{#*^C*Lf`RTA-IH{P=soj-Nt5V&
z0&U!>i=n&YnN8NTE<<sgT|Xt)UNS{Yo%qXl=Qp9Kdpx9AM7qR7OR04jGbX#JLdeol
zd(o)Y<3B%3tT!n(5l`s_A(aj@EVKrWlJtaG<*{|7CZnhkDRM2XKMB!&C}d45?j(_u
z?1)}3Xlvc)BnC@&5gS3Qq<fQ?q<dXVU2jdVA%4U>>3)Jww$h_<TY@6QVW^%{V{%1S
oI2@6N>S}v!$Xrvk(`>he#ArsujEGk1Q~*yu(9Qznk$<rN04w6BdH?_b

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..5212dff6da433a3a19ebc8bdcb3f24dd3d1289ae
GIT binary patch
literal 4323
zcmc&&d011&+J9%xIax>|AS5P$f(rqWT|uQ_Nl?~+2r5f!l}$j_1_Wz0EEbB2xD*7d
zsGuk=c$MO<1*{70A_^*kMas2Ww_00q7xm5wm$rUw|M;Hgd%io#nVEOyeShyVlPA9<
zP!Z%Hrp1BcnJTf47={8^ZgM&lWX{bGo_gdJ#2!z}nfWUJ;ZCpEnCX+;UitY5M9Oh*
z&TsJPSsf|{u|#ZSE!HD1j+kTO43@mXJt=Qc8vq2BnDlyxqM%Y=eiNC9X_*sn^kh<g
zWJGfG+_;FeI5C#!NiHKvH<B-s^ZpJ?W=(<xF$U7<kpY0n*+?9!A|e~TJ52!}0nO*q
zz=(JtAdiqdLqt4RU82r5{egkScqe`-Z*WJ-qyD2KAvyhdm2CbeCrnWVCtu17ncs4<
z6h)^*Eli3@PM;eckscw6CzJT-xlt*J3zL$yS?<G<6<Ge9lIKz;{|ldgnfpbl_?lED
zF6L;nuNFSG;CQYM0QJ6^xGwGhUAXHFJli(`eQQ%D;EnQw9V2(p^NhB@m^tQj;I_x8
z-sc>2PFhRvp4%?4vTLPQ`HTgJj$4pKpTmbs4xtwoE!Z*T9(Pe&E1bybMrSMTBgbK>
zcymfLyr|z!@hzt?yEnAKzN0VEC{SUS<MwEmj|k0ZYJ|gbHEPao$2s*B6fODyw+Byy
z2hM$1y4Q=^a!d%$UxlE$?I+;-f)aRJsHR5RWntGzlbQH+#+-X6Ymr0z5qfhThZC%M
zU#Be9SSN8r53{N1AsE^qX3I-oES!L;h2jSA8vPv3INsE`b9W+R2qKuY<On?9?E|8)
zX^=R=20#3$9+sayLR|@-0tLtB<Guz4UGhH#uCC_vLDN2{Gt9s>c~fD;R6l6EFq|?h
zdWFoFeFeJHUctZS?gPrzo=OqB18-pg{mr~$&darF@Z*VG`10fdX5+RG@a8%P)L>Ch
zjo<m2`mp6THRZ@i?&!P>e8BE1S2M?%Im-Q+Nwg26PD2{FY>0xayWY&u>`%b@P$NhG
zDnflv$z_YE0{i#fD5Fn=5Zu2UnU|h`+kVX`eJCGAA8}=_+1aDj;m*J>F#yJL4z`$5
z%S4?DXM7u_u&#DCT4&9`*vm%Dt}}1Zo?&Mw>pg3zPdmNf8}%&Q;OB)ZzMfC_i(8qm
zzw&1y4kYST?x`{KO9(|fGrI8z`6%w@cuUmZDx(zRm(aUY=HSd{Ug*-R9{q>8r}+78
z^Z5Tj4@AYrC%C>EUE!B+dy&572IOG4mZKALpO@6b<1guG;O!0`i5kz@;}13o_-fm+
zoI2ea{9TVZUOLrKKUJ_Fum9*SFJz69Nt9NCx%DD$29p5?-mb?Ve0q$Vugh@ip1hzl
z4}5}J3nqy4Zd*a-<RH$+H#|}3<`OjbvyD*HWePirtZCE0kvzM#t91m;4R}HLuc+5|
zB5%y~+bB-QTBoJfnLB$;jvl`u7}wnzhn|0%M9VI8L!et81pV_ebMA+F)E4m!LccAh
zj#`P(vG!2ht{Y{b%df;0w;Zv~lk*Vsq7^;AkwD|BPh~w*&eLlGM>B#_6DqPl8l(Im
zUhEw!xPSYYPV>-Ha3pXOw;&>NMD=?XF#hHP#$fzfRKaz`#>T5DZvI2?-joeid3!jj
z^bdKdeHwn?y~%>(qq2rD%Y-n5qNy``I=D|lfSIRsfuHJ9xNU2$K*Pi{`0&~`YX1~7
zUGr6?wBw9xRHcPC)!KhZ&*fl|-WG{F)9e||F?K3I9n)qpDI2|IGw+sCv2lQUSI!&a
zF|-KdFOslLHwMpiZ{d66z4$llD|P;bE^@-n{cv9*1A>1%VG1tAQ3a}S?%AAb5Wats
zj-EOU_jp;Nx-UI-ZD!=qyuJ>K*JZ@#-uQ~TTGNUp2U2*$tE<4|kqE*oTd1>LcAPY-
zg>m;ar}ti<Df_-+qpk`!{_@m(bb_kqT222MExmsY%8Naz;ihkZll+jIGwLMNB&hNJ
z)$gJD^?D-Lwu6lREL%9yo{3Xt%tih_3w7k*U5A?^GAvSq>u~z#HQf2@O_^!)y>LWE
z1ikMwvGBrZH}2FJANmTvikteP3w3;xjZI3A8aiKI1s}KXH}f?gkE9z;Qs2(3qQX|(
zK{JQbyeY54m^F>}P=8q}on$Zq;yiYugy{#U*yBKz{$LB|7nkEuhfd)?hpBn*hZ~`=
zL_W0Z?m|lq3Muz}2<!YLXN*tR;-#{?Lknzv<hRwv;HX*^Y8d+(t!!G(TQRd5XEY~p
zzGsYZq|RvSd|o<ql<!7u@sZ$kK}BEt(F%X>d>mbp<>IrsGQkQXS4L-R0oQEZoFPp&
z=Ock(J<1iu!=;KF)OwDoD9_Lcm85?`E!ln@olPs{j5`<v?(=@&R0!(?+iz_E>s=EC
zM_o-|yv}TN_t!@_>7@WNr-&p=ip8kw*?s)>qkO)9kC6KHR4N+YyAhcND{<ExIlb7>
zpR>(y8j20lFy!m6MNfQM$6A~{<o9oInk~JQW-kwEJ9uW6WT4($M5u^?Use(!dkD?X
z0?d=TZ_Uo3ctb5ik8aIb#?>3;UFKeqn<tPu#&sU8SoXQDnX*8tsQhfXfn`v2=&{PY
zmBZ{KPnRjS<*zcHq`uR6Y}@CnP2F>OGUfJywWGb)S%x)jU;c$fV5Rp4Wz~w3aiO(w
zT}@RhOYP=0705hx6mA$F*IpggykphIiO|b>=WB!o&F)2f4ehbhQv=eXa^k5wOsp}<
zu~@y?_*3;Ar3T;^qPCP^iZu4m3EsIa1P_c1s`WK`o@)qHCDHlJQ3JH_OGXpW5BaQ&
z`jO_BrFp{P%3|-fQJ$+z+;R<l+Q;~oSkGxH_PHbvD0f|o#lDv<gQ`^J$`aozR<mn9
zJe(``>#z<#n0URZ#P6!@M~9cZ#u8PhedLM4;U4Q%U5@k5Z*p5E@xL}<!Iiypn%Db(
z=bU`KaVa${pnH<~R!g}@X~1=tj0fF^mkpcV<GR>|q!i>vTy{0<YoztVh_@wGNMmC4
zJ#qH<!7_afeWMg<AWD{H|8ls;kJGzqvw)hiOd`vod!131&{$g3%VhD!l?6Cw0dIs$
znd~*n8l!iuxj68WJWt>z%CZ#tmn*N>Z}FI|`-E8KmZ2<0V`6=Sg#6NHs;vfub<3~p
zMXz0ZomY@Zjivi;Oj1kEmp8~XzvSrt&{$|RQT@}MUq_!_Gs<h7b^N0-XN#>uY7_51
zGH)q!jBVc=-zRU~<dXU5r@MU?7q==uA8NDUv1R*q-%`h<dymInt`6E6RI}iTRma}&
zuhkFkJsH=zKl0Q%n}ny<*XkBrtxdZB)TaB8`gVIw!ZX{R#>LMbJ-GkOuJ>pz*T6RM
zx&4jfZ_=ozX_P#FoQW6@5K8ZN$rnE<KA9z#F==#Sd|Em;CNYL3W`u=h50T_sNeaiv
zpPI^FYN_iVFaBRrt$AXVN-Wg!EPGxI1_nW=h(|{tMwf;t!Y}6{_9FxEaA5cge%Y!0
zPytAJgE}m~{PQXS_?=&RPmsu4Z2yj5>i)(y6JwHN(+7BTA(r$b`5_zmMQg$^|BU}P
zUjHg>S)mh1IcZUGF-Z|(n$T@?{0G)<Zlc<2;7)iqIAQ6#wHI>+l;0)5tlsaZBF18l
z%9XXIrP>o>nLQ>R22aY|h~S&C_*%#RGg*-6$xLLb|Kpz&|0Op6|8kH2Yd1+!Y%kH1
z^CUlyH3X2ENQH`^py?ql0~brWm9}G>lR~LbC}xm{ZGH+xra~F3@Kh*b6&|tLH6>B-
zXJ;M?g%3Ltd2l3>cACZ_ut19Gi^tj7O(K(N0p6_ak;(Sc*p5Ie6e_k;hOwe{Ut!P6
z36YKg4v`_V7e;$dP6!JPievYa0!e(IEFhg0&qE7Bf*@@GPdjDz-my;$4{=J4_KH<R
zu=ZN_BpuSi2IZhxvsl@e?FLfUhCUEa0$Ee0M}#<~goOsAgokQVC;ny(<mWr{-FQf`
zNc%%PY$=tg(O$FD*$st{iq+RDWe)nE;iIiLDK=??rB^7XvQw`_wg#!;o|6|uIxZqL
znZPQMB4@JoCm{|T3RzQXElK1gJF*|9P-YIClQvk^is&e`l^ihACONRKO+75hb0P7g
z&6Bk!_{5YwngLCanx#glKVBG<ZWEQ7Y7?PWr(~LC#HiEaQ<BXboNXO!?PV$xfZyKG
LNCO0tzc>C3fD0FP

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
deleted file mode 100644
index ce4390a418031143882c26e15a5cf27824bf0a3c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2745
zcmc&$drVVT82`?>4+?@H^wwLfib@MQ9t8!_RrHpZKt&!R7G+py1+=u+wuq?nITu{q
zre<c28D-HqUnB9cM3IfDW1E|C>NYjTxGdSGbI(Mx`Pg*l+**{We=J$HFYR|8-*>*}
zIls18DoqhsAwp1F1S~)#fJ&Eks8k>Hd|9>tLZqNc5O^Zy1zwj9@v4AK?41EXa117#
zM~DYQJJJcPHVya)tr%QmbJ?rwZQgnTYk18%q}hZF83xspENK%+C?H@UEP@OGWNj3Z
zEW|wxO=KAeMkbaBC~^LQYO4kDL7bv0AQ4r}eH{&n{n1?_42e}@$C!x)LmS&0e0-SQ
zUE9>?aQUk3HlHo5flM0g)wOPCQ=>~s=OWTvLWWkx@E+Fv1DB6Ya;Hkzud)a)_*?J_
zdm{?xzFY*Jj%oN{{XTZ>uDkGi#~Acp;--G8;f1hc*ba7f^j3(i(zC_eZlKQmPvJtw
zdUkj9*?yyvPcbj%kA#$S-$En%DSmh3A#^M5Bu;f-3tD*Q6!fgPgg$xW2b4Nc##`KW
zxYfCx369U?cE5H8I^MsHhJyu<|G<cL<%gk?V_opB!Gn%3KZ{p%GVs#EnfPp34*ZmU
z9gBPOn5{=cA^kuFI=H<DzG~eF_ttutVady|DPtVhuwf8??Zf>jrJ<YMvYO}1mQD=W
zBo7L44!O*2KK2W!6A`!db^#6@gPA7b2;_~p2`5s&3Ay_HSWXRLkg=p2Ztl*9Fl!Dt
zha}=(p6P@&pL8={mSsZg(K-0~5e|Jm<s&ee^z1uBufsw00{r&sY#5SV2wk5IX4Eg;
zLHbn(AUx*|d|llEjLFEjg-M`lYGn`4eucli-V5LLtc11?_i?Xpn~Ap^OhHHDI+@XL
z{mINad4<XB9u_oW^#Z&v`FzmQs&wxCpr1LX(aL-bUKsycEiC(fG8ehL1riQ*@dM5y
zbp1DS*KBWsapD9fx+N6K{#b+bulB%|!sEyn8I0`RChlUg5xrcQ4#69vfQzrfahdzM
z+K(!^f-Vs@?4OP{ByceDi)e1wiF;_zz!OZuo^{OP3wdzZQ;v@m<{@&^FK|L9lOVKn
z4<4n`c9EqGBJ?AU2`2+{_cTI-xnEnzyq#lJ<$y55a(s{~IHX@_SpV>d0g+K^K{HU)
zMi0^r9x^m$*zggtdP7|N$Wc!vBqk*rQ&LBdNlPD_F>d_RrU{u7CrRe4?3~=Z$@v9^
zmMKM3i%Uw&rj<{xm|?A)IjicK>e;rMTDxOTU46scdCo?c8)QLN%@6f(GVAqa%Ajsq
zKx{V)a>T-xMPnD6Wq>7|49^O3!ct;j88NWDc7-EMhLvqZrcWYl6;a6OltK3#HM-K}
zHe8O>3D^%Wll#8V;Kl|Qg&K#KbvAf?K@O*bzFzWF+d0j({+e^4hD-eotqP1iT(QT#
z#~2~jLa~T|PK3!p5rWW#`y=T>A6yhw0g0#z=u*6Sbe9;pkChlPyy2$sB)AFh1s|uw
zRp;~L=f_BM8X2yt4Lj8Ya2~b(1fP#fP`fq_MBZCl?`X6MEP+`)eh97gI?LXuNhEhj
z!(!hm3%p;sFCJ382lXdEB)r8$PqU)Z3ySENcnI`FA5HAXQ2UDGzb6YCUaQku{^g&N
z{|VgxlN<ZL-6YNCC=G8=Y1#&;0kk?XR4Of<S~1>#vBY*ZnNpf$mL#c!d?^)5QnO^P
zlX4}gPRgoN_RK_K3Y}$1Qa&BYIxv!zGWAjm)Q~iF(NoFE8Dx@EG?}imnkmnrjF^@r
z3uR_2T`F?PNZ0dfQj1b*DyBEtbH~lI&M2*?^2TBkU&~y=6nj~U5fTI;zddD2Wqr<j
zD=X4m_PjdDM$aqmNjikAfpuwlIb9b}=1*M--5<|9vS;?$D$?B68Aa~O8A|HJUx`1z
zf@yu@A;lu}P&~AhX4#%M-A4sdzJ=;5O3eZPCHYFdNwEn9N-s&-beiX+HIOTF$IY!t
zT}Wy&hANRFH`Dr)5dBRdduHV%iJW9d<W|Yt?C+BjEImbZB&Cx6LrRkVeI<2kV{Q}i
zqvT1?6Z_;SZH-?O#2DG;ncw8_CDzJvqRr!RH^(e+c)Sg6S4>KJQc99hYtaGpexQr|
K`zt1YTK@tgIEX(0

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d0f1bd9b4393fd8255b1fde58a8b75d36fb67909
GIT binary patch
literal 4341
zcmc&&X;f3!7T)I$$xQ%>l3)TT$Pkek1XMsuf-<X!Rt7~4f`E)NtDrbA;{X)}6%`Z}
z5UL2aGN>RpYYQp@b*jap*jBCfX$9ro8;Y&3?T@$CTd#X1XWGNr-##a~U*ak8G8Yg6
z2f+*v0VRMj09whpi^mVWOwzBOKJL|&T>m{MUw@Ije0LtNw8Ca?-;#A#&j>)k7YL1L
zd!B$LU};!`J~g5fQX|#?z}3a*mO>cb3*ru>As|Fnz~YI7oB&y5;F1toY={6;rBqvx
z>SZJj5mSE`N@PUm2m%b~hDUb*@|>rB$eBRELBQg$9YBfJGQu0~7?+Q-6;dM_A%%Xk
zQKb?5d6icFh5(Zz=p;TM&{qnKS{5G>6dAW9P!=as3Z)03flHP}g~vxk$}#mL)$ftG
z6&F9pH2#Y%{}Sl~bwR(nhu{rMen8KaUOi(uWvql1RvV$JO9EX@%fYC`5Wc&m2(Py2
z!IQR0u*`su4jqnyoW*?5+t~(D-|7LFs=$j;807vK3VtWv!Oi#OzzVj3!#*M?ea?e5
zyCi5`j3Tx|WCP8|%;4f4XV8u?h8J_*qpzw>AYVg>CN=5;@zfNkD0gs3C7`2P7dR8O
zK!xuB?JIb&)z2EjvK?Vo*9usaqzRk?6}UaS3J#1)gz=^|=vtKq(rKyyezptfG+l$}
zI!lQBNfjDLErRXKH6Um3B+M)ff~7~b;krKy_H`Xc54i|xuPQ*#XccH_o&ZGg3^1#G
z32Cl6FuS@0L>1<+AJd0o^;$?7s)k1}9Xg!_*uzb>uw@P(M4vlDTK{)wMf_+uM(9FB
z8VB-?NN7880*q3e!1FB&>T)N6IF|)S3uSQWzA{|6LBfr_X2`9)1axDsz`z<O(0ObM
zHv{tF{`*Ksit~oTv;#1da2|F#AAx;t_V92Qh6dL4fnnx2(5|w8=Px$Eq1zKdH*ySm
zwf|p8E$|VF(?<{;sRK@1XT!X~EVLsw6VEMw53j2&(P`sC1xni;4ox^jELGbFWBt2v
z&jYVe*VH_s&;1-qQofHiyG{c0&PNbIT)~<OzC>|jI4C0YE|%1O9op94L6=LOB8#!n
z*xsl>Sou>r#?iMYD~j5o`s`bz4IY?Hs~IYHRYG%H8lg!XgU+RPVC(B~*pjpm`_cL(
zCOqnl?`u|s+JkLyKeGUaa%1obrt7e$%hwU1`Rc6x^EJpk^bD~#jm0AM{jrJmHRQ4j
ze&n8(XHYlG6H$G3n4%?y#|s+3S?3L0vgqYLelmsR)1S+!$!FkAg)1oe&V=ys#@MsP
z^^kS>41Q~#J!Cd7$6hv&=*FxIkYBR^)oH$jXysF=Ds2XgpWzOTUyZ}{Hl6_GnFH|c
zl4`*1%<w3I15ojq#Hpp*S#R@V;rq4?Fg79z+kIdmwzt+CHB79>%P$R~MOXW9`!f^R
zI%x@5<=9L3T7PTOsB#+~Zsv<$gjldCS_bQ$xR7H~SAo%&jjYkP5qkMA96(kEW}i&Q
z)mABjuSyYjP;j06;>0Xcc08O{T3N++5A#8X6YgN+#oFv8qQ%_7>mpn-Ihm-4^2ZWi
zJEI!|_ePgabips#x(LV_i!`^Zx3S&UDl6_9dVogj7a?<g9*gV!l!~}VagsY5sET<L
z(4%G^xp0pHC&pwFt5&%R`{te=wolc8j8-^?<+Zl*ythfoaA7Iv86~k3$OJe(v;$kX
zrkS0tOtRFiGEw63Rp@%Ar4p~t01~Hpu~t5GLOy#7(2`BNVN16r#Kx*}H9aR#rg>Yr
z3g;TI6@D+!1Gg!Z?%h5V!ZqSvt+8e=*tVV*q3|VM`^XTzIUPZWzPbaR)6-!7lb!gL
z@9I&z>@@_306J@+gqk~ium?FyM>&q!f|We7z_`C$gP>opqc;!32&`<4=$`#GVw<N9
zsjyQ64;T!@P`Vct{MZ1V_BC^j?_hAoa}PUH7BK#Z?L?T|`y;6`IS-YvEiiTUtvEaV
z8My381%tFomPgzoD*EMGj%WWgh2Hv7F1b+=M&Sg0sj`#(i#L!<r8e*+S1M@Vb_*J&
zT*8|2+Hr%^Ej-@j8I;A`9{h5*DJvGgN;<gd5l6ow@PTRM=<br~oUG_{B$Ki^26KKw
zDNlQ#c)JrmPV+smBA>C>YoCXzuo&#vR$ElRgQsNIUPq4p)CAf(60xPNJ=84Mc&_-H
zyU;s6VPf>WS}ZQ3ioJY?COLDtGbT%r5he|V{Ch(eSe$paIkz}v?C4*+QRk^tOk?L+
zzV*$`u(IQrR(P)}CM-IS-;c{DeX}2<`Qr%6{+%zmt+5{s7Df{hD&t|lZ3Y=O=QtkR
z3iv?EIXKhjjP~v>R6aR2hN`Ud$9=;&(4kz8QdDwrhiZgze-M-E7i+LN?Uj7+KF76s
z#b+(6@jwle-l3e9EGm2c5iH?c80$W%h6QkS@M~#t&{<r8+Cu}N?474#&-Vt{ed|_q
zL$m?Ayg{Upt!77Z_h+)T^8H7(^e#sV{Cc!ODHLv$Jj8dfG?mi$YN#OYGd%g=U358i
zC(E$T3yuhQoD#)ag@ccZfI2!fy3XqeHJQ5rJ$dmn=3&?ZiS|mW$=d~}`}I?-|3W{^
zx~GU=X*o`e+WP`D=Si_{e=)I|KZ|vMKNAIettIKFcnu{uNEr!I*}u~cf+@X_U>*>(
z6cJi{y2IapjgUPFuZEb_R5rm`2^b}G*q^!{r^e{}oZY{EBb%q~Qs_{!Ax%MO5z=+G
zWMhW1mNZi+DczK%qVILYr@1sOXRKMk#X`w}^v&v1V;*-kAIR9M>9BzkNe^b`>A2+U
z`?eg+`fQ?SsY{WxEW5zarzWJkr7UNs>C%==k>jD<qRAm0M|{s6+Pr&8^uvoqj)%7t
z+a~?|*!Nh*9L-PG0XfDp({qAj22ancHgm3y^H!~y`IcGvS#CQ_%Fk|hy&;}mY?p!w
z+-~Z7m3b6P3*2rQET~%4v_atBY2;TIez&E-{kF+s&(*)aC5;M?$y&2Tw&_MiH9=Q*
zO<Th<-c=iVv%<F_-FWxOh&%PG23RJ0zFgURxyUxdq_{r*hn`bw*rt0MSH0@*Zpbj*
zcRFS8?XLrDv;Ak+6AEf}nPw#|X^QGL$&`6%YZl+ouQAj7!1>J@wyOszi-Q;QblrE^
zWm%M6E-+eHpUjzj=xUK=SWjct<ippC?bf^<<X9g0x@1O<+VpJ8@{Tf(-8Ly)tBRZD
z^DF(DvaKpRs}?n^e#f;w+I2ke%C6}-)>S=q;osD!C`_rob1Lp>PgBm6&+j!Q54?S+
zF!k7j=1mIf_PJA!KWxiYw_U3^t)}<VcEg3IbElp7{@NbfHSZN|Y9F^Bbl+{i*`}`l
z*3pFxYn5zI{?J_;_RZ<dwqN{oSKMf*A;5-7r_0BbjS~?X$q>+>SRy<$HjW(>9>mZ&
zn!Ba4RR?ud-{^|J))u$Q+3g>%*<TVg2Z7xODoQmd0*Qz&jHJnEz;6umF>^m@`7;@%
zt>q%5Mr;@|`sY>J=p!=v+~n`bsPYGD8XgoG95+m{X*$}E#P^KF>Bcl2{d4%g+4Qf_
zigK+$$ckMS5)>g55Hu6l!~Q_%^)x(=syNWUBftt1eq3?^YuNZ>w#?Z5ei#Bw!1AzT
zya}QFU>MSm(L0PBke(VHUyF$^-}pblf+{7_5PAHMe{G2W8S4LkxxV6m>L#h?8>muZ
zitgxdZIOmhQR3w_$J=K3VhQ&Xri@rgq!Ni_F8yZ2T_Q=8NP{I#5=pScF<8DPr442=
zBS(qEmFemENKen?!&oK)6G$>=wV|o$RC+LTwhJ?NOk~=bOhbp3NIaNE>dQ>!dWjh`
z4-2rEZ64sgAU@D(T9~hoR|unz@TBuwwSsQs@f_tr=p<+vjwc^7x{v&0{k*Lr1D%5<
zGR9xNJq?Ez-;uf3r=K!&H>MeeE>C?ppD=n&8YlC%it_cD9p&dEhfeP|cNm}B{EzdY
zi$%*H@?lCTjSh5P5XUGat{#lN+$eEm|GBR6deg<GWu){H$qZ)b9M04r+Rtg)iU5lw
zx+a#45nbd&rv7w_!-qnzN#&k&<}^B5?<<ie4xf`eS;mXDk;p4K>?8*{ye^0C8{rgB
z?<2>__|x%eSLSGjEkURiEsI$h9~5W2EIQg)784VdsFe^D6B`;8sby|$Vs2t4^3VYI
N@PbAHz?1&N<KJIsF+2bO

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
deleted file mode 100644
index 64499dc53d0c919b6e990bada1910bc4d40ce3cb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3122
zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkfnl3mGVG#M8qnBqJ~8TNrEH@)>>Rji%PXJ
zTG1+MZB<mLRkSWpSKRlNIy1+q9mnm|>cP|YXh-MXq_I-<k2z<~^gGGjzWuv5zwabP
zN(~YP(L?l3q7$3}r@>X@yFY6y^KBT??a_d$%#}Muh|Z$YkMf;FxhU6!!dGGAsT4M8
z0CpZ69R)&oPP>(8L_r+_xszIDm!&i0Oqi@QO%^$&pj<<g8;Left9YFyHS-0L109^`
z1pF~XLeCh<trHV$7otjG<ERuiStSko<x^_lx3D=DcCGL{Y+r{QV|Ix?*I=HIqciIq
z^64T!XF|5IutaaLByx->&k^lHUVDbq{1+ZynchDf#Jdhj;x68D2@m9tjC{<8Q&A8y
z7xk?YTAL@qZ*5<Ef7u?77W&{$OK)WNaV6_F7_nfyD}0u<!1$dH5a5D`ojFwfkdKV*
z|Hh@Kb&%(UVZ#_TK6~JV87m}GUTn`5sKanzYaouTjD@G(9}kB=C1)G@p<1INy_>ut
z+zWuhn23m30xmRo!>*SHTwEh?zQ75KGD0zBeiVka7GPXi57;er!PVq?tm|Bg?g87#
z>3R+EYTg9b`El@SK8>Os{b9JTL{sN+STae21;6e@bWJWMZt=vKOgUD!enoC{BxpR<
z3D>%~;NXFt5LOLEU}GC9;=PdEuomh~L0H52VpZ2h%zV~>n;3#Cu_AZl{Wr02xGU6O
z#-ifx_oSet3$_a0&{x=@+E0d-?c3ouGX^P-<=9cx8`>&4w$$iw{JJBKUX<ZtZ6Hag
zTMKW~Nj#Ym1FzcwxRh0m>rV|RGpC`ZVjZ4MJA!4g8?ibe95+^Q<VpE0^sVd$&-!3I
zcsK{^ul9ns!I?Z-^AmB;xk=2v1Vsif#4Jk2$Y1A?PfV43Rozqk+|ZwV<6qNB;hBi_
zJ=Y5p-B-gc^IJY;-6PUEaItVJ@enC<yiPuf?~R}<H=!3!a?MM35wo)$(dYl2D?5J%
zEwledPOQC0g58R^T4N5T-v5BJ^9`46T7Di4`yUfeBynMf14&)H0~v9!3C-GKa%k2C
zZuTA?3(LlGKZLe%sx7hn>I3#@{JaI%E0^L~RWaW)pq#rmsa(jf?kc}~WE%;}-zU^o
z$mKHMOs-e>Hd*%34B5(qKVrwQ6r#8_n6vNC@g?G3#CrXL<H7&zc>B%(nJXMH@ZEj*
zWm7yH(xXw>-Jkn${2t6Zv5&tzG8~l$CUI?hW#r<pqp03Cn(XM&h9bw^q`qP(x(`i6
z)7fsk&)n^BjDCXeCNzK_9LO8R2q;P_h20aE$R95@;a@Ft;HEcnpROCr)iwr^y}kDE
zb;qBPai?za;rn`a@T!=`ZFW1(zn2**^V|F}Ul^FqAA<>D%d=5_Cr;)(YdZXPHOaeN
zC8X^q9?*S(z_*6*?$hlVh}`GJ2fUxSpZ^`|1a@UAfjv-4KNF9RbmE_$Bg9Vj;1*r)
z=LN+e5RMmQ@(v2Sj-BisIy*XbadvSPm2N7vdsj`j?mawudU|>LXnXng?$g)Lzh6LL
zQ1Bc5LqZ1(92EBE;345}MM#lRL!)D2<Khz%lZGV^PZ^Oq^6gQh)5fG{j2)Lbe!@gu
zR(4MAq`b-b1yc(324m6GVw1UKT4~wzcRI~@cjkNLvu4kk`+mg-A6Cwrzo6=)g^NC3
zT>Y0N+NGZ?>r=CQ#iuJ*)vjK%_Oo@Lf3beU#=1?Lx70U$x%I1U+Z%W6{Cd~!J$su{
zn!nk%|G>dRhg*&uJ$C%W$y2A#oIQ8`!o^FMuUu{Y_S#>y=TsVzv*LMNdtkKF=NlNd
z=bD7Ve3Q9DZecD1O^T5FdF3u=<zZ)SjVt}#F|>cNn6HGvc8zGSJu5lT%31)Wmr>p(
zp;Fjn7|OqVN|j$k`MX_Thw_fkF~2a^kY~0wEGy1mbJwnLr#Szz_1E}(Rf6iO5UAv)
z?8&)$ohZ=0=EJ?v9(^=PTU;V&6GjN9YF}khZWV6dhM3&%^%pr&o-~+g3o6T9uolWu
z54Jn;ai{h@n0<@mKPC&Mpw_69{>Q!hPwJ`vmlv!3vtE+&wo)l*70N0%S3s>%*-NRZ
z!_&g74@<R12w)T<MM{!1f_@n#N>Zs5nJ2|aQl1o*XE}?c0>juUN|NH)O!u~#?kr0a
zv%m~V!{6x}5HOG~qLbs;KB|<B(QKfmB`Jvwk?Cw};Y)#RKP4+TIVdY_bV*LkpegBN
zQYSOMK840Ny?_oDdr=l6GzdyodzK~RwbM6cq=guAV)G;&)3>;%=}=0y?NdjMV*3O(
zSW~xzZjEOOJ&QE!(n5^sW0H*-V=Spte<Q5<CA{4}9$G9)FT}%28CjGQJKD?`QhXAV
zw+NNm{EvvY)SDKYlCAWTG?Xo43t0_{GGYc5WCfSen)GKvw8*8b{xn4ErO>lTizbbn
zW=HwyQe>(1o-DyKEh-~fDrr@+Bxyajq@J#iDWQHWc`|)!pQ^H}u}VVaQKT!LT9Rw_
p&n_zR*A*8VOFgFL7Mt>o29Kc7enI^L)kzxQ`3Fq`kV5}0{0G#3JWBuo

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1cc7b2c26517de57e1a2b4fe464578f4f87a314a
GIT binary patch
literal 4035
zcmc&%d011&7N41WZ*oJ3AQDIb1<`~>kVV0zh$QTbia~{75!r;u7Fo3_1r&`tAd0wy
z8><RpwLp~}p^Ac*f?8|cL9Gi~m-_0vHz~Hh+CSd+z3=sWFf(V)e$IT!FTNs)4HxG+
zab0}4Bo}A_V6;M7hXVC14zGM%ocJy+UuIr#dx5dQFGvb^FZL~O|4POMTx~ATg3{OI
zGPn!_dtgcq+3?6AD*%AnSVXNPMA#i5?1lzhoNoshntUEBG$bxOI4UGDii>neokpnB
z0fkXQ^6y;nEvOs;7XfO~q&5JQIZytOGmZlm1Z)=52@Ip8G-;?d+T^2bdE}6UN8-Qh
z=uj2>^_CL;UI9{8(1U-dz?2gnAC?>&5tkGk9+DKokx>U(cyL&JOmb|TvZev3a}Nre
zG2v5W@Lz=dWTn}<+?~2U+&hGFK<C&m+6cETGlAc`b+9_B65m+z8+dheB0OWcgH0NH
zgTtc_;|ulnfw4gX-1oo>D9CRCy_0kB``%5^+^Q4X<Y@+MdhUW){9E+Rw$m_4iv`2a
zIuJ{{Izdaub=Xn(5Zda>(cbuQ@TTS<#xivzN(;Kc(fYU082F&6&DOBQlLKcr)`4$?
z39u=>8)ejB;Hs51+!AOH9@`HhUb#EAuR$HKwv7PQ2V208if!Qi#sqA%)tAU&@)Sar
ztIrr{t%5ePv-sXD27@pSLMDz?L|9uOvAgjJ@T%j1eQ&*x`b2~!b8CV7nAf1)_Kw<v
z{z-&3;DE_1&Vtvao`55D1u>&6(US!=U|q*q>^nb4plDcx25Sj;c}5$^t(pf<8V-VL
z?G#jz<pM^zc!Ro&Be7fy0ti-}02;2pf}etq0?ff0i|0B4GFgFty)cjQHYX9>Ysmy=
zSM-Qo2j-)_)i$u!qy{T#e-A%De;;!^JDNErD+QJ5wqt2Q_Jl=QHWp(o#m<35Fts2I
zeA(|oXr(U$7N_eN!@q}c@D~g$?{x>()2Cs2%hW;PsYUoZZYOc-#0(;&GDfqgtU}v6
zIshI?xsFB&jhTC8rtn=S9}`VjftSVyq10FI@ba%WhCj({W3B00$T|iebMo|Cm|kfb
z>O0@xgu_h>ppAA8L(Tso8GD0dt>~#GOZ`T}x^Jz~{M~F;!gw=AwMGTHc0+(xx@ZrV
zvyY?g3;Id_Y%vkTD*^(GrOXr}1ysJ@j^;0KV6M?17}YP|;Hj0%V5edtNAtcpNSz{K
zEV<<d1NLr%!K-(H&3%U8@Ma6#(04Rxm9s^S-BgPf2R?&0y(W=kf4C2$)GXA_SJ^Y?
zWoKxzYW-04-Er{snOK~E@jCFGmIWkVT_L``T?4y9UV(r!d04$U2R3vEpl*$@5gM!_
zRCw1GslB`aBHnbu*SDf^bZ9yMhT{c1+jk7X&Nsk9--RQ%MnXnDFb5CsH>fpfwSlv~
zyP1lR&{4&onScp*9up%b<iJ9vEz;NDf-%=T0Uo>4fq7OL!zbx;Qa+f*@*SAMZZ`gM
z7_mwn48w4&y{w1%(jO2D#Z$qf>Ud^X_IIFmQak!4rwcpoXrv+7l#koazKZ<>FEIiI
z-l)jL0~?GXK<B&Dn(SjQ3B~0oOyLvAoO*Jz<~|)Kq9dais84Ujo7}<~;V!!9hPx@$
zw^P6#*SSRXPB#t9*%|np{(Oua?7<?V0KB<j;V@^d%?NFcMP#2I%knw`%v&cgw<Hhs
z?<+cyPGvkfviJ}%c+LTVMdz`OJ}X8dcAjwZ65!<*am;!!Pp_|V8f%?=4Q#<`nC7#7
zhASUl1$*+`u#twp0Y=;tW`=PqsEAHL$G3b2Yqo209J)>t!{?3%E#0Xoes(aN;hC%^
zy!HdQGb+VI?pKYH)>beVZ8s!b7rCR5lo0&rYOealG1HhX5uW&WtV2xsn?Bg{bviQ0
zuh+J}vI#8dK5pbCm;iYNt=O44hcM~-2XM|voOJw`l*q0dfbVw5@z@cgK$P<l7(J^J
zi);p1{_XML!WVl`fK9LZSGo!0^gul*jbVXqjS{$W#74~NC`4+HgoOUNDzuW{ucffO
z$LgwzKw(uru-5Efa6{uda{ZiQl+qN<_>s^<p=x8W3t35^XHO~Yl7#}hi$3_(d*<jz
z`(}8VpNTp$`Rw(24usl%1=A=uXjtQ&MUbss12Z`?aJldnww+<f$<o$?+mg0oD-Qku
zI}-C5<4#I|)56<~LiK9)!Mg>(qGS@g-oXG&P@4z)pFKyhZ`mN#k)yLBj|=-=Jw)#p
ztYOW#p^iOklf#iWcR@oxG3pBv;$LXbU>wkPg^`jp>Jyz?MRK!A3vN2|kFQglZaojD
zhfh%fZlq0Z*l(90bR@uwVUXg|nK)|+KqSv;e|iQ+YMBPq@6TAp)HL?k;Z&HJ#pc;Y
z_0|`zTB~6sR`5hctJjS%l@tdw6lHDDwGKVELv&!xCjH3?4|*F8tleVhlu7c%2NgME
zJaSE?jR)6lHSsO-C=ef7zinJVRa9T&p$+*~3mX-D=ffKdCPZ}?OPdaF+BHdj>s*2J
zk<EKPTl)M#U(=B-`=+NOmYz#-cA>LE%Pg?DIOl+Swz20<my+B=-aBk%SDQ<=9+|aY
zyiU)xG_S<3SW*($Qo60o|9EKoPS>*S74uIg^j~c$%Rd%epUKj9E8kHWb}rX!ertKb
ziO5Sup1a&CcAkvxt&&}9t=M%cUS8kMa#uws($Y^)PjAi9pK~;ObmDz2GjSUgJ86K5
zUA;M|*8Arh4cCUJsjqi0&Eup!J>|8Rig~YV*W6><EtkJ~s*1Tn8PjK9<w`2%?Y8oL
zCnA3<^8D5PLrl*8Q|XAQWfAq%sByKLu5HYNr=yz{S|*b6sAtBl8}%k7JbLhK%(-l1
z_go9v^RXRy=KfVN{m%vGci2XDm&*o)ox7)|K7Z6dXmV-4c&(P@;uogf2fgxbV+UTC
zT`86vkyI>xY2H&Fcq-xXz{_#H$3xq4Eu(+2xLUpV`>NQ7zgS*BE$kH<aFGf^d3+2&
zSh2D=8cszd;xV$sBxXcR1Px9oB$Ce5DL2x|GZLnc5;iNb+MgfSC*g(@*Wm-OB6|@a
zARYn<3R(rz509cJs_hfdiqcjRj~vpWq19h+DW#8~)kgEbgH{?JfL2UITx608FjC-Z
z01Drl3X4oBaP?RBe-rdm)$%vm0UjeUEGi;4go{(SQh@#hPXz`(<s+OZR1EA8@8gzp
z87k$Eu^+AX$1u2v%kXiat#O`mFf@ckRD_`c3G^ucMznur;QuTZ=#YE^zR&*{I}86A
zAp3uLzQTX%CFwK@bVwnob5uZmR`U&b>LQ6`mj6`M!{Y77t!T0niA5sOZ0esTZ;>cf
zB#sohiA0ei=Sbz9m{OQQpPWS^PkN{Lp`GHC$3)r#?MO81i*Z&~lc|I2Ob?oOPNi2@
zdZAp4L_YK)meQ<JF0!Wi=uq34HlhCWlEdAmL`wrCQM5ePm&$M1Vro(Pb5=T`lAuWC
zPkE$eAK51c`rE~YyGM#bXnSRNsvL@>L%d|}T$=Zy7gg!X)K&RJQ+MK|5P!RPX~4|*
zzyM|GRJ_@$`n=|RoDbD3ivE-j-Ab`M+<jgWEf9J7(E3WH)S>vZJ(c~YnoZGA>qR0L
z`sf}*_dp)#Hf3?B?NX{I6KN%?$*FYzsT5U1q3*;=ODc1!I!Z1TiBnZ`QYK4VQ92@J
zCsjtuB31Xw(xtI($y6L=owPmWpE9LKqtXOCBY8-|lH`aa%P_gzG9)1(KGi5CA|X*0
cA7^A^Ki+1%HQ&bofDeD5j{5cLOZ|NRH{sIBfdBvi

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
deleted file mode 100644
index d0c68b087a629133e6561c3090078ae686a5e13e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2537
zcmc&0YfO_@^xXS>E$v1pB42G;b*Q7FE6@^QNEq`gZ>R`3g|aCYT4}MgtPeo9DQ+{t
zg+(-)VKl;`G2&~^u>=7v>X@RNuf5o^M7C^G=hP*bP1&Nm_v^|-<=-+-+I!DE?{n@w
zCtp5asN+~JgbOopn1fINiOSfGOTsm~-{61?<>VSd58=d|Sd{=#k^oOJi~)c&0+SUZ
zghAbkR2-{}2XTl}AuTgG%%xV7+sa`%mNy}JGg2q2CF5CAYDg%+VIWHg*#I64jU_Y+
z4~C|Y1UMs@oP_%@l_>ukE~eq$V}`?xl9`?5-YSd3Q))JOOfnlmZ06E(r`=oS5EAG>
z@*bqViPgujYQ)b7Hjc+frd1!ry%S{M29Qwn9_F?WWo*cUg@3O{?bZ+3#<qL#=fOGX
zSakD~lIgjyd)jVxS$GRPyCjm$-+3EFzSj%=i5uA~xhK)=*mKOg*|R|R_0Lenp2bJE
zbRbWt6q$X=qPmOcpm*)}=*;`qQQYKeyu)dRyPdljX;g}6@79ZOuxAKO2Ln#}M2p(8
zWvJjpHyl;F(8>Bscx@*G-^FUthLQxhkuZoA`!kuA<G~=^G6~vu^}^+bEpWfl#Y~G`
zgOd~Iifo&n7XSL`VWhM9*d6P{Vo}r*JTv96sQgrkX#0s<kkhSzmZ2ORJO?vgt_w0}
z+=0HhpPsmS?Rilc$RP1`AKclS4Kiao*r!J0TT44({TUzg)xs2LIKCVYc8Smz^H0I1
z!;8_!&kRC)SPed~E)Axp<wE!QDa<B~2qNG52qvWe1wWJ?1SVO_IJtQs@iwqWmTeXf
zZFIvgy>G&-FT+LKcD{^vwChmU%uc4Q?>>6vvnx!BZ(7idbv1b3<UVHgk_3@vUlU{3
z8ky7JhNP|Ku;yBpD71bhXga#Z6Td-d@DB!F>(2!3i!U(YD}(8yyo$J+NJDRRxMf&+
zg$kAylYr-Uw}@=ay$c9Y!T@4}rqtIjn-`^cs^RUj<JQEfO$}y!TX9Tt<MNZ&>QY*p
ztZB2i=Iz;B*;{#J>7izO#=dVI9orqJuHRkV(^{RS3Eyyfm#fd7)N)~uC+9%P2mSkN
z&fZw%8#q*Fh}m}jYWu3s9bK2z=O(B)>~G7gvF}`3WpmILwYXWk&Fu-Y*e$dN$b%n1
z@=H?rk5YBBO#PdPHtpzK9{H-r%#nWD!2}*|)&NH#31GB!0{94(D35N|{?PH7b;5A#
z+AWR>kAL3y+x7HBb@N2ht`A!u$LFIGQ#QtfLhLTLTB=MOOU6<p9yQ$}RfheO=8@S6
z@mMjox}4ZAJa(E;xske)0V_5nQ*Blu+yR|{n0N@>MPxX!A3^O4jvq`G<XEXv8vf(a
z`<O=K|MF7RkLxALFUjOsEs+n&!T^*iMKE7jxS%M>|F9ITY%GO%UeEJ<0r^wN<@s7(
zU%_Yad<CCcA<*<hU_QlCc|M!&<Q&+^NkHAy0yV@hSoK_NY$8F@^RnnUwU(~wbRnjB
z-ar?<ksbv;uchY|WpR1BvZBRabH>~i#^ORN<yYmC_*PbuMX;AD7$HFr@Y@qm${VBa
zE-8w4m@_MQ6V(^olXM6e1Lwj;i|9FrF8<Vo(EahOAT+(lR21(t7UwxjiiOmPzXE@L
zIWLWkhZKvzsCZ~8_0{Ig#U9Gwvkg>U5ULIMFUS_^O^Qt*P<ozEqiCj`)}Xp1V{T<x
zTpg*&94bVLTubXuLiAq>q3H!p5;@6^@Qu8_)_+ezuvCl4@Iod1N<xx;x{$iDD#J_s
z2zgR{VxOqetMN;MBBI*ldev+3M3+}rN1I$OXKh4{#pSj+9TB>O7+s83X;1+S|Dc<E
K0_BsByng_P3~*oo

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f892d384dcee4c386d866900d99cf672068e5bc4
GIT binary patch
literal 4860
zcmc&&c|4VA`@imIIU9!~=ja@SlC6}bsE{1HNTRZ}SVPwAGZk8tN{VSAZA95aOl7Gk
zN{faz+7~9YP1{Tp>3yDK>YaWw^T+S=c|Y%aInTXb_w~KM*ZrK2`+Ho(t_A`^;3Eid
z5=aYR6o7t1MnIyG^~8rC6&vGskz)kU);Sbfn--)Ve;z&LH#i0zHP_R@#J^rV`m*)S
z^ZY9d*9btMAW+m5@TfnAfT3&*+N>`zgsd-K03fA}Q6&%Ic-mBB&38yyK!}Wh!4nDD
ze!dX_OM`u5f(4iYPoW1XJV%;oNR#zX!9=>$u0enSRq&_^fa<3pfrk^-rD<3Y2-pa4
zHj@C7%K`@VNK+4&%QE9p?zG8`RCJ7Q2nbnUW`r#ChmQh9>aU~J<R7Fkid5@I&Hdz8
zQrg^rNdLI-z=+tT0lu-m+z_e}60p=iGAu4ULPE0wQ`n4YZYDH$;L86G%)jOSP(?7H
z;w0!|O4wJ*zB|ga-M9jl8?A%#E-`d9E(6`XiSXoxG`!2ufc_R8@Sng(hmS<UmL+`9
z*xdq=k2C=E<l(gp2I((C!0Xs2xISDAj35&@;vs_kUwE)`j~FFKOJmDLCeT!=4;S{@
z!MN~A@Othrx>_+AvXq5Lr(rx0&-8#5=?E4n1awx62ey_P$n!0rV>u7Dc^O0KW-FM}
zwHy{DssekbJlt|FheI+6prTieu9Pby^~Mt5Z?*^Z#w!q2I~5{cDnNtGBG|D^8MX}l
z2sYV);8QjZ+7~n6Kvxxd%t26dSqg5;$wPC~SRnRUgMQ6hNOe#L=ZZWKl^DQ5OdIwo
z)xhe{70?Sa;ijDcd%WHhGUoC@bkq(~2cDqiadJ>ejEC@4He~6N&~oe;=&rT}myZmn
zP1ga<bOw}V`$F3TSvY@<glqft(d^<p7$0*923OjGdY>L#_sfC@!x4}e>kiqehv0L3
zE9|x_f&;V7;qe{}4JJQ@iD_eCT)83q@_H>CzNH1@BSxWj2Y*G10lg?z8$ncrI@oS=
zhUkj<SWR#hkzPCuzg0{{rzT}fvBo*V;jxDaAH@SOez6AOa_AkZbGQUu(=&;ZrJYg}
z^xE;Q4mx0PvlqgNOIYL16X<uX%a~#005h?p9a`4hMHlm)A;Zy8*#5`>_`R+GXKS02
zB{>~Xapof$2TqvDIek>@z(w<#8=z4$8l6q)#Mac|kdf$(b-LNW3*)z#u*?oW&?F7U
z$J|j(K?^)c+X<i3qw%qN$(Y&n8DvP75@VpX8X1I~CibT?7;Z_kIN4E3oG_L9<lg32
zz@LQ310QE&(o-=!PEZea>K~xZu!qyvKaJ!A7pAW|4IfGzfa_@kVJefbS4-+()5X*H
z4L5U0Yg&fAttZj7Ip@L5OoOOZeG4`Gc&t3t8dR(uq2cNnoS!j>G}avhS(`z4w6p?n
zGkrW#U;(VSG~%Ss4#vmK82GtmE$nHnB6AOUWBY3iP`y?iKIO<Ke9`5nxcTX^O!d@w
ztV-_|Gik9gd4~Ct4Ab|-FF*{K<oH8!zdboB<vY+l(ZG<qh0xnyaRA>!(4RFESNu*I
z+=e!xr;cY)>?k%0IBiCrj9!3|+KVwQ^J>!nycd~Oy%6bKS0szuKBLmnZMbgfcD&xv
z4z296gIzIc_~^ymc%tkB6drO9Tk&!O_H0=)Q9H*4v7VY?QL-n|=<rA^@AxueNYG9m
zKQ@Q-tqSAimzMJ#Lp{)u_`8^j<~Zj55N$NnF2coARuLtUi?M{??9jEr`*QhO_V|Vl
zANEo7g1bYhh1ouIg16`MLnNo2gADkY437IVR``7ud)3W)R*BnK)Nn~3GpWtsMNigY
z)X0`&ckXLo2NW#GD5*+p*OGpg`*uq*OqdTEx{1uQwgHT)&$}@1l}*eIvLr*vC=K1T
zSwOa@P37{QPJo0Nu8b9rZIQ?RooMO$T*&BFg~J)TL}aeLNH236N9t@nw%qGAdN_L;
zYy7>ZD43(mxm<0`T)2G=k6rJE)$~q8A5Mi6qN{hoWo9bO@86AIdQyive1C(WU_fUk
za8XmI2ljA_kBrr*3=I1r9OJyd0)fA`qYsZm39N9X=)U<CV!MkvDYaV}_Ztep&<0ml
zP~QZ2_OyvJ=`IGRUG_55eEn34OtoN2&kIt1N+z<6wU<^>+J>7>v18lsO@Rrir3|Oo
zMXac|No<#a8B#rU`5bbcG|1ot-d1{(`Q9B!A4?N>UK7de*nR`*r?p{?nH_kgxtgrT
z*4>2RyxX{^d=6)+XbGpy)LuYZFH)VIBrDz9#3Q};O0hG~o#Vxm@sLpU9g2xj;HbKc
zWra(f!1H<yu{@?B#@z4<`qM216HYbrcvGxd7rXTsG5BTDVzvfRc9p;fXOME;c{ACY
zqBbC3OD225+?Qzev)iz5hb=xvbr={Cub6AbwL*DlG*-FI6xHqGam_kvNx21+p`|kc
z^Er2$HOC>2qj~2Z^r*yZMY+{ru^Y>o%XX=fHp}cV-*{hQa(y=c{^#=y_NQCy8|*@6
z)bHKs=E)RHdG{H<@%62+qO(#htVaP8=CtAuVzWrk&3$P87=mT~$&=jPFo1@#qlj>M
z6*y?Rkqn(%g$JDjd~o$yIQ`TP?a$4Y{c&_OtF(48?it30PT69#T0R}Os6ZI!xhAP}
zp&FapQOXDRvmB#$e1?Cu6ROwwgtjzqVr`yZgvFl?Wjr7iF+YwvekC;)Ztg2V9U*>D
z_{l~3_RkZr2gc{nHPKq^;#!f^W<@iSb1;pmmbF->xn~)Y;@6?I+z_~y_ZZ*BQ01ob
z717SvAMjNL_t3?d-HeH~u23Z4vGb&BqzZa-fK~QcuGY1PHHEVf^}l|DIZbSa1aq#!
zsvQE<{o6Ba;QRp0xi5`hYOW$=_P+*IH%qL0u_p07e-7gi-v$M_CXv)N<T6iiU0PR=
zBJ=O76hWt6NYMY_C_Btj)21qX_$P!iNO(7u9!-%U*vWvggcb)=*5IsB+8$>Pu35+A
zjkC|T$XlB#B{U4~I+M3<qpX@`noyj-ev`bmYmrA&e(ILd`hFL(#fLU*Rhk~%*VS}r
z<2F@`wJeckL0YD|eU`RobHSz`v|RG-b1Vxt@0{pS9o*eqxMjDVPji~c>Tr6_l;F-H
z&$EZO=1z-xd?Cl`NX9<X#5aB2XOC<<Fe?RPD_R$A&$CJ!rQ>z3DD#lr_Hhn-tc$Y>
z9kUHXZl5du;mF*BmYWo9N_G^x6}c9BwUq2Eb+7bm+hbF@tK9oUbpP#^(%nawo>|LQ
zvMtN5^1qO!<K0@8b1dkme1}}y@;$YoUDY9XTFY~fM@F4#`-<*)Mr*GGeblsyzd_$b
zp%-^?lXNqc7PQ|zIf87G1p0Y*BycVgI5H`*kOr%xz{+!5D=4rRG}x7WG+1M<)e$rl
z{IH+GTGT)@9i??tf_24vgeiN-Yy^m+sr5Zga{)>s-amIv0wV6HaT35I3KP0e(k~lm
z0>8g|@#NOk2eVQTDKp;Vr-$P8Y{BG!uAd%RrO9bH7kG3$wn-NmMf7xaJh9t8)+S@T
zXJ?PY4xI%R0o|Rwj@kYOf$asJ*M6R}cShXHp6+W;=N=TN%4m39@0(ZPuyb<Y?d$z+
zMe_=s3%za(EGYA=jp)67<C%M<Z%c-T_s!>?H6ia`JRy@|IXju(R2wGvF_4=qUANl1
z110kxQJ@AdQ8IB+gPY81FT8`2$t1XCND+A<W19rRMsL9@DyD#9@|Fs>@YRx<Rf8Wt
zNlmYO*tA|s$vk~})#H|QCDSD78Pz>)J0^OcN}qA;=PP?nR}M>?)bw=}IOdvfHK`rA
zQRZEr#5Mi#d3R0dol{#)kH5UvuySyiYj)z7N9VRE&B`!4`KI@Bu4%H&%({2|*Gs)m
zXUwes^~K%#m7itI8$P`5xs*F=n|b5M_ff;|u{ZBA&6+970&GNI*nfK!XNQMG&{`uf
zh6oFZiDd?c1=81a>Q0^_qp(X~p-5LVTT?TgNsEs^fBpXU;_V}F{wkeV_0m8hA`l}f
z)zQUe5&nqHGxPgHL-f&KV9Nd~H>hP26SBVe&~oFiqm<L1<OZdohK&D7ZpeNWA7OzJ
zL9rvcDjic8Mw)#_n!{5m!SUDl|A6{;yhZ6oAY{b&2M31x3J6MlX<&a+aT>}_W%3r3
z3<D!f_-*n9j1lK=^$_j%?_&{Q0)~?ry_yh8j))d|7=`fVs5BI*_-b@~3Gn|V3kp1u
zvdHN_EiwN`>h8btCjZY~C50Me1)e5LVXq+{K%^{`7Q4F6bvGG#vxEl;J=%=KmSVAZ
z9`(@XC>AG(ErZ0iVsVhzDo8SBNjc1+TUKJR16@=7FE!PZG-K!pbRhBE?<eZ%O{W?*
z&h~WQDuFI-=z<C@7CX^}r6=8$%!~EueyE?Jvw@%c!ngq28KIsYuEF$txC^zv@0L@A
zB%YNd2(<~yM&e1D^xQY=W4zprA_DA!#J==;2|PuIGS4r4*98mc{%pD!p)T3_$bLep
zG0RwAccVy859dfP4+(V&Z{7&M+4H~M50xy+{<I%Dm6lNfb_-+a39*9{?Jsdk_<}#r
zL6UDO*_3@ry;y8bH|@gc97K89&RFhem`LShD(yriIf2eUwZ)M`p~fsFE2*7R?5KH9
zv1P)@IY~B4ucCa!l1z>)l8_u3mr(Z%w~eE4Bs}T$RD5bFeKaGUAXJO;jb0HK7(2;7
pDr%B%baZ5bT6|!1Oh{yent}0TgUR|LCuM-IA7~^1T&Ul@{sjY`8d(4U

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
deleted file mode 100644
index d10d236054191cfdbad484e674d3f07a75c5e0ec..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3413
zcmc&%cUV-%7C&>#?xG+ddzW2Y5Ge~N$U=lz5V=&56%h-?N|Pe&u5@DoH6*A6BhP{j
zG}u5PikK)x1cSXqY{Zw?D?VG~8Ka42?p;xN>L2g>-uH4on3*$k%I}=H+}|aH4Rv8C
zM#&6NF#-m90!Swo#%8<u_Pg@jtRZs+ZO+_V5V*;A$i}?y8DPYW)R7S}M%Y#$cL#e>
zr>#`fX$1h?YzRDx5J7GPMKh8!l*|nTMlz{hbd)-FN<vgx0z*hdl4>NWMT&5R=wqQ|
zju;y-1mM94PXJi=Vg{-(1`ewwg+LDhsVDRT8lQ3^&`q367qn7Q=ZRE=UfW1;1@G_T
z!`BrM1bG7#2dOs|*c-%ZVlq<V)ag@Vqtc@c67eE2c1nyUIU_~Q*VK+Aw~(TSR9qqC
z|3%6VR(jZwX*E<azmj~hH-sJ6N&hcrK$6=6sBB}QtzjBCZs-Ts&*?))p)K5Pae$b<
zVzhaS1{O~igYC)|&|I|za5jO*JqRecp9qn={)DqH%Rvx77`BAV;G0KAFl#l7a#Hn)
zB-vnS+M$HQYXiV4#R(pddWlX|^@k$46gkw}0Cmq9M4BM*%B7&S$_Di8EWt$V1*emY
zU`eDqOkd;!BioW-Vs>xPTW$guf-9k{M;2H(SECb^a%A050pdl&!MfoDXm<|)^#cji
z_m~LFrpaOPAK!sraU4wDZUrYN381uXC%V=RLCx{*aM9QVnw$CnwayomHBTTf&>DiP
zHh`?c1=bVxu&!4P%z0G>H(@B84`7IEpAUh;QDTsN8vuFj*HKc2G3=mhASF)^iX3TZ
z*|iHC=lDa&a{=rwZ~#St0JaxL!I8^`aOeyTXG)Z4MEM4=NjnBFX8D8l9cMTjT?Ch3
zsv$c)42tv0;8o^9SQ)StN=JCYwbcZAk#igR<(q?5r7JvoJRde+umc-)Pt>vgDKd+_
ziPG&6Xw}x>za$t^tHu&F2|KBR@|W;i)c~~5skpnyDhM|B*-T9}D+QZLwp2)22dWJ`
z25keEQWaBLyZ3cINq!aR050cmLJD<^Xjr}n{cd-haMiR6vrnIdmU%y;qZ{raS5qxf
zqKSpyYd4a5_MUXbs?$)l|2eV(6*2e+B`OazK%<-Mp+S*~4&=5H^J+;b%$`8Bj`4&0
z?oSBm_5iZ9Ngv#Ig`t{_EpR!1IlL-JCHpw%5FP^u(TPR91nmc_kxODDRgxzVjF~&E
zTd}rRw`7Y;^xEc!Aa+7@>GKhU{s4l^VCo>i`WYN?z0vK?-A`#T7{I{Ujqt1@5DX&x
zAlbr+csRKh79MRR&yDef{HAHdlR6rm8F>giJZ!1my`MmhIFqQ%^92jvAgDiOPKpa(
zAln7Iz|ijnT%A${q=%B!FkT?a$fx#BT_$+GG!1UG%!kznchYOhCJ-exE~w6~mi%PP
zpX9{jw@J^&K0@ofOk$_=1>xLD?(}})13Fn5K^}%Q7`!S5a_$bNd*;pr$367|;|mBq
zc}fCAZ35-6p`_VNeHin{LUcRm07~zvhhiH&=)I}q2-<!UVdq&*#~g~Ji>k*XhqGpM
z`H@#>o9PkKaog8qT~Gj;)fNCN((*~uNk5U<hL=%F;xELE2VW5PrsYt(M}{ELZ4W|g
zxEGnGXbAk~Y8EoZ0!JoS`1V!HKL%2Y`=h{em9!M&iG2AZLY_1{x{sAyAtjmt_^nIP
z0-;E+TX%hf9)?E7Jx#=n#8fIX>m@h0=xy1@%G$<OVQ25qx1Xa^e`lqO>n8);+&>*S
zXz&n^p`OFMSZ^O+KmUN?fg^%cBZEhUj1C<$cHH=|@QBC>6DLicGBqkXCN^$bd_rQ<
z^yCz^Mmr-lEj=SMD|_Z=vu4kko0B_l{({f*zW6eK;iAO_UllI-dTG%=mMvegvUt_%
zHEY+El&;_KP1(jxo40H&uh_P|vg+F%JF9oq?EdciJ$q~G>Kpbo?r&;7@I%YNLx+zX
zJ$C%W$x}a`Zas7M-1!S_KVAIi&%a!{ti8hJqqX|oXJ4I*wBl>mxn!<dkScJFzH8Rf
zTIr3lo40On+3KwYxI=5HkS1fT>MoAfj-%Z>^Di9jJ{Ro)(#jv|z&}D-cuZ^IR~+~W
z#%+8B6vkTp4vZGZ`}b2GtN9h9J9WP`>@(NGGn`P~2}zM}E@ucG#SQOD5PB(zYL4#W
z(x~Lbv~*!yavVn<m??6Fk`7bJS&3q+L{Y<Y>AM&FK{`+}$~Pn}s?!G=dBky?h^6iI
z{66VnJ6@!sPFs%j-rvPWZ%Gf+_72OBk)Gij3QUeu$EWM4{34QELW+{!idG9ud+&$;
zNYW40DJyUTsUR&TAuc6~p)ljy67PIC*veJgO}sGogBv03+H6LkGw#aDoZZ`NU<gK_
z^5DEFDSsv0#|42ibY6|E8IEts#pe_MXR#m=$>cKC|M;usZ~DaezdX2q+mj?|mrFzn
zk)*X3epAZiQhhcwbX3@2-Ge1vPdRhq#(J|XI~xCT62!7utam)?&$98XPdvZpjSWU}
zD<75(<YugQ&RFA@X<P&@5IgF#e$LJV@xm{7IH&t$aibqM;Lt3q;s);sPUX*8C8tl1
zb`5rk4jZ2l>py6EM0jWdcb*c0@y$%a10T<a4}w8p(Z%DJ+*ud@w8$_wb!<R98^!ta
z>2Wz&B0BZZapO3B1UKkP=b`KHOviiP=}}>BnuzdVO=LJ<I?gv*SKo-SUGd;%VR;7+
z*Gg|~Y{2+*?tl$carV4XR%iavfqcJlv$1rxo@ITxWk53518t=LprmNmY}}IpoDptv
z7T13aQFkeL&ztwe$Z>V}Jc9Mk(%lmemh-|kEZ<3;6JMlmpD#Tk#Xke*;p^o5aeVB`
zU5(BXq?X#K)EOCZ=}s|Pty5HLswT@aGcGkPQKPnWaqsWaUnx_`0p9#TJp~Yg|DpN|
DjS{<J

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1a786300b7ea789c918d25ecc282aa7737b43bf2
GIT binary patch
literal 3554
zcmc&%d010d7C-mC_mY<o3Melmf(j^sB1%9(q)2%|StBAYENUez0?Jlatr%3q;)p0J
z;=bUn)lsBV%TigY)}?fyO3|WqI$F0nI-j<pQtf?7v31lx=KJQG-Y>cLo_o*sJNJA!
zC*gdA8%J|~+!!s#a?l6BwjgJ6)wJs~Pv<w>+Az>f*J1r#!^yP;%e0z;n$_KNtRC5M
zz*%w%7f!~Rb0SV;;|We|uZ)82l>z|iU`%Ei!qgf{)sAd9TIm5IbEQHOr%O+mk*v#3
z=CE8QZ$|PnNR_N&-=s?ELVN@c1DVXp0zl|kE=Ws!5PVkC6i5)z5-|;oDGMg1tdERv
z3B$s;2%>VX-|t>4*xnljOZ~<pCrDmBC6^{n1|*oGN={<cP5K2CC)0$C_}sL_^qd(9
zx*VNp3fZJ2%!toO%}q-;fGfrFZlpR-sqSN&|H9^LAs;s9YR$D=DG3Hay^eioMF*@1
zgRDc7a82?Nx^&+&cywq4`fTVr6V^TycH8ZyWBYA|!P6b+@Es3PZSYy>^eU(K&uBMs
zc5S7W2fqz&9oHd^K8qVFj-khn=dgRmZSkD8RyeV+3!T}12f15i;w>2o@VIs-C2{g)
z_OEY)Lyb?6J!rA_=W4Vs*c44@u7?Iy7HV10ju+NaP&Q{OZl5?7?s`7JijM-RtxYoU
zJQ|5=cAkLmN-E%aX%=PYT8MqTMlmU?tVOp^RwK8R<Mftdk!Yg+UFo_^Yia6$E6k?m
zdtf;fF<YO6VA%*v<#KfpIH(&=yI+&uyfu=s1XJ)TI1b(WgTXX<ETj$?itoKw3ro)&
zr!GwNg_5S3_(2_mz8-f9e0&_}j|V=08p}L<uy_m%7!wNhU-hRf%bp^KqN89o_9^^h
z#v!15)KmuN2W)N${b}qP(UbCQ_~FDN*l_X)vvJ2%yrsqs)j8Hu!}t6`O*?mk@;z=R
z9#ou%kGOs#)=&3j8pZ!&Qq|Ga7my9!>*JyDRuI!?!CY`TRxh&n2B8N(lVsEF0`<G2
zseW^1FtKMTa;Q83H$q!bP9F(MIPSw-a#f=hF`gi)=nIU~bnNI`&BULIVM6K^u&R0r
zTIIsP+voc+`%XVc2dqw0E(cap^Ew0J)2zw3E;JBr|70fJ!?iM>9398#j-;Ab9XMzi
zIy(yO$?L)cRQBR6DNd-TRY~!~3+VkB({cWzK=k#~D;D<_osukRi<SHp-8EffeL@_f
zH<N96eid0btw(N_<sxb19X9O>D=FxxWA{(AL-l9Xc<Lq-N!GBpMKxvz@wZnT@cc2B
z7MUi8@!I!pv5_k^Osb*^99-sz^O!t1@_a3xI<HB*#EcQu{PdX4KQb4!mW(hpzu^q|
zqasA_e;<INwp5@Qi#I~q#euN9%!M8pZpXTouaKIw)Ztk%|3+6sMzVu1-$2Py7wNfb
zPw|wM3(X~U6LHP;A*lPaG+Oyp7le;4hKRqNXU_h!7PaXfLDXk!s77Z~)YKk@+s)$p
znn|kg_UrCg`qP(?__!5ye?ObXyXGmc_<l*R3?IapRN7E+JqZ{siC~j%I>VhCP12S=
zr{H+_CUJ=_ZouAm9AWsiyG-BV<!HOu9a~$kpu|h=LC~fJ;9Pt_q|KSeW<JnM!f%f<
z`P{znEv85YZ&5UL`ap;Hr$}I8HQw-JO@_E_<pro4c^WsAw^4_EZOt5(SJLhiE>TsE
zK~!tcF>~*a%gndR{g{@31d+8z3F;U-naS7~q?~lCl1fSjbah$mTmF5@FkYR8hjw8I
z$O)3YGdxgot+q-^e(oA&+%~Zb*C_q{cbXR5SI`188QVm*1IR)>`#VB28MwcU5RFn=
zQV5t;_-$LTkYf8dMKx|)SR^*L4_fE9eNnNA!acdOaeL8+X11CV1z)vzX<w&^y-`h7
z#mlVJabK+CcPv?M?Ui-2vuVeND+c;4VwIYmCFO&HRyjpC?_9dtF}x~hy=K>6DuzT=
zCtqydwXD)Lwz)*<zq@q(@Z|Qr(Ji}|ZyX6%1?v*Mtfa-SOrod#_XOxcQMQ>_>Wb#{
zHVYkBY_Xn~byK4UxKG&T6P>1dbvMy>b}P{b{rc{;(3=OWB&w<i&ti8y(6SATKCB0p
z2sG+J`up>XW&Jg4g4*l@mfMV8WEtE(IHba5dh?p#uT^22edc2><h)bFF70MbMaTu`
zDF>%DEaE~tTw*>>z1&<8`pvNS8VY{Fa&4zN?nG&S|FznS?lZsKG`dJW?$U@^7e1Qa
zvUc3Jp6QqC=TlZ;U0zw&&u#Xv47==|ceksd$ZGr*pAWnVq=?bP<i!>Xy`t8NSS#3t
z^fu1l5n~VU7Fp=&?<<iWhA0cwPx|}+F#ck7AyB?Wa%CZX)e{xUtQBQfnL>8Ry0Bq|
z@GKwyoR3u-2HJ4gSYv};E=-cNlysrqCT7#ADcL#V#MDHgXpsulD3xF8FaKPjs<u*f
z7>fBHljOB!$dB`S(Jiov7#IYdq~hw3kW>KizpW$%U+NPSY^NdZ02FL5uTXJcJtazh
z>m9J6cQ|AJrh;qsqWq>NrYGeX`-$5~{sU4Svr}EOB_;P&^=~lxI;_f44^W7*<C7E9
zbR13E4hQ^)?#01I`%zy%Qi{O?D_#ybCo(d>+z|+Tzdr?sIg!>!kfs%eXAwFaOf2+1
zk3&DA-&W9XF#MljK`v9;D7F9N*6L4cZ~iaItNODtNxo{3T&7~>KlZl-P}(SDd_=_f
zNN;0eDYntB!tmfVJkL)cUtxsud_J#9;sba-iT6)3oM{NdIAQ0{^TEPO=-!o3hHbW>
zK~Thx|6qu#s~6df4GR)z|9oK{D@;T+&ufKA6D^R2dtNQjv*X;u+~Ojq<R%1+njIY#
zku2P&g%khg&LWdRpT9u~ae@q^KEqbHdr3Y!CekB4Aux&83GxQ>1P&R|y>!In$pRfB
zOh)Ji*Ny(nCTE%)U8F}wbW~VIOq2mSu{XhpFJ#io{*Yji@dtl|P--$00;l8%7ksc*
z;5RVk_u8KjY=}1rHW|I4=lL<hHZWC)L1s+As9ACDb4W}^2uvi%`9l1Oi^fMGXBvYf
z@tmL|_tCs2-}p`jX9X$3!y6)L6f%G`o*SS?rv>B^I|iJBJkd`?g{Lucg2Faam-T*b
sV$RU`%*>&>tgMWD+q}fA?39djTQ|>PZo||{tqs7757g5D;pE@luYn)u8UO$Q

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
deleted file mode 100644
index d9326808f106cc0c8551356f9e11397011237c29..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2753
zcmc&$d2mxj82@(P(X_Ns+Pv^m0u*RMfgVY1OTkchDZNAKK`AMek~X0&q{$;mDFx(|
znc^S_10$dVRlKOk$l(l>VmT~_Iw+{XIF61pDuM{kz$k~;?h64b@P{*we3R_%x8MEk
z?k_2pN=*VQ#0qg%GC>qTq}gBo>h$ktooYUGY0Ds!{ak+mgeXCiAn-yY`SQASh*t%5
zL{(4=077Fi8F_>l-taEc39L2^_(-i7T4Q%Rs_X5(dI4*AO&ij@g$#KH)#E&A6KF01
z1~NpF34riMA<0U@(-cLPfly>-iGUIpn5ec|5bx(Hs)7<x#oX1=klY{MCBm>cC3lRO
zI54!c_mYo~aCmB)8l7%`wZrbWM>LQ{gQL3E<7#SjE9HENG+!aZCdP0Y>;6T^M;7^(
zO4y^a3eRDs&JFC1Xq>aQ2)w%{<9+qJ+0{G#gx_|hqW2Rw^->Kfgf|Ai!Oo1?260t-
zws`vu)RF%YT*z3*wpX9+H6r;m^L+krFrE7z8rhHVp^f{|tzjSI6wl?5`Dac;_p-0h
zhp+sEQu@hwtH%MiI$mQ!<1@MT&1YcO;oE2kSn-$-jA%!G1S&b&35N_`bZqHaysU$P
zHS?$8vt`-vbNY2G?#yMj9SMi@y%lKRYu)fo>qhu%wU-&3yabywMsp48`}3EN??I-9
zE_Ulmo-bQGE^L$BKg>1oYwqQv*Fc?!xNWx!aCj<anuLRpJM<=;O8GAA$4jF)HAFzh
zqAs}Eo(~bWY;X-s#MfqYz^V_sm@mpQq4mfteElGYKAU&~%w|3N-k|HSPdyL6vvL9q
zoKOgzpA2BsYwjTZioMV$`wo0ty$cw#k?{!QLDkgC9+<g-zrD@}KXfmLEys6rFKwTO
zx9&5cgTp$Qr{4U7nf~!bCbMgB$k3JZ@b2XEA&aZhxx*n>IhWDKoCF_?*<1@tF6D7i
zOIsjee<$DfJVMuhC3nsKIvB@|Wnx;wq3rinNdICtTr50>{86FE(PieoOg5smmFW<=
zF&en|Dm*N64_AAlk}K#GVf~&dXng_)!#|JVcAWYPz1{B=lkoNn%)$$~aKKxR4;JPk
za?>wxLI;x|v~l+zrE=^dOAm<96&%}#Ow6C3A~b;erG>29I962-2s13lhp0lsdWA>y
z?i1NJDq1aQ`ia_@{<;AJ2gME^GBi$a7#2T##K?riq-3Kh<*C%P^idh3$2@Hwn>lX0
zWXYP4os*lFUr=bBSTw1)q_k{u`IL&Ow#sSKt7cTswAa)+oU`ic8)naOHM%_@3$ki%
zxR;YzpFdLub<;fJyIGJU=eH~vwa_91EaGH%MvxO069-F(gQd00oDniC-$G>iBr>cZ
z3K<<T=$@rcm%BZNt5G@u2jC^I=L-#OY;aSkar#(SgU=u0bUEqkB~P`D(_HSYITvoY
z(%Y~}fw2cW_Q?0x0~ehXAqZW#JCjrBfs3LlC=pdbU5Ymk?-D0>u@Y0PY}GstZu;B<
zAD7cz=MUiLNu>D{8E&c#`_%++9`^p2n2#z@yE+X--d9`iY_tn3fmuC%0Il^p>(1!$
z1aKh@i#@w6@B!tXcu4i`_g}yQZ#7eER#eV{B044!f@h+SA^u~je<kogiv<m@)oHE&
z@lVMU0{8#r#y+u|q}ds*;SDNHdtWtxRwsr_rKOW9#sn^w*v2MPN|P*-B$bdar9w$+
zmMnErjwIDdS#`>ug(ys<t1L;%r!(P$GvSn_k9wevq)7`#CMRc*MRrjh<+GaUm`w-b
zT9T}Eu-GW8$R#7?=hUPWnQAJgG&yoc&#_G{t*7$FVv=9WY%(bRvJ@vI2|@vX%96@@
z*!wCg(%g>RI>}D$mGGn-Lbf1ZT3$~10y+drS5gn;Gl%S1{Pv18k8Ns^r*f)NI*C^j
zsIOph&wNO;2tAMwZKXwa<WBKZfs}8h`ifF>Fn&qC(r?miLc!Kc(geE9b<rNkl{uql
z*QCrRJxQfXq{+>+|0Ko0p^!a`Vo5S5)e*T(vNQ+INlBJk5gkeCWWY!%GO(|dZfneG
zB5{;DsXg&eOzF`CG(n7&?cTXfPJd#pEGOE%UQcuEJg3*!;Bm*A(vwU{My*u`aPI@1
L<lkR0`SbcW#|)Te

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..bc20a7699b64f53ab5cc24074d8f61c1e997b00d
GIT binary patch
literal 4872
zcmc&&cT`l@*5Bur=?Dsg3?QJ$h|-%zMZ*Bnn`ori06{>ibh}`HL9r20P(g|)7Kl=!
zQlyCm5Cd4iC>SiL#2Cdq6;R*3Gr<?XB!7Hsed~LB&78XT{_VZby=%^I+(hn1e1h-8
z_jls+_%H#$u%>2Ozgg&OlOp5Jrz)9m1N)uYmgYUFOYN`5U7^|RV|ul$X>sWNr$?SF
zyRp0UY-uka_=<dGUCLgL&)_puXM#3s%tnbdW(5FTZH#Jp2*<^jG2t7e$|nS7z>pIt
zvHg6*{Z|J0MhEdRMV{g#r1%RGijk1@PtF9oRMf!7fNJEZ4nUHH-RCR_1Z)HhHj@Au
z85WF_VVTQPTC~VXJKFS%xRqFAJ4!6{hmInZ%U@Ti%|CL%q`9P%pL5Zc@sC&)8x{~A
zv(n!;##bho8U*{VTon--8x}6fE*(>RhlKBN;XYjTe}Vb8Dc7s>ThyKSzcD3e_dxbd
zAJZmvEvzxy1|{7h=&oH2y7@Zr@D>l=?9zf~&C_Ak3<Y%Tcm(WNp#WM3njzw6EdT>~
zcr^}#^cTV4UH$=Xemnt;z&UW-Qve0O$-#z$BD5unhpiFJfrc}NaP_b)ObOG2SFRt?
zjZ%HcP*p<HYo-D*XaKATN3hsRKxgSxU{BKkc?ApTSR)6!y=Owmb}Ml1UIWV#)PbET
z4|iNjpm1C~s2P-@>m{m4v-Tt?Y_|i=+UpQmX$s-LDniY;Ww39xD(o2j0TyHhfY0eE
z(7v1jN4n3Vehz|)HZI&9FAt3klYz*w2E&RWNVeAmm(qL?oHT+wOdE1kDqz#6Qg{sW
z;I=Iv>;HBx>~d8A!S}Y1{QMzW6FVNx5K|#6nGG4bBs7<ogYG69a2sYoW%_gwrZeDl
zmM>iElZ7i?By{B(B8L<CFg3arMmE@h=2HW>>6Zb0AHyLb#sjjF3*l4TWjJ743`ZR1
zL;pbxjcgeJ9q}ZXQeq6hy-I>(cc#JA@CoQm-oKEt|6>%RjUX~y6Kr<7z@pJKv_D#m
zr=R!;Z%a+lMZGL8Yl<Trn|zG$Q9c3_m-paqg>O*zS9^&8$0n2@+lO}APY0vhk0Fd`
z#cDIpqnHV76c&6ROXz5a=EQrbC4UeZPmIKJBm80QuPGQ?dp>zGy8}w=htU*p!sc8u
zL?`TJ(89(Vs1-(`rmdY=VigX%5|&~wW)5LWr)}{g4Lqng+6;Z-O!$-@g-<rvf(@?T
zLIh{1FrHs7Lq@@ML~b&JL256@rp+%SS6%TY4>!JqN@q938ZgIrrWhW}uLfJqcW}-4
z5$EZ%S)>BUz*igV;N3}kknvgop=x^A%N13S)>4PxS~MTT4Xd%CY7%uhUxAFWC8$z;
z2qI-Kpps;3P_uS~nj4dFt!?EXyI=%<UResbxgj3Gw*XeGn7H7xk1@PA8lE&K!Njl#
z>`>uSEVsf4RZpwJPh9(imbDGw^Xn!vHIw78QxmV@o0rcdbx-ZVLk+#~s}K!yvRA>D
zXLjU-t?NMdd<|p#9fXGdg#-8&f#KKlaOHJ8@RHBwjPl#bbLGyY@7Ykff>R|5jv=1t
zc-%cqO*n<QQm}$E+AhFFCL4*95zDdox3;Kjq<4J5G&}s-TsuB#y-a<dN;A`8vn=o6
zrw3@fb~Z9n*vsH}46?#{S?rCst63)(O-7F!<jAFmx$G$Y>5K~566~j5E$oP*1sTab
zgYCU^NzP-BB^jzz09v{U%s4U*&VJgDE#1(-OpzrSDrRC7e|8;e7n{n+4a|V}+3t+B
z{Wi!mHxsS=_7LpqQHSVgMUJ}LWR}6+-5hRHHMYk46?)(>i#7HB01D#ha@xvfGMDU0
zlndjY$15J|pm!I;2*Hhe;5IKA7C$?Hw?3>w9lmcNFbGin3>nnW>4`nq;WN%^!Y(ZT
zu`$N^{W=7^Z%6O?LkO&BgP?c*bz+a3CdoaZiu;ZFV<^R)75H=p3=TAK^zLC$=XRJW
z_VrUMo;wXp9=#ytP4=RErZJ|XvKwcnyac<$TVY1>DTY(bGFIf!X13e&+1y7}1srl4
z560mHe(lt4=I<Ur`dH3^XX%#wjy<=ade$|pc5eqh<6@(noQXBdc;Q_<|J-WgXY?b(
zTgws44c0~}9X{;u(F>WOP!ifl&&zSY|D6<f1>s^RZ)Q!aAvULSmt0F?G3;OQjK%x<
zGSOt?&+xaNi1pfPquUF<A&>3SB`OZu$g-;!VR{P_3D(eU9Q!T|WA!Mrnf>LsO^hAy
zPDwkacs7DHskjJ!sfb{zzLo*4od*c-f;POR$AA%yw~-bOTEyua1a3IAPq`<59y={E
z1vTSU%o(n~qD_N$A!nZrK1ux}Fv4Fl6Q^8;l8`9u%<j3UYQLO}c}FEV{u_O0?u^GG
z7Oq6j_OTq{Pxs-GTHLhAMHN^~Y6)}oesyxeYFo@V&X*|tme0GPIge=_U{Bm)7cnE>
z_n_MswqmLW>J?_*+zD$t&uBPknINU?%lO5`MYz}Yr)cpcf;Inx7rCeAIU3E1B*Ns?
zAjs-C3UNJ)2VMgFz(akwzCH)@H0tL4FfoetwYM_z3S~p5><P3<J{`9xMHuH7A*ph;
z4BI4lHbJcSgxygVfUPQXLe<khpdF2AtnG`7vACuXMjxq+`EfMy>&Y>2JLe?o2=)WB
z8&1UCCo{0VnU_$PAPH+p5^%RGo0FV8F;gRB`MAbMs}WbB3MI(|Lsx!3zMr8kldPbO
zGGo5OHy*u@TA~jybSmA!!sj6)pI5;>`ZybOPt4-ho2!BeX9;@t>NOTN%!T;*GKw4b
z@lns)LG06t6t;6O4}Wzf5>0w=2&pf!#CnzsiS-K3j6#J4DA0W~IWn6Z?`z3q`&_!q
z(yz`^=>2qs!TxZE9r{QS>x;=C^X8_KBb%C_Zon4h9?FtERk<8@?Kq_X9!(sxHwuiR
z4{p3-z8sr%`qTJI_hOclP9r?O@*G+l&l2NuU(wf=dLBQQr^~PB{O86+&_}C8&_9Tj
z<S{hVs1E=5BSNc5cr!}hkm@-EdkbJJC5ybRi8yP5wr71_;x?w-6uT^o{G?>Al5tRX
zeg3voSq)3ElBnR@G<j|JV$X(x<Q)?Y{jO$-3R8Bfd=>SyyP+_3x4K0VOJI3ayjRmM
zL))wIXxewv+zRZnEsM5i>Ufp~^)wdkIAGw@C>B^9OV2h5>MZtZI=1uBtjPYW*;dDQ
z<;+cZ{nYDBs;m0fTY#KkDt4P}l`5y@ULjtnX}eovL568sMw-KZ{S)>3?7M_6Ip&)%
zzQavz_adhp%S?w`GnSMrt4-oN-q!W5487l&>3Bzfh1>e~!=!H38Q)9W1#?q$v&#b7
z4$j`d&^uTWeDkDNb&B4hAHwccts7zJA3neKQA_sRRQ;T)*kA5m*uXT%ty%Zxc~5n!
z!I6ubMu*>zFb(tS6A7-ex!5qjF`1__XCup~;8L1`j(3gNsPOVm)w%0OS;j}N?w#s*
z&^*n!s3lW(Y1Kxy$+5O<(~!G0X(q?pbIdmkkFrgRf6TYup*(N9>50xFr$g(bIW)R!
zs|0;c@xmgPO}1vaH7A!0E^ppWqkB~jkDSX4xs$kpyLsLp=t(l%F?8*?(=>X<2Pt~1
z1l^&0485E}x87k=(w?(s!|*b$1ignsuPFKHT;#)+9LIB|_p1|zp5Jdh_kCa8R-CKm
zbpFiGO=5YS6>aCwKD@F=(=pfSLfPZiER&GlwhQG0H}b4@s5w_vJnPaaUUX{3^{UFj
zJ7-q4<~mpZ@ZzWQ(SyC$tIxgctKY^|cd0r5>fzPzbXK<4To@YovC#2|OKsKfgWcsJ
zeeJc?Z(q{z+>536fBOPn_u?3Qg8BQ`f55+Q9gxBYzR9yX?ooHqbfim)UOV#kPv}j3
ze??cTyAb_;sL=7aD}_GX{i08bE^ISV<zrH=VfW?zoE;V%PV<g{Xd*N?I))h#8bIIO
zsf&6&NAbC?VyCX~iBLE!VKsk#$Nu&<?!$NaOh8!GJRlL#jgd?i_#rk6R#F_~CH;0%
z@>JUW4F{pjB%;I`)1f)YUsoxmFFDA*S^vaAWIr>K(17s37%9i1Sja~t+&WXZ-;81*
zfA#+lsDGEYAl(d<7}2YO0>XUx1jWR(us<<6Emfz}@)i`c1T#$O%i{AHQspm660P^2
zW8q_bhLbsMO(;o@h$ej)g)nwhTFR7v4cfm1_<tq~iadd;!0G>*ZVLZ9l<)uLNf!RM
z9wo&m#)>>4OR>UO0YIRt#1px@yL!x#K3Ph6gaIvPB1@4-w2=DJ;wTcui!1|0HX>1=
z$SP2>W=SbH(<3X9$e!-0`B+cQB*SRh1MNuUx?ab?;45mdz{QT9TgB7u0=l7Gi$qRz
zW9datCHo>ndLH6u>|*5Su_V^tW_E~|r+W~+ALd5Iw{8v9Nc>qzoKQhfBK0R3(tBUn
zM|*pih5OqEihOB%2|Se!C0=85_iw(T=MHouO<fYYG@cM@%`(Q<!z{wf(<Q>&Q<6G`
zw@{j&!{RUFp^8PxpW>lQX&LEnyCjC*5ZOD?`Vyu1G5m%0l6q6crev)2B9S#cv<;<e
z5b13*dySuQ0#y@JT8S!hJY9b(MCqYWYnBpADsn12YTrv_881C2NwBmPr6ZD5Qfeeg
zQo1fl-7Cx{mco(bN!wHYDO37rq?(|l5$PMXHZ~weZ&hTZo^MoCM7%~^KvZ;aM7W00
XOnoDLLxGbjz~>Lt5&&-0e_;OxMuIRh

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
deleted file mode 100644
index c7bee26c70d25045c9d71709e8ed0d492f391026..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3122
zcmc&%c~nzZ8vot*USd!X1YSrmDzXGrkflPED)oVEvWQqka1DzHk_1T*thKn57L{sc
zw4zneS{GEPRqVJxU2)%6>dYLcb{w};tD~p(Xb0!sq_I-<k2z<~^gGGjzWuv5zwabP
zN(~kT(L?l3q!XM0r-4nr*Fy{b^~So8^S8P#eVlt;gy<|P{VCr`l#6mr82l7Ao=Rbp
z24LsG(NQ3T=d@diMikVckUObWc9}Xu_Jqkg(`1oT3d+?)xshn&w2IeRQZru=Incq0
zPQdR&B=n4t+&VGAb|I=1HjYYRlU35NUp}P<ehr&rVb=)H!}fE?Hf9y;a}4GQ**de%
zA&)NdvL|F23ySpyOCm>!@+{FV<h7?c&41zXmFfN6LA>LTDDLDfmvCSH$jHZhG!^-w
zb5Y+ap|yDu{MYos_m}MPXrT{oxAa0*Z&$K@gAohHyTWH#3yk0S0D&%e(3wN!4|&Mg
z{%>4ttAji@92-Wf@%eox%vd3jvLbsfUmcG9TZ3?PWh^}P0eCR1jht!diz<zZ^lI{k
za5oSNV*&=x5^%o38+N`PaB&@sbNNnKlo5t0^P`Z|nvb!iJz%%g1y_>mv9@ywx(99}
zr|LDtt9cV#=f}aT`4kFw^n>A^5>1`QV#y>87X0!xqN{T-af>HTkCUUe^(%6{BSGWI
zPPp2|1qb%`gs^G|f*K#9Jl+e*4Qrs@6pYoJA69j3#LQ<6xPd{q94m6yKX?lZhq*%i
zWh~0?d{6R=yI`x}4Sl&Cs{CbW*}fhAGh>kQM2;Pmy`Zg>V@tIT$F4cz$ORcL)C7^C
zb!*^lI)SG%V&HWv5EnD6aIMXNQga%r%h%%Bw8K~yyAicRBXE5MN1m45M4yUo@T?EP
z{ReZf{)#WW4bJ4z>Ys>v_6=h8BPcX@A!boBM*K35d}^xTE9=_uxS=2UCZM{L!ZQKu
zd#)EIy4S*O+_!wn+DD|d|6<{0!a-8%c#V7<-wVN)Z$K}c;F_22BxYwjqR;yqS9<O=
zT4w*19A9&ngt!%QHO6dAz4sw!=NBQ{wEP?z_B|n<NaVr~1(CXV2QvIX6PmR}<lwCH
z-0a;v7M70Teh7QWskX%OwfpVS_(cn@RV>A`$|AmJU>SFJQkjrf)m47y@HP^hw^yhs
zm&;{-<2c`lZL+K*8M2iJe#DNX6r#8}khAZ{@x|gE#CrXVV<G?Oc<c6?GFLdD|9gA!
z^QL$>q(`HmdjR+2_}!Ryd@p}#L<B1KPvRc#k&z2YM^LqG6xq?^AqpLLk^1r>=sqL?
zO=r6CK6AIjG5RUKo6rD$U=VK<2SZU@A?%vCME+#43IA-F12?^q`)utPuBI`V?D5^r
z*ByIC#-6;%N9^s{!K-{4x7qC&|Ngizng8Zb_=2Ex{wPccU!H}s+i^1IS<~Uavq|3N
z3Ly`F;sM<{1id|ocb{(0K;#}T-sk<ref)1wC$KA13G9JVdQ3bz(ur?7ONgE9{!O~x
z#|w%9ARI5q<Q)`t9Xr`Obar&=;_TuoD&16S_pX|5-FtZS^z`!f(fa!J>fOgbpl@JM
zaL60|Lc`wdKOp?AfrBF69xO#h4T+A4jf)?ekeHM_EM<7=h<8SgN*kS?F=p(z@e?NM
zGPAOCCgo1f%b!x9Hy8`27MaY&(@ILGzuRfXdo$lJn>BmR+z-k>{HS8y`~{UCFI@D=
z;;KI_(JuXTS?}uQD?VGfs-|}Jn$OpMv2Oi_jdhzgZ>ewia_d*ywm0th`p-Le?cURr
z()`Wdeftj_Jk)ad$kAiRPn<k;`pnsL=Pz8mbool_w^#q7J*(1)oE6XG+5@AVKF`3g
zJ;x*z<eAJJatd-7Xi|jS$18U_D-Stqt6k~uj-maF#e5|UwrfOt>{-cyR@MS2y^QiU
z36;Vo!%+U^Q>y$T%HQewI+S;Oj`;;ShFr6?VOeqhs=IcDJH`2zt-r?Ss}fXKhC(Gb
zWlhe}>qLR}H6QMU_UNNY+~P8rHerNvs`gbD<yPVLZHUSJR)3Kb<%t8CwxF`y1#6)k
z^<cXbA9rfsgW0z@{(Z7g3Tlly@qgUA|Dc}we|fRmKk6kZZz+|6R-vqPa|P5ImA#aj
zIxH>R`mj{1g+NB3QlunF!|9h%f+Uqlk-1WgB;`s`xt6m?Dv-ohQIZtTX1ce{bZ1$b
zm<47?8uo6Vz`*`=5uF^z_E9BljAjEhElG)Nh)ic&3ttLi`ze_r$-$Xvql&X*224pG
zojRHE^(i#I>G^c9*o(3lp+Qix+OsSfubsXrBQ4aB9h)oZn7+k5O@~stZJ#=FB-;;V
zgEe(a=+=0q(6dOhE-loUK04W$G1`(k^*7v_-_Up3$3u%n>4kV$DI*KBV@H`8LyAvi
z@)n^IoB!eQmU`1-Q?iv_l7_HlYyqo5VMfe={LGM2T9bZEh!(ko)t`oFy%c&DY0;#S
z)9ffeU5YHR-jgL*rbT5WOC_yJmL#p`mekYrF~!u6B~PYL?Ne2DHC9QeJPLJ1Q;T!V
q0a=BG0lK0hV~NMKoFY@6(clpr);G9skUCKVJpZ6c08;3`h5rB{LO@#p

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..151008dc482639c3df72873480ca796f9ef31d42
GIT binary patch
literal 3553
zcmc&%c~leE8o&3>OlA^-1kJ<*QK*)P>`GBVOPQd68WnJX@Ser62*@e~sE9eBfN>W$
zK-7l_O5LgyZ3Ul2#HB7M+M-xXU1(LbuUhM~ty+6$Qmj1nk9W>Huiqhe`+nbk&wQ6)
zE_5hMvVQCsEz7W=Hvsmzt*PGE9rj<!8fLNB*M7>KoHH$vr_uF{cbxzIf$k#s3By=9
zt8~G587pE%w!?uV(~VU!-6#Nn<ZXmU1|p&ps4qZUmQ;BGkxZo&N9j{yBIETL@hnm>
ziYBNy1=aCt=1r<pF4#xFB7g@Oo&XSh9{Iv&QVPTn^b*s4zzTDDl9?x7q=&akrkkT=
zq<>%(ID%JCG4qcIkPtx|`62>G%b3*Y%;eY<Lu8EJptnrGi-efS=+val<P;&ME~xkp
zs+$S*ePsJzn0#%d*JNy$Ov{$y9N+@&Wqxuew|p{-e9E>^pL{%#((g->Rqos^pO6?1
zw`W~N1JncQ%?Xb1X^V>BJafq%sWVXa<3M=vNr!di;2@%?b*A_u_`T)Ye#hvRr=Q9;
zJii959n0WQd9g?mc9%)+V8pp?4a|-yZ^1*SJ&@1-6|!`<A)-2~-RP?h2lNkxAC+dY
z7p;H)4imOYLnSFIfrHBj^wEGA(Z1*F(ck7Bp^L02Q9myeY#Td`YB9ZIDZAwavb{q^
zb8ZB{@XhOBWI;Jt(QXH}t#Bbz%Y#(z#VaKiM;p-WhzIc6_z}$8U*Cf95*Nvt+Trx{
zRrxY;!xU8atuy@ja5AYncNGM`TL`A!DIreZtcR`o#~>~qz(yxac;rGjy0&y?FaO>v
z5L%OrB#+L5*k4-U&o>fDR6S4C;d_=`6+DQtD6u7?p2i?p6w1Wib^>>A9g(<RMc}jG
zjkHN0HK1nHVBp#LJ=Mpv7zP-Er2YD>Bu04#ii0-h0jI*9BCX-?Oxi=EIJnE(qO-nI
zLM@bnUIa;ezO#*f6b7i78Xs`4E|qRw^(AN+@j3cuaVxRc*WSuuc?mfz<O*>eo)<+p
zOh6BlC}8o?Bg%9!o-k=6=u`PMK>FTsGG<I))Dh?i>ox>fxrXGE%)>T<X}1y6H$EmV
z?QTJeeW^_UnrdMCqa}!_JVTsncNb+4XDGk%4&<(LB;oOJtxfxucg2g-ir_J#o_3o2
zFF6116|iY-0MXy>Hy}#+fzBUz9PCa^M|)R}g7xcVmZMwuQ`Xbmz_AP2C^aM!P8^ph
zQGfL{=p2wWIBiNDGA!Cn&suLsjhz*U^jUgxPXQ}EH|Sk@Ozb%FOK~-w_Def#JD7)T
zOB&_FzgQ0DT-a+r-oX<p%Z?L=r&bfXCAZ<!{v_l3Oh>Ic)CHfGrjf~g27q|~?J#lj
zJ|eCe5G6O=z}cKlD17K;>4CoK%zF_wP?sbI7py*n^ZP6#{PsX3xu>T3ovcOkRd;%u
zT)z{y*2bdfS}klC@(eC*TFfk&T7$BVCW^kHY*3VB5OKE90NOU~fUOBp!0Viryz-qB
z`et}Dyr^1$PAyPbEU_6)Nh(aV{puOLnmT7e3wb?UV3`0eZn;6M7ui`B%5C5}!x|#D
z>T7r^qeSGqKNR@QyeZlut+S~5whXv@IKrZFv@P(IOow+K{D_i&wE)?^mWtf9ENp*#
z7d?N!NIbDaN<27`2K!$thjvpmsC|Z-%#lwNZIzFOaiK=)i8p1f*U;jb%~u$=hEjRQ
z^AjTRvrFQnP6hhLu$rnEwTOB$|0wwERv_G5UTU>-{|sVs>rOccJ1X(|NiM2>7A&n{
zWnyO6bL;(~HH_A|33Q$40;ikyk-auQ;J<>k44X&0;NPF0ZIUmolH@NwrZUoAjwj-`
z^AP${;Qms)$)ic}9Do?5UqxO%!Sr?vZ>-2)NXrHWmHKU2P-vkX7Js>M%fdxg_8OCt
zt1MXD$1$`f{77Zt(!L&1Cri1lMa%n*Ouv2k$ks(G?fe!nDovHCcu>%4M_p6Z;x&VV
zD}%~3)l1enhu6lpH&rh!ai7^_Qu%LNR^}Ohp+<Lf+w$@e;2LA{uNY<oE}cQ89@`gC
zKAG4kW%S6xCg*G?H}{!~O<Eh7^Zr|{jmqeiyrKKUjR5#s8vWS!1l8ZJEOF0kGKt4*
zvo_w(4_H;s6-ZP*1&+m)W0%=@=a55ISB+ciq^mC&vS!=FjXraKC7ssRgj9_$^C@&%
zw`1C_h=Vzl^ZH%7y4mgZh0Y~=W*o}-<yXq3bg#a7iE31lOIdB~nGN3aM6Mg^6291>
zYbbIpKahO2e(n>I+s04lbe<|3wa9H#edfI@2j|i5n-9%}M)?@`;>w(5Zbm@HxLX@#
zT>}p1R*8+$!_%HbP(~LQW8LgW4<EbtZJcq4GD<5N^P5#h(mFT52*Rg(Y#h0eK3(Q-
z!a57?$=z}Vo$69MTNasHY*5bzB~DIA;oEI&2APzQVW4A^V)^EUJ5?U7c&bu-p;Vvm
zt8Ny0`OBB$we84{{j>2h(GXA&L|QVVf8g&BrrYM}HH{Zz1yVBI7{2kodWu0k8t=)0
zZ)&_&FWPWYY)YKL{G2ei-UDx`Z`kA3`>Ogk7=2x=s%2h4DaweBk4@IIBz|=`pqCym
z4z}7|ef)4s23|<nGvTbr4DR_d;BkMJf<>%IJDQg!l|otg2M)p(y36BWgZ10<`US)P
z87xpRDqEHIfBdpi|25B?|I6i7|5Z0hvD!|-s2N39fBc12*(#-6Xz1iHAM?dhR*>#I
zd2t$!<3jL{rwJUF&1vGe0FH~}{Nsc(4F*i)SN<F~j-T<qd&WCqnZaw|6>*buoZa0=
z;>FlWLHyo7n;*yW16Iv(T7J;z_^rU_JotTL)UZiIqr#?V#sqjL>cT_gd46&*_HXWN
zJP7*y1tr)COlEz;lIQi1&xi=~N{I=K<Mh0|U>=8qN!Ps(oi>f%kLL$-=z{BJe-iPT
z#-I=LO4Wr=N{t8?LdW((%<+w%+S4DLEKD!?!>3Y{785w#z%#gUS{^Tevb*hvj1%&W
zlZ~l6^&B^bUj`=eIY^5L@SYtt>;s&WcX$v^ayFlT?4r3S_)H^6V$X4On6KkB+2%S4
z&hk<i!wH!*3kgA*&xO!+$pM+zju0m=kM(0wzBFbeP}-;I)8}Ny8eF5((p>fF>8aWF
hS+VIE38^XeLx;N!b@NbZZ2@@k2M&<{1ml11eg_+(_4xn*

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
deleted file mode 100644
index 4748c07ab9f5c77e8fc8c141ac748fc7bb24c158..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1354
zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x<
zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;<l$a!#xUoBoF<sXrQ7y58
zs;m!f-x*BoXd($RlQsE)&h%T4&`%i{dpY&%jV$-!rW1^op6ka0$BymIDITWIU=+-k
zp1*?j=?b(Tf+h)^$zdLC-JANv%8kVNG>UJ!bC)Wuz<v>~!2PxM7nZj#IL}>w9Itl$
zb1es`ey%X35RE3TXG<dLP!d+uDrx5TyDv%1ilF4FONBHzF9^l`X-X2Lg8727MTIwr
zT1x^}$Qmu(NBkn;^TL;DGnuKHYW~Y0_3KSe|6ilhuRR+%l<_wzpWkCBg0caf&T;p7
zIw<9vxI?P~1^~42rS%#h0b>j`fH73Z{La9DH*~53H0X<V_C<>yBSIh`IR5x}XXhCn
z4w{E#R}<O~XhUcKlQzR5llx(p?AdUySsr#zmriYeX7xG~@_Q}h_u&jTj#uR%NCH=i
z$B*R8<wyN)#dqpsuqmIH$98a8ncX=#A^V87v~^BB<uk)OBeuJh!0I)Fevh|~_1bBF
zM{o0dP%K<4d`L<obn2%uIiO)uJP%5;`t1hS8^y+zNe}Rfj`cZd5cX^PXT!a7)Z_^T
zp~wm8j}+6P;GMxUkvVpUehZ8wof9WZSr`Lc$ux;Kncnl(Ew8r3I=oNHM|@1B(WH?~
uDTemq-O`P>Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..b485d3882928aa5ca25ac11f280ec415a3432930
GIT binary patch
literal 4311
zcmc&&d0fp|8-LDkxl8M9`Ca#_AuYEeS}5z7?k(*|p$+ZYyJ?Scg^40jBgztGX`zU+
zm8A%cZDflv%otu`3`UmJ@SfjIhS$s=@8|P=-Z}L<=RD^;=lMS0=ltq^%TwxQ%}`7b
z6XwCFF`x}#)p&aDzJ-&YT<8v{NM4#^?y$LRI6rGhcKO+pf#1nW0_PWQbvXU@*XEzE
zTr@P?r}08}IRlK4>2JXZ@E3>S=-YuAuP=s(*XIQQs%8Z41&AcGC3ccdq|Z=dTi^)9
zB7R74d{|&)a8e{g2nE8sNcby~WFrah?}CXfaMr*OfI9&m0I;2j7+();vNY@|1bhS}
zpGyIy&I1}h9Q>%N^Tw%TceZKu4s4812#9!nJ0c!>?<2&dK3~O~?@1Au)X6uJ68Zxv
zGxaceXi8jod~#q|aB{GE6rMzd1%}FFQ{v*4G;0XqGD5P9lB_26{|n5&<o>!Jvq<E@
zyrq@wdsI8SX{V)gAlh~@RJTi^{qStCC>;X7Tv3D0JQL_>F^AB>x@dd39G1`01(UTc
zAph9}z)}aEX%LY6I12pt{sEWYRsttt43ztdq3o#u=B<;W1&L}zw0I0O?Y4r`8=S!~
zZYVsP{1#oP83qOVB4plZ4Adh_;K|*=K9hpB8e`y_8i0<jJzR_yz)F8Rh*{<U6WgO<
zM*2YD7wW*ZN!74LBMk;v?m_3O^^wuxD$rf#0!D|=K|=iqi2qFpjT$px^=y4u-m?!T
z6oo_3PD409lLMRDYtbDQ1a)Us;f9tDG&c<bYNI1q)jfwSS0k8IQwriLYuHSf!A8A0
znE$Q@?!lLE)tMpgd@~mEChLNDmosEN`~^j)Xu)pE7~-<{P+&nr%ig_UG2aP1-*BKl
z*Bm6d9N1YD46U~{;nXD>E|pjzx5`p5PC5hK^PIrwfhAlHDS+E=;~_oS2a2+`z`N9w
zu-3T(Ho1+5JL?G4z2H6!$^HTit8L)vvqiA|nkg8^YopH1FH!%ndnnlqK|;I{IIWxn
zi8WJ+y2x58xAHB#su_Wf3@uXS8M?#vLEEXI{+qyfrU~V_r4t=;Jp=8d@~Nu8Hr2tF
z=gAeW=3ssG9>h^+h{J^kP?zah!bbj(n||>;v@E=dzAJr%Z2Bb-CGs%nI<%GKn~kTd
ziZ4RVu{X#NJcuzTtWc$^I-1hl2!|z!=y+xuvG5QHdFcT}oA(5GZ1<cH?Q|wLHK~E!
zULRDqwFPcx7s9*TL~@Yj0%F{#(R5US9_Qi7J;*xhC{>ch;dm>)QYlK%Q;8jLi{8-u
z1ayZYdea*>LTv;=rZ5fQZ1fshZSJZ(=om@sf;x=)`Y61vas_q22@pGADDh<0A;|ge
zD0#(uJY+Y`CZ0FY=+eYfFm9X)RX^}K)aj-Y)me@(z|kEVFML7j=5-^J#d|?>LO1*z
zSOerZD^kwb11}|;IvBK?^CmwD{?)Pw)}5@S*KY|RO6sgpgXtl1c=;dXjI;O2@ka-7
zjj~dSTFYx(#Y{W;822|i*2<4O4M{MjI20CixX{{}8DMdsk)w4Dq3172fZ%Ol_0^YT
z{|q(o?#V&--H)SWZ9WP+I*z_!X@zLBnS|;1J#^?Pf7-241O<DhBlFAs>B`o3Xh*+R
z(qhLda$dVL988=>G`KsXHA&fIznRy`bj{l+F6u{O&TmVIN3$1D^%Ff2@BTO<LGvK$
z7bho5_s^z!nDg}hy%Xu++E_u^j%r=^7++MLdXpF+G31s+nW3KZVp2N%YpO~<lSq5z
zj4pNG(t5J!6o1LZApS1&Sbeo#3)fAdskZLjPe{wG7+LG)b5wjD@#1dr_+MXb;8l4K
zLXBsvh=2{M{KR4AoI1^F;)h!%#C%6xtpwHG#F|+hJfBrEI#yH$CKl=3R5}%E->o46
z<~4DbXwsaz7hP0ZZ3a4@Jwjb@e=wwt_TtRB<Ai)m3Q^!U>ml#@K-ivVK@IdA#Iwv_
zsiJzkfr$2hhJJDz$us`(K8jSaP&vEDjyrwTLIJ<Qo2a`t1id~IM~N@o1kW$Cz-!-S
z`phqf(8b_a;Co~>d2FycYHIT(+B8FTH2Gyj={*}l<;6J&?>diO--)4!ZS%yp#-F2B
zc^c8GYxT*Ho-hI}@!~~17z~f@H>n)gJ_ScTH*m9qLk3igHHG1KAJaO+^HC|+hS1Yn
zNphDw0hbM#FgR-m$0K<LFX6d@@A+`F>IuUI8uVf{&>$(Yb;nii3m>3^WMkmhIyv{^
zsw>bivXwZTf05ih-ayl2#ahZ{$_>&_r&uLWJWHj_)P-yv>%!2EGX~yk640B3_H_P<
z69PTkY?PEFREbkPKu+ygOGZQj`e}KPhJ$t<G4v(@cMp}RbmYpI!AF_}g5i!l{E>E6
z!1SnCFqs;Eek@aLSP{kkvczpW&%g{1<lFBN%A=uk1iPB4LGc#=!4uhU&Rj_Hw9R~v
zZC<#TD=>5^vM*hfr7E(CY(G}Ic&Vm=EL$Wk`zA-n%&Wq;sVr-GKdX?_Mba%xR_KjN
ze9+#sW$DU+_KSF8+1BiQBbNd*zvitu-<o=sxfIK`Eh`-2yC?E`^S0${ErXh~#SYtZ
zi-$+HRrnp>zGD5zggd8;9m?}Ij!pml!S&<iD>r?WN$~qSR;(&@$ksOZKT(mt#d(#X
z>pI8Ef^F_aHc>ZDRDN4Nd9y61|Aea5mEILzmHsVNg*$w9hqSJnuwzYiz=6b$8!bE5
z?g~7%h_C0gv#2)obb)!m$(_Y}Bfc+lUGG%Au0E!HPt*@5tJm+BCmd`2fbMt9bb}II
zdt}XL=s6p5FnVP>qcEO!{^mjN2Q2Ow1z2tSK?&xm1fvy^l^E*!Aq-W!{%TDh)VYl;
z)H(&|R~GcfQCCoh^=JiDv?&0diWO=t>mAAm&5gX<fz!Cz$QD7{d6zPcD*?SlFrUjP
zM5gmt>eEmUFZ9*ZNq+!UxXbPjfDSx;x`bua{rbi8SDHcg3rW6kM7khYmZd7TOMWc~
zam>*%pI(=)73#D?Z**$sYppPsd?VL2BQmtZ-3o{J?Mv>`p6yXQJnCv)hEBw!jbk%j
zc6RARdX_rm2yN%;MtN;>FSbj0qZ>W7(z|kc{ahwydUe3T)E94<Sl`;v)-|^CgmM1$
zF<lrPDIk`w;05fLixtMr^%(Nx0lj;v>I%5eD0potU0RA!e*IXC$1giG`bBy+X4oiz
zH&s5B9e(U)kiwt~2i&(Sus-4K3*z?|grsw@K11~CN0(2f7C$a3p4F1!NmOi9QZhF@
zHk_4VSaW4+2y4uR4~!(c2TF3ethoI6N8&Gql_%rq!Kf%zl^L%=4QND#6Es$p?N~_$
zzn6z0@5j%TBtD^RB3_>_t0X^P#ZDh7$*tCsJ;VM^Nos!3m9gRR5y`!hxd#b5k)&j(
z<cI~<lAp)_8`Qs|EY7tB5hp1$GCVGrp|G$vAwCl5Ci)&bb?mXi23tb(Y26u4uk)u;
zm-YMeMi_$Oc#LCDQzGT<uo|Ag2z|F?(jUh+VB;%+|C1~T1!8@%$N&1Thkr=8|6g9>
zf9zEfHd+b=5}vTlTo*vBFH)0wc}?~i)B9$LHdB^tvz5xEQt1@@u+3d6O_RzZq)t+4
zgw!EIxhBI76WN)ARO-r(*zOy#rJN?Q5!gV|$#aKTT8_e#36oq{+aZnZC$Jq3EtPt(
zoy?ClmHSdF){Y6WnPeT}Gd(5DX>^RAuU90yALohl%ZSFEGM<An2u=dG-gwF>yZ6cY
zB!3^<_%P=PX)t?U36JUE=GSL?O`FErZfw^}U731sJ~4PrmK^M3EBEuAB=`4KQpb2x
zdil9c{WKq3EZjcIhb^TnA<TJtGP@yl^<e##PHBDkQ(Tqx#>K|1uk=!>BRh4DWowY&
z?=(6(#3mirWCZJki=4*RAE(%RDR@n$Jc%>M?C`#yRF>9zPs(K3Q`kqUtYq&YCCT1(
zC3U|zrxc8%<jJ1L@$pgiYI;3EWRMV?I431Md1z=t!qDKvM0uJ)YItH&lsw+R+HRQj
UFe|Z#KEQ`>I7|U};(uxU8z)yG4gdfE

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
deleted file mode 100644
index 9d247f8cae7a335a4ecb364c654ae9a66b3a8c00..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3450
zcmc&%cUV-%7C&>#vP%)vz3VQnf&vRD$bv%DpmIS#U<Jg2pol9VMV7s?AXu>xQHe&M
zMpOh_L@Y!FeM+$gD{907@g<5awy5~<8B3y>dsi%p{_(!=eJ}ID%$%9ierJaJ8^YNL
zcZOnon1LEb%s@K;+iAzA?3oqz=<Lf;n=|HRI{U3%-&2y?J+GwhXzWSfqS)!n8DON0
z+=Ve^tQZj^QhI@txYb53Zq)(+Q+tB%5kcN0RP{*7Pzp~FSt;a_aoV)_*d(nXi6LaB
zvcpJr5~-3@;!n9!xZoUtApmz)cmTk1IOC(iIq*3<Q3xamNQnsrl!e%v#u|JR83*1>
z##q10ST1hek&DqA9T`UP*Hye}<|LpXNHQ+~R87=d7)}=P`nXJ8LRv;_yf#B?k&GwF
z@v(9G)J$EP0PHs;YeuR%Qq@c-|BIL(h1+P!T({IPPe}psFD6g!lL7N4LW<`c*xtxO
zW6gMQS=SwITr`I#iyYu?tuw@RlcG%}dRRC{3Jxo3L4VBwz^y$zZbLx+gJg&<{~a#8
zDg{wuUnm)=fc4E*Fm)A+a?;I-6h&V+v_lQY3j?96&J`Yqy+UWpdcabp9649p19jgG
z#QG5MnMpx?nLS7xZK1u?2hOEf!QyByn6SVP1~;a_=xiI1EN>4Dq1$0Yn=I(;whNu!
zu0(b<TR^&C5ZKk625xsRNP8%Q>NcZc*?1)^{A~~T7bL*gZC&9^j0lPwccSa22rBB#
z;BwpcaQILcpw<iobwvy02H8Pq**Z{cafh{p6Rhc20n=ZX!7b<qmjW5$`rOZ9QJ54I
z-v&bNy&EVcvn}kP>_L|+fu$}q)Rvcn%k%&Ue<6b1`Ocuq7s0jyEu6S&3CGUUaK1>5
zf=kzdz2OwRoEiXjciiB@xTSFQRT^YxL_$IC26#Q`D69zF48_6y;rc29z0A1{-Sav@
z*X<tA{CGBOYH$Smw07vp+Na1m{uat`LcpcjLBQfrNG}^kR3z=B@=IUAuVuZ^e%Ass
zan}&o)MXPj*18z%V;rdP4Np*I&?#u_y@c8lTW{9Q?F{*4kTbYnx&=Dw6j8H$FM94+
zM|kM(nPi_k1GTe$L?_qXM;;wGqDUVP&nq{Q5~u$3mX+tA?BEO36*NTO@6@O?$N~*H
zTn#m<baZ59Ju$12ghkn-i29-a@W88ukZ%hliw~KDS9v6=*jNi!^OnQw{B*L5TMpsf
zyAPebw4><W(Ot+r`2bavD-sQzF~GEd>u8$V`3hZl_z_555ncQum@w}}keN&s1lm1=
z6CO8B@7(>2mVyQJo^}A9Z3zO4D1S)p>`FWuQwj4=9w09c?GJf}#uF`7G&(={7<hX-
zP`hnfph7x{*q%EOIu8ti>a(3l>7tj&VNN+%`oDy0v1LGdt4TfM1LDj)YTwvpq8CdH
z@N?~KSaozKy?VnaqNu_hRXJ9YJxhKkN7vmZ`yc3HVwXFK*y+|_G9$)|K4|igPE|*d
z$H4%7SH?lk-9dD_nUld~Z?&jx141oNNdWCPpdQeVw4Q7ZLw}o(ZigH}8SNw}{(v`q
z*-edTrx?Pq|1LW2STr46Er+GMMj+=4)^zEK*Jx{p6Qs-5ugIy5fv_)qI8hZ6h`u)D
zksV@wAhRv6B3<%N#KebR5ckLDP`d|*Bk^r-g0tL*I_UHS{)ClDn7twwrb+znyDMA=
zImQ2R;MPFdD)2zQ`W~SO8k!$qp-D`Ma{zyWDOzMAmYABETePvXYTK^8l#z9iE37*z
zJ9W0P?b6lG-a+N)<lL>hi)#-zwYx{pUY=f`_3qR6bMJor2l%kQegpjj0tW>JhiC?e
zhJ_D_7&>hDh{%yq(W6GkjENnq9Tyj$Fg`IUIb}ksE=|u(OgCg?PRh!jJZ0*%=`(U>
z&YC@EZtfTJ^5!pCnE&OX#a}I1`j2JHzg|(Wa@Fd>HATg1*R9{M@taL0n@hKB-L}2#
z+Z{W1l~?TE^UuBeDyyn%_8&NS=<t#6YL6Z}e&Xb*y3=RQet)k1{Dq5`8XAAN{I4H>
zx^k6k;^Wbrx!JU9GY}`eeuEFzg9GJF6%NvEb)f}kb?fKbC3k!|fV(tDg&%6msqbND
z_p!5I&Of+#Ny5RyRd{zfHiJhr2b(K7FmJ{YB%JcG5zfC+6SUAx@Pux%|Ib?_ojCKS
z7y->GEq{F`0O&Ww>TcEZn*E)w!Ha-vAjT=G>N_bJ!pMJvJ|qr_E;)_o!UO}Anrz50
zNk~oLNd+@VlUUZQkj?F&s+XzG2t@n-fq#@H)Qnn#S(#@=xwy(4Xyl#1^A~Q7Z$^BT
zKHzUbPtcHyTXlIh#MXaW#o`-ICkS#Qp?YEaDKfNtOO2@sX^9y|p1+J_SCHzCjjGTF
z)8Sv;e<J9|a4PaWK`t`HB_-&z427Qq4#az}3J0ZTTYDc|0`Mf{9~PSt8I?a|YhLf&
zF)#!p(s=XMlw3Fx{^dd72(3rsV2%CT^8SUue+COOQ-xBY`5!-d{-&qL|K;NTZ8u4l
zXCpIJiDiXKDS$#LH)kUv!Xo<`FP3~Q<;FKp)|X}3A^4YXAuOB4`X;ggESt#sB?@c4
zSYa?f^JCc{e#CO?h$Ufa;63n;*sv+x-Q0TPiGS!IUiQo4dw;&eu31*YcfL`)DD1Op
zUY;<{Bh-Cd<cQ4pfIbtVMn)v@`?_$P-{chB3I6;9CpZb*jQ)fvzxRQ?Av)4CEj}=j
z)$;a2cnk-(s8%^*_;6kh<~t*FA$4Ou6Y!dEhBneuA2l*mA3ailj^hn6;tL-3VLrH6
zxV@JTUrJvtK5#?^zrhA+czr=Bt2O?RAfev4*toTpo@EE})4)`|23&MNpOkSP*|;XX
zcqLrqEWZ9YMdP91HDAFJXO7X~{V3Kq%Xm&gvb+`6VTDQ>jRcU!bpd*mE+7-f5peSM
z*grPqkH)A8a$8QDJ~1;P!!?fMT(#-x`YhW?3F(GpeVVPiR}c3dYK2A#@b(9)DS&YN
GU)LYZD$;EL

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..0da33db3c308003647b082db9d88177f6ae33a22
GIT binary patch
literal 3765
zcmc&%c~nzZ8vpKn?<FrGk)XVo2r4dMM38`j0#<p*E?U4U1nUwO0VM=kM8pZAvbYO%
zV;L7*YC&zaXf03$*SeI}rB*xCTI*WJ(HXQV4R!7d#n!R?W6qf~{Z4Y%Z~J}sp4?v|
z_$YUd<|4Sa!Z|4i-2pUJlXgog51W>bU}~p)$}v-dD@r~;I>zdxmpQ$?rsM0{*lD+e
z(|h;ap34EJ<a)VqGETvXIFY><XxOej729P609G1IS{b6KlHu6tCbH*fwI_%aYLz5T
zpO!E^S)ZBAVWmuY2`O8V(>y2kk9?_JNV33TAdP}_0AxOc3kfGl2+3<{8YBor5-|-{
zau#HAc9fiG3DT{z6HH%8Tg7(ms95?J9VLPC`YPG{1qvomPGYBa`Xv;NJi!>BZAeVZ
znx3G~(#uoGASGdXyfHP~kY)k50xO>(r`wd%BFg^1i21vauiJ1x*o1S3NHGZNN9<!G
z9khA^Of)Y=b;<ka74Q8FPxlT-hg~;XvHi!v&VD=T8ND{cz^RUO#J0!Cal;9?GP0Du
zGwv92c59(lh7Jbz%eTNlpT(c7JA|^jOHjg*QOvyaEpVcs6`k37A9?gl$D51^@VtII
zCDDv#YRb>U-llfcAHuQEaV>f;RE{E#Ho_-P8R*!;3%H=3g0J1QXmzX?Jn(A6s@=iV
z=4Kg4*7bn8?I+;dl6CN6MF!Q+Egz2>>CL2+*@=EQxgWWw9HBQAibRZNDjqa?KNEi{
zmZ>`W5W*T&u(>@9%Z6hrn`?mJ0ngyH$4%+odm|WIki*CYN8njaD9CkzklM!;Kb%$%
zOV1pkzMME3N}6Zlwgv`W9DfSR_D@C!`nExx?OeR4FaY`ljDyC{dsAgD3^*?S5Uc}#
zg72p91!|O*GID;vW|z<(&sZyJFU^G8Cl<lrOTC!NZBy{3I(O7CsGfT7^b7R%xjWS8
zBmKk!3g_b5o~J3(R4>M*b`6!P)lr{9Cis-cL;k%Gru)KNa5>Z{viS<3wjU|TyAllA
zF>g@4a%HeSvxMq7^(r;b`deg3`3BGVM-g5%ZwkIYGoL;%J_50KM&Wepk5Ny95rcjQ
zZ2f2^ZRT2-k3JmF==Y^6s%!Vyj+-5W-ksZu`#AL%Z%WZ1bBmhdhb^FMj8k#W(_nP*
zr)xG37oCz6ou46jA3cz-wL2jWGg-?jUR*~unsVfBTPl)9-)9ZiSjmFR4Q$QCeyH)R
z7Eh_Nl4J}WEUL5Kga3BT5&tE?)+XKR1H69PJvMrcKa;Ae21l29;<?OR*!N;Ro|4xr
zF0y7sbx)qtIs0-^OUZD#;*K-qct?rm+z3K3o7SP}ODbX2)xNNEl?&ZBq95y4x>{;=
ztO3u8{U^E}Hi8}a%^j31b&;Oi?<JnRra&QSn276c4MEQi8))_Ctq}1>Aw(U##GL)U
z9-Y@eg_y%@sU~MRYQ7MIFIdO-u$ENgt+za|^vP$C_`C%@yD^)_JMz@mMt?@Hi5S3G
zZLp`}%n2A3MX^bDo#FnSX6do+r{G9LmAFJ7*JqdSAQ*P@0n=kxDcUObz;<@4DRI$5
z2&q~K&V{w2@T|Al^fr?u;s<Z5<NfoyF^gr;jiRa3wU@<DqJf#=?*o6YGm6iz`4SpN
zoW`G&o~J$-?O^S=as%xV`88END1>S;A5!=nSf$vk^ka?%C5Y@iOVH)ONsO^FME%yi
z4OCJxpzF(LbQ{}!6~^xvuxl%ZpsWy?@33IW&H8Gou1C3ax_X-Qv~LK<1ian%R<nXB
zJ$_t~nW>Z-tPWB4R`_$yhmR^0!vfgv&=;au$8oqVl>w{wpD-mClc|#MSn-*HT_78C
zl1>QdiLV7~P+dikwQFPn&9+^p*sHxH;u{}QU+rnZ%6&$*_pTja|1UYjR-dED&uEj3
z`;m3wnxwxz0JM;XiWU+jvPI<3hji4>Um!G-fk(^9q)AFk@&U6dzby+3D7L#MrfEyT
zVzHus$VR`diwdn&9?4glwk|HVcJMDz@zqO~_RvJ_ifOJcT;5X~_vuD{ThU6pkr{Wd
zG;b?j-PdmstM=bsQaT`{Ortxxed#-cBC12m{dX)|Hza0%^3|g|mTz#IakNA|cIS%n
zVaXSE>5lDOSvdl5liF_!GQkbT1XYZQ*1k)cFRu1B!79$gTKQ^iOl;1hyJ04v%ii@e
z0o<Knf=2~2S67Z$(EjW>rCGCuUm{ieEYXx!2d?PlJ)a(2wmo#Mv#x&0;CFV8uky)l
zr=8dCirgMn?o;Tzu4dBi*pKHkL)P!s)y=wEUpQpL-l>iApSLqE8$Zw=U#9jgaw*@R
zc&@@bPvlxrmvX5_*HGkIdC<^WpZk+&Xw{)PH_w#&77zVveYTGYR5lGH<bgaJlPaT_
zM9Zl{CVS~7670rlUmFu$AtnjC+<o3Y$!()H@9878Z*LR5xel3R7mE#>858@xc<4|9
zsl&z$KQm^9O+!A|ab}unL}pn|6+K^Wh8vujl`mkBY2Xkc;Xo}(Se@_H0g`;KL!Pgd
zR`?dTTyM@^plA<ez4K7VK-+_Pb=tClMe-r?4ox@T-1f(cKF1?%zE^k9Wh}}^Cbn>P
zH8RnWW{(b_mz{IW%i27AIYD}{_(AnSgQ_9-Ec+7agl+XJo%Z*)=Wr)Y2zm8wl^9ae
z1j>+@NvEb{W{DG16NPt|JhDwv<+8rYGj>joRZe#;gygsP@poU+2rl3y^I@-GU=XcG
z451J4ggV3_;T8i-|D}JOIuHwssMszYfjYdtN|b)34t|6MJ#zdb>R|nnNTepFC1rIo
zBQsJyMoz5*oy-FWd3fFbA4L5<$m$iIpb}-qCnp;898Cz6BmNB!bF>fN-NTQN3h>0L
zSL@D+I+b7LHiF)-C&FP)6h2C@rd5{H5x5#A5xP#vu@~|0Ao#Zg{!g%=l&S62;s0Zj
z_$Lvy|H~cz({7UT_CTe~iB*>MvjtGwt7Lps)SJ;hofk{Bg?1CfllSL&K9YO|8OQTE
zynhlO#Pdn~*d)uEKT#MjjK=bOsL+#nS5Ib^VW!|gaKyhke~6pgNHPeV5F*US<_K+|
z&=A)=A1*ZhI$>(r=e5FocAUoq_qgcE*$F}3vvo01$-=%Ng5;Mwi!>I0V=Ycd5=1)v
zS%$*iEB2YO(Vl4u!AZPcu(yOKaER!-=24R-3G*<a>4a`cy)&QLWX(TIAMI(>#Y`~9
z##o?}c#)m>!rpo{A5ttLzsW}^rGI)t@Z>CEgAWZC^esv`UGXDBE%heFCel@Uo(~X)
z!Kp$G(qn_XXT^EUBQ+T=D3K!P2=yl^b{+~@^S4-%%n3TOujBo5I?u_HtYAfScuOTa
zjVvHL*DcU>hM;T`$AVL^C;o}4a5SBopmIppXUxe?%yNxSPj}U4WEgWC<|bxjrWn&4
Z+`Wdn57nx}?EzkXpppDNiy(hk{|!(~IQIYm

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
deleted file mode 100644
index 78e853246b125d8723fcba4db0657d7509e2c93c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2667
zcmc&$dr(wW82`?_cb8oSLEtV6ZHf<8G~Oc!3=wyE!-{|@EQGkrt~}hA>>_A}SlQqV
z`5G-49H7x05mYj45KvG<K+RV#EDM{dd}TV}lv<;mbGJqD@kcXFezSYdch2|z&N;ui
z>0D+o%dp<;bS0j^6~HBGL94j_aOJu=f|lF?R*;*>0_)03{IJ}G6|zEE82F0pGO5U}
z4FF<q3XcpzG$n6CGM4cS1)+<lRIE`OwfRMAOA$*+7)cY7>_YM+x#(G*JV`8IDZqmZ
zo&X+Ck->X}gtmz&j!97}vP+~QyRHPM{p2oo@K4$lNxOj=P21N=Ybq!+=#18Ut=g(~
zD#lE)How56FEbeVLI#lJERwIM<)<mxzliwM@-8{B*Bq4W9mF5PE&BFNl&~QU=Kr-0
zwHJNF)HL6RKlaW>hy82Ei>9W+&M7;Y#a@jtJ<o?p-+mkUyx#{y;hUJtsmIW)z|-`*
z$ulANtDnHYoTLtI?LbynG1B%#2`bK=hQ2iy(TNYPqYw`>)nL-X-Oe4f*f(0RyY3w9
z?Y)Plf|82-Gzc{(JE4rD-Ec@=ijGyCr`B}R@J(P4+K>|lH^N3JX=@_gc*GIJTiu|2
zM;{DVZ-oanrSz1*)l^h?grIoy3&Ly1_oLwA9;RWfP$=-tqh>_!7Zmj82(}%)2`SxD
zXuOv~InJi&GPVm6r`>_UkRQigxjIMS4o(ohst4}uP6j7cJm@F+Q#TiN!nzYZ^q2Fa
zq58-YYNShmKA+nUoA)n72PTg|yZdr#&)PVc6qgF!XI`W?`w773t&hPu{!jQJe=pEc
zL9~gD0Z~~sb8zt%;k`{3__^;*nE8d5pniKU)zBV{x@L6J&4UlnYo{*L(LGZfrmbC0
zwRjBDm3d(Tzm|=(K1f9mfCVD!3SjlsB!O$y3h?Xb7Eb&cp^@Kdczq}lf?j!<_FCZx
z?^&v8kGu<Xh4U~n6n{r8`}G}aeMK&HeaUL(z}$2sx*SEBoev@pgNXw5hvX7}LbJb7
zerzxO@ToE6$fXQvKBjMx_c%P!_s=3E7ThvmzL#bkBC4tu$M{MoRKKk`QWQREW3^V%
zoE=bGv*g&-is+_|MRBw0(sper=_@(7sI69?&~m}pvCY_j{cdG%lR3%HYs0{f(m{P>
z<Jn!-ls!57hFX`Oys@(9Qd@;Gp#IF2_LZL*yN0V^RSeQZb>IL4lW~BMeK>&Q8xu96
z-Suf=4f9KzSmRjUZR`)xfP0;ZICIi4l_$eAAo{**Sblz@jHPUVkTmvXi4Da@0uDM0
zqc65t9dvpfd3E@S_0p0mN6GQ=^4baV8XkBa?Z#8zkqhgtWGVg;iDW|rJ1{*74|a)E
zWakJx*!hRM*mx8k7?59de-=DAkAj6>XDqbZkgfwsdXfB;n|zxa28buUKO^ST3iGT9
z1*y<dP^2@cSq1~55A_JH`pA^6ZZR0WA(WDi?Jg^{DUZc3qW7@xxM4zN6tQNc{237J
zrEmoMS@?Ki|K7wuANcWNLBe>-JeB`3=szbg{$FnDbGu2BCKm}K7fJ3ry90R2q>fx>
z=De&(+r^SLF@c0aIR(dY8Td~qmE+1eMIo2KafMuLA-|`<3Udh;%W=tM#%ueGS3GMW
z9*84u-pZE(1H&;BpO!?{vE^ipCj)lPaY`~MRAkA^xgfG$stHL8)?_U#(<VeLRb^)u
z5xF59=eME+54^uv-U&_ulg%H`64@AgOHNj(QJYxEsfj%w9+!hjWnX75SU}b(WU!Ua
zr*6w<Dc)09)mfn?Rd$*wCz~%F$IGzQm-6b^d~mZcJ(3S;rNXRDTxcZ%E?G(Rd8KlD
z{ETG2-?-VB?5*dxIKn3CNe|392@xflkP6(B*+dC9xt#PLr)WDAyr<wTapt%>ELU-g
za@#ra$r3B9!||Q88SzEh_W9CPhJ-R4hp&^^WB=Hc9F0vAq~2zA>9R7N)xW@O_E(pd
jn##SG>q;%fCZl(7SU_+<kf%}xF#3aT`~jAZKQsRZwIH6n

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1e1b4765c9bc16d822b1bac7fb9e6d103c2763c9
GIT binary patch
literal 4158
zcmc&&cT|+u5}*5(WfvA{`{6F4AZ4jaQ4kT8rB^{vDMsnNtyB{g6oZ1INE47CARwIx
zLR8d<2-q7o&?F}Il88o6j2h*A-|{f}V*YsNyz??=_uf1A&dhIS?mg`9;3@Jlp$N*F
zvU8)@6et3yuQq6?w1{^Z=^I`!;X>d@*WeNT5o*FZrT)|lKlw%4$~03z@hH9_#iisa
z7R6FE2VM5KEuTGZ1pv}o7(H?kh8IZ*2azg82+V*bFW_^6gW^L2B7>45DNLEG+>4Y?
zAz`GD{db`RhIEcVVL%V^^aMcbfs~UQodc7zF@b=C0Ov>%K+3W~PL^#YPg^mfFg7u+
z%9P5VS@YTBCww;gw~aDg$zONrLm6F3B^Zup4`9Lv#QRcoWkVA}QsTnmlLJD7l7nQU
z=tWd$KuAJtN?g38xLi#6cO<-w3!h@D{{`kJOWr?`>YeCDJs~Ake<$<e2I;VI1w@;z
zht?qx3>^*!!)gP#eT54zwraz}emw}$=b_#839xxN5421AA>k)&03#)MHUWd&U!uT&
z?_0S1t`S%fmQe32fL%}JVbu;1%1Gp5(E>|2y2lt!eB}rlans<L$2)YsZ8{XF@{!&l
zEg&8n0XxAJtk)7S*ro*>9d%IRS;NI>dC2oOhnP*a;64-$pRH5_PO%bPThIzM6VhOc
zQ3pEPs){rZHvw;x6KEbj3*xSs5dTCO4o&zBio#W4^B>=UeQ6j3wrIe)Wh|&1YDc%E
z5p<rBf$IuN(0g<$5EXV{-1!``oHb!VTQvxpOrQ$Wg^Ed?ux7Lk?!sKS>PTU?zMKPF
zJ$NAa+7YrI-A2(V3b2RJg19UW6d01wzjrSfu5kd**DUDD)dOKJ3tCEp;N(p?IDUzQ
zOFNB`OJg-?C7p&hs~kY{fe~B|E`Xcw;$da750qxrz-a0KlsGm(rOQ0HwF5(MGVX&x
z_GHj#HHD|oGGX^M9ngwbL@%m-Ln@(nQL-)qal9rt<Sl^2wnbQHWIK`D_zqsS%|zc$
zE0tkuxWewKyNN)RO3+%SO?cM4K;6!#VaOt%XbKpV(KkAWZ*kTGldE?jjyR1SF5Zt`
z>72n#6CO#eym$`!*L{ahRX;|i6UErhgiv_ZU5j&c=aEfi7on}^HPQe#%(BlIH9E_p
zg}sO1urLuFTRVuY>&9X0%B9$#w>|t~{v6}CIO3H@xnRE62X)r=!_Dks7|l(@ry6Bo
zRu;3#sDeqXM*|(mB<cvUGmFLYPM<GbDxM@AJLL!RtKJddO+#emYZr_=6T?%egW#z7
zJDfDVBmLmvERqMZV6plL{NCgYvVQguJ7pR+vb-B|P94Foc+Z3Eqv6=|gCx4-ejKc<
zw23aY=g`SZ#agrMV2Yh996CQ4=WTt1wAb$iIr}&8Q$QQwR>pV&Wex0<Y~ny*5$kn+
z65Q+0gdGFzWO>a}Y-gtlI;hi)&!~Tke|F|RKJUm>Db1`@tlj9ERQfV=vPbF(8Efo^
zpMWH=EDM2*hfbv8+Rwpo{~?ybHH4o3h64oEf${vgxXR~T@cttQ-FH2Pk`*~9^oSLC
zAaOB9>Mp}{=5>%E$NkBIjwML%vI^OFauhXBJc%1NZ^I9|I-*rWj&N$&h?Rt&A+24s
ziI(#OZv4DRWw?4SCr7*i_2b=A`W{cvn#b3nqR0WCto9CA@gq{}GzOqGCK20{HwSfZ
zmzTA=*hMNVo(}zkX;{L-0OanRA}ze}J=~d+sw4L9#F96*N`-G%BkjW-v7po-qU}qH
zdtP&{lwFuJafMSSC4M!Gt{zy6sh0Hc%r9?&6@z=!U9@K)e%S#2?V>u|Z_@*`XfnZ`
z_tuZxcIXlMqf|`9DNTV$+j<n^(T+#-0bX)@I-LEY0`oN);(jwRkv-pE1^LBtU{J0R
ztx?Lwt=kYL{j-pqbfN=WBY3EoJ?$RnVn-Mj(&2^<>b*sqdvn;E7Byh0$6{DFNfj(u
zS`$B;l?+!an$X3lU@$xHMqIzAkKHuyLze`Z*r`l`%qA5pQo1TzO1)s&gx)*hNQT#q
zGG(LSQuQr-J4;PAi>HE$lMC_Hwcn#tNhK_UE-$bSyv?fScFNS=Ed#^GSu#CVsxU)(
z33~YKS1j(e45ZDIRbE|0q2ZU0vC-ulIPO1i@n^@yX!4KcNX^>@8(t<PzTmmDYIyc2
z!YiG8Gn-ThvXSBh_FZRBKUm89Ve~kQ^Y$7i_Kq@kGr54QnzNC7v*s8ax$lT}mY2#k
zcP+y`E;jSP=a{tF3m&WPttYpElIO5nMisie8rW_Iz3}MxBRJjLPE6SOjQPpx;ZnUE
zL#jvm&)-^txmrHK{4u1Zilsh<p73`U5DF*Z#Y|?mN1EVd0LJE9SFK%#vlVrHd#cv0
zmy*|TDz&c8%#z`oMh^8<uiq%AZj;Ry?fNoDN!P2v_vo&y%@d7-Pn3#kHf)(>k@#Tf
zXwAkvHS0{az@|1kU(=~T*RQuWr%=apms6Qd-KJs#-;T)P-nz{tMuENA0^8lWWiuiN
z8~l##-cmkGeCtG+ZT;4YIV*pCFnp{&uX6rcjH6=Lu&vrQTT#!yuOYw2ahrzo4!g#J
zI@eOusOx==h4mg)HaRNxO+}5~4PK4@{Y}NqK6`>s?yzs(-nw*u;=}9x%_Uz4^ki}-
zIkc3vhny(TTRPBEwm0I!F6VNG)*W3jLmg2!23pJaC5U@YzDM`#(fLY(t~jgh6X-cK
zdgG96x?%pL#pk{|FotZI4#w3tBygS*I5PcdZ5@NwO{2Az546!}zc6TLEA}Of{McVO
zD{uMSRWG03y|gbcV1C9Zuy_{R0!8bk6oiZRZws>BC^yxi$Ra=3e)B|=@XP!2Lmcze
zL>qXs3qqX>br$Ykbf6&2wZtfBplEhsxO@4m756S5D2(u^oWG7mSr$butZ~d2F77Uh
z@~-!&b||(ij$YjC(;j}MyEw+@YX<#g@$$KQB<Kq11b@3iI@j}O-wxSyw%_J(PP*~&
zy=tOe?#<ip_!k=JkQu6ROZQ2zI2pvZR~$0HTDm0a(*DG1hl)9L=x(3K*XidYe)kIf
z`U(mAbX6&AjJP>{%yu|&QSl6k3riwmqmq)P!eYZ1CPdSo9%<!UTFOtR3cIHWuSyu=
zhp+aN+{Tlkpwbdzqsu<X1riZq7)cYlA)3sMFi)M92V>JG$Q<n<5k7m|h9Pr*-KC8_
zBy%9lH2ga<C-<J(#fHU4B#&{`KalbZBz&ST+^SF0xxdE$8`MvgEXXwjJ}W6CGAu5L
zB4|phjeWq3wN>3(l&ooT3uYMq<Gxd@G2@TfCS&*KjZhfHa<gK*3BKfZ81{_O2;;Y;
ztwP6FXW~nM|Fc+7<_c5=ZvSK2{tr>-|H~Wx$6k_hi@q{f$X2e^;Q<I#`CO5gmxqt#
z*u&yi5k`!diEKn7(L(ymh^t7HCbEeTIfz6NBHIYbo(*l_&a7-jB4=i%_3@e3B+Dcw
z0uxB&@r8krkp;c5U*N>(wrR|0&kS^Ek;si1Z2TBiaxOAv^q64N1t!5hOHx7|X2<yX
zdPOqlah`O3pGVV!B%ZA#2%Q8iWAP+Q=IkT?B!3^X_)y0PQ4r%Vfv3x%#cy2qTD+Li
zU6^64bV=%C`NYtBHpxLgW(j`23ljW&C8g7N3&-knS@dx}bhBvrARnfcHsVmnCCSWz
z$k~mtml&mu<1ciU^qX!rE#s{hiR_rAV=U7HvA@IY=wQ>8bWdh7Ms$<YnEul#j$I19
zXCv{XGpDPg&;3L;X=C>!NtW@VZA6kzjyXw+9NU+a?ic5fLgPs4Wc=y)v@3HpW0t^I
z7Y8M-NC``x79ti;3rb8(NK;P@OH7JNh*vi;pKdbUSm34#@csjb2mnv|KL`H;bMo-^

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
deleted file mode 100644
index 3704000ef5a4c06ce7bce030a206d86afe278c9c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3289
zcmc&%c~nzZ8vpKFLO?blFD4ifSpq1SK!qYj^nnnTh=>b<3x-7&A&Ddi)?Lfgg`!fe
zxPrx^pimX;C`%C)_Z2tlI67*rTiuSHj@4G@zL%m>^^ZAc&h$IU-M;<%?zz7tmW}ga
zD8`9#RpSZl0BnQ$Z7B=TMm;?6LyYz5Ne_3mGr-s}VlPI(*y681<_{jiPM$>ADGdPT
zP6QqW2$2H$b|hmcsUHYzr4sXGjW#tYLu1Hb2(dt1j>KD#JVGw~kR_=X7YjoG9&GUh
zfcHwKzZ$#eLX=Pl%ux`D86453iIWyFk~eWmgq<9T5d9$|#<9P9ikTL!eCz~4x`Mo#
zdL6unRjMu}FDFfFOiI-lHCCB;k(rv5qRY<9(emkBLgLFvUP{VO60-lo<$IHSX3fmE
zQ8QOaKF=nzr;TLz=TjleZvoVFu+Y&w8N4?3gB#~9;c2lO+-dWKl)fTVxmgEGCW^pq
zRU7E8xdABb;juLVMfWoyvF>*`_p%BE>4C6$tQ5X}WD7IauxL)MC6Of!gx2j!IJ`~;
z&N<%jc=$_ns<uCr$Rx<K$rY%33J~g|VaQwx+G}0G+}#Q6MML0BmMttz^oJ>nLt$7)
z7K|_G4dyHD;bKe;R9NSOqhcpIQ6obx&D9`U904xPCqTbz0B9eGp~-qYte7l=CBJ<K
zVWnv>X`3^goFIU*jveT_1%mqH-QZGpd)U|72dMQ!L0SI<3L{-0rgkGpt9_uH@PPF_
z>tWWbTDS>=;DU-Fu7CCk6b~1H^jj4a-o1gc^18!z$`x`7&7s7LhPJvo@R}74vCjpt
ztH=}NMFQAXs(~X{ZQ#&Z8qRJ|qNu8k;A%JqFJ^>;%WVamOD=({FSSr$jEB;~3V1c`
zAgoetfwHJzxW0x!FXr5WehYhnbB!-NdORO0FS>)P)($-_{{=au-b6+Z1bVFtgfESO
z+}cq@eZ~%|sOlyByLJHj-n+D$&^a0^`&3es9Lm6Tf*TcE@f0;g9)pg6<y3W2d$+!d
zljIkXp5SxgCgf1Zh~|~M(XZ~u318h^vw|}xp>5t}badlA<l93}Y|y2`uML|>bB|!U
zdi5Eo-TNFlgPI6DphQ)XR%pb&CTN!DqWyE*iFpkq6c>ym+DC@Leg7wfWSfdCYqbRb
zx_DH-sSU0!TnVp=a>+i5IYdyvKsvLer{M0voyaG%h1yUk5R9BX*rHV5(<0mP3cYUM
zLlAi*y6kxrVL5;x^O#0Zxjcg-zCT*rzVk6H0xJlZ*#gh1Bf%;m46+@)iH8##VA0VQ
z^8CnPSlBw5c+yCtv%?NSP>>t7tM?PA7fmB-3WtK@&}e8n)r%Arzd&va>cA%K1zbz2
z1u{rU>X;!Q%v(t9nY2Rie7OOBYMT#h4(_1WR*WMy)cc@D_XhH#&A*f5kKZDLTl$!}
z6iy>{C@z}Kp5RaKHG4p3D-+1WV1U5YDKO_w1Z_8WI(Y4F5_G?a(34;AyQ299l!FJ6
z4%01R<Zp}6t?2#8XlIU6TY~64xuXf%V*=qGypv8jlt`EC9D_X1InY%{UZJf$j*wnk
zmywOpDm0@*#l1)!G*d)-G3E4|$CGcbgyJqcrk@EX8$6LOPb1WeesmwN%V|nD8W2HJ
zw7^VgZqd!s%G$=ZyPds=5%-Wt9eT=oIrety<Lu(<CU^Jn?Ay=FyT3x|<NMJ7KmU&d
z1_pi-G$?rR5H=)qXjr%^A~GskJuGH;?1;FLqehR3ADfUkZv2FaNs~0mDXD3b(=#%&
zrex=6b^58f24mi|{DSG9&X_rC_MEx%<}diH@Nb_lT(o#e(HF%_mn|>(a>dH8R+X+^
zvv%G34Q1sUzpmKyP37h-Rn=R!)zp5weaFta`d#1sefOS*#-`@)TlTi@+kc?#;Gx4u
zjvhOH;^e8*XWGx6JAdI~#}AkOarvKDuIm5A`J?rg-DX~!jr5}HH#ld$dXUJmkG^Zy
z(R#^`6*qsnwRua39^f{ur(#>X>qp(e*6w0!_s;%|t=;FWJ-BjJzO1K=AxwA^(e=qT
z&&kws*pp_UvNH`vv$X6q?m5SAcR4L?V8qWv@+L-J%H!JGt$FVQtz?w1QApTm2{Z~y
z<A4KG#~<^%FoflJk_bCxIT(8P6bp5Mq1}!jf+3sNpp>1aO*fiw6$8-~B(Llxza+yz
z^sf02IDKE9(jq^Q2n;D1X*n8(!tm=xyoKU!GW9n5AsC3kkC1dNHX|?zcg0Cg?oAsQ
zf)S{LIBiP8w}eA_0(<Cejhh3u@5I^X9sfC55DTO-srrBX6XP!e{r}6wmH$;YNnGq8
z7RZI-OHvVlR3@=x<Kl+L2bwOHq?}T4<j01vEIR^!If`c4d^RMV4QJVOHZ+|-3&8@z
zxK$|2MshRWcg}dnFAbap&Ja8N(|!s?0A7T}L~#4id~OWm25g#T)!Y!0z-@UxtK{}m
zl6_-*lH<qZrG^ijk}x(dgX8DK;`pX#;eodo${WE!U^3a`mmIH)z9BK*Pn)VrXEmHY
z?;fXvDWP*8H+nR;kKzVX>U`*?c&6a95ThpEPnR$@Mwd92PaXRkVahLRRM&WLu`s<A
z4_C?%eX44Vkz=rtYEGUP%J1|)B9gB+E;gpl(zEPPZmG)VYM@UHADETwTYzgafD^(+
z&gbfnLo_u7pM~(6IC7jF<|nWr`KCVc!E#zyhUF`1QsR>|o%5+D<b>y8KYX5?KDLil
zxz?B@LE@y><W9{?GkT}!_1>D?TwT7?w6t79rcUeR<KN$>zf!7}0lfY|69o{9|J3{k
D8Fh$h

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7e9af93b02ee386e9ac98f1c9aea4130450e7a99
GIT binary patch
literal 3781
zcmc&%d03Ry8h_9EzHjE6VMd1K8)t+RR0c$lO%|1uZ`c(LL1l4)VG#%rVnAGK02Km3
z7R}W3qTrgAhT#&JY@(S)38myU^L8^tE4`0(UBc{qLu6j-kNZ6LxgD5u&U?=L{@&%B
z;rs~aBiuNebLA$6aUu@P0aOdToonMS#q#;GaF27~Dhl0~HoxCs)t=7*XUVCYI0g9;
zaUyFkaA1e_RP2y807%DT(kl?AV4S8MS#z}76GRHNN|K;kk`$M!)2DJ+sZbt9$|j_V
z*06sDrFJ5*0f&Kf3Nio?xe#&DVMG*y^`j|}AfP2;8W=eXBJwdIADNu>l@oQLnGFsC
zhLRicBzzS+w4-9FUv-ourDs>krip_wB}`GJrypu=(!b^8AWzCl%*#w(k{g$#%hk!#
z$RsT(E-@=3FLQ}8%Y#_C7Hck2nrqnlzwr5ox%ZlJUzvq*M@dyk_TREUq|-rFvmoc-
z9Nd`NKv%r^Gu%5k5uJ42Y{HJ70efus(D5U-!`Ov(bokC6P*dnR==G_g_s4acIJ$IE
z>q5tY+r{gUNuR^-Z#<43*q_JlSvSSYE_6X>(G~Pz-7VxkJR8?$CBcKHU6jOODzksf
z1vuFH5RHZ~Jo%I>dNWjxX1AY!_cb}Fqp%wnHBnHxY(DOe4218z?qbzj!PNFP8F;-L
zg&KEt!k48R;c-O{W$RLaeSIb|Y3r>-H%}i%ZfPxaZLvrcX_zM6oNXn|7<rZ1)_xl-
zoe{JB;dCsUh^aiT8G^^$hqLZqOK<dhGnOC+pO;(U{{B#q#{@#gNN0R|Q4^GX*g}04
zITcFV7UR3k4C<MA27G<(=p&=<LZjtU{C05wj0~6oCq90TvaI|G*{yyTOap&{f5aUG
z%GZ_3;{1TkE2ZCye@*nTMi1X~u7WpCH!xdw&d0TlZm8M5iSpR{h+1&IkDA(ID;`t4
z6gRkB5*rqJF|Fe7nGDw$>I2Zj<SmI%&>zB>7cK{<<0nLBmk_%9Z*tjmyTNtZ^VEpt
zGKd@~Lw1`wp>IY9$~Bjuq!wT1vWqLKiuD4?MhjpZ7GnFUhnd7PvCQ-nDp-Gb9$N3j
zz_?FGFmIlHj1CMxOF12=rdIX_!+SY%ar2B|^t!&38ouyzYMJSmC^PK~{L=R&_}1bA
z`pC?1#P<2(Y}5D9@XRb+cWf~|z;!Xl-kr(l8Zs312i~@vksgiqF1><BYDSA|(;U!1
zmzv@|UZ(eFEyVfvf>F;;SIus(IwL8$5HC4|zLURZ)hV8CFqOUW_!=^E*n-?FYedqh
zTWsc4R`T-2W_Ew1Ejn?|70=&hBFULBPSj}nHvaonJN!z3rCGMgJNWfQ{cKdVmdQ}n
zgPqed@ls|fG(3JC&tKUlE-__9jXyr1^Bb0<uF{EeMV}+&Pl^z|^i>dwuHA^@)@+5!
z&qu+YN+)_$xGn2aQzbR&XvQzb{tI23?#+(<q7S7?ouuavdx_^&7bzsok+|{tcy#|{
zCawPX3WPsj3=u~^WzPMx30=_LgXoj5QLT=0)Ycu1yG;`<OeOWW?z%gc{`e6jKj=dD
zze=a^?v?7RQ$M1s!^bcto2;pXfh3GdBG{B0j&Q55P1<3823o?miA!|}Blk_Q2am76
zV=O#sP@UKvTUk|6;*#4CvaJvtiw}sxau=}KcMX#8o0CjVjV>6*td_wril)vUxG4TH
z3Yd89Wcao*OMIdFGidfci{G!gK)o~7#?)@zCfa@WWvbpjgz6eNu9$qJQn6j>$8-cG
ziL5+J(Z#^IOxD&A^_>1qR7xtKYvu97{LL#d-jIo%uV4tu4UtXp2$p=^R4<LO*dmQn
zFOr^}62dV73r1aUQ!q8BPAT+yr8LvzIMrXF<s478D-<39Ecp|9UcnudIB|u;o;(7y
zV5Xu4VAA%mHV$N<e*OfZN(Szf6QOVzEhzxZs{D2o7E!FZLv-toqSa!>=#b5Rb*qX^
zRPL$0t#zwkHMP-}s`&afWfl$*`=Z<Gi_3?*CVa4&-&wNG$|vVWZ`;mSt48^)V%6GR
zr8Q$h);q+s?<(71A6_4_MZ0_L#_`dIQ$KItUB1aBzP(iKzo%l0M{4)Jn2tT`wtB-g
z!MnsDEA8;Blo)9Ly+H<0RsM~5x)Q?~tc&ccYOPl0+|U{T?hv(C2&UZNdY|B(+fMLc
zgvCBHgCeM!K-CkSHSPwWWp6NsSp!fa$Y_9|T8<dELM0nVe00^>t)N%$V+LenFSpiB
zTEQ6<CV?aw$pBlf>YqLUvhICl8zu|@?Q@w(wkz%wRoMitpI8=QH)E4)a(81{r2Pxq
z{ff;!*UlLixi@@w#F4dg$44EC?_BR$KF@jnvFz*JN6MpI;#vwB3$Kc3x5UIV)pn6b
zD`GrSKHd?y!E0TtXZodgl6#J>o9~_VWoxm8cjbaf`o2$gM;@zOI3@q~wayLRRg0cq
zdGF5ko?}&U(+Y_D2?e0)nMT|qOS1x%^f>WYbO;q#r^geAtnyQ{0@}j|6~GzJ;4~~~
zLrHb)ku}oci+i3kfQ~UZq#y6n8If1|2-Q*8${*2YZs?JsXIhyB&po1N&q0r5MTZ<u
z%OA1qH{Egr8)NB3s=Bm>M6EW^4awwCSQ4OQKyFaEe$#WzWoOXOdg`H`(sYlu=I|ho
z2zmO=m1L$Z5jaJ%p3X?q=Zcdvl7;t}Jidj)l$B$Y53Mv;RGMBRVfp=C{lgbFoC^r!
zq{ipfflbE1An28|v<@+*evpWS8xD~0>-a3C=rw)^2#aEMzv>8-;@MRK@`O@^)SGFl
zHGf7aOn)O78OcjhatEo?08;*dGy`Kb+r|=7@htwIc>SZ4)fJwg66q6DlQVT3O&FFP
z{vA=XvkrU9!jF&*@WiU8>&}S=m7nH3g5Hx8;V>r(^A)UVmGN{0GKYzWp;NLOLGWz^
ze52$4Ocs<1wY56zf6OBPA_Dh+xx;_iO;X-;P%1R6^02KXfZAFm<0B$oh?+cju~a)~
z7omCbTAt@;laJ75@O(b6P2q!hK85#BF|KKe!c1Z2&-0<eNaUfBNXDsNKoEfV7gmgS
zaq%IOz*!-J?4K|6fkH>1c|J_&v@wEc+~-{dIX%IBmRmy9yu75KN$D}s5vjs{W;ls&
z`HQ48;`tjvND!nA;u)vH-c$Da*eK5>Nx><+POvw+C+UzDGbBgMoh!)Gg>Eo)W9WnN
zq?0vmt}e<mD<*nYR&2B}b>eUKV1Co*JRJ`y7HPkWM<}H>J1KZxuCT#}h6(ycrTiiP
zvqO#bCdDRgsPsG^AWVZZgc@YW22FY~!F?I2$wWbk6ggk0KM8U0P{^9rXh|X`*^&Ji
zUYkF7PR3vbE26_2D>-OnOmcADn0icRP#*DP%u}!@_{3B=nn6ua*<|Z-Udl_(bxzFA
ocGl(OWaZl|P0rD$Wi7FB^P1o`!Brh*4e;A9oS*^1$v?2a0DmGVAOHXW

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
deleted file mode 100644
index 0d95d679984cbe1a72e4410ce8b15f3ba58a4945..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2845
zcmc&$e{@sT9sl0@-b-FeN=wO0lM*lm5{kC;hqROuN?%G_+NLFKfs$e^X%pH&(maxs
zw#uO3a77WdM>cdj#T^+N=r{v$M@qGbLq$DK*$>#xHgU(n#546AJR9Ps``#A<bif}w
zXJ>v-a_{}#?~nWaar3!x%Z>RGFQrNbmqaB<0}w6qa)wu*X_0#+kkTY$wj`ohBWWx}
zkR>K$rfVwz&{-*tA|WE!pC%TGHy45?)ojwWd%JyYUEXMyM2({HF=E_J?8SERcAm`H
z7z#)fa7e`o0QoYh*o7gm(5yTMIs&{-TLuD38EN7ZGcib}>%p4Dgv2CrS9Oe-z|FfT
zOq!`CKq>bj*!S|+GRR8yg*$pe{_a?t&l~e52XGPawRMDpJ)v$jr;EgRp4f*t`^(hw
zUmSVMOfQSlR-H>4j`v}d|9k<he6$`S2ba+!U5EG``@e#J9GpvD&e=UdoL&q2GWYST
zGxosDRvYhr=5sPq{T7@pew;thc6!2{x$kg~RL_R|-~I(c{9E*;T`!U^X1z@d!WXrD
z@4W*jxBP*;`NNM%LApYR!#?<8WG|=7Disbq`5qiReuYd27cKck9@$@=Od3v%!b|oD
zd3E44-7><#&c0RjbW;WVt>_XpJy*r;c|{LJ&o`4Jdr!jqgS+7C9T6@wcQbVq&ldti
zQ#2P}J52HeWBl+ojizbSqNLr*l%(LaKM6lL@hKQ`h_L5M4b{)3T#s}Ns%Bh<Qw1L;
z{q>`Hf&r4Dc;gscK2QzGo(c#~%b}m%I|AF^9OHh|R0@Nytf7~V3FKFGuY<#3<6pSr
z5*#sXpg-NV7^W?*h0$M5<qSK&B(?{hhm?vh;e)n=z&Y}`uv7+O&mezv^%I&ak4NGB
z$*u6zYlnpIKeLJsAIT@jW{q%n|M*|rJ#Sy&O2;y_Gq!D@hjPzoH?<ZC$F-jb!8{N5
z21KFc$qv~3(GnqTpdYed9Mw!ZL&&AS<6ZOq2=W#!<TCp8(Dcvk#P+?DaG~~95=+w&
z-<U)AeQq9kw514iyC#E>)k<fT9u_)YZxL!nO)zwL1sTc~VD_&wg#D+!CeNmy;<BH8
zjJyAA6&#H$r^jlm$akZITzc#8xW1J4Nht6~y8e>~>Bnm}^Dor7iFm<5m6W3-Jru@U
zB%RPoKjE{b0qyl)uzEdAJZmJ<2b|S_6Zh4-gp>;KSwAuZT3%cagmS!~(TcjH3Hs!T
zDXEjvCL1JUy2+d|#WHo;9oEe0GiKWCv$AI2c~^GMoZP(pg1hGy7R@W3U$Vfluyj$G
z>?~hgQCYR5x~A4uSHINV(AczW`HJS1o|aYjwBFmc+S}gY^RMab3anih40VS=krZ*g
zJ|ZZ5G*+sBp=Sg7?v<3(zW)2>-S1QYHVO)SOH#5op@YrnV4!1*KUsmTPoc~<2Zskx
zp?E|A{kPGnPT{w4Ha9#-&cSyw-F$RVI0xGox+O*b5C$tK`iFOvEGTl5)Qz=|)YVTt
z2Rkdzfrz2l_Zch_jYHUy@c^I;1-coW`J;R=5RGa5K|ccrM1ldWaqlGK6BF$}Pq7cF
zP<dmaZ~1e#N;6#y?-1Bh9exHtNN2Ck*t~JrYZ4O@lbFzDaCY-9I=PC+lyDR6$J1|z
zW+~UOE$Hv=jKwkf4Ptzk*neiUe<mTG-R%7~XWlA7^NvC=X`&rn{*YJV5z1}!h8MwR
zaXmM=40DG<Y8qc<NfTEd54TM3dj0XS(zqPVnm4JvV6O(n6B0eKWuSj6^RFKKpUHwz
zG+WHB|M7G64ZR)zmmB+y-6Z3A$tc=I<Db(Is>~LXUT$n$+FTO9Sf;1>Tt<bmQ<mih
z{4%PQ<zCs@DObvJr(E8t?m1DRj;+dNxth($CuZc-Wt4efj^w5H-IbeLjEjo;C5$ic
zWn%>!(6ubP*x>XqR+Y<nj9=GYP@mu4yrRcfIe(pJWn&kUhuj!n|5_YWf90wZ3<4?c
zPhB$EIQwWzb78lys#Eqdd-Zrs2g#G*8<#I<d<`4osjH#K<5`D$&X~8kFzi`bA8uKx
zrjF+|#Ph3JHa;FK7Savzuu?h|U)72j6UfyrrmrgXCeClDR_l$$MoN@kmKU>SRgl#{
zX{nsQw!NSaYciK9VUc@T{V~LNQ*h6zT4Lmw9m+kjvp3!+HCSeaI<i{HxRIJ<d|yr7
z6RPaNbJRSUJ^Dve)|$8`n5>F7vcAV3%jr;*9B(8N?zL|4N1}mnw>7_LPX3%cv&#Z-
N?E|AcfE)i9{~HhQtA+po

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d112e179eb9bbb8372b524bcc40eea2e5cae3f76
GIT binary patch
literal 3997
zcmc&%cUV*B8h^iYPLh+bA|z=7Q79t}QBcIK%?UC^K^zRV$dXMEP`tIa2%@NniUSk}
zLnQ2NTSv<f6crV%2x`@$t%@C@_SV+bbCP1^YX7*;bD!Jyfp5Lz_rCA<J;^WLl+PF;
zA+#5c^Axg$pbsEkv>^_2fT!~I8vS#hZwu`H>4z^DdPw!R9u|U7AQamQNg<!MWlP6{
z6**`lCI_tmfM<@;BZ)BlAjn#fR7gl1fz6kQxxqnkp@9)Wi4j6fM2hN==m?Ve$;iJ8
zC9$P*10e?Vz^5kwdM_7F^rUlOay20ka1rQoIdagWwT$pJ6XP-@TQNDP5tHao8xdW>
z+o!bh&k8UMzXP&%;&la9I-$yt<*{LLNr9n3NkKZ1^dd4eFhm)%JT^{UQ$G@Yk7NzF
z>=q{dFCgDpX_H8JTI4Cbz)}~Rul3{r%VqsCh;mGW${q@OjxPn<f{}3hvNk-)vVi+-
z)(~PNK!-|{u<1hqSmd;U@>>f4I|CTd#US&?NC>EZ0bjo?1$OvkDDjg(;X^*G+C!mF
z6ST1?$z*7*v4>N8UBNVV6byL1MCYqUL$*|mtdE%k@xTs9<xG&L5OA@|9JrPyU?7k~
zR}>$%1dNC1jZQGDCkhs=GzRWY1GqA~5{h(_VT4^B>a3I^v*TqT*ysjk$2%duaU8_`
zB!XkQi(to6DQtRv7@Tv%U`d53oLkI>13k5<kB6Y)tQK4yW&kbC?*p-K2G}<|hIDr`
zm|ax>lCm+7k6FP!qXt<0stUe?Y0&K|#QN4wfh;cpNUB{Sz5h0fT0RVF2y=){=R&qE
z3)<@I!FII^c>m6Z#!PFFWwN0nHwZdz>cPoNEV#7a9(j}&fO+B>c(%#~%<kF2*TLCv
z^JN^YO!9@?^dfk*q8)NvRdB##I`r+q(6dkP!pMx_U|KmA9uB0!p(~bP9;c6<<o_EP
zhJJ^VtPsS<nL)v4ONnpM1@>q4vseKZGqJ+*O2N!%KUA{fIyORP%Gn=jg`S_2;FQD1
zM456imi((Ly7cVEutG~Wd|lTPZZ-N*=Wz@R&OQH}zvtC0q*eV3Ym8tUo9Fw0%(xVR
zXLts1%I3b0j-9c`CO2mB6GmIJ8}ur%Yd0*g10p%=^cofH`tUyKyH&x85f_5?2kk_w
zODH>Z#(UVpRn43mu2#s%F#~lw&u5*>7^lO(YXixXeAvtST#(=XohWc^9%S_z!=Wr&
zqG6AVp53-BJgwHFSX96Oy5%u}H2>x<is0Gu&eo0REZDk+&pkR98#QMQ@$}1BLUR5(
zcuz}*dG~X0@^Lrb74$2FM*wQF(Lv1@{jgh`mgqX^XJOd(Sd8~eCxrcW4n6IQCa~gF
zk{i=I39{FatCb_ggP(_DXq^uke$NIT+->HKx{kqD-g`NmzIt3W#S$Fu{KztJ*oIt^
z+_a61w%}JP&tYzRQ@|#@oGq$VlJSo};d=K^(i*NR23DFj=;8$4QQpn@#TQsh6qDhH
z1|_F!>t#4Pp#wX<tqYev(gBOjIYdC=S^RXb9Xk;}%aVIo5EbVM-2U+n!`^~v+zs*T
zP#b=PW8?J`TK(WE?Azgj4>x`Z?6{vfYfRgrGCBdP*)jzk*~QnH+SSM!Hh(m<T};N5
za{|#U_vJj<wQu0gh!vLca~rUv^_85ZyNp@ROI@*`6+uMRTA}uNvuT_eVeZ6bZZRkR
zw_eo!MG7X(X%dY8dNVA$SYzU0;ef=u+wm{w732OJ@1c3a33B=if7aGx{pfjaJP~U!
z0wSDBP_$Pq9)1GwoZF+J^P_#3-<Tfl!|x@K9|Ra8{}?V@)GI}+4Ki_g6~cHw$XG_F
z>af+4`}!H9zUOw;g<&Cep6ICc3$&?a1G#aY3R}?{&A!Pp#DaNdcxQSNbnh!eU6H}y
zc;1t^`n?TybNmT(Ns@}4PL*hFG@Q!f<!5kAvKQ;N+*yjW1V>P+P9$6^=)-rhjdjum
zhG=KfcKqXm-=NcpIqZ>*J|JImn_ZyYpmp%O-C$cfL91!16dZU9(EWi&SnTgwkUU*S
z^zjZM>izWr_UglR+*voY@qv@^X!xx>WIR`a^)8kX9|>l$iv-Rn+~*V4vq`LBK?)9c
z$%(7v=%cxU8?R2Xxi7A8WA2Esn@QQM{3+{M&sMj>S9e{}{=8hh^2WuuS68_Jd|P>r
zPXz4Z7v9<`A)iZDyc*W%qar;=wm|>MemK)oOX%(&V7_(gNZ~=!R+z&1>uZu=u2xJi
zA854fCQJnMgulFi&~z3&Q8A|}9D=(HFj6efPg#SL`c{5T`D@ZRd{ehvc|mHrmUwJL
zPg6nKdOZ_GhL|c`yTQQ9N9EUCn7-*f``}Z#RMEQ4MiUe6^)wf)-(oCJB_)c38QaX<
zvaS4E4sO_P>0RiyTT#4m=SaW0h~AdsO*wW;S~4U~hcb6NL|jz)w;tM@HzB_7)NZGe
ztbJ2fKDyW2TC(N92PqiWaE5AYfm4RQb-)SLwj$T9rtW)Ylx7#t%pDtf^+f6R60dy4
z21Dnv9i?+sKBWO|Wjo7#Yl1uWIG69LT=;py{i|)|In{wpsazwMirm_eQ`y!F+bee0
zhhHdk&vU8V(-_@T7kRC{GVe3xt(FZ&u2p-#NW6Ekbm7UWeMgfY_I2dBR_{N)>gl8V
z*G^U+`07)@c~ZBU{N^;$z-G~@nu6B#d^3-|ZncGNo5T)L*H6_Jo!nyLv{C9_cd%o-
z)!ed09d*TLa_mD+?{%*~)R{LivHyBUeaZO)@-*IXj|Np&k=u5g#itueFO@LpoW{Lh
zsD_|hHV&dIZ7!G5=;)J&K9|#cHRw(ie6yvOk2IQI`4heTO7$D)sf*u6mmDtfstrlw
zS<Ix->)bl`YtUtxGN};LkR-RE6oVTZ8OIQ&utXvzGBJr078AzM51MPFa75RQMLou{
z{Nb_&H5+^LnZ1)x$c0m1Qz!DMHn0#8h*&gv`k8r`wA|B7?;uaKwOYjFpbbNw-ae&`
zhLEQw>%SvUdatQeOjulal7<q}^r;`os;p$)mNb2OJN(~(zN=bErXz^ii6IeTu|YzD
zW{wuv8wAus>RDkRr#U1zV&b7K7qT_RL)jf;_t#|zF(KP?D&tLv)fdB%RE$O#ydVoh
zI=%@LUk&`9#e#^GNF|>CV|M%xQPuy;^OgNmFG<vAE+S<lJ<*@q5~*05^6~NVove9S
z;(Wr65l2cvQPdpzpOKjql}stZDHn<gr<}soXA0V27PE4qD0gP2_k%ONQ!f*l2uvW#
z>!Xo&b`$A^^K3U}@084p&dflErYKKlQ1~-jwVtwP_R+y(XO9W?U9ddVWm2@ipHBp%
zkM*YW`&Se_sN*@QgV0IPqKT(oGP)uDi2=TjaiOl^R1o8@hNsJ+#eZ<`Gk-p__h1H1
z>FU%q`9#xailiW4N2R~tY-NC-x^x<Ej;213c|-G|n?=hT`7o_i#D}^rNMaO}yC-9>
zHcB4EpX09XH{EPn23t>2Gnl1o4AX=70GCNo!DCm_JsHOs(M?Wf`cJ2*xfJ?Lq4uOR
zr>mp&{*)qFb5H7I886y~Qg>40q%Kl(t}fj_)@3=3qpp+jr{mME%++WtL2MEql(1}h
tSkkDF`1nyl2?@$%lNDhJiIK`UlQH8*j~Q(*@st9*{)1x#fH(ca{0|mk!UF&R

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
deleted file mode 100644
index 81083b2ee90f8208676313b318811207bed04c6d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2537
zcmc&$drVVT82_D1DeVTxqqk{Ybxfe5A_XT#*=D#!TfhoPraa=*LMu?{ZKcIBjgxF6
zb4t`07dxjD=i+OCY~tdgPThi!I3H6t+5VAX%ydqpnOQd7aOd2vJQV(0<_Vnpo$vYk
zzH`1WEf5ORco`qXt1LXnBL-0D-W9nQ@kizf9()Y1PNw`QUe3#PQ;?(x$~1ix0A(CU
zca9Ja`evfzW!iMeqqG`jnN_qcth9P6c}~r#*Aum$=%?xx!+FvsQz+m$&@GC7fXAT`
z%46$sXc|RO(od-=+$Xuz_+PjjgWDU@2RA9g<|?mq+C}d|o7HQLsG>xbZDF~qrp_r!
z2^=KqV?=+4(|^h7`s4Id!$cf1t%fjub(n?kCQ{Llh1@?%Oz$j#BmFJ3b_wV@vKYzR
zC*az5;kdsx5jQ)>p?tiG?A+zTnguE(Zt8^Vr$oRIi+hnAnr>I2bpPMD+`9+ziVW;3
z*5aePQCPlNAS-LbIfpg_$J<kJYKs|ToD*>`zn7eEOTh-6hKxI$0NE`A6s}pwY>?q%
zTLP40;}EOL#J7$ptSg;@>eX48-Q~di`sbm1FBVtwTCpS2htY=5$k|pMi9fOzs@2(u
zKXMlC1Ct@%QR8srd~95-!<xT7$4tK+i(1Cu+-q`d>-v=RJVVg&&9nG1Iu>6Ye*v<s
zGmzTx8ya)sk=M2z+P!IL=8~}Wxely&)Q0PL1>c)_uIKHSur^->?LIRaf4N2+b<t>-
zCBWIJ#D?Thbnf4e<P|0qJdoo+(>Umx<Y@6*ar){AoV*l@OCP3^+&$Zo;5mbb%T0*C
zVZi0G4Y=AXqTX8sf8!23T6zMT%)7BIcRG4DbL8R5pD|(8D2!>HguD0N#Lg>Yks!v9
z`^~?Tk+$o^n?&Fi<JqIULHIi)ncZq%%g7jPjf^Esm-ntGlzu{bFAx$!iEbhb8`aRT
zC^JbDx$5n*<CRlKuU};|?kP_3H!c49X8rW$^_4Tmzn`~zqodRD#e#N!jp>sg#DiPJ
zlehkS<5;shJ9*@~Qy<r!uF2STVYfH8wRB%s%hEHqmmTeHueYRpaQ;Teva{l$tNL>z
zbUYVm%k07LT<NS5S&P~|vYIN7H_TpRXFWh4d^b^FRH}QFdVhre*HG5919KVjRS%jY
zhr(ILLp5tqqEQ4X);dADpG%DgHfvYRaLqcRuXSteVud#_ZvyRlDq8Q4rtSKq_hDiV
zm6*0E9U8f(ywdKp@-jM>61f4>Em3D_iOr<56X~30aCLckKzZ;qVS4>_rvp}Qd6`+u
zG}0Zg3CPh1!MjKtN&Uw$|5D(`lZBeo>a>>sc=SG{(fGf-RQ=O>N$QIcYEG|EKZsBP
zTAe0bC@jn`$_P9xO|#6vR=Qvm1Yr*SXKR)q_yl8xU=oB1A*({-8L7f-Mr8>?4*Sw`
z@Jmk;>0ut2BO!m;1cPBJC1&PjvvZb@?K9a%T?>MRZN?IIl;lDxJ6D%Y%1bLNnpbBt
zO{*>`F05p7X930cx`S?#zbwfK1wmK9pF}d*AbU?~QMza|R|r;SFNLS+&{YyV7tWo_
z&be$0q%J`Z;HjoOqt{xL?kXwHbCniLsnd9K0{P{>Iv5Wv7F`4Iuu>Y`HuF3$69_pL
zrY|Y^g7N3%NcE=0rYl%_L72fva}BG3yVNwzQ8uZb)?_kMqDA(x`csI3OQAfYWJ!_J
z?5Mm%F!}=bB!OjCR7a3188DKP4DhAYOPr=U8b`{L*;D`2lwD0g6Ph@;wRTCJ-8-?|
n?Vf0@t#$e0mfCAQRW30uZAwa7N~+eP1N#1OSOyf(A9?=(tu<#2

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f3f7d2a7d0cc4ecb1608dee4e76b94cde19d2b83
GIT binary patch
literal 4018
zcmc&%d011&7N41WZ*r4>M9obTKtL%0abXkOP)XPk6~PLq6vHAQTS{28?obpJ72E(p
zR*_9m5LyM5T?My-&jq#0vseLN>-wI1efK8C>a+dhec$_D&zH>1nK|eD&N(yRmtQ=j
zo)#2N*-;Me6iI;*02r)Z78|!}x1Rp2qp7})1!Ygd(-hlXZ{B>jHe|0&-nH;)HWZ))
zlxRG~rT7$!V(Hod6LQE#L=IU20JxtcW`qdq1Cnm2OW|T`z~YNVoB(-LkbkIL8A>4`
zSJ(oDolp`gA^$9tcs!jWpb(%3K0N_I>mHPyJDmfQvk8s?4g{JUHV(8jNn1W`!N_p+
z#JCA5?cXyNkwYgU68p_YNLTUcE`7)|QB{F3EX4^(dhz#Vm}mw?2gXGPN5%RF$z$c3
zVe}#_$UiVTA}%sYRoELSd;}%uG07vO`(I%GVYy90s#EAreNU(=&gVVLBb;{31u<0%
zP;F>6o?7-RcwRLLo|=%!Bgf1J2S*>omulyM&;5*WkD_O=-t`=~J|!J5_wVMJnq9=U
zxsC-ESMPyH{2V&D=NNoxd>&aw53*NuT?FkLZo$ri2hj2}1<H*M0x#<iU>uW~M0r*h
zsA_rz#{hRUt<4;kxoX1sEhoT9NepaV-;FlZV_^G=#i-lM5&UfP6p0QwV|mRQz~-no
ztUb^U?xgGi?^0v1(PrzA?UboR*iIeR;OQD@5!Q(3CbL*xYiDsY6*}Aqz1u`i%R?ZT
z0ExU;Zb)Mi!s4g~;B5E;bXfLr`v)cy0-y<|tZoD^%3Xn`uOo=in}8mESr0aMHey%2
zW`dOFW$0-G0WZ%v18i-L@H+jcpjHr%Dw7?6p2KW#;?gKgu>CbON;(R(9AAUK`&R+X
z)*Oqb>;M^;f*)VHoAoMP34Ur%1p7``6WK+JQEsgTY%s3JtV-TuOU~cNW;Tvy8z#r2
zYO`zXwSG246Z;n;!rT`-3zT45Rv=h6;6jX8zY>f;c7iqh8iY^(ML#xqF)*Js1Jhoq
z0enYhao<oEiSLfiA>`E&{KAS#!R&A!SQ38==}E@0{l#B$-&_=9QmfT?d9)u&c<u}@
zzrH=Z(AWj*nC?Om4omcR>$J1o)@o_&d)Etxn`A)?K{|`;{eX<TO>$OWZ6M3NM#Fo}
zd}479j}tR-EUQ+l65YIQgz|)Tgo1YjrMI>5z0+hwgs2c0jbFizC*nc%yS-@fs%G{^
zErO+EodOf8SHg=alQjAFO+ms`Pu5p=ouE(d9_ar~HrU>y50pwFSKnhaX_mf&%WG{w
z%NIR{y>64q&wscNL%HL*=WA@(3)436BYDTL+I!>Ri&K%f_|h%lF(Vl)7|6iR{a6pX
z<j+A!D1c3-ny|Us2lZ}UI?R5=c2scB5^?`|0R+Fi2w&U{$5HVr@$H!x@H7uYf|sF-
z1-uDDaHA&~(r*eL+;8SixP?HYM-Dqh9-vn`-56N){Y;FsN{0n(OQfT-17mM|2wZa3
z1JmRRmV4|HQt@;x$766Rudlw4OC)K4VHl2eR9t2M(;E;=Wz)bws*LJNy8;>}cc7E$
zU6|>q7Czs~fwY`|1MBQDV=1xogq@oae&`a8nLpjF-BU1wvstkbwqx~d(|NzZH4kop
z{kxs8QTo3ER@6iGhB2o>Wq1ravST`|-^<su?W!Y&e=!lXcPF6e`TlT@YaCZ{^9RtU
z7jLZasztGzD%s2S>JyI3oRK_Uj#qs{X<Ra#!FC9C#jkLR*@~Av@apmPNH?QNVDq<a
z;H&N<25v@HP?U8VJGGz~^WD-97mUKmnQwiGv=f8yO{M~m9H|FF?Mq<zylO0@4PY5R
zP6QXe-j94Nu4{bzSqwRAkv8;=;DBzeGPq`BDrQ#&5%(VwLg#D^S|c79kuu>YPFGDZ
z3aoL54P)QJtu33$EelFfd}}!C4xx<#xQ5t;<XCWZe>v<53jo%a-0>SfnW8&3ZSb-<
z5p^btd0Vt?32uH0+hC{Pu$I1MkSC~ziJD>Ha=~3}FH2uDS)dK~#O}gYANT=wDl=H)
z>O6tn(jQp`8nwIw_p-qFvdO$ATU}tqT?hvrKS7bNcpzb>rf~Ic3idpIfZlz%ku&GE
z2KM-j0*>m<hWcJI)Z-_?zZT446$u<+i04}3^;BZGT*l@sZM#8EY{(Sces_k&d3%i$
z(I-TAVs{ex(>D>X*R+Dh`_3>oJ5#Ho&JUZ{RUrW0tz7G80#@-`4~<fa&mj-J8(!yG
zO1h700fT1-!MT=dd|2*d=4Yp#OVyI&sb2E;-%6ahS`p4XR9f~E24Z@`e*GRoPXatE
zg|ya7;+%B=ks`bN^&2p9go#g6{)Qwre~e3}T|r_pPh=T-y{RB+la_%jMI<f!X7flB
z&r+Y}!sM-=nFpNBlooB=rZXj`|9W%LrXBiriKJL|ASK<<Wv7X6%Yn_ij6DinvSh_u
z_KfqX3GHbq-kM>yv?WDse=s%6Dzv-QxAow*?8%C|XS3`}w(p<5;z@r`YsrqhS?du;
z+o3eAz&>Tf*hOun=|#?IV_f$+l<h2@ooN|%qpfUL$-I2oW^Kpv-DO^-o@I;L%lA}x
z9|`E#=UB0~a`Ca4fg9}=8HfFw5;;0fhcc@J&+Z(%`1GNyqaoiHx@J37?yC#GUK4in
zbY=E;(Y-C3b)2hmjw}1S%NC!h+TW1y=x#^0^Wof+t6n@AxOwJqUgJ7|adlme<Todg
zBTbi_JyOuRiErqZ<5FGNzExzk{MOm(qBA=T?6>H;)*R^AW#U!7q@$+zT!wjIXO8R9
zgBP-=C<kwK94)z&XP3ks<yKqTRpheE)UUI)>~aZ%&aTTjUiuNbaorGlwCR;{8Xc~E
z<as5@Ta9jih;O*;N`2j!YrmmaTs!;$dZOQ_=;Cin=2ZtKagAov=rt}Ea@FXsUO=Gr
z>LrwXE77HpnlZV2Of@)>VNncw3RdC~Vaix`a6~XeLuk&?OA7l&3;XpY^`j&gRn+Xm
z=k|xh!j7_i&;7_?1PF)+K!R-t9y0HfmVWibJm-&ipNO$miHIDsVOZ3syR^}7To9?a
z;NaLlBT-uKDOE&pR7k9v9}Ys{Ae3A;k*qhNY1F6T{{-}psuibN0})FZ7#bWYr*N7^
z8lex^sFAMwp^<hpp#;`Q^l{56mfHAZmdDurei;;^SnjrrH!f0L48vFvjWBdUM%r|I
z117!-_&<vULata>?EXKd%6}1M{l7e4$zS!7gsH|tu7nh(8VLXp>xwj_o}Tl(r>P&7
zC?7Xt#9AtoN~QDZFC(+1(gdk2MCv4!hDhy0RC_YoU=Fjgmr7lknbwD9T2n2ROavy7
zbl%tF%*>|H3&*)GjBcO6jE>Aehn7m+nL*~usH$_RIirUMSkAQw@Lm`f<TN$h*T*xI
zIgj+9^IN%`9#rw{RYB+^Xi>*gEt#{A{FRHmt)qgRL!@%XUj<K>LyPZ_?)k+RjPAw^
z>e5xItMdt`_hhkhZ|i7ZpSjVCd{m{=c=Ofuxh?oOAG%qze2@>*N|_?ad0{MbAa!+T
z>{UhyL-_MuRsE)$P0LX0rBVlG=^VlIK(WYa>hb`~6?9J~F-CNg6PW(fDXK4p-jk_3
z>CEZs=yPAGEJ1xws$>~2+D5ACq}oYUq<UXfx^JXY9F3!@lkunH)2__bs4aoWKp~I$
vDlRy7LZCu1K^_wmonR0j9HR`2jxw;YnP@T5T<oq3!21t4fdk+{|6Bhz)#cIN

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
deleted file mode 100644
index 28e2ab00c7a1b435bea1614c8f8f03dd7c991c90..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2537
zcmc&0Yfw~W^nBmF`#{-5;%>XRB5JH;yq1KJA@nY9ToLdAizzPb%7V-8whusS6f0+_
zAs>y-U^+shnVOO&%^;wlhVOjPls3~OH9dT!HiJ`&ll6V~wmek+HF*ZUd%p8N=R4=j
z=JNSEj^zTlAOnXv2n3L8mU_IeZw#3*g99#*Q$`Sa04L?7ns^A8`*^AW3;_H>F*z|p
z7_?1D!?Ef(kOrt#ekCTmx!7uQTRE)6%JoRufV2r(`AC-35hN7gFpwjFd;lH|jUhA&
zkA|j_`#6J`l7#y>l_>ukE~eo=WCp?wSC}28-b#zzQ*1VQOp0=Xl$(o79X4;JT}Yr6
zDZ7yNHr5`&nn6E<*a#k<m{zTfdsk-QdXZ4{F6M40N?(@)uI72T#d?6P-~AB&Y@UXW
zL^h0;PnrcgC+=jI1UJI-i^AC4?RQaI)>*icuz}rE+%tM=^m*pJtQSD{&CgKDp2deZ
zwW9k|&f!?c4Oz{l^U%HeJ9Os5n<(}<C*I~T!~M1$j9+-NWY6YH(A?FJCV>Gbei4Iq
zXDLwLu?{$_b)n<6J$Q8+18Zv*;-0x_a4WtKtM+CvjYs_<{$K%W+0hMG>o&pP^)6;&
z^eUW`FkMpq{y6E46Q3bnc_+JVjZ`{!<&04c&T*q`A>T{39=iiUkx0_mpN;*eVaCg~
zL&oHLa60y<QP-}&EC~VyB)riH_x5Cg!k7lOkVt%IaT~mIrjz-4ZZgyzU5fkKCFrZ!
zCm|^*jQxB<AG8Ej;eBgTAtZGcbX*wE1g(95!rnRvW6~bL55>*EB*iceZYIdRb?l)f
z8>Rgl-0(~HoAAMj1ClM<7vgO#I@CU;jd|(gznE9g{mLYFPLxevQ-u#iUzV+06ffzL
z-Imy5jLa!;L*nLASam&95?H$&B3e77V=p7r_XlY&(?<|9;}s@&xj${nYlypzBIv6Q
zjEwL^DwY<LfaiC&gnXEX7ZIX_Uc}1A*VZnX8LoP!?(LGJ)`XDtb!PqU!l;J&rN^(=
zBsZ<MrcT|Qvuk5Tcg3N_`x|WOpL}a?-D*F1^Zv@NCTC_u@VZkwT&Hb`jTd)$viB8z
zdS!3b*;^|*d-vBEqPASP*0SPDd;3-G`7s&}i({ENywCkA%k6YTEpFCU?)JzmHVd5r
z((t`V+2g1D(NEi;(EcW&LpwB=CwBF)Jz}8~OyKch^>I{kA4W&VhYwPT^3Y+u5;)SZ
zju{wTo5fz{5%-NauBXOo8^)4xecbv8KA)7Bx;_q6Qg^A<QfcB?vX;W|knI+xG3<SI
zCfS`3hgHL?%SlDy;oF4D4c473Sg9e2YO^Zg4d@2M#DniG!h(tYP-<Ur{AjYE#A=P&
z@E@<<r?eXXmrm6_txi(*D3n+$SN1D{0Mr_lKcAmJryx;mEL9U5O(Blg^E{tN{wd7j
z`D$KY#;5aq8J|)n(DX!LHpNnSK8t?I-S;Io0d-Rg)DS;s#f#C=2?R;Y$)xv`YI>&8
zgP7)d13mOcdKLJ5481QaiOtcK6fE$X)2A;p7Uo+izcQD^x4eQJg1r>M2nm9KXiq>X
zZ<xNjs36X6&M4zeR9|pU(jj2<-Sg+qr}u1nh^Y&qi}5TYG`+`E5a%!!<~WK9h17|^
zJTbrQdBfu&#Ud~y9$HGh)10xuLm7OQfyxU))jt1uSwg)@u?hG}&-1Ai&9KoLIE&J!
zSCquokeW=RLZrymwEiSS@lgm(FKCj;Np^&9<n`6!I|;#3Eh57Ul@yhPBt^QAy0J3d
zOZ*6VQhj2dsM4nqB|#PHG`U{$T0D`ZPG_Xa<#JSqR#{x`a)&)s7aygIicuRh00V!}
L!2;xxA9?=(n#OI(

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06444accf507f2191f70afd33d8ea36e7ca5aa07
GIT binary patch
literal 4326
zcmc&&2UJt{8vp)xBsUN?HztCL0x>FEz=f88vc-tXauN0lK?s72p#_5j#fpN80*V`F
zuvS0?>#B+y(dVdEtJYezxZ&x$H;C3}d(M03yz}~<<iEc8`~ElNM`Y3<N1C7`=;#2N
zPlGXlLw!!o{IpoPUq#yb4tmL=dkb6k9z1&6y8Lm-O?m3>+2Q3U1ysi7;p<8Z=bKfX
zyK(Fd4YYvnZbR$ShBSxfn7e|6>e30Rt|$PYE5VpiAK|!<vq1bDnbU;G1vrKxAum#%
z5<N9uo*qwQ0)4?<B={AH)ksYJRWOkaQ#H^SFoPj80WkYiS{A@m!J-&UAmAY&cw7QV
zJqkF?TUU>A<1?15=yy)6`)9QasV<F>LLY1d44yAf8RZ8&7>=utbr&xuKjM+-MJuB+
z6fr5PsnK$kTrYuHBt%b*QYL38QnU!yV1issoJ)w8<L3WG&Oe2nE2I|)1L$0i7I`lJ
z`Fql9{xnE*nFlqkQfO_61)ItN@Y7{JJTJ0_he!KCRDV;nag!1jPB8`Rl}AB&+Zw>$
z1fJ<*u;_6Dgzx<)Tza<!IB`Q^Q>X~4o*KfeRZ^6b#>WyxL!oh(1Dsgn4Sf{0@NCRG
zbasawEHf9Pe)U#BJhBH$=@0I?1YFo*1-!l%U}EYH=MxQKNw_N{E%1cVt%)!xs~7M}
zOyEjj4XoGCgr4?$(3u)@WZAG4Oc(foWy2Xrt#yW!-vm&vKM9t{n#015eK4vt1|qii
zf#%5^Sl7B6-O)jyIjsxVj7*@Zu{RLqBf&w_2Km005V)ffL|Yx90+T?wnFi*(-vRev
zIJ9`v*qyJ3LD3jf5bgAa{9k@Ti5W()i?9MkJ`a}JkZ^SGUa*<t1+upss9n?##EUqv
zy;Ke-ZyCVxizHlJ>wx^WRDxCdDQKVN1(px&;Zo!>xb-dtvQ!~Zn!g_2Pd^4Ly*I-;
zzY%a}6^7b#?!y3e59m|l1W%vMg^gGGf>nw!dS3Aob&tM>R1yTKDVE^1BoM}R6e3Oh
zZer1vckpV5GdgHns!R3phmE~A5)s|kfz@PdLbm=nY8_lm-1k3%`r9|-i+%fnW6M2I
z5T~$)lKn_!%tO(K+{mo+&2TjDI{KmV5pwF5img>f!<z#eaGqoYxwY&(>^S@u^??9v
z==Tn2i?1FU+f)w?;xu$5_X3u80EcV#4(PFK8z$WDjjwCugJ#1~xTP+E_lwf--u5||
z+u$K&!ZI_?FUR&E$Am+~+I$X&luX9@j@Uy+9S<khG(7>odLgWP>xc23F+78=18>XM
zaMJ0n&Vz@8NK?>*!Ltv+>#e?^7d8r#d)i`8rW}C69}eM{$Bh7WV=UHIN1}_PkHfM(
z!Kk)Z8)!_YV>S6Bq31|{s6X2Sx1P5b3`VuX?WsEecXPm%v^!84YT~<y<(#+0>G1Q>
zxzO)YcXIXmiP&0=BdY6r0N--*J(_g-K0e}5Z?0wjbZlF<lX&)ISJGzNQasrq3_k(s
zFtjWRavu7S#<?@VW`8}$=n6t@FL9XF>J1KG4ad9B;6qW?MC`k?37DxZBJ2FU(Ta36
z-fi+#Jj>t~QY75Kru|lcJ&MgCYDdcub>9t3HTVv7Qz$WzS3~8uu|x;mOn$p}G%4Sm
zY*@9e#?(J46m6P*9qTFX!(E#oK^@H^Tsm+zu~j)4%Y5aHF1Ft^dNTJoui$(HZzp=J
zx7_R~*Du?Ezv}&u$VgI#98HTkIw6lJ#Z8JgyQPlWI<7aWKjnZ;T%*fNv+Kvv7}Q|@
zxM_{e8EI;is=EtYG36l@vebi27FL0^O%``LIURPtUx7`W)yOR{AUT>BZ-~s@Gf=bI
zS<mo(f5;pX#F=)-3x%#NK~uk84MkUb!NwvRqL-{UWna8RNB2k_mKgpF{pdG{vbu2}
z#p~GUoZjQg4PKgO$g3NNY3>a`uMa8+(b?-D8=enA`!12Eema28%U?n0!R7ej{(7kK
zLMV2@Aj-smSA|vHbHa39oPn4(&FJ-=Bm%3RCAvA{46#&ZN$RdN$0IwUF;oyl#XaZ`
zkM1|>95Fr)hh%HGYI$VO&BOY_z`Ku0lYzyklIw(-nJvM&1y8_dO)m7$-^K}0O`=lU
zvU##!hUk9ZCr6*0#|M3!z)x;#;l2m~GQwjh{HjrM&o8|Ub%Rb~4aMj2T_Y?EtQW5&
zoW@?mtNQxj%^mv<ht?Jut`oSEN4%mrW-e;fGHL>;T<s$o|8ON97Z2#ig%SFm#zh$R
zjRLd1j)9lThd*qfH}CF&DxKTt94Fk`A8Sh{L3ig1QhhNVR|kZ1f5_Vm{I8A?(IdNI
zH@ziDv&zfBc5EI&wYA{X)$Tm*oxS*#nr2L}TS@iUTn*;G>p^(cX`K05JEl*s(zT&0
z_@BO&3AP<Vg8hTZ$a=1YB{Sjg&LK3Dgy+ueX$7C)<p4$r-79kQaLQN`dblEQ9@nss
zPpNz5+<aZ3Q+(^;%6an*EIiagY1P+-CX%4dp^a7f3%fZ)o+y>BFIa3gIPF1e<NEna
zdb!V~L>?Q|#g;zHBw<Y(3cu+qtMV!Hs9sPqAaqas)u!r&EA1ni)FRJ~i^>MZU)UUW
zWaHx1gHrFDDD&J@R6Z>0_XlCS=8x(1RSuBG&T3h2&-sSdK^pZ~OYbEXqn0@rE-Um~
zVYlV*a^H*Mz;d@a80~jS5>y>f?or}*xqooYq=vb)e~V3cZSsw#68|f9Q)DyWyd`Z)
zcgeqBAR1O+Q?@7O^r|7VIJT=a3758p)fL#T-lzD_v#;*Vu9t0-fon70WHUEq19fX_
zm{B%AceUpx)FEqn$beWfxD!N?7DVy59v!fn?w~fNl^b;sKQ{+v13y3Tik)2tNUA?o
zChB-vFuHu7;!$gD`YZh@ZZp~*T;7{$Fm=S7_SY}kUKvEV=ixkobC#jpBVSkKs(Ni0
zIkM2CU$7?2D9UTG*^ueaUmHdH6nk6xu5g}V9OG9qAZ(xNjd5&1*}#Mr%?y*c!17@;
zUOs<g5-+RtEEKrRG))Mq_AhhIcx#$Ce#^Km!L>8#q~IDheUBXBtzahYv=>G>!e3`H
z8F%WU9P_71Ov1*3TB97ov>oM$vL(m&seA>or}xC3ZL!M+P>|V@(X+-Yv~ef54`sTu
zFR5MhPn2zSJ?+ISL4^DMX*|7bt2Czk!LJ5PF3jkW4H{9l`R^%AxySEhFhPjsdR~W0
zA!}f296d{=462@8=K8q(V~v-UEr-NQ2bj}XC!6&7oPF^W2`MbIj7cYw6Vg@OnB*9i
znK3l1f-l(BS5Ran-qA}um&;PkKYl{~lxxXouK-#{OSB|Z9Un+Uv|=Qe0y*MQZc!g7
z<J1rHm+X>p(Tb4jvSHcfm#2);CwA#mWh6dl_gC!F;3L;ej!B7Abu#D<BzTF$^|s<`
zHVnV~GX7s={WHv>MJ^!Zq({ZaDC9K3@N8@B4}9I)JYc(tJHxoa1rvVWc$(8`{5kz)
z?LJ)zjnSL{H`bdFYOjZ-^ca(&>zb^)Gx06h_}aw(4i*IZB6Cr||N5VazlqHMU!LN>
z?NJhJw-xA%DZ$fzrT`*yAzvC4G$v$d=aVI@AnaLjk$Om_(y`2&6@RHTQ|b{X^^!{C
zq@HowGY`gKG`sSYN`2Xx*>}y%PP<HJBd~#_V`dJpw;#+bMg{t?d(TXE9K{YyXsI-S
z9X!I=tyV8}VE0LpPJxb*A;B5ZUPF??LWAO2y+X#+HzSc5wDCN(L6{;K>5QjcvbxXw
z)5AktQlh=%q;l3@o1THgNLbfCXu<?`@5c_E(6yy^)|14Xd8p(eF3PabKxKHS7CMt}
zY$ray@t@bjG>efx)Wf#YBQ@GPSj8%&z5%Sg)+n<p|5#sbznNw;($#vYbR@g<PG)<M
z8tyeDG14iE>4`II#56gR?LSju=cO=b9$HVPat0lv50iRicHWb=Sk{ZNk!m~H>7)hO
zd9H;Xrtr#O@@R3g{!DzvmA#ryOAuP5%G0K0#HeheQd4c^X=%z#i|H|G=?Tgd3rANw
VM>_{mfH}a&7c>w6GUgYKe*tj26RQ9K

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
deleted file mode 100644
index 7b231f3e38c05597bb0f2244bd064df42bdc2274..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3314
zcmc&%c~nzZ8vpKlFCkGjAul8t6<H!ENT5KGBISV)fryAz7DWw&RkD}_L9GQBDk{~=
zXkDtJ#cENY3fNH=saV|im0DYI-;S*wJ)Mp^I`_R4l~MngbLLFHliYj1`|aPo=l+re
zCee?kXayahqWLs907xbl*Df$-sbe=7tDezI%Wo_`->|3SipRS9G|(bi;!O)^d#uZs
z1%aoaTP6{7YXd+pcLI+BM3Bot-i~B6B@G0=y;LGhSL-yBbJV6Bnh*=b^+?==<jHcu
zyIe`Vac)2pfCqa#0bo9y4p!kD*j!yG1VRLq&?+2wSjrOLGO;E-vXuzBB@zMpQ%8&|
zcykvE|Ez!@$m<~APra<b(?+AuD9~o=%#$^0v)U#XFLE`LGxYfdS{+x@T_nDV<c*}f
zi;(>nA>UeQr!9TeRz;sDxngUq9&aO~7fge^z<E&H$v|go7I<$Q0@u!2!{a3$aHqoy
zG6sv#<}G?yJV^u|D?31c#RI_C5gytSP<}rbQXBpSXP;LCKRX1rBun9|2lg;)4TFk}
z)<m8(1om!Mz`?ajaMk+2!_m*tiMpXsA(J4l7I&cT`GP<n2jL|YoUU^R;UE`q6otd7
zJbPH08U#}pMZ&1gJeXKC0E8<X;X-^ZRM{3nf8Tm^tX78HTDO8|(MWJ>JqCuI0igR;
z3@x@3VR@De7C-wAqE=<Xlp0q!p2mmD&K>A#F9eN8d&9*(j?lh$AW-W@fTHmcl*YJ0
zeBDNnZuNr=geR=)+X%B?)WHoH4(F9LadrL&uw=9dq~9u`^zJp3SI`HxQ|_QG6+(qK
z4>}qez<YKyBs}56&T=o1m-C@!l^PCRwu1v_cyMOD0>xHu1b5RBcseT@+;02A+4Ksy
z{9Fe`<|J5ES_Lnr?}wGjZ=f<Z46d#r(9_~uFr=&>xYqi^gNJir^Myg+u5&<-H~fyA
zH8+si6M;eJ2GL97VeGSos4-^;RbKrZ{!tfz_V}#oEpUy4%>y@6Q=BWoJ<WqksCtY#
zgO^da;`X7zzQ@VWV!Xib{0-1jM~K!HyO7yIh%{}XyrNUbp<~V^ba>-E<nLr4*6TI!
z&!$bJ&@+s;b@eHz`~C@X1r-tUg925@*q|}(Ezl}AqJ1T&i8)OqT=Z3-`$3NgNsW@M
z+-nVun>yff*$Q}3ZX^f#789Yt!+5zBeff9y*CW5&Hfnt-pU?A5BL;=l^D+*k^47Nh
z2C*#?sC*JjSO*Yf0o@Erw?E*J|MgzC@4U|wfei$I+y;MajRBjKD9G>cL;N<W2^Jo1
zBhQTugR;F@#G_^&Iy343RMd}0I|n?1M$vSlwsZva9}x#FC;E{d^BTY|>M2~ATnA*R
zg4EODASfuKc28N(f3nO3H#_En*I8%YmsJyp^^Jb0c~BEsedq<6c=Q$-);7?}t#mrE
z&FK*NNm>xkd)wz^z9NM@2qp+wodLynM)Dj=W`Osu7Ji=#2tE3pgjt<RP<%L?be>@i
zONu5CyN%-rkq_cE$0^YlrZUng?MJf6?lRKm{!C2!^;6<rRx!16R00y*3MCA7yOEPt
zPk{Ps_62IUqEGU?=@P3~PbS44w}fIZH>NA3iwIBT^OFdL@!-LI%#>JBf@y$XPm0I4
z5(s<swzjdgv+v{JD5Awq5~*`vS-<`RTn4(jxqHY5d3p^V;_WlkSK;UXUO-^b`@zFP
zJ_sEi_F*^^5ji3%S~)T%HcmAvessc^#IYZZ8=sV%k~(2x+N8--)ae<T%&hF3+`Osz
zTAki7&1f<gOfM{&F|+rqk7s{UTry|wy!oY{E+|{LXmR;xOO}4Vtm1FW<tx5eIe69T
zHD9h>x4v@2#;>Y2eZ6_hH`QCW)zsE~yM0G}L*vfx{=RE>Q*%o~>z=mn_qOl*p=1An
zgNF_uIeP5)iIb;JpE-N({DsaRFaBhSH`4%uOY#*`1N_W0z%NY(5MAaJkU`dE$>$2{
zf~&kP_iJ1-*Ksm7U2usuSl_@Y@C=HZw_Z!Yrb~323*M(obcc)U-vxrZ7k`qU>?@-Q
z3k;3y`G^a(xjGh-GEG!|uE}hbnV-o%@c1<^u@cXB60fD@yJ&eMhj*_Z-dmq+7usFL
zVipD``_N`<_J{#GS>(cmFN6aveg&WsLAQoP(5=Hl(wn>3Sm?rmDuGB_`Q85SLL|GF
z_>`Zi%QjnZ_8=0UMDmOM<ZJq2OnTG%J7T`AKxug(Ncg6VoJ_5nrZ60P5U-)ShfG!D
z7>;X)Ktj^9%QWAj+!Hccy;uFG37W48WvwX*cNQ%26F5TmnRqy3|1PY5F7SUA3u1v(
zCRP29e{%dq`2T;ov47c365sAC7RUwSa%cSfNo5jiCNXhzQi$bZNj6ZvECn(V48x4U
zUzXw+rjUuqW}+D;n~BWk_9C#tD0UUeFfr_m`R*BW+|tB)U>z}|XAbf84aSS8_>n9h
zS;&r2?0{V}jEWs1QdpLgGYXcUn(iO(m!34fKodP|YD#iq4lCCt;QVIf;eqoP$vMGE
zV6yn*maMFYy(u**P^VF5GiuhJ3y;gel+w*7jvL4FvFu<eolD)4&s4k@VOA#v>Qj>A
z^{L5R={VjPOMS5)^~?u13)5@)u&s<RXq4m4tbmD8vHF}+VR!s7F<ifKvoUqIo?%9?
zOJzRW14C-`u)K8tBHWVzRtYz`knKNC(Q+twFM_kgnd9oPJcWrUw44){ENg{z7_O5R
zBd$oxK395*Ho5@E;p$}Vv43pJ9*sp4BrXQEaauv9*(bwb@KGC$`a+lKnMPBtUgzQ$
WG}LdXLaLGhy!=251(1OM?fe^E1DWdp

diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json
index 0fb3047e0..8a62a81b2 100644
--- a/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json
+++ b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json
@@ -2,28 +2,17 @@
     "pipeline": "pipeline_id",
     "job details": {
         "job category": "preprocessing",
-        "job name": "minhash",
+        "job name": "fdlist",
         "job type": "pure python",
         "job id": "job_id",
-        "start_time": "2024-10-18 08:15:20",
-        "end_time": "2024-10-18 08:15:21",
+        "start_time": "2024-10-18 10:08:23",
+        "end_time": "2024-10-18 10:08:23",
         "status": "success"
     },
-    "code": {
-        "github": "github",
-        "commit_hash": "12345",
-        "path": "path"
-    },
+    "code": null,
     "job_input_params": {
-        "document_id_column": "int_id_column",
-        "contents_column": "contents",
-        "seed": 42,
-        "num_permutations": 112,
-        "jaccard_similarity_threshold": 0.75,
-        "word_shingle_size": 5,
-        "num_bands": 14,
-        "num_minhashes_per_band": 8,
-        "num_segments": 2,
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
         "checkpointing": false,
         "max_files": -1,
         "random_samples": -1,
@@ -31,32 +20,29 @@
         "num_processors": 0
     },
     "execution_stats": {
-        "cpus": 69.0,
+        "cpus": 112.8,
         "gpus": 0,
-        "memory": 25.13,
+        "memory": 24.15,
         "object_store": 0,
-        "execution time, min": 0.006
+        "execution time, min": 0.0
     },
     "job_output_stats": {
-        "source_files": 1,
-        "source_size": 3093,
-        "result_files": 0,
-        "processing_time": 0.38,
-        "source_doc_count": 5,
-        "result_doc_count": 0,
-        "input_files": 1,
-        "input_docs": 5,
-        "input_bytes": 6143,
-        "output_files": 25,
-        "output_docs": 70,
-        "output_bytes": 33600
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.006,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
     },
     "source": {
-        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/input/data_1",
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
         "type": "path"
     },
     "target": {
-        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2",
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
         "type": "path"
     }
 }
diff --git a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
index 771d3256e..cecd224fe 100644
--- a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
+++ b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
@@ -12,6 +12,7 @@
 
 import os
 
+from cluster_analysis_transform import sort_output_cli_param
 from cluster_analysis_transform_python import (
     ClusterAnalysisPythonTransformConfiguration,
 )
@@ -32,7 +33,8 @@ def get_test_transform_fixtures(self) -> list[tuple]:
         config = {
             "cluster_num_bands": 14,
             "cluster_num_segments": 2,
-            "cluster_jaccard_similarity_threshold": 0.0,
+            "cluster_jaccard_similarity_threshold": 0.7,
+            sort_output_cli_param: True,
         }
         launcher = PythonTransformLauncher(ClusterAnalysisPythonTransformConfiguration())
         fixtures = [
diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
index fca5485b4..8c4debed9 100644
--- a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
+++ b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
@@ -35,8 +35,8 @@ def get_test_transform_fixtures(self) -> list[tuple]:
             os.path.join(
                 os.path.dirname(__file__),
                 "..",
-                "output",
-                "docs_to_remove_consolidated",
+                "test-data",
+                "expected/get_list_transform/docs_to_remove_consolidated",
                 "docs_to_remove_consolidated.parquet",
             )
         )
@@ -45,5 +45,5 @@ def get_test_transform_fixtures(self) -> list[tuple]:
             duplicate_list_location_cli_param: duplicate_location,
         }
         launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration())
-        fixtures = [(launcher, config, basedir + "/input/data_1", basedir + "/expected/data_cleaning/cleaned")]
+        fixtures = [(launcher, config, basedir + "/input", basedir + "/expected/data_cleaning/cleaned")]
         return fixtures
diff --git a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py
new file mode 100644
index 000000000..4b59e3a7a
--- /dev/null
+++ b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py
@@ -0,0 +1,45 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from get_duplicate_list_transform import sort_output_cli_param
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+
+
+class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            sort_output_cli_param: True,
+        }
+        launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "cluster_analysis"),
+                os.path.join(basedir, "expected", "get_list_transform"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py
index 07710b74d..9ad8a32d7 100644
--- a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py
+++ b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py
@@ -28,56 +28,13 @@ class TestPythonSignatureCalcTransform(AbstractTransformLauncherTest):
     The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
     """
 
-    # # create parameters
-    # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
-    # output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
-    # local_conf = {"input_folder": input_folder, "output_folder": output_folder}
-    # code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
-    # params = {
-    #     # Data access. Only required parameters are specified
-    #     "data_local_config": ParamsUtils.convert_to_ast(local_conf),
-    #     # execution info
-    #     "runtime_pipeline_id": "pipeline_id",
-    #     "runtime_job_id": "job_id",
-    #     "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
-    #     "minhash_num_permutations": 112,
-    #     "minhash_num_bands": 14,
-    #     "minhash_num_segments": 2,
-    # }
-    print("====")
-
     def get_test_transform_fixtures(self) -> list[tuple]:
         basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
         config = {
             "minhash_num_permutations": 112,
             "minhash_num_bands": 14,
             "minhash_num_segments": 2,
-            # # When running in ray, our Runtime's get_transform_config() method  will load the domains using
-            # # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration.
-            # # columns used
-            # "fdedup_doc_column": "contents",
-            # "fdedup_id_column": "int_id_column",
-            # "fdedup_cluster_column": "cluster",
-            # # infrastructure
-            # "fdedup_bucket_cpu": 0.5,
-            # "fdedup_doc_cpu": 0.5,
-            # "fdedup_mhash_cpu": 0.5,
-            # "fdedup_num_doc_actors": 1,
-            # "fdedup_num_bucket_actors": 1,
-            # "fdedup_num_minhash_actors": 1,
-            # "fdedup_num_preprocessors": 1,
-            # # fuzzy parameters
-            # "fdedup_num_permutations": 64,
-            # "fdedup_threshold": 0.8,
-            # "fdedup_shingles_size": 5,
-            # "fdedup_delimiters": " ",
-            # # Random delay between reads
-            # "fdedup_random_delay_limit": 5,
-            # # snapshotting
-            # "fdedup_snapshot_delay": 1,
-            # "fdedup_use_doc_snapshot": False,
-            # "fdedup_use_bucket_snapshot": False,
         }
         launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration())
-        fixtures = [(launcher, config, basedir + "/input/data_1/", basedir + "/expected/signature_calc/")]
+        fixtures = [(launcher, config, basedir + "/input/", basedir + "/expected/signature_calc/")]
         return fixtures

From d07a23a47d3faf0e5bce744cd375b1c5ec1d5966 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 18 Oct 2024 15:24:41 -0400
Subject: [PATCH 040/105] Update versions in pyproject.toml

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/python/pyproject.toml | 4 ++--
 transforms/universal/fdedup/spark/pyproject.toml  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml
index f2b9d8268..fa815441c 100644
--- a/transforms/universal/fdedup/python/pyproject.toml
+++ b/transforms/universal/fdedup/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_fdedup_transform_python"
-version = "0.3.0.dev0"
+version = "0.2.2.dev1"
 requires-python = ">=3.10"
 description = "Fuzzy Dedup Transform for Python"
 license = {text = "Apache-2.0"}
@@ -10,7 +10,7 @@ authors = [
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit==0.2.2.dev0",
+    "data-prep-toolkit==0.2.2.dev1",
     "pyarrow==16.1.0",
     "pyyaml>=6.0.2",
     "boto3>=1.34.69",
diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml
index dcf1f48e2..548f350c0 100644
--- a/transforms/universal/fdedup/spark/pyproject.toml
+++ b/transforms/universal/fdedup/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_fdedup_transform_spark"
-version = "0.3.0.dev0"
+version = "0.2.2.dev1"
 requires-python = ">=3.10"
 description = "Fuzzy Dedup Spark Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
 dependencies = [
-    "dpk_fdedup_transform_python==0.3.0.dev0",
-    "data-prep-toolkit-spark==0.2.2.dev0",
+    "dpk_fdedup_transform_python==0.2.2.dev1",
+    "data-prep-toolkit-spark==0.2.2.dev1",
 ]
 
 [project.optional-dependencies]

From ec2168c2d8f9b1bf9575689b05b08650bf91510d Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 18 Oct 2024 15:27:39 -0400
Subject: [PATCH 041/105] Updated ray test data

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../docs_to_remove/band_0_segment_0.parquet   | Bin 0 -> 1513 bytes
 .../docs_to_remove/band_0_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_10_segment_0.parquet  | Bin 0 -> 1505 bytes
 .../docs_to_remove/band_10_segment_1.parquet  | Bin 0 -> 1523 bytes
 .../docs_to_remove/band_11_segment_0.parquet  | Bin 0 -> 1523 bytes
 .../docs_to_remove/band_11_segment_1.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_12_segment_0.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_12_segment_1.parquet  | Bin 0 -> 1532 bytes
 .../docs_to_remove/band_13_segment_0.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_13_segment_1.parquet  | Bin 0 -> 1526 bytes
 .../docs_to_remove/band_1_segment_0.parquet   | Bin 0 -> 1523 bytes
 .../docs_to_remove/band_1_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_2_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_2_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_3_segment_0.parquet   | Bin 0 -> 1510 bytes
 .../docs_to_remove/band_3_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_4_segment_0.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_4_segment_1.parquet   | Bin 0 -> 1513 bytes
 .../docs_to_remove/band_5_segment_0.parquet   | Bin 0 -> 1513 bytes
 .../docs_to_remove/band_5_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_6_segment_0.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_6_segment_1.parquet   | Bin 0 -> 1513 bytes
 .../docs_to_remove/band_7_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_7_segment_1.parquet   | Bin 0 -> 1505 bytes
 .../docs_to_remove/band_8_segment_0.parquet   | Bin 0 -> 1530 bytes
 .../docs_to_remove/band_8_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_9_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_9_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/metadata.json              |  58 ++++++++++++
 .../data_cleaning/annotated/df1.parquet       | Bin 0 -> 6923 bytes
 .../data_cleaning/annotated/metadata.json     |  56 ++++++++++++
 .../data_cleaning/cleaned/data_1/df1.parquet  | Bin 0 -> 14933 bytes
 .../data_cleaning/cleaned/data_2/df2.parquet  | Bin 0 -> 3068 bytes
 .../data_cleaning/cleaned/metadata.json       |  59 ++++++++++++
 .../docs_to_remove_consolidated.parquet       | Bin 0 -> 663 bytes
 .../docs_to_remove_consolidated.parquet       | Bin 0 -> 663 bytes
 .../expected/get_list_transform/metadata.json |  48 ++++++++++
 .../ray/test-data/expected/metadata.json      |  84 +++++-------------
 .../ray/test-data/expected/sample1.parquet    | Bin 36941 -> 0 bytes
 .../bands/band=0/segment=0/df1.parquet        | Bin 0 -> 3984 bytes
 .../bands/band=0/segment=1/df1.parquet        | Bin 0 -> 4763 bytes
 .../bands/band=1/segment=0/df1.parquet        | Bin 0 -> 3695 bytes
 .../bands/band=1/segment=1/df1.parquet        | Bin 0 -> 3684 bytes
 .../bands/band=10/segment=0/df1.parquet       | Bin 0 -> 3305 bytes
 .../bands/band=10/segment=1/df1.parquet       | Bin 0 -> 4466 bytes
 .../bands/band=11/segment=0/df1.parquet       | Bin 0 -> 4906 bytes
 .../bands/band=11/segment=1/df1.parquet       | Bin 0 -> 3317 bytes
 .../bands/band=12/segment=0/df1.parquet       | Bin 0 -> 3138 bytes
 .../bands/band=12/segment=1/df1.parquet       | Bin 0 -> 5020 bytes
 .../bands/band=13/segment=0/df1.parquet       | Bin 0 -> 3138 bytes
 .../bands/band=13/segment=1/df1.parquet       | Bin 0 -> 5244 bytes
 .../bands/band=2/segment=0/df1.parquet        | Bin 0 -> 4782 bytes
 .../bands/band=2/segment=1/df1.parquet        | Bin 0 -> 3988 bytes
 .../bands/band=3/segment=0/df1.parquet        | Bin 0 -> 4323 bytes
 .../bands/band=3/segment=1/df1.parquet        | Bin 0 -> 4341 bytes
 .../bands/band=4/segment=0/df1.parquet        | Bin 0 -> 4035 bytes
 .../bands/band=4/segment=1/df1.parquet        | Bin 0 -> 4860 bytes
 .../bands/band=5/segment=0/df1.parquet        | Bin 0 -> 3554 bytes
 .../bands/band=5/segment=1/df1.parquet        | Bin 0 -> 4872 bytes
 .../bands/band=6/segment=0/df1.parquet        | Bin 0 -> 3553 bytes
 .../bands/band=6/segment=1/df1.parquet        | Bin 0 -> 4311 bytes
 .../bands/band=7/segment=0/df1.parquet        | Bin 0 -> 3765 bytes
 .../bands/band=7/segment=1/df1.parquet        | Bin 0 -> 4158 bytes
 .../bands/band=8/segment=0/df1.parquet        | Bin 0 -> 3781 bytes
 .../bands/band=8/segment=1/df1.parquet        | Bin 0 -> 3997 bytes
 .../bands/band=9/segment=0/df1.parquet        | Bin 0 -> 4018 bytes
 .../bands/band=9/segment=1/df1.parquet        | Bin 0 -> 4326 bytes
 .../expected/signature_calc/metadata.json     |  48 ++++++++++
 .../snapshot/buckets/buckets_collector_0      | Bin 263 -> 0 bytes
 .../expected/snapshot/docs/doc_collector_0    | Bin 31 -> 0 bytes
 .../snapshot/minhash/minhash_collector_0      | Bin 2840 -> 0 bytes
 71 files changed, 292 insertions(+), 61 deletions(-)
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json
 delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/sample1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json
 delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0
 delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0
 delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6
GIT binary patch
literal 1513
zcmcgsL2nXK5T0EY+$N2|#<$r`I1r<o9&F2&)QIWjLF)#^3K*$!A+Rh^Q&?Omq(^W5
z5`TdvjlZD3z>^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL
zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p
zHuSaLi9eaPx}L+<HATCWwC|GHFxCGVVh}Z8GN4Oi09YaHRwyVL#${nqhpt9FBy72g
z05QToBp1SNqNGV{>a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh
ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuP<MB<-tYUfeK$(j8d_UokRHo>AEYJvDI#?y${L
zc%Jv<D_14R|9QhfF`gc1ijgt)ui@sI*LDZ4{ZzC|MN>>w`<T29U}W@lUavnteiR&%
z{!QK?oAV;)yoP_Wr<`M<&mVK%=cT@=6|vzNtxBAiDPkT)%xJ6hlIkzW38+$;pfM@(
zfhc<9X;5elicP!l#<2>yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf
z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;`
zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m
S&e+cY1V1>$PjeMN^1lIVM)pPk

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..9df2f3bd54e13d5078be076585302c2d0f4e93af
GIT binary patch
literal 1497
zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K}
z0vA4laj)@Nj0+bod<hfJX{q20B)Z|GJ-_#!^L@QHEo0Rbagh>tj)g2>17P#h?yro3
z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI><LK$avKj2}{3#PvN$q;ze)edZ5G
z^_J&|q$;alB=wtQ92v_04AHlU00<dKrN{s*A#^k3!|SdPiYl5Cc}ytsXW7Ms-mc9E
z-6AT(`pB#MZqFOLbF6;GjE^zmb-nA7S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDYL|
zerMF3qxb%X@s;n!_2x@PE~2L&?}3&WI$gKd5D7ZRd-B{B3F9kYI7}{v2db>6_5Bsx
zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I
z<O;OlDSJi45?hKROA(ifFF604RnV$f;?@)tup1;j);wq`HMX{uf9;t0Y}3qN+Nb8Y
z)vL9eR_(&hRfF~2a)6Ik5*(7tD6W(Oya9+m&=C(9uT>qt%AK@ZR=(}@YNKY(J|}z+
zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(<f)cDaQY|Nm-lJTgtHyqPf#50H<b`C
uXqiF7e{<oEww=LXyW#uZBr|sX;hERZm=AW$ovgl}1_&QGi~*GCr~VuJx%5l`

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f5da05a106242414df178b29c0ac05f21de73c73
GIT binary patch
literal 1505
zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3
z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv%
zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W
z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X<QT*?HF2g
zmV_;$SYTdQOt#*|XTolwq+xyJ*8{KTk3FI8i=_XQtY0zfV+{2zR=uIuQhmz?M$*X^
zbwxXd7OkjPw4R}Od0rO|?6Wr0#s1Y*Kll2bQTN*LKCD?k=#4BEzoF$ad1i4R40Y&s
zy<UT<IL~|XwWkv1bH8v1E~f{&VkXUl6}-IFy@8caMZZ#X#nSZm@!J4q(%9gY%Cpl)
z;gF1P@-Eqg=Q!tO{E|K49Dxo$=Dfr69f9TWz*Ab~IWJJeJc^jnR{jOy&&i2W`2v;3
zB*(h~^vP4BsZ}Y~oa}4Y&Ze7o_Of<rk6XQJyXjOfYMF{S-z^D#coONLT}FAiAovOr
zf2nhRNPL~j_*Lel-Ey*Rw^toCGqrQ%_i)OGbHNlZ<;tYT;oRkiR2<12Q6!tze;g*l
zlgm?Ar#5i=C+U}a*k?j$hwdX4m+DO|<O@b>&<NgKc%yB1FxYMcfj>!&y<m9e_fz(R
U9eXEj9wY(64;;k+O8Dpg2b^^EJOBUy

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..0e089dee3ccc91c9decf65449e3ae930f1d1717b
GIT binary patch
literal 1523
zcmcgs&2G~`5FR_J>!K9Ws%zO&4iPF=5B-UWib&PVCM{MVlq86><UoELCn;*;;G~KA
z5<CER9svn)=D>j`Kpc8ToVkIRUE3s(<iI5(duC^LJo9}!E9Ys^AYqaq2RR}USO?g8
zxA!xyz6Gd;AOg3*6A`M{LRBC@Hi*8BsZo5Tm?{SesvPoD71){)#Z6xMMgc%XM-s%)
zCg{I}7zu0pAVoEGrFri52bG3nk%%HGUj*fcpuf=N|LLM_BO4$hpo*dan8$7nRw3#`
zp382{Pf+Dmf5q5MM{#o&yAMlab~lk%zcO$tuHA8l_5{T@Vf|wWNlurY(`+WmD7l>@
z;&_BI1rU7Dco^$sb8>9Kjw%N$)+<<#qXDs%Y%n%K0{xpj=0*PIRk^UctwEd9doL(o
z1^x4~{(2edO&6cjauz);w-2h+x7v26MkE~S9dY8mc>X^p>;-4T14Yte+TjAOpT4j=
zu<)rU-@}Te%gU#v4FD~sMrg74<n+KBl6o!fF&WV`rL=%AlSh<dphX{0+M<~jkEJo;
zF<WIQ%`%IAV9{e+nP)8ilqp^+lVzpRNz*nDI!v=hzLc0Po2i$UkxJB!)K&S^7&bbk
zX5B1ZmXk%E-_CP<n2B&ut^&W1<#;)WKhP;XVtCEs@Okp2*)UU0t5X`(ljRG<ci+cD
zDMyMQr3;LYNvTDTSaAe*%q$bX{zK2<nyC!yYL<Ie_at$47yFD@w8g$hC>E>di-#A~
tc(3NZy0iy7R<E~Hb6sZ?AKGsJ-08-Rd%MPNLOYBBcn=)o4?2&(``=K^{GR{-

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..4b0fecb15e7a68344562f556912622282bd20231
GIT binary patch
literal 1523
zcmcgsPj3=Y5T9LOyVYuF)3@18I1p1eJ@gMNsgb6a2dx_vD`2F`LH;ZY)D#whLi#2A
z0N(uwCdRV|4}JpUp=aaSo5q>_BWU5^g~{&Bn|ZS{zu%je&GVwc!Ysk|b4+5e2C(^V
z_h(#v3s4O~7;b<kB2+Jhs=$D)GkptFqxecORSptVIpn7*usI`&n{MT61ppBpNf1LD
zp#KtLEUfK;6xGz_=4pR0tTY^pttgW6MNobS`U_qDpDx-KvH>Onswf(OdF)mp0@r=W
zbJ?x>3920NSH$jW6gOwF`w*G1yNSF8m7!DV+Z|_QPf>gm)<1@j<khlsn$08`Ww&xn
zoQzQB0D=!14`YpOOph(tQRQGoy@K^58W3B~24fQ>(7(wOUhHpPl{35B8n#_}?*-+n
zpnqP{UoRoOtHtNEoJCK|?SU!{thU{$F$sryTb#Ntp8wAYd%@Z8K#{bVcCdi!r!VXd
zEPN`;_pl=Avhpdi4xq)<6<#bpKH2w%q+W`<Bx9cDoEPvVdB`~iTKqoeEuLw)u{0(;
zqE&|TELrpeiym!do>Ke?DYsN6OQq3C^R^pwNK+$UN@UAs>ZN6*5_KbWQ9dz7jZUdq
zH%sT`WYNuU=Usf5iEvOZ0>6-T@wy=XK<E6B@S4TZ^W<@}VWyf^r!=f5%V&u1o{xuf
z7b$*}E)XA+bBiBRaRhfnmc+0Bz;j$pDnnh(a?k1>C!XEGK4Xfu=zD}>Q9WNgyr9N=
swf?Jfd$?`&dfT;r-x<e8c7Jf{bmPX|9b+e<9mD{<2afOuoyXt(Z|O(;OaK4?

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..5601f5cb07d71179df35855265cde6b0634c130f
GIT binary patch
literal 1532
zcmcgsy>HV{5I@_A>!N_ts?V~e3>m=c(2t~15velpl9nhCN)jN24CKT4Dr!DR(nKBE
z85kH@85!9a8TcC*PzNT&j2HlM_iVR?qytM&$vfZ2yZim#Jv+w>QRZV2wv}ZngGGSl
zck92x#v6c&0zL?UBQjKurHaIWEir3_X)K88YRJ&M1VdBYR2`OQcrhp{U&;f>XlRnG
zyaMaD6k<Me6V#w-%s2N3!%?~3vspmb^iPuhU9z57+W&MhSC9=b8E^@r0zmsFOJou9
z3A^xHfc^sbL50gMXyt(n$_#o7ZrsrP?i10Q_F`~~-WRam`%mdzL1M%5s8=2|I=yj2
zaQrIiKP2n5&-$Ps%jvS`G@HpX%r>$_-kV|YIe_GXrh_2CQ(0tJ&(6<_7@C($J*U06
zSrDAhc=MAK)%vNO@?(GVD<3qvtx;Rh{3z+)B<u5>^>PkrP8UC?<t&<Jb`uPBXtx`k
zDpPT~*W@!7$Fcu8X~#Jm9_X4GG7}eY4=?U6T=-P<7k*u{@Qw8^0hl2pzzc<Ydt1(s
zjAMD7WWtl2^F01Z?r@Hd7Qe-Li>F$`m&Am-^pxT}LmF*QqeY?AL-OCJMU+Zqs5BZ$
z-WJXtt<=bsBI!~({?v}fBeiJ!sI(Uy*E_{#EnPe;#R?+7ofG&l6XBp7d3HV{@Cpzo
z&^g~Gyy?RDacs9)Psf{fr#Py`N(YGVhKq-DffP3-^Ta33xy`q!ID*?DP2$$S?HFRE
zB}HA+rM}(WjXc`GJ`?h`Xg@-+sh%qiUNFM_>fqU7W3+1b`>WN#pf?GR8-w9~uN#hD
VUyH6q%tQ#l`N0l;sdM<<{{tHM{bm3F

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..02bedff1c8ca8f026eb6a3d7664517b0f74cbb2d
GIT binary patch
literal 1526
zcmcgsy>HV{5I-lW>q3Oms?V~e3>k{mp&xOhB2smENlO$6B?%%eS@PjHNm28GlP2nj
z#2>&v!Op_MU%<q`$kZ7zFd#N0?w;*7H0i*SlYQsAckk}^d-vo#FPbdGVr(ae237#p
zKW_htMcx3&;7ZuD4A=@=+h8(_;44Qp#b41BKUanI8Br`CsxD;#0_ez+6bu=^q$msN
zyC6q&ZK-+c42G3P&thR!RzFMX56LJQ%KtRcH;4fX0ICQY0QBQ*FNbKah-vQus>=|B
zyB@0mR4=8fgnL$*F-dOOOVE^{_eJEUqqr!@-6{Cw0>L#fJ}g|byNA36m0_>q*qz?U
z78Jip>Tk(-7cxEtk>qsQTbj)zIcs{vpG0yE9m<XA`5Btx=TOh-C>EG8y`Rc8KlU%b
z%9-754cilw{JLnoT|}DG#cydji>9921x+4UZM#!rG7k5CVB+4m{*9A%gR|j*D(g{w
ze-77hes{r`_f*vHAyqaM^~=I4fF9MtyjXm8yyp%{y9{iTjCqQ4Uci^+3FjDS@kgAu
zc)BHGDNJ}uUun*>WYPC6dbE^&LGkCLM5%O^N~4qFZ4vB|rbeEW$d)t7SC*NK*UjWb
z`PdvaI;CbkQ#vmviXy+A7x*v};h<djejzLH3J_P&IX@u0nd0bW;;7lkB%4;JG^{7e
zXNd2ihlg{46gQ;`#3#eK#Sf@Bf;%Kj;?;lbIzp35Q`bznZ*`C2Cp*|@OwktYM<^E6
z^TfpqTC87lUZ2~;O{?GEtT|3^92?or;I!9`nGd$it+>7)1#llY#9vt+|M-6aMPB*y

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..bf131f43cbf10180944b4906799b7d6288c54724
GIT binary patch
literal 1523
zcmcgs%Wl(95FI;->!J$Ms@JlmEFx5_F8YXxib&PXO<Jr#C`k}$$$~r_Cn;(kIBBB(
z1RubjPe4MfS+L*>5R0x6Yjz-J{2+lO3pO3uGk4~WXU@5E<vcGMEX)#YKgT2nYXF<?
zc7Mjzw*b`;gy9x=B0}{_s0s|&I@7l>HHxnkQ{^B*l|z220-JN9xan5DQ2-FpkpwZc
z0s1c?#=_bjNKs8)Zk{=VVWrWt*oq=4Uj*fcpuf=N|LLM_Asb*Kpo*daSj27>A`tZ<
zFJ!msC#Z77UlF_6C~nSU_aQQ2cN2LHD#KpIu{*tyJw@?NSpOJ8lCxzOG@DB@%5LYF
zI2obL0R$g39>yBmm>yfOqsqaGdIjrAG$6K|4aO!&pnsDmyx8BoD(7~$HEg@|-V4fC
zLI1p@zg|Lmv&9#*oJUW~?SU!{thU{$F$sryN1VDZp8wAYd%^keK#{bVc5n^XPrtT1
zaP3o3zK0b_mz7VEbpS1<uJB^<$;rMqB=t(%B^mQH=e&R~$s^7&(Bcm`Z}CjajioW+
zF|9J3XUU=;SoCNs^Niw8Nx7vmSt^ZAnz!9xk2E#%r9`%Dre0b`Dp5C5m*o><)aaC&
zb+dF)P8Qw#cHYH@nFt5vGVlvo7q1KA4|L8C39nfkJx?At8)m9$bxOl}vV4yC?)!K+
zcah>p=>qXFIk)&B6-RJKWJ&z`4?V}#q%zdiEcdPMapLqY_8C*OMc*S7i|YB};RQ9`
suQ{(S?BTZ6?{C)}r#Fs|Y-e!R>&A_HJH}2zJBR^z4;<kSI*-5m-$ZTvUjP6A

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d41b35de2e6f38dd1aa146c59b1e5050088f7b70
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyM!!c@iuM312Nj{VSlJ(M$BFgY-`Xt0VC7C5GVy^2F1c4`vQIf
zAN&Z$ca5LL_~3&NehCxLX{q1_iN0{sp1=2=^ZWJQw2W0##6?QnIT5meO@OUWd%rRY
z0wf_60b*0!-KKI%Bt#;uDN$wSsj#&mOZ$Uv>m>l00$Gw|Fn&mB5!d%2k<zvG_PIYC
z)mxq;lB%qJk<@RJacn66GeqAe0w81{l_CSMgwRIFhu2*r6jd}O@|aNM&$5dNy<M9T
z+8`>!`pB#MZqFOLbF6;GjE^zmb+hY|S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDSo6
zerMF3qxb%n@s;n!&E`u+E~2NO?1PpVI$gKd5D7ZRd-B{B3FB*DI7}{v2db>6^@A1M
zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0mccu3lJd5>j+1w^dS%kl(~5*<87>|n7I
z<O;OlDSJi45?hKROA(ifFF604RnV$f;?@)tup1;j);wq`HMX{uf9;t0Y}3qN*=Ods
z)vL9eR_)TxRfF~2a)6Ik5*(7ND6W(Oya9+m&=C(AuT>qt%AK}bR=(}@YNKY(z94)L
zr+5$pq_n9}VLle3gNNLlz#Xw=nf8Ai27zW(<f)cDaQdg&m-lJTgtHyqPf#50H<b`C
uXqiF7e{<=McAUXrr{VkFBr|sX;knn(m=AW%-K>6)1_&QGiUE}Ar~Vt0tMqdK

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U|
zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5
z1c*T>0>rwwze(k!NQgvAQ=-b$Q(<FHmi7nTR*L{K1+pZ`VEm9$BChX3BB^Vutuucx
ztTjDHtf{j4MN+>>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=<Ujs
z&~2hJs13cE?{>YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^
z?X`!U8G7$;8DIHs+-$yJ<UD%%@h)hIfzxrjb&;TRd?3$UkubjYg~Q~0c%aI9O5b0?
zjWd>Z2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi
zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC
zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9
zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t
vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J|

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..2838dd9727770220dd6b3f3ae9f0d4bdaefd8ec2
GIT binary patch
literal 1510
zcmcgsKX21e5I;MKaZ!cRs?V~e3=t|;hyIC)ib&PvC2g%hC`k}0WXT`LNs5{{IBBA8
zd=wTI1_mS~J_8doV(7p}Kx|0dd$yYdhk+$0`|jPndw0K|?<GZAHb|7j$=w1G39JKb
zecJn-lqo<IJRv~V$>uhZFiRp*LY0G^=}LjE1yMZDtF7b#M06xUkU;+-BuG@-2Pv+p
zEA4Z4II6WAi>xY=@>x*63;KyJ|4$Qb8wmgr0aY9g0QxC%P(ZXdgw%Hc^)-k<%p-+(
z>eoVDz%v^}pCz~EC#Z77UompwC~k6cw@U#zF|rBz+t`%d9pp8vjhvcm_nfiKDSi`_
zUxNN7s=tpQ$#B_!G+Rh=(bNLa;i&~X^Tfi@fY|)hzRFWxWS&>;!tQrQ-I)n~yrutS
z7bRSr(F;dm5k0N252`e@x^}NYBpm8Japt~w{$F3%3oeESilim9!zEn5<=p{&*;7%z
zL={Pwl@GBE04<@e(sKFf*?~7C^;+CxGND;YX$fB@k154Khn`T{p}7u^WijCiTjeOt
zGmCy;(PLY==Pdq=DPAg<XQk1}(k>4=OtVJ5RG6)rnOByPNi~hkRrSmmw|bR!)2v)p
z(`BCDEpmLAiEvP^0>6~!csYn0=#(BYyk>d)GJV=^nVGiLtBjiI>ILF^=;NW3BgI46
z660f1YSANB9Kjtk%fzq$$aA=6D#yB-)q&MNO})5}eI_j0VdoKw#p?Ou;RQ80Xt=L0
o?a_`k80<7$*O?^8wmUp``bp!#uCbfa4if<01IPGNE8;)=AFJv2-2eap

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7cb2cbac4ca976304da8c64e8db39c678260db2c
GIT binary patch
literal 1513
zcmcgsPj3=Y5T9L^b(=H>8{cL(;XsUTdax~9Qd3MX4_Y@UR=`M=3xQ>UHigB7LVEP(
zC-D<#()bDb2|Ri519<dgV&csHkt)l<3lo@mGjH}czu&%>t?-)3B$i>jWel(Zu=#1{
zS0?oiAOV3T!^nVbu=@|OEyZLer!^(o(UhoHh0T~M0{EzFB>)i<Ne~jE@k2;6N#6rG
zrE6>5Q*S(Jc3hjStCD&xsNV&nZYcjVL?>#%L_n9q0I)>ZoluY*#zkRKho(e5By6#Y
z05QToB<I3zqoi?j;x;{J;7*+z>fTGn=LG6oL|rl~R$q$UD>HF^a0&+ymg<UjG$mTm
ztY|&2#^{p3_@c~pvA=aS&z)g!()Z<E3F=qDxVmNhq&u<*zGP&aJiWXJnmo4q&Y;C)
zJkJN>jjIym|GeR#7*7vWNl)tsE4X>a%I?6*Per|!RLM}(kIB0LdRklOwc3l5$H5_K
z-^3lV882|otN15-#yJ*x{0ZkhUhMg`0yaFSRgv=&Ma-j!8Eq9`QT-)3AF5a)XiN&c
z?-yP2G$^z>#fFtXv(0?AZRRf<C+4&>sCU~|{i2bp`ThNh&ksA14%%guS4%!$pTr;P
zoF5Wjt2TX;JMMO@eAgb-C+%G09Qo~sd^q=+;zNZh>9IJs`5}QLxg&~XA^y`K@jbaB
zO|=>$dw86Ey^C{ZRPE98NW~_+&;q`oWkxOU?S(VhvPYw>mgl*%%+&G5r|vLgKH4_7
Sv-&|AAo#%%ewr)zk^c>+^Y&B#

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6
GIT binary patch
literal 1513
zcmcgsL2nXK5T0EY+$N2|#<$r`I1r<o9&F2&)QIWjLF)#^3K*$!A+Rh^Q&?Omq(^W5
z5`TdvjlZD3z>^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL
zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p
zHuSaLi9eaPx}L+<HATCWwC|GHFxCGVVh}Z8GN4Oi09YaHRwyVL#${nqhpt9FBy72g
z05QToBp1SNqNGV{>a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh
ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuP<MB<-tYUfeK$(j8d_UokRHo>AEYJvDI#?y${L
zc%Jv<D_14R|9QhfF`gc1ijgt)ui@sI*LDZ4{ZzC|MN>>w`<T29U}W@lUavnteiR&%
z{!QK?oAV;)yoP_Wr<`M<&mVK%=cT@=6|vzNtxBAiDPkT)%xJ6hlIkzW38+$;pfM@(
zfhc<9X;5elicP!l#<2>yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf
z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;`
zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m
S&e+cY1V1>$PjeMN^1lIVM)pPk

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..9de62574605a07b28b991ff2c736fbf6e3a7f45b
GIT binary patch
literal 1513
zcmcgs&2G~`5FR_Oait1{inVMhhX{ey18w6*MFjP-NsAQ-B?%%0U!ufulA<OKPMWAk
zZafK308+&h^a(g|-~l*tLPBE3Kco=nz$K$(W@mQ%&G&g%uJXFcB$j1+6%4Qmu=Q#8
zS2q0)AO)dGQDne2+1>ltmS!@OGnx|bXiD6x!d5~R0Rq(ZGJuGQBnT<d_#tGNr0;{A
z*0uHSsXv}HJD$TfR7t%O)bE1PFqHopq7yY>BA`oS09YdIRwPJ{;-av)LsQ}&61G@H
zfCOP5*5<-)qNH(i;x&DD;7#3Y>fTGn=M?H&L|rl~QD2(fsW5SVa0&+ymg<UkG$mfq
ztav@ICg_sD_@c~pvA=aS&)i{e(huZa3hGzExV&Nfq&u<*zGP&QJiW3Hnml&;?x4kF
zJkNXLwW|{5|GeR_m`o2;Nzdp9E4X>a%I?6*Per|wRLM}(k88I9^o+K_>-Fa+kHSOJ
zzKOeJGhXDJ*YHpFlyfZf_+!p{ywnS7MQnIRs}kpBikQa{GukS>r1}eT0aU3>(3li?
zKPY<SX;5ShindjF<Cuk9+bmqzC+4&>Xms0F<J`{IgZ_Rs;D?<^2kj!xYvq7%K;jQ|
z&JT&NRiD1jA9p)eq3aA9lXl)dLw*k<KAZ<k@u6ak^jMrb{E)zr+z~~x2>(f#1fE=p
zrdsyM86M|e?ctmmReSV2QgH|`vXC!m*-^`Xd+tuQozZB!<@??&J9Yi>sW;4;4|dF*
SoPLl22!C*dpXMrl<bMN(@Agdq

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..9df2f3bd54e13d5078be076585302c2d0f4e93af
GIT binary patch
literal 1497
zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K}
z0vA4laj)@Nj0+bod<hfJX{q20B)Z|GJ-_#!^L@QHEo0Rbagh>tj)g2>17P#h?yro3
z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI><LK$avKj2}{3#PvN$q;ze)edZ5G
z^_J&|q$;alB=wtQ92v_04AHlU00<dKrN{s*A#^k3!|SdPiYl5Cc}ytsXW7Ms-mc9E
z-6AT(`pB#MZqFOLbF6;GjE^zmb-nA7S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDYL|
zerMF3qxb%X@s;n!_2x@PE~2L&?}3&WI$gKd5D7ZRd-B{B3F9kYI7}{v2db>6_5Bsx
zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I
z<O;OlDSJi45?hKROA(ifFF604RnV$f;?@)tup1;j);wq`HMX{uf9;t0Y}3qN+Nb8Y
z)vL9eR_(&hRfF~2a)6Ik5*(7tD6W(Oya9+m&=C(9uT>qt%AK@ZR=(}@YNKY(J|}z+
zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(<f)cDaQY|Nm-lJTgtHyqPf#50H<b`C
uXqiF7e{<oEww=LXyW#uZBr|sX;hERZm=AW$ovgl}1_&QGi~*GCr~VuJx%5l`

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..8e1fe121e25cb51ec8c26b1aea7ee00463f9400b
GIT binary patch
literal 1505
zcmcgsL2nXK5T0EY+$N2|#<$r`I1r<o9&F2&)QIWjLE8<A6);leLSR{-rm(nBNRR#r
ze}M<%-K4+3_#-@c@MQb}UQC>M%SvVG!3&e!nK$$1&3xa^%a(c7VhT&LJ0&JF*Z|o4
zwDUWya)3Al!hmhCyZ1#_f+bSA8t%+j8f-3!;(1YRtq35aqezkh=65N@6k`t(Nkd=j
zocg12qwP6tT~oA6N&709N2dBeO$>qtOa@d*Gyv$Q*=~u#A)(}3fLj3{h|O%2SeT)!
zVGa?CMzO#mv6yVWi_eMOL`tK^*lYN1-<!CC-4{vwDVe`w=EoTFi&nj+*HV7V21e9L
zqP)T#T@6>%D_qZ`czBT)4(zi!=f(cz)i`qp-Er^A@IKryf6yI?7Qd$DGI~a75A?*y
z>AC$TOW-{3$ycsQ;4ggPAh;YJXo`_C_E&K8R(A(hJ{9d!(G*kF-p6kP7%6?7SF6uX
z9tA_vzsWl!Q=aFXSMViy!Z`-I{D|`|FLXsLj|op{Rp7ix7X8qoM_Yv#6n{=ilqwXd
zG&*_S6G4wOHJV9{Y~9YicC1{cW#umFC)T9huXS2>?Yy3?iu_(#;KNLWgK`o2m7>5a
zK>UTy`2pdztCLsR<4)Vob)0@}+{)I^5Z}WY9?k_)Je02xADeTBA5d`wcSx3GR{wF}
z2u-R$UG4hN860O`?qZ)QMZ0t!p*U1;CIMd1)5E6!=G+}`Im6*r)Azk;dgA({Q*V&A
W9&B6N8Dl>M5Ik^*f3PzC-2VX7ZuN)&

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..37aea5168fab44a7bd45091bfdfe5871bee8d360
GIT binary patch
literal 1530
zcmcgsPmj`25T8=$b`cG`@ilG2frB<2_77dMM$8_3u)7V4E1<G27XqcgnuTIvA$!xC
zhdujEOpJ+lKY<5NegKaiJg9NrYrB7hHSxlvo$1V*H}m^-US5e-3=$?uvQb0>%K)i&
z>%Wrm*8pMgB<5QJWSQJrAyE=15|I+B9IU8vkgGsyMidM1s*44H5IRvo5aarHAwk00
zCP;BjU2Gq^!%@BESR|%Il~025P0*{l{69^!6~=&s0970f0Qx0zzley>h^0OPyjlVg
zA|SzapIP{62t)z5DI|4~-;%$s%A)_p_)SM~k@LHD5bzVlH@l#}kDN2Sf!v1mkyCf=
zo-?*N&7XquThQNz^$#KvovwRDx0yu$H9^5gKRv-<ZmJyQu%6S=fZ^o{{*cf4k<0w*
zM|Qt6>Ry=Umj(TF0clPbzoO+Vnp$xaRB32+?OuaOIN;l%3unjk|N7Hja5g+pqFO@R
zn!{0?-&xS-Jr(6NqC|CB`5d_kpe588tyCWG-}8o~ehsZNna~`iw2Uv4hm>NVLw70d
z(0qr-a+vUledQ@FFpGX*(PK;bXDt4NDPAgHV5QN?(Jl`<OtVJ5RGF=r*%y|PO*f6~
zNp0U4w|dof)2trXG8LZREpdF9iEvO(0>50~csYnO=#*|Vyk=#5klAau%xv50RY%QC
z?FjMR_3=>3k>aLYnej0xwdgi0j^K8fW#ZR=;5l3~m1kYe+Q91XrJrwLp9zb0*nWg!
zv3kCEctK4L8t%(ud$ei|2CEI%btcKN?G6u}e$u$JW~`;PtptGgzz*JJCA{_j0OaiY
Ae*gdg

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..3d1f158e9e79bac193f88f94d2b548b79827778b
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyJS(ac$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U|
zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5
z1h@mC2oUSy{w9@^A|VnfO^GT~Plb&+S=t|TTP*^}6v&b!gYiR3iMYNCiKMQrw$A*)
zu-5b(v8KxE7fJml8ApclKST6QA^<`LQb{rZ3kW4cKD_Q4p{SxMk;jB0f0~_7=<Ujs
z&~2hJs13cE?{>YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^
z?X`!U8G7$;8DIHs+-$yJ<UD%%@h)hIfzxrjb&;TRd?3$UkubjYg~Q~0c%aI9O5b0?
zjWd>Z2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi
zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC
zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9
zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t
uf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEVGN)|KlR_NHuSUr

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U|
zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5
z1c*T>0>rwwze(k!NQgvAQ=-b$Q(<FHmi7nTR*L{K1+pZ`VEm9$BChX3BB^Vutuucx
ztTjDHtf{j4MN+>>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=<Ujs
z&~2hJs13cE?{>YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^
z?X`!U8G7$;8DIHs+-$yJ<UD%%@h)hIfzxrjb&;TRd?3$UkubjYg~Q~0c%aI9O5b0?
zjWd>Z2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi
zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC
zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9
zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t
vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J|

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
new file mode 100644
index 000000000..c08326355
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
@@ -0,0 +1,58 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "cluster",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:32:15",
+        "end_time": "2024-10-18 10:32:15",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "jaccard_similarity_threshold": 0.7,
+        "num_bands": 14,
+        "num_segments": 2,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 91.7,
+        "gpus": 0,
+        "memory": 24.01,
+        "object_store": 0,
+        "execution time, min": 0.001
+    },
+    "job_output_stats": {
+        "result_files": 28,
+        "result_size": 38040,
+        "processing_time": 0.061,
+        "input_files": 28,
+        "input_bytes": 115324,
+        "input_rows": 168,
+        "consolidated_files": 28,
+        "consolidated_bytes": 80640,
+        "consolidated_rows": 168,
+        "groupby_clusters": 35,
+        "cluster_duplicate_docs": 79,
+        "jaccard_clusters": 35,
+        "jaccard_duplicate_docs": 44,
+        "num_duplicate_documents": 44
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..03a0c321a678ea8f24cc6466e71fc792f5848e66
GIT binary patch
literal 6923
zcmeHMc~le0*6(1%00G&ONdTb*4JwNeHbrCzAe#vSA_&4rCuu@Ul5RU4ki{KPMg~+|
zSw!3&1s7aVISNk}7hDHf#07CdTyVkBah!RTs5A5C^ZU;E-aF_0Wu2Vru3J_2{(kpX
zZmKE~f+$xOopqYVN)WO*EXWjs#vC610ZwwFK?5{s2t<d5Lb{Lv#DokXBWO5e44FV5
zi$SywG=#>2STKw42<95IbXmGq9LDE<4uke-BpVv@1u5v%nlfGtK{}wpq0xsL^P|%?
z)It!iTE~h-=XyZ8#$1jerle+L(%BMRu2v{nOg4x8R)@Vqhqr{rWBh*%avcFxh@}I8
z${0K#2(%w(vj#>BS?oW!CwW+_r$Tc!^>TgvR;;Ehjb^W>rC%DTW!j(jqK~RL<Ye!$
zYW0OVp^Q~ovAeq#u27mRnBla!bWv(Vo1L_3<iX+jTWy&EH;^Iuw8gX%nxo~8O%FMl
z<6C_4qpIgu-?VsFkX#Yp6nA!gA#-+dVe^WuT>g;<gPma|(_GM;RpnPYlAX%ExD}1&
z<9u~HOq`<i=6BTAL~|t9-iKV8xV8Ug@iUS2g1zRJA;jbr-vswEr)}t25%H?QtT7MG
zbLa_U%6{f~q@^8u<~UwfUJ}2cb>%D@XTAQ>+`N|aoE_9Oj<R8sV{g``tn2;!yA9Q^
z3tPwbp64Yrx2B9cmE8Si@2ivNS@DX^m3w`Qm%p}_ITVU2(C5xZy{V=yP4`}eSeq9b
zHm(hP>R;7<+T*=-d1vynh=sAc-ahFVQ5f)upxq6#H=1VDIdoGJb%!`WYZzwmjfD_T
zOz_%x%S@-?hGJfr?)zht%aCjGZheNAJ2PoqML)5<NzZD=L8aBLy{-d?TYJLr-HFDs
zlcFdCSwhASBdXiH-@1^(l`dB{yS&&u&);=!P;*)j(VN4`vAlJ6OJ$R^_-aqdjf-`M
zEUG717Py^V*0Ad6<ddxO2U!F4$IH*EzYPpzEE0rWFSztl6?E-xBFCnw=7YrDuH|w5
zgBtEQUx|0~aGqY?@onc0Ke-ZyA8CEwp5$|E$pI+O=$qMZtX!gf;za6`tft)RfsS00
z=puGb=Fhz)^uoNB2W9P4qZ#GtltWbY?kLI*HXd#<+?;yr(v!GyRt3K$s?Q_;3R1wk
zcksz^r!>Z|Z&)_}IN>8nA0bGHQ|Gj%c#d!l8VIWqgpA56-?pJ(8pCJnZX$71T-V^r
zYi5lnendLSYnNsYD|hui&$Uj7i{|$}4DAi)6Kk7(gYO=Zxp!r(Uvsf~aNX^=`j}TG
z2Us3I6)W01Uk{tk2&{ACXQ#f8d>9rI^kjb@{l(@=VT8|^x;8XvcdP%4xKo3H4r>n*
z_sbMYl$h`kKI-Vxaz}l~SeYVPkePm*?->xXJy)5t((TF8bsK9^Z^fM+<VTBEICSSc
z2|oRgrT)czymj!c0@cL+g7(t9b=*_+L3z_E{ri{4cpr7l+Ie_OP2sa^)iIvUN3U^3
z`H!xP-Ped30?MWgS$~~qi*j8oS!!Zp0)<iajCB>kT*r6myekGFC+lnWE?|Zl-k99L
zKrVzlUEE04M(U19TC#{?Ze7GMYO3s;S(vtW>BKclyCxcSGw69wh>?>xIr?WhwvG8#
zpDr+lqvzF@>Q}Xw2wz&)|9YA599s}s`n>NAY_rlrwDaCwlR2uc@AG=(QOC-+T4vnl
z28i0=;EUVe|Fo`=^TwlZokUEPRS`*V$MP27-j!~(d-iqM3lm<+{d3$3VjTEKub(Uu
z_c|w3Dsp2IU!A{6*E5UVb?7OXcG<F<Ev~F&`c+;WqMPSX__D0tX;;!xheC3p&?i59
zZvQJH>_$&>(_6=*Gwz%tgUl<=7`W!0fBA}T*U|Fko!WU9Bg3IOcP!l(X|{UZ$n<Sm
zyeju5_uj$H#ZPDMaSC2;rhXYj6w;XQ`A(9bn{x8;-?5^!$wt2M{qYV#1F54axr@cQ
z$M+k`LScNT^x~DN%T=@2XZBBgUVWc)#(vkcB}`>TP*@(tkeXf*yAOZY$@dBhKGZhX
zA;#UBUZiJfYim0p!uHfYbzf0afKKkc7xN+_Mt%Eko=-T^v)&{w?~p-kb9LxV=jt)G
zZqM^V^Rf?*J~6`I+BGOmA*SALuF)AaL3ic;?8MvF)v3w*haJd&<o7d3?9FSJGays`
z_hwIiaV;r$7T;WD@7NudTU6YC<#Ariph{U4Y?kyI9hDtDE}^^W(ne?gJL1+W)g66b
z#f`(?)jE6jU0HHd)t4Dt#7-@}dDX1VJMnwCsI5~J+dg09J;^uZb%J+l>DYJ24{bWq
z$F{YFjs@9)+Hj2;OLUVD9tzkq%{1x4gca!X(>uJfJM5OGD$Wi@@x9rF+zpB64F~mA
zEy+Tovqx9nj;ot{rbkbbu43OiJ0s{>{n%p<Ty-8EF}ito`khSHLkTBmUr(rwCu~%u
zv(IIuf54f7gr*Q@>ejnD-P*0i^Oa%O-p!fnCFEc9>MCehHlnoVed&ole&a(MmrVnn
z-EIDD7eWi{FOubC7sE6)bA;!zw5p55Q1$b31{3}3<`*QqT6cK{-tRYrYu7|yJbKq%
z&UAgQ_`-#P=7Le7-U7PbkjSXai$<nNcRJZQjj)@Wt*v17%c4<ZQ;hbd=MHmquJG(^
zyBeFgwP5LYJK4j}4a?p0CaT|Wt)&9XvAvmFXb~0dzG(Ar#~#H0G-vz7kk>z6fB49z
zMv*)vX1~rw3sb+qVUK=s8Ft(C=)Qq4_RD7-i`o}8FjxOR;l%FyXP#^mA+7<&Q*uvV
zh%_9(nPr)oU;e|d<_RX>o5q{2NOBs-$UWCr$852`bg+9#n()V2KbM|p^83){ymS6k
zYJ@tajyYqGe&bJ7-CYmpMwi6A0esDo4Y8AjaeeteEc}k|jC<|O94@u73cr;(i|}*e
z4Ov1zE#7M+e%Dl7@2+!TV*4ns%e1<h#90ZAIdx0UvnHh}Tlps5O?DdvWMuN@lk5bG
zJ88qyQ@0b-UpPn8qmy6gF#R}$$=uTq4?dl4xyrI!RKH*bF^yj_xjyAs|IjHpzbL0x
zd#s&87hX+z#Ec8--1$2^v|;Gs5}w%Ue74=GGp!Em+8g?)!}=Ww+ZR4#D0dh47|^Tx
zPB22uS0{9~PGUaemCq_~9n5+buFKFdeA8UL5vwj+yv*&~d)wzDciP;jKoR-xOWSoz
zG569bS%>a58qXi$M_<LY_gG%LyGmtjz96YoZ8!7QPMeUfNI7%L7U6#1mz!G4vQOS?
z^s8(-ZXA8tc(!%11(qAteZjhBm8<);<kZfk4<c2a*7yWt#=L_@<Tg5S^^)6OoLSkH
zzIx)5%lng}`x8RHF&z5XDSmuSx=EJ4{dg1XXq2JXxn);Re7m1>p+0sAQ^(T`h1=eb
z+)W!Ut9-l2>UcpU#W61Hc(_`3(VqRLtNE>OeiIF__kF0d+u}rwt_tWH7Z+WnH2LAa
zkz|*n`;?Pjdy&flZl(KsUv6(0pGBzgQ|p#b2=jRn;VZ5=6reU=JAa(n1sBF7+N%!{
zk}z&9&Cp}?mJcnv&+3JY@r@1i<*|K_+%?EDpz}C1^^n6}v<C$1@4HA(Xv}S$I8;TU
zq!h_;gb|oPFcQX-VF^Nzm;}SsDwx1zD6Rxg5}_zLDuoF-BEh8c3|O22Qz<AIE1}dX
zM_8gJH4{LG1dPX|D2XX$Frvi4Ao7zMjbcO$h7hPERohJgk&;5>xSBwn;b@>wj$~%Q
zayg<vU^ymBp}=T4rXYZ2Duu+=vJ|aVIR;<=t8zq)f3_>fl`@UeWSj(C<aj2klmVpz
z07y;9(Z3M`P%^{_iKt*n3ZMq`OK@ENfAJ!4<=<Fobo{|dl7gmd%;U<xaN@KN)z`2J
zMwJ-wrVvX;^+m`$Sg4jEX();6$Kh&NiYx6YI0Xn`3Wvog3^*wPg-?^TG6}8#43vOR
z5I~8i!zp0#Qz^i~QdlLyP^AQg=iq8miO4Y(<p_tP>2MrQroz#f0#yN}DoTaQ<yxn!
zsT7=agrj3X97q+enL=e~H7V5;iBTA!fX)Mb6#Q4LFz^+iq9RZvRi#0y3BLkYQ7{Gq
z4nmczmg_4prBtPHHNXN>z%~pbq^96>;A9x1U<HbR)dVKXF+f+2rGf#Uh&t=Ld3eJi
zfLkme2RH;Eq!`3lLvcFr7YPJPI0PXTfEm#4Pa>ES&V~b#G)xLdprl-(27v_3M1lHW
z62h1Y7!%_NDRq(nc33h7!mdQsBreBgTDJuO9{~UWzlNMZ0ah_spl|^2IV#gGq43Wx
z`Up%j><hkj`d4Rb6n=y&LQ=ukK|?863tuaBgq6Uknvf!zb(oHUxN7vuK@)iQkDws(
zk3sqjSONr}V(+XU1A-<(fLj7l0wE~{s-)oQ6i%Y>Gy(-Kk50izDJ;Y#h&H~!b0SQV
zg2@o2e!N-&K#5UOyLg3|m_%SHi4!1(l*${n52frijw3NuD){P>_#9M1+3WwWM@tKE
zwCtT^#bSOsS~5UFpa2||_EgzgNMk?n;uVhL{WiqvYYO_Bg1)ApuPNx?O+jD0Y%*ny
z5^C<CK3}uxg@RNCisq(F3N}IJEZ(12X&?j9q#RAR_MS{X^7B0zZD_3_?`8O(X-rQF
zb`jv$fI_u-ER9`4<827%wTm=aOsnHdXt+WTh@-0ln*#;H(gk;6qjmmv*9NheK=wzH
z91RJr02;0EPm`adY!26o>+|2bxci#LziuSIw7L1$z0KE+<kyYlf3}hQpLz2?PQ2`s
zt7&WwkHOwC$sB^XRve}vDr#D+_s2xcIju8ZQygRhP$D%IC{qQ3bb-H25F`-D1OeJE
zfxo{%AkuUO2n3;;Cuo0uf|jPs-(4dQ*Hl_n0jV0*py3|`27@BeR3Be8Z-OvQ6(16l
zE)JH@iJKWE*R)+Dz@&+Rpqc<$Q)W!j2!T;hw6EGXfD34vBj$U!#K(HLNYb@Z&>#6x
zKQIEo1AK7;|MZWzwVfX)`wLREFSw@u3mz?W1n8T|m#5C;ho^}{Vkohezf2IQ)hGD`
zMKcpv00rWJ+X8Z~8l{{h1mT2`Bsjr<6Rk>fcy#r2c6D~)`hXv@j~_^H7z9Otzrp?q
Du#EMC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json
new file mode 100644
index 000000000..047921334
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json
@@ -0,0 +1,56 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "spark",
+        "job id": "job_id",
+        "start_time": "2024-10-14 10:43:38",
+        "end_time": "2024-10-14 10:43:55",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "annotate",
+        "RDD parallelization": -1,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"]
+    },
+    "execution_stats": {
+        "num partitions": 20,
+        "execution time, min": 0.284,
+        "cpus": 20,
+        "gpus": 0,
+        "memory": 0.36,
+        "object_store": 0
+    },
+    "job_output_stats": {
+        "source_size": 4111,
+        "output_bytes": 8856,
+        "processing_time": 0.46729254722595215,
+        "input_bytes": 8753,
+        "result_size": 6923,
+        "input_files": 1,
+        "source_files": 1,
+        "input_docs": 12,
+        "output_docs": 12,
+        "filtered_docs": 0,
+        "output_files": 1,
+        "result_files": 1,
+        "source_doc_count": 12,
+        "filtered_bytes": -103,
+        "result_doc_count": 12
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd
GIT binary patch
literal 14933
zcmeHOYgiLkw+@Ji5meA1DB=bYY@39OqT-cs6OfyLPz9`$$&idpX5!3*K(&@CUfOCE
zue2&v@Jef~TCG}ZRn&T`7p#g}QB*#w^@dv89<|@M_9P&BJkN8!?>W!;mHZ%?Jv)1^
zz1I7_YwgJ<O_Ls~a8e*eSQ5O+JY=%I+k&5?&L7BR4l?V@$@Z?0DLfPbO6b;E;jD1>
z@p5Ut<K@!a*HhN_HLWm!G-~?<namL`oE#k5I0im;@KHE<hs&HhdwaRjtoRPCdxzv0
zvyoM}dOCR?cJRF85SSI{^6yOKjm?uO9A)sQGrY-UaJ_f9;%cwf%bT#WWB3S}{NvT0
zLm~?niZw^Jgg$YNUvi$B6M@RUQT2JNY(f6EfYQUs(>C-t8B=<&s=EL1E3;f91H(Rh
zUiYNzNoFarKGHSEA=x3v!Rx~H`w<s+FWPjie(RvTaU|QbY**>2#fw)o?VFtZTUGy?
zN!4WPt}&-gmlmCm<Zqv@dGOuF2c>7mi68c;xD!4{QDKq4b>d0f*eTmHEaFn(t+#Fs
zzj^6w;i&!JqncS?9ADS>Noi`2sg>;A;ya_?if;6sn)ozg$^78z<%5SWEU*1+@~xds
zUmWXF%dBildvE6KTHldxOiliL^O$ZwZrtcn=kD%a78CPlxYzpu35wghoh#~|99}S>
z*RI{e^6IN@4)L+fcbGnX#JHVB+WG1SPQ7c|&g`J*5twE$eYXEW=8bRGMIT(8w`1Dz
zZN`E@R}$}J%70b&cfDFVk19&M<TSPY&hrPCb!(G-R9R7*ncip`Z(Owe!LS+UmNq18
zAn*FOb8-mi?jGPDSdkPzrBmg`+-IJCGw<#1H^Hzvw63qA>A*VevP#bi&mMDxdZ8)D
z=yreTy7LuP5AyPxes*)~?leKOBl(}_ruz7H?=(+1ryiHbX6J+*F0buQog@M$abq8U
z_jp%L-c#CbTwcYh>Zv||pL#s~M8JTacVq9pu{k2%DW_+jPJdK%v;>B(*y6%|weIqp
zsYPQlYLu7Pe!j|e@coEg3g5rq_IP;rQ2^i0yC~}Vh=A`NCtdrm+m#cxnlJK0g1-Fl
z+~gHY%PR~uiHk?96L;i1_1(Jmn_l5>mQFZOnB05TtV7XSf9<uzv273OJd5Y8-4K*s
zURyfi_~}ELhh{`LhlV#K9I9?J>t0mQy))IvgiU{~EcM)QT65yO^Q|y_LH*k$+a6Mt
z2Q#YqDZL~8wf<!pinxy4_}mk7-G89tmmXR78+op_&+1R&SCPjmEm=1sZm%BxTSCLM
z+?<GoF79K~=Ny{8x#989?WZPd{*kbHW$n$KUvMV_w)ZjnBo`c=H@0BWmAlsrd%jPZ
zaxp#e+QJOSx!Pl?7X+U*rp#wuJiZ+`_tdAEXU|P>@kkvSU1Zu5Z?3<W{_gKbKK7as
zw0gz-wL#;(r=yYsXV&EITJH94)zYZI{=RLJZq$~HZkqL|$hV<u()@kBPSp7NH3e+>
z=8NR%YcfB+a<RVZyUY(p3_h8VH|NGMQlZ@w`$sYF&gkA_=N@a+)E6hNsBNft5bFGW
zF!|G@+!(*Y`kW=g*F6(AHny8od1#Ew@z9b@ZVGDq$jJFSo_zj${MqZj^j^c5$KUv>
zYt#?URZXY*Pbu$rVB_{_yO;Mb9aKn_udUcrJ#K2*Wd6vTJN7Qo_uVo7p2^kyjlk2a
z@z%YILQLX?Lf_cy4er0!S3O;xAZ9K7{>LW6mGT2F?7It!m_6TIiN5(HqV2R!-ICTX
z>h(tBPq%-W>UOB6msf{O-QEYW?~m3r#dKM?G5uNBcXy3E8>=3cernByE;;Ulx;rlm
z9)HF=^~C7do?>F#xwFDz$38N@d(Q8tp!RpD@`II@+%w(w{5mgU&7y0x{9Z(tsS2+#
zyMB9k%e1dP!-r1x`?lX6&Aw|##{a?p>^n1Pj@OJ)4`Onh_KfPeW9!<zXZr3r6r!7Z
zE#XqQPsa_8OJxn3>yy0KPj%0{xx2~e_ISK}^XJo_=e=>7UcXJOK|y=Q_PgWzcewuP
z<jHFu)<1IdO3hj}KC$P(A+wuSdoONO&xmYTyEspXUtBAHKeu{Vw%>@u+unCCsPDYt
z)~KF_U&bFOry6=5ubk9j+@Rp&qgHC#$U;gCjp)>SzEsG)yRN4@MCR_VTE=$JbVwbx
zwcUkdY3JIwHHweCR?6Bt`o_$jU3Ast^I^wE!>qw|Wj@~>pR_v8=fUO`yUZctt-Yfg
z%bpHJF8tNV{bw%6|GGOxXZrnA!_1jgN2mX><$T5XlXo2wt7x(R&|P=(5>FQmsvcVS
z>B8>Cmp%<H>vm)5`t(ib_D}g}OZg^M$&K|5<>`s;+WBSPW!}lPa~wjVqGl7WoX3Qf
z50@`;IeRK;K@|1Rh=&7Ig~b=Al|LJi*1z8%s>Uf}p7`a0gsxkDUbLqlda5#g@?`G8
zGOvYonv#-FS9QGnC)pI<SXgMPNtpPL3zs~X7O%~2;+x2KmM?76-o25i)5QFm|LXzI
z-r)h3UYm|bmRHaA@x6a^ifLcxkhVz<T}B?zeSR!)ZDAFg_9W4e`t`}gANx9(_Vrlh
zy1mctwPXAv6HZ43wb!j4{Cr-I$K8HDEQ{I8YTWKSM{n}F<D0ZLJF}0+6?cy-M`nDq
z>bxP#Y0yV~mqy(TpZ0up)!AFgKTNhvT!M~n{+38Sv}ZxM?}QuM{6otg&HMP|mXPhw
zG`0IGKl9yuefZHG`&Qp?)4AKHe(QfatGF^acv`wg&Y8PApLvYvVeD`(FOIL7@ZO^n
z3nN17+mf==*GIcJ8tNARnB99vRI%UIDE$pTw?L-u57f=P<dVsuU8M88AtO63l6_L(
zJMd5E4dH%4>pocA@ys0G@^1SNPINf_(N{;m%ssGX|F@$Y?~FSdlKxi-f3vM;kkTU{
zzWr6du3y#pxE;|qg?2dG$*ZixmD6kX2J7#R@|0CNIs_P(kNC68CrbH4&)$c&$&xST
zG9Pngzh9{we(nI77A%@}&p2fY4C<E?C*+--bT}(_@%ANs(tY|YE0PV@PP{$;mnx5x
zM!M7Guiu=Y?$|bN-<Lg}`^AhD+qGAHay90{&ah{p%l=xa_AKn6^oWbUoVz4RmayLO
z)v9HHtnXg$ERIFVgqcS%l!@R)BgKj+L*#f0rJ5*KK$!-b*P$eiBt#mz($JeG4YZzM
z<!R|Cg|Y}18h0i|PD}A9G)#%YLP8_tq5Z>E#T3D-I7WxkDUqNVxUFGH1IG&}o8xp0
z!RlaYhCzvDK}4x$QJ{4c8p-4KbizdIgduX6;Kk(894peSnG(Q4o)9TLO`;UZE`mX^
zMJ9q3u#IUjvLKJqWW}bWrKV>pQ37Wnl-NXdi+w0KExp<JV5Cx^|IrTu;=mD<4iSh2
zV+6@JBBvvY5h1>^5b(|@aG(WNr<DAp2LDkgB19Py5~30&nF*d!<#U{98-^B;2=}8o
zBiLRXgs-r-nb&w*Eg!{l78d1mJfcM~3fv`vr)XA>1d5?ZDLh<0(n|hg2_uEDBLx#h
z(gY)dwI&EK#VC<r;3z?f2$G}(f#YfLC+?}I^F?UJ23Z%GPZ6S-hr1@8)0s(_G@s{;
zh@l8+R%s#u31-rO2m!@HfGHNm(Sm{El_-m5sbT^agFnEqaXdw#G>S7Z6if}%funRh
zfig%=6rsh)!9tAS1;l{`yr6y&5%3bh2pqBiJeXn?Pw8P?2uQOl1ha|aY48gPIO2=|
zGlAj@X%Z%)S&}hJ))E-IA_dD4Rr%N!(m?Qfc!RMdUvNev1Qqb5M?3}VW|4_wXtG!d
ztA_5_JIKO=6LAl`0I69p(AE~)l!98GiLn8(#IV7A$rjX#9MYO;;Dgx&FvX5Gg1M|{
zHX;ZdzRo9%G%OJU4eM(K3v8+gO>8sT^*m=04G<`a2gR+tWkV~*NIuO2v9f6aOb`H@
zt#Jh;9Vjh>0B^y4pd!seq&Sgywcj9Xz-SQ}AkKJZNe8TK02&HNOBLf0@SMrC&H_HB
zz<(I%CLX-rIu4l7V>!M+i8KtMC5(h*L}~#;7&s%LMjD-t#!v=b0M}!m*xVpZZo<??
z;77blX+E3ruU}*`AXTucRvZ?Zu#m7&wcJ0O2G?O@DMA2VNIammG(!VRU~D`ogFVi_
zBt{6>W3`qlA}N#ANQf@eihMvV2@XmGplb#SM3~}109i>~Iq93gh?g}GgB-%oii9A7
zthIy+gWN1Fxfw@N1n}4#sJ4~utyEd*j|76&p+wq<3SsG3+m*mz3X3afI{>X&>eR?T
z4iowX5V16vfJ_3i5KIBw;5oD2AWfv>K)02M1RqEo0X~^oo-QO9fMxTt5mkh#NU<b^
zY}#hzMCc@mla?~jfD)>Z;)_A(U<DQm={PmY27hy00VcEsO9sS%DiT2-nper?=fkk9
z9!T|)zya5R#?Vi&i5;y6hJa@MHz{b2YwPS{4JdZ7q&qlU*&J=8)cyby7!-yAFVGNU
z$t(-SkU-fM7b;Ofiz<T~X&ympG^jx6B{02eUu)pBXcTY?M}Ux0W3vzdxtL<4179%!
zbscCQR-IS!O-rmf%n}~UGN(k*v8i~JwDj17ObBr%9@|DL&>Ij0(uzhHUjT*E(fK$7
zyo5$jFJKx3(PkUK$4CPmKo}v9ON@OP)H<L%Z51b6BDD50Pyt|q&Bu&u8J2*Em{fok
z@V^n?mF&f-2HM18@P9N+!DAUvNn`*g;83+7RLVuICz8y<o)pYH;MR&B%voDflk^H^
zVyyYCxFv|MraL^w3+)wJFoiKG<rytclYmyDW*wn}BqM1NARm_>g`nYpWgah-wd!Uw
z^&ksFn$yw{=$A5+(JTPrrR7N0;{pQ$pqKKv3A%HTQ^2H`B4o4K0?5RKf*gW9g(#zm
z;ff)@V7!Nhhd=-Y4)Q;U^bjl0iHVT$LFcRq#|XfJ5wu{nnPF5S4ak4t7m0SDOIogw
zQW*)pfPxfA=y5v7NCr}XplG2u%EzI_Igx;*DS&<#!U`dj`A}d;T8=Cr`G{f*X`Yj0
zP>te2z9c^5fLjFuvn?4C36xe~eg<YTd{&}RSclrEM%PLfq6lQdA7i1c!xLseoTV;d
z)^^-wKyw$hT%(0LsbxX(X61ubKz76!0aQVyimcox^c+U74icP*lUxh|o;7hiBrse;
zT8p{nbVLHmVEy29>jHp7{lcwb5%4IX_yRi~3}tX}W5HR6fC^Nq29dQIl#(QhRl%rO
zEN}%t+7gig=#Hra17r6>orUW)h!f&i28t@EB_OFo6V?TgdLt;k8q`TGj|SI@P=28d
zBg`o$ZI_WeWio?F5D{GJv$)EF{_-puax$Po4-96cas}2k*dU6g!NI{6i$!hXfjd&2
zkNXGX3#h`N4iqRM7-j}+n0b}1Wdos->PHpkg`kp3S}TQNVigU&Z5s$xzmSlKegi`X
z4Nx0IBO@PYCJQJYb`g-;Sd>IVZUYB!CaKmPjDiwGpap?=S|}*S=$Z&cZ;)4ATZtVb
zq1b{vnMCJP@?@e2C7JaQJ)V+}<;)OKHV9WQz)k=bpk2#BeL^7Hm!vjW83pqK{^KB}
za~5QvplpVS0m=l@n$sqNC&5Rw0EkGj6*4FjNP|ltL4-OS0s}g27Gam8M44F#unh%>
zH5=fOShEQ914md>r3m@?g{@d_v3{8-qCm<-SPvx0ra@Zt1`&cU2eH&yc?0Pm6BhFm
zKyJZ!ff~(7*-9h^ixEQ?KnL4G)$)GfgMoM!lnsl6HAEA<7UNSQ5>(aDz@m79H$sU9
zqet;jw4f=N2T)O@Q9NTbi=ct9w~3Lzt`R4F0-Os;T!pg?PA#xz%FJ^Nr<dFo2R;G=
ztn!020D78)NuzNY1?4!jr!>3BY8dvxYkaMG+1dDuR&$dH0FksLNDg4W)T%^~ra=~=
zz?YIK#1)UrV19}(f`Szq2dUX$5*lo=SHBRCe}AX^A0b=*`ritUYkd^1w%-a|a7_qg
z6{MdD>fZDG=$*j*y#x2R@v-mit%7CW+e?VH@9ph-d;8uV^1M~`?R$H0gne&s-`hhC
zWZ&E4=SB9ty?t-r^3(^O9@_Wz_Psqkv9j;&TQ>amz5R<HqT#VZ%dg=7;okmr$Md|0
zhReL11=2tn3564!*X!f>%5gxM!WG(Dw*7r%Nj~tDz|+aw$6NXTbc!(0{s6*$xWIn6
z;H95%?1u~NhYRe73+#srS{+@pA1<&TE|AVuzH*ihKY)yGIqzpbTwp(3U_V@7KU{zh
z%-IhYv^;CDA1=Trt1|v~$MFBp9zb}meBaY4(8cq~gsw7~w~v>rCOthhd$9F+gjb1U
zFn$fxYv45+|AE&ijmDyh(re;08oef3x}}MV(rA+L&1j7#0l&j_^E+JOk5MnKQ(|Dy
zLC_XECKmqUFTqRtD1Cx)q5ni}Sa`^|?C=oMBDKQ((Rf^GaF`s%8>@-3Sm%{)O5LD~
zLHfYix>{Q`niTwjfbrfAV+!65ODWXGXNg+Le!V6}8jrM@f_uUY@RE5UI5#!POo=Me
nWKt1u%Fu$~jFI$D2cZMhq3RHCB|MM8k7Rywm&wxLA4>lPWZ^x$

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d
GIT binary patch
literal 3068
zcmeHJU2GIp6u!GGYZr<vh1u>VOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd
zr)%p2U<hKU6y-tUPmu(TQBZ7vglG{-4E}`(O614-AO#a35t|Y$rs|n#i(31_gZk3*
zaR1IX-#O>bx#!+mwyv0QF_?+Op+i{+<sB&+!rm<iWujr3KGHLY$ztZ_1I){~8F$d1
zF{<%rjADH#Z=6*iuvtAZgq$#OIUJLmp#evbaRo||+Z*st6J=w)m|V{(l9`g3OrOj5
zsl#`}5n3I}xEDhK%RR(65j0-t5Q6z(4|8_%U&Gt5&M`Cv9c_%~y`CGp?{|>J=%~>8
zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N!
zT+gohy}f4!viskNb<Jyk=3=U&>vSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h
z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{<Is@}v7#
zAMEU1*s}fjEYF3{y@QEpLoZi<sGzI%g&~J~vNN-*|A2Q5cdk{g%vLVV|7LG|?pEP!
z_ES5ro(Oli_-sLbU}101gXdFMyE>+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk
zldhaoEl!8**-E2<axUk?mv(0yjCn%yPhC4P?YSq1*uMU<+`zGiqSH~qnRhS!`qj3Z
zc~f%EEMM0B__c;?_qVw3026K8aq)@kH8-}z_uXEVJCz@sE@de1vU4rf4XPzcTRtXO
zC8CCvCd?646FE^ab*zd!Rb=QjVi*!lVpSrXn3URZq754Y1uSQndOqe%&H4Zo9N@(y
z)kK-cL{<Q#jnG)cBmoeWa?Q5dfF;==l47c~5Z42rMB3Z1B#{)slE@1Npd~S-f@MR{
z6q6ThtC9$AfK`bklu^5qBJ&n$lcGT^lG09P9!Obm$W$ddCNOZNEkQJ*V@`n3fS*$o
z>0aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1
zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe
zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R<q%rskj$++VG-{8m)S#9FkqBT4
zA|)1*Ba#F^8&U!sE*!tSbk7VjnIq*T1Cqo5qq<#LPR#V(oE|#t37wo2oXE<Fto)a<
za=g1Ju&5OI-8vlI6k%L&IpsRX+<l12gtfZ~n~UN>$Zwx35Deu1M^|QO;^<5i^#8S>
z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@C<t&?Kt9W)9%ku0J
zTg6sYv23-aT*9(3tB3h$4>N06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1
zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^(
zZ9Z<q6$=R#K#(7vQ(B8jjRH<p3y7w{WeRt;F2eJmxU8_aFdWE-N@5k0XI%)@!n^+u
DR7u&o

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json
new file mode 100644
index 000000000..717d9bbe9
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json
@@ -0,0 +1,59 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:10:22",
+        "end_time": "2024-10-18 10:10:23",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "filter_duplicates",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 112.7,
+        "gpus": 0,
+        "memory": 24.17,
+        "object_store": 0,
+        "execution time, min": 0.005
+    },
+    "job_output_stats": {
+        "source_files": 2,
+        "source_size": 4490,
+        "result_files": 2,
+        "result_size": 18001,
+        "processing_time": 0.308,
+        "input_files": 2,
+        "input_docs": 12,
+        "input_bytes": 8753,
+        "output_files": 2,
+        "output_docs": 4,
+        "output_bytes": 4650,
+        "filtered_docs": 8,
+        "filtered_bytes": 4103,
+        "source_doc_count": 12,
+        "result_doc_count": 4
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..8aa870c00b0fe5f1488f3bfd1d1f8e619f637138
GIT binary patch
literal 663
zcmcgqJ4?e*6h296478S8@rDF4WHEH;!@dNayeSnQRokGnx=33?6s@UgZNbUW!O_*f
zATI7s{u}>*Ab4(47jbd%8_xTj?;P%JFsDFc^j4ttIwch50n#t|Pjl@Z;5h{{%m9Hn
zNRWg%h%mb^f+SF&DotmwhftZyhNh&Zc}<a~mI~=#qL>UR;)ehtIz*79ySESPK{RMg
zuYgQ+EqUVihQ}lSIB@&^*rf@Ts86DLFIs`6{Dn);;B`PnKtWIeAV;PGnmp{Cy8RB%
z#1_S$grE}L(2f7viQh&n1@Bt`&<!oY9A~e!8m*|k3VDvQ#}G{}5ZjDRnC-E3#ynOH
zQ_Qv*+u_M7W1C!~|E<xh@z4PmXrcCa{|29fX!7C+wQ5)y4GqmGUPfh~cZ<6>?UFNY
zm$ojtt(D>tZiIZui}NNCUsxY9MV+9TgHCXD?T)g&!64fS0)JwT-C%g;Uz&yGTp_ol
O8wNhkqX5vrFZl)Zqj<Og

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..34b15a76c75091485eac702e1c9f8c6b28c3b30c
GIT binary patch
literal 663
zcmcgqJ4?e*6h296478T3ctZjivKTt_VPAqy-js@us%=nOT}oR+6s@UgZNbUW!O_*f
zATI7s{u}>*Ab4(K7jbd%8_svn^E-#6&YV1r(OZGmYLrk|07$*$KF#%afM*G2K!!NX
zK@teeLxkA_5hQ^ERcR`XJ%q|sHZ*0L*A(fyQiaqnQA~ys@go2c9U@55-P?!tAR08L
zmq8}FmOSx$!}iE;2X5aVyELH^^+{CkMJuqBzi8=cybh=cXb2JjYNX4c$-~~M+wbs9
zY)Skn2rAJH-T1Ga_-({e@V-R=-Ov)uarT>Q(TduukmD$M4AI~QvBTJe*&f?q%wv@>
z#cZ3gU7oBkw#6C!DWg~Ap+j!)h3xVEO+E+F;KdQDm9R1z8a|_V3CTY17WQshMQ7Y9
zZeMhptA%6S2=!1G=S?ELus&3ZIzclBo#5))9c6lhL8cP~{=^)+!SKw#H1jLje0Etk
N4E#8c0ze&q$uINqc(?!n

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json
new file mode 100644
index 000000000..d4cd3e362
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:49:10",
+        "end_time": "2024-10-18 10:49:10",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 101.1,
+        "gpus": 0,
+        "memory": 24.02,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.007,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/metadata.json
index 4a1b54395..a0b26f931 100644
--- a/transforms/universal/fdedup/ray/test-data/expected/metadata.json
+++ b/transforms/universal/fdedup/ray/test-data/expected/metadata.json
@@ -2,86 +2,48 @@
     "pipeline": "pipeline_id",
     "job details": {
         "job category": "preprocessing",
-        "job name": "fdedup",
-        "job type": "ray",
+        "job name": "fdlist",
+        "job type": "pure python",
         "job id": "job_id",
-        "start_time": "2024-06-24 19:39:44",
-        "end_time": "2024-06-24 19:39:57",
+        "start_time": "2024-10-18 11:36:37",
+        "end_time": "2024-10-18 11:36:37",
         "status": "success"
     },
-    "code": {
-        "github": "github",
-        "commit_hash": "12345",
-        "path": "path"
-    },
+    "code": null,
     "job_input_params": {
-        "doc_column": "contents",
-        "id_column": "int_id_column",
-        "cluster_column": "cluster",
-        "bucket_cpu": 0.5,
-        "mhash_cpu": 0.5,
-        "doc_cpu": 0.5,
-        "num_doc_actors": 1,
-        "num_minhash_actors": 1,
-        "num_bucket_actors": 1,
-        "num_preprocessors": 2,
-        "num_permutations": 64,
-        "threshold": 0.8,
-        "shingles_size": 5,
-        "delimiters": " ",
-        "snapshot_delay": 1,
-        "use_bucket_snapshot": false,
-        "use_doc_snapshot": false,
-        "random_delay_limit": 5,
-        "worker_options": {
-            "num_cpus": 0.8,
-            "max_restarts": -1
-        },
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "sort_output": false,
         "checkpointing": false,
         "max_files": -1,
         "random_samples": -1,
         "files_to_use": [".parquet"],
-        "number of workers": 1,
-        "worker options": {
-            "num_cpus": 0.8,
-            "max_restarts": -1
-        },
-        "actor creation delay": 0
+        "num_processors": 0
     },
     "execution_stats": {
-        "cpus": 16,
+        "cpus": 4.5,
         "gpus": 0,
-        "memory": 14.396823502145708,
-        "object_store": 2.0,
-        "execution time, min": 0.22008283535639445
+        "memory": 15.91,
+        "object_store": 0,
+        "execution time, min": 0.0
     },
     "job_output_stats": {
-        "number of buckets": 15,
-        "number of docs": 3,
-        "number of removed docs": 2,
-        "number of min hashes": 5,
-        "overall hash memory GB": 7.152557373046875e-6,
-        "de duplication %": 40.0,
-        "source_files": 2,
-        "source_size": 73126,
-        "generated buckets": 15,
-        "generated minhashes": 5,
-        "source_doc_count": 10,
-        "generated doc_ids": 3,
-        "bucket processing time": 0.04204988479614258,
         "result_files": 1,
-        "result_size": 36941,
-        "processing_time": 2.286285161972046,
-        "source_documents": 5,
-        "result_documents": 3,
-        "result_doc_count": 3
+        "result_size": 663,
+        "processing_time": 0.024,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
     },
     "source": {
-        "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/test-data/input",
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis",
         "type": "path"
     },
     "target": {
-        "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/output",
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected",
         "type": "path"
     }
 }
diff --git a/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet b/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet
deleted file mode 100644
index 92b4e58c722d74a2f1528e89c521ffac58af6360..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 36941
zcmeHw3v?sZm0*=yc1v!%+v;ig$iz*`z1>B3)3#NmzZXcCq_VAUS(5d!Yz63)N+nyi
zepr%!fh<iJhToE<(;+02VId)Gm|-}X36m3Ym;^GwG)YL<OhO1dCj*>(NH*Cp*$jut
z;S76Ul}ai}B{z-ThGbi(+g82b{k(hcyYJrj%0VVX88ily;fT-BVZe4{m=VLe@yl>b
z`m|Mk8!*EjgYgKo>4E<p{YI@^W7JlfV7=Hul~Snf+>aV};BDLXW8^D%zd_Sy#X5TW
zj9t-Oj>`(k5yCV?8QOX~dSArh9~mUI=K`Vc2#|mo+A#Rh1HUkAYe~Afy+%@JjU)v@
zuWutC-nL<q?%rm&cbhy<-^O42)=?leX|g*gyO1<lQ+B77vfC3W-r}^<35rT_W`U;o
zM1pg2PFhG>EGEurx20@m+Tu()5+*B0S<o0}!IHG`Hs0Y(CMg^Ib|mdi&c-JMbJA(z
z9mxdEI~^8#isvY&!)dmdoB~ffEoO_&V&=_BGzRamIh{5>MVp+oDQR+At(@6mb(pwB
z!e%ukOi(6J$rNCjNID%hi_K<pT5VJ+>2UItjkY&Rm<>C4bfdHxk8YGQAgoS$f85n?
z=&DHXj%1!MXN6qpL^>(X^?$VY-r7N)?j*n8(f=MnUsW|8!wnDNK0}+R?^k=`KxEN8
zGCZ8fl!d`UIwSOT1_u|p<zefIbRv5qohu28IW9A7YTtE~&u531s3Fs^kQ+{NC2kns
zKMrLlOwz|`wG7P{^EsH!-G<(py#RK^>tWQOse%$AHvvQFs6aEG$FYGqIzqBvcn3zl
zw5KKjT3G<Z*}u8H_e~A?MLqeGU1;{tBTDC$Tvds=cbnmnHret#hrbr`0bAxur9u(J
zW_fve2qtC-%@%N^h(u#;DLZT*qCZf|7t;LY#c~2Fvs^AYnB^AIx!J*@kP-M&dPx}M
z=eXRga0P}f1+kk;h14)@qUC;Bp;+W*h2mgtL0BEk<@n+Vwss2BlnU)G4fO$?%aSWC
z76x<00%tN&7D2^^_%;(u<H)+BRj{WjRM+FyXi0v%w<bvKrF1DHh*ET_qxUb{$<Mrx
zJYz&s^xi>VRW2SsZn*uBEEg{wIP-I$azdP8f+NE6920JFj)`!&P{<brVl`hTmbqMs
zDCG%W_rBLYn9m8sa(*$13b<UpG$$+)`CMj|DCV;Qkxvm&LX^^3VThQ4zC4#B7KMVq
zm4qZT5>^DhT!IGB$n6WbnFveNBo2>YMV!Vx2UB3!ju2q3g+<*1J??SFKSmE@?JHf&
zb3%@O5YvKa>Nt#V|J$QOH}_ze|2R8C3=)37J51nRD`4L&F(T~syIEp_nGS~{gf~pY
z*-+pJ0n*w2@|E!D9rFi5tcP&T5Rq|~2+xEg>_j)?p(vK{`llG5*F%g3LK93x--pqC
zeMFE6F%xWr4e9$ab6>xP@!`fjA$Bs#h7FftmOi2fIz`!07pAeY-52fm2O>nw8yWZd
zaeWsI7CJ!a{^d%_#u8CK<MOdYBtV4NNXW}hu|yodcs~&cvLVLrAtE@j=Lq2rA2;dk
zm}AI}-8|}r_CRDX;PpqKmpc&h5U*VSaT@+Z%gAw>z=Lz3SWXsdkFo9u>p4P%1McJD
zSdefte*Au)3z!k`ct^cJ^k^tBLHGmyfqDE>?OGx>&iaY4FA&468gE!nU_;%fHK5#A
zVciF>z6xs(X(9nV+kI8{sllHZ;`_~B!X1nPNh{XZ0gKbb5(mqX{v(*hzaPUs-o0P@
z%54wp4`NpTK}_%Ki7{a!N5?P|kRJAXVt!&G8jcL?&-mDI7?{ua0q1apfobvqFMNQw
zKNKZ2v>mtOJ$ifBScqkq@%EAa@OU)h3B>$;{~1plxW9{{DR$ytJurPCmSEt&KR~$M
zt}wxLYy5!&j}mMoOnBH)uYcg1-1{^h&*d5R5e@6nhcQ|oHi2kBYa-~2h6!4?9luNq
z{L($Rhn;q_tOwuIHSUfa>DKO@?uiCHOr&p`BliR+BL3(E^S?AvcHmJb(Z%?Dfs-Q#
zJw!MX2r*;q5kf!26h^@2TGFrJx1~eC&m|wGfAkm>Sx;K9xb9;;OlXV^9a`4L#+mkK
z2mZ0k#kh~(Lhqr72^TSX68o<uj2igbb(h@YKRN6C-BG`90xj5-m+(yx?!d&vz@PUp
zZZ{h|@P6IJn1$AFGmKs4Vqx|OcL-PwoQ$yD6I#l|4ozVClUkUefwz3V=n1zkN`!&u
z-hiKA+!61TVFaVD9>MPXcft>%O|bq5%NpA6Ir-M!$M)v+)@iRl7>yAAiGUvvpuL2P
z9rOA@R`vaa#~Tj%m>K=W8jF{?yZaLa;|qhnMExH9#h5Lp#k(kzm8P^y3z*jLb{#4n
z^!mLKFQ`?Vi41h@0YZX4c3@n$53@b`nSPK=0vPMM??<}5n7warmp|ZP5A=1v4|9AT
z?<WGIqdu>n?d}DtyomyaXwv@<_ZflQ<KAG9dw+YzNCAcbioMV9C5*lA)7WX(8C*y8
zn_0pW4f?!k8y-3E)t)fHPJ2&YsUtD#Jpo)dtg(3qPP{xq*|lE(nC?@$0nC91d%}Te
z$gRtNzJr?@d`d?i1-$|>WkaDrhzPjd(GYW-4r6;VWwa^U8-HT=s0DlWr}&?~{MS8z
zf;%*rJ^ALdlp&6>_`a?%DA*G}?Fq4g&=_~y#DeZKfBKR($hx0-3=8SM{nY2M2!4ac
z7u6re%>Jjb(Nh@aF`Uv^*pHmTwC;c(U$8-u?eK1`CQS7GMGsgdn1e6>^$S?!<gc}$
zhWMXgrl;)J+|qM|7#j`aGXuBUuEI19_T^n#Vw`#T#{FRb{9rPIfzanaec)3&JG7cm
z<QW3<a{IBcp2M8NgP8w`32y|ybKuYVK4(3uJAK4#0(D{|FnS=gxBukiQ9vZZc>SQT
zfdE)l|IE%eb;b~g5gQ8YufV7Sk2}Ac{t}40?Vm7>nbuu$;3W;?zve~PC9rAHUt)y*
zZZFKf9|OuJ9^8FKd&D!&gvZ@5m^&2Ee-g9mywQ}-d*`vPBZMpZ)Zu|^fn{Mf(*CPG
zp@5HYc?Yf;a7Dj7ZZ~;hP2o$c1%Uv)WL!-6q(I-V*?m}xvDB*?J9QI{>2Eh!j5JG(
zGVXBqi&)@^+jsv$*A<xwvWDvqJ)*UnW50Or(DPk0ge!RXzD^_9V#XB+^&alT{D<~;
zfz=MP(Qf9x|EY1icKbRnPh(8G{g8F<cQETMukKv!ykZdR`#NR|9l%D6IA(|6t{ph$
z2wj3*^xt%S#ohm>6LY8k`@lEp(QTN6c>xQ&Y|w;e(p?}6Ucb<#@w-prH(UgD_oVQX
zkFd9nq;Rdn;&p{_d}&}U>J|P3BMyBMb6Dr`u;D-L3By9-9(UahAgo3I5Ozfr*EsA?
zY{w7(jpHI-`}wyp$5RKoFJ`yj_jq^zfjQ6^cOW<e#Cv_#8!_i@LsyqO5Dnrn5L(Q6
z3)sR?_oM&z+%rktl^Q3lzpMSqfv2x{6w^4(lRNQ`S`F={{ShX794UO%uMc*?a>_7E
z*ckq~BVaK-URZ&^-p#OI*MvOy|L7Wx`h0p5W}}z3{ms?heS5$^$d$AD!_mh^{|xK<
z((W5|&$b<Uaq#cEe-nKXX92%29QY{ayzepLG~VC-FEhX0&qP4XJYGNJqwm5rl<C0r
zIMqGZ`;VPH!BAi<wDXxRY)~iQpWXQnomlt>pWONPJFuAkT8!G+q0=5dd>DV@?>f2P
z051DFJKNunKd7b5rbBmXy#C(8uATSmm>=%R38i#$sCT~>j8S;!(H&&8bLVe&?TdDT
z38kKTh_z_Fks0nUj3Dx{K<K#s;ej(<5m*;?6T1dnny_j2<gR~rUs2w1I14Coi$ZQE
ztuKMUJVFF=nKbxBvi~bUIL03iGA7f`U+y{;#4PMeA;aakQaYalkEl1K-`%U<P3o~Y
z?$;3fMMw3!^?FS@r~BB4iY4%pi2NeKCGtz)KZRF|B_T_2eEac;AN;N%B3cA*Yf&f`
z@;UI$Jo)y>(NsQ@$uEPSK0>g$VtG+$?=`LsM}nh7QO9c1McXW<9U|DB59-?w**=c#
z=VrNdZiom8nVs8q8;Nu-m0!&2zM~BbLibcxpdc)^@7Z4@pp?kw7X|t-)&uC2OQTm|
z*w9E8KZuA!zE&g@RPo_|+9nZCS?U}S(YZoSn6yrzr9oP!h<f%&4x{M!i9TXngjU&*
zNPcEIM~K~9a!cg2I={p*n>Z$xR(K|{JY59FD8K*%kuR!(QjML+!>zDWnrz@GA&rW-
zDwfswTGvu!aP#0pgeYmq_ZB5h!Fl7)@pJUuCAa>3dhcjQP(=Pda_$u#Jjf>OFhnO_
z>Fi6hi5&X6O$K=O()K{wYz{T<#cqw%_F`H-57Uw>6~*n?H=@0#%;f#Y$TtYI9lIcj
zpx75AaT6s`bz9wK=#j&79eYshNl*)@wFx;R{D*2~fa<d+6tWD9#B3QND55NvdaotF
zcQyII)ku=%&Z_Vi4NLN--uY7BdVBgEP^s7`>(HaPP=T#hvE;gHrLnK2t${J>Vy;go
zY82@pz;vP(ExEg`_ue-0D;)X2hY(S3Z+KOb3B!Jw<vo{m0Sd~<r%E)lOIdHMdxFl&
z33*bep}!+l&SXv`^I2Fg#X0#R-uo4t{C1MOJ&6dY8>eAZRkR`N;REbY;s#|ub{cjo
zVy+!fxZNnmE-{S!{%pUYvl6=2B2$WX86pyq-Z**pEcyFXzpVpP1#?fE;Q;_7MT|d&
zzjnuMFti*CgGlkvY`Qd8PE^ByTzW8@PYRjhpco@YKL$DensU0BMl0zsq90;~%ehRR
zOM)#Qt^{`F-@qcNZm6&d(X<62cR9njW?0(icTM>GUNdamW}=fbZu6+e8W^WlCg?W&
zJ%pd}J%vq!fZkN738+D4JcQYk!igneu?X`eO3B}D@BP|#@{R@a%p4Nc_Y_6%G5)IR
zP|7ebYmoWOFw8-bUlhPm8BAu01Xr9R26^I=!fI&_eENzyDAnK+6&W8s$03a8+Mz*G
zUR<e3$}WYZ<WlKbQATdq)_dzV^7$q5xiXTGa|L(q_^EQ*Wf+w?J#hVApdoR|tdJ9m
z>Ed8vF`pcyY>>iqLN=7P(3Bcq_paN)xPEm5&%SY8jc1+dTvAv$Q8m1Gwe{Z9M&5KS
zdH;tIyVf7~Y+zMcF=DtvX2sqDTFD^*YaTQ?25ARjvW}Qc@b6Obr;2#>wQOu`UDZJT
z<0m2CFK0|E^rJ|=C@%kj_ukY-p1vL@zxrPg16q%|9za#X8*}CEqkvcv64;3ZpR&W$
z^K`;YLnhjk608&rx$cCGvYD-%)nYcAD9Byg1kO&|tQ1O<r)aYy!8vfxYag{Gla_>#
zq$oRYNkIZVMW>ue7$TK|Bs?upDVx&-xo_TXNu_wR!(uYiyw$-|f`z9X4jYB=u$u&d
zvzYB%GR0Y)iKJknIl#lhrR-+TVWzE?gvny#k~SyG-8-Bnn21!0GC|5-NH{nr%vIfK
z3}D^SjnZa3x>3r2usXl(%eN?|zO61t{;QjC@}ZC7XwKgYh^tD_g`B6jz2D57=ZkkL
zBw|~6F(X<5Ddl<N$8hqCw~He29#49ngU|igCxBqt>D~?@rvjH-6^TYZ_w}E^$yXp+
z*~;ra2S9gxQUPd3C4_;3*;QcQy9*~DxvMp>rakA{n#z*Uwzg)3|Dfv1EzYf<Qp_um
zS%%cNL?+sEe)VpgyyI?hX65dx3ik!vaIL)UjCkN28d|KVK@&Yl+as_>wQuVgf|OCb
zyWa>)7&S;aLjBMN@4t+;p?dM3*h0lV8l|-i^{Q`cL^TR1wNV=F+4G-KP}z~>VOJxu
z_g8JbPqvW{{W(sa`m{*qI{~K3k{5OvzR)G}<X7AOX!7f)Q9%2TH@^;Pvc2}r$gh7$
zEx#Tg8X6J??(PxeEcoktKX52@?b)K>lUK9*Sh|uQnDAkF<`XDC@PYG?AJ9Ueo0bv;
z%-@OmfuL$Yd)@p1N>`L2NIfS&*3noAT62GOS!2vvMbj_DXfMQQ&ug~eXOAe>fgMQ!
zHsR?43TugL?TdZA5BHMS{}oPt<sNY{xFC-g<Z*N5QPov=7&m-G_7q}QR<`;TzlJc`
zDqH>bj_$p(_cR5e5gOlnl(yCCMej$&s8)97$bD*PEbh#k;E3ZiI#4e>p-`M|A(fJj
zfL8se*V}r(*+%~5<KQlST~wWS2KDW;$0|9OHHt*`*d^x@vB!1}?5T9C4JPtHWKR<u
zRrcKVw1PdI!b%~%C^Ycq)h8iY@DEkqoI6NW-W)NQ<RDRR1Z5Hm<wPdUUkj&i%aCO*
z!vO)6t-$W)Rz{UF7*4rWc<C7hkvk#I38_|e_!a#Y?dSLL-hal)=g#2djb}tcRmN0>
zqe|L2Y@tqn8!(giYFaqbiB6<SSzxqTd*NFUrTP|N)FMT44v=1bRsm^ycCN_h6&tmO
zo(1`b7Oeoa2<h2jmqeKCBNoSq*>bj!o=Rgm!lqKyFoi#RPBDeu5K-kpP=sVdRPU?b
z#mSexQx#R%v8jx_9#GZ!yvT5|EUMimlx-|SL^TCxWs)iqs;gESYhN3Px&HeKVmcOu
zS$Q452k(6VCr|$qPQLg(k&4<7%F$G@>@{2@W4Yttc7O<Sj!wp_;JCd~Wt6`(;P?3R
z3jEq@l5z^~{WMPg^AB+HfggzI$^B~ssiJlcV*2G56k@6=akDjL`27z7_77Xj=s7^S
z;Y9_M4)Ih<4bIa)!pSfEs5KnbD6hjpCG9qfx7hfkU~y6jr_Dk`<do-VD-ZXZED)Ea
zlN@b<8&4KNu$wHL#p-a<R-4V?bS7*l$}1#N2|G=32@8aI;ewJGg1*j#gSI)6JjHR2
zWYR{P(9NZUKwAZ?)51eLvzg-rvl;GS*^k0B+MT@9l1NyriIiZr0vH-X&lG31af02N
zvRN(GlnK^AI%$JQv?XP;CruC%wk1-Qlq1DaYPT7VZWJ%$(T!pSgw;iPZ+KZDpxyG}
znG?l1ju!2?bSdg9KgP*7V5U*P_dP&^s%pT6C0T2AL_5$4*PP&5V)IR=cCo@J{epJN
z_3hrDD%i1InLoV=@4W*jpZp0#iGCt7qJ9`vl+HnqPW@b|N5V>Jk*n#^^FM>_$j@5q
zk($9m_jy!^M|K8PiFO8kX<)!-f1zXm%KR4vX=Rs8*4JLe$(vsl8PGhGDie%`etBNH
zj-cD7@`W@N?6h*7vHGn6zgPaZ0>2K$QH;Cs-g|NK_rJu+Z~Rh3uR4NaEL8;8U6J2r
zl1*A1>l)WDj&)THG~N0e1x;PZXFS2fsf-M)is(|UlxF+!ufe7IwMbdTuRceZ=R&)4
zkhZ?<1fxo0nRb<`2HJ1<zY5xSiL}d^={oYC|5u!R^ED;;Dr2t)R+W5{VMyL|88tYj
z!!JX}ZO~_%^b4KJsms+FwQ`M7E8nv2g`3mOr7%F9rG5T&pj5s(-3@{c_fFxMMy8yt
zL>@%X`N@9+A^sf>myMf8YyqbQWr{1|Z^4ytWD(IvBmGh#Om1P+zE$}kfX$^aKpkN>
z|6W1Zc0N;vBg%_XFh#r${`wm@`S2Tvq~@_&Ku{y8@6AJ2%-B)n=Y%Y02*M@t18wzJ
z(4`{96>iMX1#OfUeGgzhNIFR6)%%~yLg_A_vZQ*Vw60J-nx`zRoo-W>Di4jx<(+_5
z^$?eYjr!3YxdWo3@yLd<PH{9rr5O44%3{doDj!iQPku(2$(b(9fS#1;C<#?LMI`q@
z$W;<5$|7ZKO<7(aI#AXqcFoG-hJB`7B{EUo=t;QByD<sR`pu2pO~h8(DJY91=Ugs9
zV~aksG;YPQtqfb|CCNR|u(HD-m!VNQq_w=lxyphnH;_0do+nfm)vW=_ibew|*CAD?
zHz|@X$l8eYcJ%f4y?JnJUf)1&$r}~OKm5!5|36+wC;#woPLmfd99+2ia<+@>=jrN8
zvu8wZ_{soz{*UOB=kFN(J3kpuUN}0v$p`sWt{;$p2&az8ck!4n`L3L?CI66*canbu
zClSdF94;g`cwUg)gcE<{R-dOMFC5%}$l!%*$8Y1ML0jj0y=!M8$am+s0lAe20LYEx
z^vMmR>B)C3M^0{jLYv&w95ne()5_#K&lHn?BuPqg6SIrtJ5Lyro1Eh#H#;XszFP?~
z@?FeEkz1G_BDW-E(|hI&-rGTHdw+W$z6XP?p|Jz*&m_5`@w*2;yidAIDdoS4`$mWl
zes~|eS~3aW_AR1UO71&C>_ay#p&$ph?%@gE?y^{dN0azHUx2Sx@vDHoQrBGt57Kz>
z&QrLkDO857A0?F*GD-0cXKiE&trO_N4HWJx=hEo<w1`1DTUf1D<P-C7<`I==g%XEi
z`Oy5s`-B`chimYtZ!|JG=-79i0-1U&_SIf6JVc?9>hNi*ZNLn&hBiwxUe#g+T8L@a
z9Rchr<ST|=4=k$NM4<93Rw38f47H)M0qnWWLKC_2;H|-`kh(#b;!((EfYsv7rBF^o
zH3L-EY%WYW?b!@dS+lt?VJ4+_hBO1MtJ++sidTI!!)>VCWXP45d#Hd_)HJ|M!K*k_
zZG)m>J?kJJx~O|`L^B30n>C=bK}Ax4K`$p*7Z16?=3-F0>fa2!zV3X$MG5;>aOJv9
zhO1(qC7ZpuaNwvxGhvm&&4pCCuG<VvQPBWVp)2sNLdC+bs8~;lDt1@&Bbt`>hRRI`
zE#BHyqqbVn0CJ-`D&C-N1}NREMdh0euXa;Q1!#Tc=0dK!>e39msd|&4S8tN2fUc|C
zT)6d@JDOoP*KaO-`D9NsWMxeQOogJpBMq+H=4ghgsMutXbvH1Y0XJ1QfZw1NO9!Ev
z!Pd$*7oK<pq8VJZcypnlTMx~^<kALM8<idD@`DOE^r}t~jZFq9yZPd2{bsmL)eYb`
zC^?nG@y&1*6`KoDI!V_IuvXpxZ=*9>Ijz?WQBlzVa)V5j4z#NP6we5v@=b=PIBc#0
zwW0Prz?MCILB;<BE(_Ins7Vbrj)52@T7*A7kzW*pH<@%|ky~6vv75pwx44*JMtqQE
z7)mL0UBt8xVkt836_xZ5HDum*9cJue;63oM5$SZY@%Hgy^jT&S48u&qfAl%dFc!u&
z%ZxJ2EaRRP%UrIy`ejrIf7n2-pNvS)@KYQ`toJg^GQ8lNf!}Bl56{9ccxIRhM;N}@
zz1d7TJ~m~Uru|Ea@u^ZGH<1pcT?t|J*diCRE_mnjv)<gXIk?r9@urt&y;<vAA{Iq$
z%E{PD(dTB;yg4+-=O&?yS*|=U@oulp>xmY<?qh4ov8iP=&crm8MD;VYvzUxg<!M@i
z9iOhkEYHNK3~HOmO%)Sv1TS8Q$5s}mX%)C*qk#M9+O#=j<;M_COgb5}7GV5^c-V_z
z1CG?PFy%A}(=KsbZ`PUSvQzU(w^(0_Plx7U4AV>)@EtFv<1y=ef({iDSza7(YAQ75
zi_E4|4Y>2G(0;T8{FCuw(o-?3h=1Gzw7M-T62BVpZ_zI20v-b&DJ8C%a?&$ns_>mk
zOuMW~T48=n{`tu?;%Oo_WtxeF=8|J<b#7AO<yn}=Do?$csd6GaT8hUeVQv_)Up$-1
zB*sE1u^jQheQYWr(loVz=2L{d6}V+XOEcyvz|Cp`xmak$yZD%M9_BQY$c0i|Oro9N
zY&jT%SHX>~IDKxUAGP_d!It2v@NOYN`ze@<az(zDXF!K$rv0hpcxD-767iQxWHU&{
zWPNIy*93j+nt04#nRhCa92<o(LUUEU7&jG>tSaWr#7&2a6`9X+v8iHw)LEPNT&R#7
zTan<Y&1G(?%*FgmanK=AmU2@iemtYZd1)qF$jq3}LDsThPjVqNPiw8_67;kTwxOIH
zpAg}0)jTGUyqKrv;<QAs$~+cf-g3#<Xs*K3t(r$9KdIzc#+0bcg$iy>ZMKSUab%N6
zodBPj<6@Rp_9Khdg=sppl!%QM5^2%SZWaGP_E$0$y+HQASr=eENr23W>kL}Y)bt@6
zFDB^Gnr&^?hiqcZIajmIZ%aoP66RzX^#k1$^?%wN&+s{aAwjp~<yre|+1!sg&57*F
z609*Pq_-fKxmJ1&c2d2q8n4fb?U8*N$#x@~rlLo%#)mS3Y$sdE!uiC{41NaqQfI->
z#37$(Yw>Hj{%-Y}o18vYY$a#ax-F9{SktD#Ps=T|nlsRA3fVeq%lSG#wguo?O~`ku
z_!%v1Uv*8E{NYwIb-r*kqqY8+#G59+UfdT@;4iESlJCA%YsSJ1jr``8dRSM#b@p~@
zZN_}8z>kBCSZ95RPy6TjY-SmJhNUfDYcug|9DG9XEwfX|Kdtz)&HJ5r3|fLe%T32K
zXs@-!K2=5-cNO@v;W$0F%uOGAH}EUWWX1iA=%Xw44l2G21R&7fLi7)%y;4g%b6(ah
zcze@ya*5Bj_USo#%9^B|tMPSw`Z)3<rYC9mgKjPM=nekY+u7S@sXPxc3={`wv7VN=
zasPb63~_+1wR_8a);SM)x)zUxYHPWQ{o}`vEzQtVYdk%*0CQa0Vm(=n$CB&Wn!5U}
zvV&)t(>3r#(Y_kV(N?V&5NAoY+|$BdJqNL&R`@jR8z9cm5YuVUjr#ahi?~E`%()Ev
z_jPbm)T{YQ=WD&M_#_b1TI0sp)mDC7Ee_yM@$}r>#MbT^>G<@qH7@2Xx55wBw^g3r
z`t?;953SBXY_x?vgLSP|ucnU;vZN9VeJ6Gn`AVZ}@#)qvFchl-8z#lC)a32FtW9<C
z29>q5u70a*h3FSSY#rt?S%kQ2tGQHdr#7b>if^~DAEHk-1#ulYJ~1}ca{t$|?fIJL
zDTsBtkAaUn1#;(HMKMr_X&1TJG2kieCB*!M%L?CD6J}oIuR>oT-s)YM2wRqY^Gq7W
z!xizKn%<)^=DlTTx8}{6P~HXpUUVfM4`~!g#y|#Tb{u6+T=48+c!u#Wk1>pU8MU2d
zW*ErSz%w(8o*hh-VSKZYyKy;Okc$bUXBT?*<WcTr86{|r%|h<RB|iJbgbjN3@KDJt
zG7|uQ)&uok^gN251B@HNLus58O6R~IriGyv(6bYzfhHzlX!j%<Dvq8#=ouQqvl+fH
z$DtwPGNG>l^h1Qn<S$JP55t3tnG*|v5cE*}0L=_DL$6Sf<H_)|%w(lKgGhpwGY$O_
z5M_TqdPaE)6w|Nfm{2Qr-a0(&K-Gvvzz<L5nH>e}ffV!u5h8z3TUSL-LQ6^7iehvN
zpi|7H!5j>Aw*J7#YX7P}ih~~yE<ysXp}!~uO8AI1E;;rI{P)-Z=**={68~HwaVS;(
ziTR~ze<sPeEU5h?f``zVoDfML6T6CE#cv{L$jQN8d5(v6^b&MJ?Pp!Zw*#*#|FfP6
z)IL*%7Zrhk^uVVE&x5{>RbvB?PxI$z;BRe%5IIo)m1_TW@Es3<@12=P_)agaRq*vJ
z%lI-Y2qsb#;3E1VeZ=_{eI`U7{2lY*pvX@*<65&z@==k8rujeu=#u<Dg#S2d?_@89
znM4&W=0jvP*peJt?*ZNdYM%&J_&d1{f5u&_5S!2NW`8D*_T(`H-xEdsbJhMW_=9L0
ztL{%2|B_kek5|UG!XGd!b^H;hO0lOx)l9tB-y-*~(A%hgqN;<f6!U6gj!4X%0mH^T
z4&ZOT!f%;hjqpX&7gz3$q3e7Q=9h|~{(wy@eMP3m)f6K@_%EUUmWKYwu-55&5guI&
zb@7c^kBDy(^+(Ms^QEpop?Ptw0({vZ@Y^|Gk8hJaA<|v2Ka^rrjA!17=o^;pqa?qL
z{P962u#a*cOUy$vxg`)qgx@T)+|a*K{wwy?Swd^QFzGoy45A1<-OT8k2*1i-$x15h
zA*>0Ei{i7(Oc~K<@tmbS5+zq54nNJPy)DwvzLEcVv3(^*F78{Qz>DxRq5cZ~i$o~w
ziFjUNtS$x*suDq>>X7--LSBOagHg*LFk`5{qgdgOEUySjr9GKa>Aw)4a#q)RRG$mN
z{Ltcf@Wgu5*Bhitu3A0`<OoCtm^dT(2WX&K_e=vms`Am0H)Dnod`A_&wB9wz{{n)q
z%=ICNdEnu&1NC2z|ET5I3Ochv!Cp0k9+}Yy>dB-2@FerE5ns{X)Z`h-7?24zdDt|M
z`oq(@_E^e81Fx9lsJ(L;ew-DaBCn3XF%*Jl1szwybBk39`IqcNHQzq!M6wO_laf44
zRQtQdPEGt3<-6{z1E`&)f^WsYL4!>qR|HH94}KqfVDYLVG3iB=O8%ij{>65qb!RQg
z(#nA3XBmwLDkGX|aYS(t^o?OI$BaaQTfC+$lm_`iVNg6|L{ttD5tI#%5SifJZ17&f
QcjX`U!AT4YG9l{!0VnkT$p8QV

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c7d3d807242ca605dd7ae80e4807895ad4d48c50
GIT binary patch
literal 3984
zcmc&%XH-<z7T)LFd#5oYAalnVx`hD|qzN`8%+Of?k|2m(qARijQ6MX@V_fCT|d
zu&#jkUIG@X1xp@SF@VNc5R+(P5;cOpdxv1+OY-Bb_14Q?Gv}OrcHd{O;UiEHWJi;<
zEA6hLc{B_KFkSkpRLa?Qmm6_|>r&>{ns3qC<LO|9>WSl>YEfiY;u0EYF)g*Cg|rUM
zp*e<5V8I)dNqK|X03aBJnN^4|Q9#Jsks(dW9D$=Flk%gZl4It?N2SNpSR#}(B1tQf
z$IE$t<w|D7<OnndX3=3b0EVASd#jin*qm)i0{93>K9>Ya!~>Ct=d8nMv7|e&v8EDQ
z|F^8Ayg`YSNBpWIVJi9KEhE&j&{Tp6BJV7g_mJ;Ou@J?i%*#xQP0pAT6O|DqN?;BN
zF>~gnBxWWhYl_=|B`=Zu8X<p%4gZUne_C>-gszsT=vGQob%EB8+bGY?0g!fVI<ASY
zA~zg-122w^L#M5awRj`^;K1+$<XrtdFe=iF4BY!8s`a@Dw<c^Q56x-Uvb4EEZ1foo
zb~heC5_u7yD*Oz+GQWiFQ+m0J+O9y$icZv8{sh?%OU0!rG4QH(AHlb9p$?U_!Lj;Z
z&<IfBNzJzCppOVmYdi_3<Y}mBc{^TFOTeZ@GjV&c2Yl=F97~UQ5qlbh;B-6`)$D75
zJ9&lhc0(F5+~#BKJYgc0kZ-{0JzI_J63&pNxg1Wg`U636s(~QUxSQJ5_!Pv}h}!du
zFBXo&L?&GaUL#+^1^fGgp2y=UF^FKo(lhY#kPnE$Js{E88b6&~3u{}?5Z8lUAg^IQ
zeqKkRtN!P}+1ZRdVe%Ym#98=gt~(gJ`@zXCj0o|j*T`(;anSa74PVVU286RMkwUuy
zFEfw)d~N~fm#yjWuq6j}oUNjE?wyHCYwS>+c`f1aNk1{`(j&s<%y90=+$>yWbCau%
zbfW6H-%*LS;lz1Jhe;*#;N!>M)X?RN!RoV<9Nn7;J^z^*wy6DJ`@s}Kf3XmP`_>|}
zq851M*Mu^L@=?qgXX>_%E!rI61pGohpe!P>xl1)Q?_31sdr}Jd)gdU~ih|MC^r?dv
z-lD_9E)Z6Sw-8HidBNvtGjN@s7Yf%a5%kelsDB>!r=qG7bt(=Y75gm=L!V@IVq^IT
z?i|@{LEjY_p>SAA9!iPC*)P1%)z{s+73SW=g~{GD<vz=#z@UZetJW6oczYM=T9hC=
z@m7u?^a(Gio5x>zqmFkdcsP2{phL~vrNvJhJDO9YeH4G$ZHD(qT&Yy8Px02~W}VP2
zN-9xW0cKW<xLH&dRK4AfXD(^ruGXeF299|syJ|7Ik~dDI^T-miCkAmoyyuC+N(<4P
zRXbr*hY6&oO9Uo?!+ADaHw(0y>hOYy@6lb~@w`!8k5Ih8N^q&#i5s$Ig-(*zXGG0|
zG3e##BvST8Cj?H(h3Surh>Ks>qPD0P5El=q-cp1b+QabO^>c@~4c&yxAJ}8TPnRL~
z)fM#e-a-=ZUn1*vxlC>e97$;v84}TbF&M25;>Gn?!jnf00_#o;X99O|^P-}SD<_+S
z!~Jh5J%_ERoNJE_3^o(o)lb2D*K)ATJ<L&M%;Kd!SMvjVCu-fVtq@Qvg)oF5i3^8s
zaDNH~YOZn;Jl>$B+qPVXy73qAsjY2<<>^Kp9S3)w{j}ReYljUdow!7~`kIkPz95O$
z6Di$}@+thasjE?xlFPT8`W;&K<TmUs@Fa{(-T){0DR;$)vv71_8vb<iWK_FdN95dg
zg3_Ha7Fyb~@!aOyJb#}|f&9xZxNn?go*G<(Gu9mC&fjiAdCd31QCU&s*t%kI_uF$E
ze*aDWb^d;C>Z=ZP<MZX%u&7?_bZsMi*#4<$;(ZC0mYgN-WaLxf>w3_1Ba-LRA5LvK
z*^By$Q^_PfV<?-vhFUnaiim3l;`Op7IP=I0mF_Io{>QL1-r*CGM0g?}+O-d&WqKP3
z*JB6^zL8S~=d1D5w!>lwZ4x;CDCW$oR-wAl{b+sTTHd<pl{l+uA?FUIkD~=6iOabe
zaAWr&)Rqtp`}+fhw;x*KJ5J5$sw@Y$=E$_x={r+`vOKP7e&mqG`}2{OxEAGz65wk2
zJz_h@M3gJmM}-;Nh^70wP-}V-XUvHpsHAoH<-!`ReGf{2cjT?^iJ(fJgCGPw{{9E7
z8q)~bE+Wa&0vdI^c!GP+^@4x5khs`bMGh(b9!!FjxFb?dekAti>=k>UxF9w2>rB@R
z=~X-{x?b?x-$;^ey_94JiJ>x%sWGz=Z~leQd<veeXP9~c$^RHIPwHB>d<DT9Y7tgn
zwqhk$XM}gLYk5wtmefA}R(<)(HQJ`iJgK5$)mlA^pvtg@irn?XY@^Q?EB3D5XfPqI
z=T^hsHJeRbb9gf4zPzm?z4I-?8~3f<W*%7KU83B-u5e6Pb$mzT{`EyRa~tzyZU;7$
zIK;PChBqD9xN|(*Wux=e!n`KeO?)-!_KBw&q?>j#kvhX^wc!f$&7}rQ(t4C?fM<-_
zGRCJ!ZTph(cX1En5A^jab=5kaTNqarjLs^1HITv`lscdfR<k_nL+WS!xk4jlfp^;o
z&y9vta>PFEqkIdkA{z^QuF3;;J1@hu?=_2{{i@x{Lf`9_AxCGO%Ax&kSVf#j>}o9Z
zyE%6Dsipl`qPk@p-Lk>RZM&+&e*WcMQ&vj+Z;xAW{YYffcK<J(lDkeWBZdWZPDp!j
zX}4QZK-Z+KZ#z$|95%Jv`J<n{>1ryPddEfG-`7t_1Mj-#P$FaZ;=r$zt3^gGtEAKJ
zxv!(GXEzs5``UAp(G)c)3%c)}Z{}a>UJ~@ccl#LY$kVH2!4FkC9Fw}6OM)K-lulXp
zhSZ<l6IkJ~(b!|h^v6L5{CB(L>d)w%erU$g*)2O}JPA1#c}h(ggnScL4R5mX57}7$
zvzZ}{2N03>`!Ir^l#tA#qu6vZF(EyJ8=Dx*LJS5^)C(l-BPC@f@<T@Q8ye{K`?2~{
z(BMi>c!%wHu^1=>w-|gBjo2{+=o#2peDofwlk#q91{FZc8`NR((I0Oa2frdctU-DQ
zM*kH&(tZb)5@VC&G6v9JFOob&@&XHay*UGr{^<TMg8o^pvJH+P<)qJxk4=iANd_dE
z;oo6RGegx8Jy!-Uf+Lo`-*B2Up!`05WA%P}3L4WKl{0HiN;S{I;!(^*7<?Wxea62j
z>t7T2KZ^y4P-ZAo{f~k0-vmbgFT*SUyCzApZnQ)w=Sj|w5(CH#r9wqe(A3aL1A`?k
zBW+l6R45e+#Wd#2lAl76tx(1(JQa#Kg<G8FPRS_vvnMx&!iU`%esE_v%`u(zz&cV)
z{b-Dh%>?G)5#Y`8ZrSYW!7hwzg+j$H%5av|$Q8CMzcAW9z%Dv8Bs0cy;==H-pm<iE
z6v*VacmcC${JCkIFi9{p;7@a8W$)RiM}#^i$9TmlqF8%Pc%~eN!Uy@F88cYkmt6)*
z*Q7p>&qC%-nGqH0m=YcqkP;E5DV>QoZJ<8i>F?*mG>f6%<-@j8nHu92lEDfTJ}OpU
zqm(@uf0~b`-%PU^8f?8n;m#ht64@T4MtDwK5N*GR>B%@&iD_~++kYm-fk$EPlp0GW
zbEY~*9<ESk54@8mS=NftQD{0jV5BK>;9gUDc#>x(6Gu}gYtQ&+OxdRy&;+SzYE;^X
unXwtx^HNi-qten+vQ4vM)6x@Cl1=TL#@dawm8lE?-hJRC2@uHqFaB>f=8;VR

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c355b299a2b6bafc20dea943a5a9833b966e68b4
GIT binary patch
literal 4763
zcmc&&d036<`@f&(ec!XEsFQP&igZN#YK!TV_K<89(P>Z1NsDX^MRiKH2q6?j$Rssi
zDpYnuri3Cf_GQL0Wl-VwzNcx1S^oH4*YEm$pX;3GeU|&aKlkT;-gEVQ@{)R62uY!r
zaJr|ECxih2hC3F2a%Ycj(%o2op`t81<I?r8#=YGu^p3Z88ys_WS!Ta^v}K^PXrn=f
z5QLgSEn}fTs3v3yS=zQ>$mz2car&$PfNzNDUVsQfVT~mZk+zT&+W<>VEaC>rVuI&|
z%i_a@SW}?cj5OPjWRZmPZ?VM2bh04CKzC~N1Av|f2wgqtB$(vQNd#O3EH0Y_z6u8f
zDjd6hv=!4-dmoJJS80nleLEr!`rSs8rt<eydh>TGn5J?LOS;HEs2HjQ%Yzc5Lt+x<
z1<Mj-DiQP`B6wbqJSs6dMoBIkYraR4$Al!6(EeXw{v&0Djxa~ZQ&`VdvR<XyxsvUi
zu>j`Vtc21QDYTpj1>*uEcyL_+oofx?+1U{gG*SZ<?~}voxf)=w<t)g5G5|1Bhd2E&
zWd9NYen&q-^QRJEg^h)MK4K_*tp-cCNzw8+0iG`&3w34YaAAiF42?E{H&Z^LtA|Y?
zM_YtO)C>dig&A<<9$>$Mgr>v8fID0d)HUp(alRU?^RtD>EJv8sG9TtF(gkj=I^3LG
z3VZq`fsR=@x>Bl*^iLcBjVxEtKXC<OE3F~sr6$z$n**CdwPAJ7_u#ZO1Og5Xg$92X
z>}olJ?(+~-T;{{A{_0R)Hw4K1iC|vQ0cmdfF!^u+h!0r6ZfpqogDN2T^I>=d<Kc#j
z5Z_-l4%SZ50P%M&koNomnxEJo%E)05oyLV6V+zh5JqpIj&fwL>g39a>AjxLIp{+8w
zcvlt9U!&mKPIKg5QUJr^e}L|#&Y=I)44MOT;O?gwSd=gewx;cY&kN7N7MK07%Y6de
z--c25@+V-V7z{&8E#dW>R4BeV9EQaVK%KjPLt4R)P=X<X*cg3qUN;%0^<<*W@d_fl
z<P*F*Y>iHuY~^!?dO-1zVlqH$7Yy`oCA{`@qL$Gc$R{4B(IVBmXpP$lu(<IEqRAid
ziQHo-VE`9JN8G`S8XMqjN*lUV@B&#5jKw?U!LZ<EI_4Tqpbq3U!r|I3G!#7X*fZv+
z#7zZFt*?O-k~nmFMH5b`B4F*J+4vXR4lFw4LhPy&K*ip(a95EFpR?nLA!f_*i_qm{
zM9v`A^K<3MBH|ReGmXWf4E^!&3FTDKc|U4L{VS-P<b^m-?6ANZ6N$oVaM6Dc7cC$1
zo<18zX@ClhUUmxJA8-Q|UnhvtF~P6qRzc>aQ^fUY6F^ZHiaV+)bZydk$SI$RDs?*`
zR`ob4O`8Zh6Fs2j>R`fP<xx;|>V}`@9R|YAoRACcfs?2pj|XgGb!~`;$7fSvV6+@>
z-!mKUtgt}U!>fpri=WY)%TI_2r-rcg(-z``11}Qa`rA^*2iFr(=Dx%Qh=;LxL9qOp
zD>Y!nVlX~d!|Hz%p^o1O09g^3e>0xYS}cHq|Adk~!UpO;M<-FTBT;IF2TL_PB7M-l
zg>6_zGL*eD!VvW|hzY6HGV*}jA1A$YLD#z5`@c#(&rNR(;C_dGQQ0);EZhBCRl&B;
z_fUVsJY=D<fyJBkf)m}&;V!#T%{ee_2&(zP9M9gt=f;_iU{$D=;@j;8IC-K*|5$z*
z-aPjiXV!WLDoRuc2F8on3#o;0<nv}cdubgzU6o>0y!}8X9a)ST6xJ$gPewx07;n~s
z`_9N`XD*tzYCEiL)rI1<#-y&-5RTb~bv*v*YCPZX4Z7z(iZkra6BN!f=3OqgWzSrn
zqQ<SBhASQ!q4y`FN%7S-@EV^6-rqM<KRl>Hjk0&(b8-_=J5mMJHTmEs)gX0MZXqss
zWQloiuRzF$2K4@ZB#DcbirXh#A=i89Q~WL3L|{)aM(N(1u%{#8#gjVT=>g~Al-CZn
zLKdjAf821ediV>aZnXgwur2YRLF)*1`YUkVu>wY>9b|bX%;Cg#e9QHEK8Am0=<<Hl
zN&)mENaEtb8|=5UfC_LJ3qM!L*^TS3L-nYO_{4@rqHKbms==Bqq~+9GMB#8(qM_%Q
z+StmqYP&S;sngEEtU)#kbi-)|CExBUp8jkL5f%>U-s*sUjsw<WyfGS^v|(^ga21TR
za^XI#D&+lyeq{L>c;JpG3i#i>r4-k~35BO0`%=n&5PWlv44ya;x4RgkifzuSCR0;L
zPR9*`)2hW~-#<#+EN#G=N93Hr`-?#PR~7Ipyi8nbHDkpSmnnO91M<*Sk}&Vsq}5t5
zo|_q)j?NNQ?2%JmqU0C1Ab*oHF<AE#uwq`ZQ-+>{(#SYmwr(7%+N`Ex*H}sQpJ57T
zo0716>O3^bEs-a=eFq-uEF2y?tpX=xl(IuN>rzgkE?BluMjl=z6kOFG&z>0KMqcL@
zv1329q8rCoVC^ln8n(@AU_n!vp1Xk+66KvEPEIc(e6yaS>4Qnmgpa<|`kLpcXKO4O
zt*!&%j{8vLlp{pg86dViFoi2i^0ALai{Sf#ahz}bw2*HU7n)Q{P_lY9VSgB5-p>+h
z(1miGEPggXVe**USRR6d$~{r_h>vJ>eI_Ss`hL9dbR_F8rG*1|`oxvA1h|oZ05wJg
zg3VP=^48;#_^$04bWNO!FQtn4Sz2}!Z?}T2m*d~B{$VKMYgD0Bl?b?2aG%)B(p5>*
z&_cNh8;NCm@1RTZTUbVw-e4c_fK?!<;O~8u2gW6%__cQ0V8xq>p1t`MM|bfdX@ZL8
zvQ0wN`tAk(JU5*?sa-(4IUkD#-`kFKr#awOe+junV-jnRh7$_&{+6Pz7PSK5dV#U9
zmiOnSh-6AHBAGv0blc6+)1e>4rymgtrJ!>*bBn3vk=*5gIU@VrD^du~07IYJ-6<>C
zYC~PO+83mz@kN&5Ewu$JGgS2)6e4Nis!Vl5@BKb?g=wn?ng?FkD&3R5X3*%kr!94R
zGS=zZr*gy&dleh>U2_b5>-T1E9PU-<n&(iIm22cv9^P7Cw0etKK)pijSe%__72dSp
z_jK`^?W1DvU&wRZw>E#=qF<l-mSs%Q{bo5(1FRKZLmV^I47@88Q}tce={e<CXXa$O
zZ#FHd-Q;#nGCAKa84KN;4ZVvz^Br>CuaBHrI_E^H(Bp=&UuD#t`dp8jrgOcPeCVQ#
zx0cDyWQoV68|RgWT;4WjDa&MAMMU!f-|BRe?cYbYRW0sjneI5Y;NhjbaT%uhRf#{}
zI=+-`wzFn&=kwO;46|J)lY6>8bhFKO*QStsEjxvIL4BHF(AZ@hi^4OR8b*FK3X47G
z)@YAg(!;Ubdtt*ck8O6DmPMCxjb~Ra<60G8&a;lZRg-D8uOZ)VX;%-|djF3F6IW}E
z&$2FQD*6lh;A=FxO>!T4d=`zqPl>K}fJWcrKE4lq2k&e2k|jMpHhWzU&N}iH`avan
zb5{I8CHh0&U(lCy@c!h^pr5o&X4~$pDGPkj+KV34{h`}MbAD?|=ojdAd<NY}Nxpw6
zO}?b~<CpO6mB!RZ`-7J&Y3j{c8%rDVqnCCi@P*4?r*xKjA6xLKp=kEam;N=2-@bfw
z`{t{8wJDUUWKPR#*_lkCS!i3!o1hCD45k#%Y5g_ia-NO+aa(Ig*pCHH>m>fSIwP7(
zW_%ZV=hoZEmLnloi~Vo^7TtC{@zvuyx8KR{)upKppWF6+!NZHWW?}7ZAL5@j7EReV
z_fA*h&#jg6C+&B-lU_?YEwqKWR|mL$eR<|aN5n9iAS9lQiil5OheU-i7i;<!UCYxf
zH`2^DlI$>$JXI==FMnMBcvTJ%PX0q3aH<7BA+iNiY!1i}mjgkxK$!RYt^XDC^Q7C~
z1p@7+Y$8sd4I>c#zDgT?Ef8vK{!JjL{vj8lLSn)adi4)25k4Wwerw4|D_SD_J^sI-
z{u6I;whf3_@j>As(J~=P%OnH*h1xRE_B^C+PYWin!J@B=FJ$!^f308`yFZUbh=nXq
zJI0$7DUXOzWSB<iJ1PS$I=&tgUkUtIvY;stYl}VqkEP&$Nsj%$yvUOO)|;fc(?U}q
z;b^v-YXFF~MFOd}_mo*<dtWTkZqkftHc|(vR63RZGR;FOO_Dl<Nu8zAFsWmha?OD@
zn8b`6rBXNMNzeP9^h`O7XCg3xq*Im{nVF5I2Tqe+nYm*U(>pO89a<{&WI6|5W~$tm
znltmrK+DM%fwN{N20M?5^!4!$XZEAL==>JXr#odlM`aK?3A**hQx2KEul(cvX4%99
zyM#$)jK30|rb9R1zPa~|8O+?B>3XRvQ}4|ul3sI2kj=7@`}$0l`}ruT(|A*R`MFR3
zIv=`Nbo(M7rj!n`!7eism<_3$Cu6TPO6tR(>ZYtWU2MAbm0l{H$P8Vgm>R_TIggni
zXt{{4i8W(H7deTkKb>Olq0nm%N>4g-njO9GD|JZfJtt+dj2CSqRaUasNlCJIT}j<H
z+BuQNQSxN`>G-rOb2Pn{AkvGK#Vtq-NiYeDjWv<Q#mSTO7KX&dN62IJENo3JOwGlf
Q+5mripoRqSqW>-RKi7BbssI20

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ad59ee31cdea4b0c0bd77befe4a698181186db33
GIT binary patch
literal 3695
zcmc&%d011&7C$rh-YgLI+!zoQMFJ|wq99sC5(EUJBA^f`#jpsJEwYHE8W4-J3gU)!
z0|mvUR4LdBwjhYM*lI<q=(B2H-F>gxYM-s5zPU-U^|k%uec$_D&qrqF%$)6aW|DIV
zmWQ~qH0#4oSF!>Y%mHkcR#63GKPaA$mVNuf$Mstt<&7R5b!y9$`%7=R4ZGP}^kW<g
zteBNLvc{|_%VT-g9^fDtl#vPswE@6jG$x}l!qhqwSs$`yX-jwDnOaJPvFh~rm?U*p
z5{o6qk}f3aMY1HB;Ll81I+7TH#XtsAG64`?!v-r!3|!2fGzCHgw2)5&V<Z3``Is09
zyiAECM`rqoQ<tz-zeX+<44y~@)Gsm;g68+T<j~MThXzwr@o_8J1NtQ#2c!6mxSX_v
z^z4{;b++0lnJkjyW8yMWbJEgv0JmUC0hV2+WDl_Qf8p~FVSi-Cc9|*JCXx_>{IcLt
z6zyL*8#0^1a7)rbx_r-bc-%A*ed1JOC>S*pc8%OcFR<DUqoeKU;GK_9Ti{9P_gq8o
zjp;KS>wJo;2pj{h=Wjq7eG-4X{v-6W{pZ*%<1T;6nN!fS{4(mTyNBF{Xz;d-c=)-k
zo)S8EGkZ6kfu_zUXcQ>%lw&SvPoNQ++jRs!mSv*uf<C;wje^xnqHtfRAN<qfA(rkB
zptc_sfyW2osHMIKt`@I{7v-7MNauX)<vE#2t{%#}d%PLBCU?-=ig>(G?KFcL%}|5X
z;a8X~UH3ukgqZD5g0N^JrgGSJ2(bMLK6ATnaQn_AMhr&axwHd*+8YQ)DnCda?u75Z
z(FP^G9n@E$-cWpWA%57-ptG}1fR~pY-8$kSw1^ku_ll;&@aZ$*$mwB}c=c0cx8egZ
z@p}s2#54ir<w9k!J|M^`rVlSz$9u9S3vTul!n?;0GMjfs;cYFhsNKGeny~vB_4?<x
zDDRGueA}YM_@MIzzBbx}>EwUMq`Ii6PazAYY>I>YJ2M#bg15l&qa!@C3kW^@7kO;z
z25^}+m9lzE1fc^Z$Zlf~+?v^qvdx7kzQc>T<m`efBRxR4-U1khXl(D@%*36DWP*-J
zp}KiKs&-^x%sDG&&u1^tz9FAcj{B;pxBCO&aOOPRJ~IGq$SS6WL|>$qm|R6^$zS6|
z->t;=7Ut8fvx1S}mKWBT97aRZGH~xjXI>WdIpY&#NAEvPQ!WqJSzWA~DlE~gL_Jg+
zf9#y^(6W1%VCy=6YS@V9z)Qc+Uq0$MyqA)R4^+N}+BTRPd7Wux%;t@Up1xe1F*gRy
z3d}K({q1YGK76sgCbR`-uX>NaaKi}3Z(#sdFILk{@35lNwp01j69Vb4gbjSn&ll18
z!v)xSW2e~TTm>xZJ75!JHvvgE9j87CYoJu6w^7(ITHyUm#Z(=+iw0^mbehF*Nb=o{
zQsx|_5|073@!Rq6Mc!5%;o2`cG$d0nEz%09QiafGvIi}*D5rdy5H|RSj2ZfAGhSwS
z$Gq6-rtnO20*-4|qV_S*P+3=rpfs!zFYZp^U1h9rtbr}{MNu}K-?|r_Nsa~g(@Of%
z&9V5Z$1!x)vJm$cS{jyGc`*jH#eAFUXydNy3z4C?4HX(C!`ZsOQyX|AjEcloXnpot
zYH9t~s5fgPZ(M5#_$>IAS0`#QtiQ1d9QRBz?DVpR2?q1go$r6ZX-^Cx*V{<4bRCN>
zKE8)vys=U^>xzi_{)7e%yS5pP2vy*V(K0$uJd3we?1vIVw9M1VjG0=&7cMw<Nie>>
zMttSP37+uT1!3xS3BH<L&D6fOig~)M8#-<UplzFLO!l=#Q*+Mj6GM2nf%_vdui;s+
zsF5`l3iiJ+YYk}>D93fd-4l1=WY<C3c-!~fPv&uBcCWyZtrh+Hdrxy$E2X(FI-#9>
z8(T6_&%Z<{kAVkegsBzL!hFC2sZVXea!O$C5Ybt?d<EZh)QlRRy22tusasNiXWfcb
zCN_#<seH#fB^C}LjS)w86qOBeiT$)jzH?>8P|wWU{YQ7MsvO}{D6mx27q79MQSG4W
zsxMh<AG~A6CPhQ(`f(A>Nf)~s$~HPL=qk4K-BrG6LQ-F&s(V+(=1Fjk^Dfkiio1PQ
z3$?WGZhtLES8pYrE~{9r^>X{lZA0JAysgjzJRoArh)pA{%TL7K$?e1*SXnfhX-)mB
zh^ZY!<{dXJ(4u!4?d$<q$?<4_SQT51$&-r45Fe-N?DFUpjm&^(^rg<a$$6~S)X<NB
z5e^8-q<4Y_K-AY*vUdCc(4LnF_n?dmveL${dSXe4-OP<H34JXkq4smO`xKeGmxhfA
z-5uNz(poxiT=@G7daB*a<~v1wsJYSCS{Cjc(^0@!c$7!D#s!?HvI{*_u9}c|y4G*4
zM@6K2%7p_7XAf0GP0G01S!6M3_3M+fZhhGh`r+#6*K+S)>sdRg@{Or)KYnoI?1z;x
z)ABj@k19j#3f%I6W~<EdO@pra6uvLa7s;b|i<o@n`ztyMMk*Vt`W10{S8Cr?!?dEQ
z=MGhPYhgNnO>}#)`8PeAJ&D=U$73VUG1})p;(z^!Wi2DES*$PkGhVHb!nEXcuJ|Wp
z(W%K<+5Cjm1g?6LQdujK+_sUNw~}>9Wmj~a#cyBQAJ#)3*7IfSB1pi%Am}Hh|33FH
z3EkFD+&g}yX^{&0xpGcODj1aEO8@V7iO4UF48~NLvF!TjKdbyEFZ+Phg!II0eRFdc
zNp2!ppRMfqC{p`>Z~jl5{t;Hoa(9sOvf`2w($p+XdLTRen^wxsTDjlChx7#Cj-{_A
zoaO0-Uu{h|xnHk>#Vk+h#c9)0-L-J-7A77BugA`c*tg;A>m2_lSdbW7T3agr$9>*k
z)Mx!)9`9fFk|YOY5@VS_veHHjU}-HC$wNZsgip~wENLz6%#piXA(zYNk`G5S<?>v)
zB2n%ymnX`76LotEA~1_v`O4*i+)VhvnQ*#g7H5GoB%hNv&e_?MEc|BA;CSC$ZuH{@
zVp=X&a)Uy}vAT1)3&*F#y3KZt4WFMA?>{+36%mreou>tp_}+Sx3_5$hIwK?qLVA0;
zC3p6UepY0-dwP68qFl}C>)aDKgj9ok$h>(RAH)rM=(^DL@uZMFMYcNJJwp{SJ0mhe
z2c7tvtH&1<_G&yNS%iKQ50^@XCO%+(Hg_NoRC4k<q1-|La|3nxCdno=n0mQ<I=2i+
z<#M2j^q>4@tlJWjlZl)VNpdcie-fhpQplb{r%56w=*YQBuE^EjlP*|Ji^#}znba%k
zK<f8(&{b*vImC|+C#O&B6IJeN^pYU8(Wo;Q<s@V~#c4E7>defHT${xSnOVsh={Bw&
W<6XzQSSqanUjD%m8X%bbtNCwBcqY97

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..fb2a0b13d44e39718519f4b481204d15e4dbf2ba
GIT binary patch
literal 3684
zcmc&%c~}$I7C(0~NhX0nSSBWrR@N95WfNPa3lks-ii*_A;(}q3ogj<Y+Qt>d4Z)2?
zsen}$#SN@Jw<ow#mAbSRX=^F1pIiH?h_ALywRa|B_1XUMzVCgn=S%Lr=iamZ&YjFT
ziC`jqX_5}2L&9l_273UP_a1Cqd*7{hVde0dlm0QSd(p*J-Ms<+fn5#r`0q{(ylr?$
z1MNu5y=XBlr3JJ=ITF-V|CyZX*9HKw2f>br;C)A^E+HjNDgr<tRmes0hOET66hm$b
zO~}NuHYEEPsZvzbUzt*P;TVA?01r|;0bo9Y4hqLHuraGi1R?}hA|VOZ66!SxmSD-o
zGO=nh+Ubvo<y8NPoI+1zWEjPZyZF#b&7mN8ydo{Ay2+<7)RIJFLVjjaR$g49A<rO5
z#f#L$xCCQHer6U2%#37rk?I^zb(c{77e4<K?tD9Xxm`G2jT3<pAEWx-B(+OtL0Z5f
zXzFC3^T<r_su>F3er64Q#h!5M<X}h`;)u4@8DYhANAN5=3C3?c0lXdHo(%yje@KPc
z=HKAcM-3oI9u0NT3aGs+g}IdsnwMiuq$x&2>uw)7vq=YTnZw}Tgh%L;ox@>`QjP{6
z_5kvZH&Dj05Hz2JOFKP4G{^-U9E0FuniN*Yj)e5(!7#2f4W`Z+2%_~4@cH;A*kV%v
zFMIDr=bMzs{YWD?F4u$mk@JwfZv<rhO9qE+roy_JN?2k32tvz~V8#wN=$Iye&7FHt
zj~GGAIV<S0bAYy1S0JlGz^COt6o$FO_?<PNX!M0@LJd{UE%5&1op2pqhbuao=vf>H
z#S<Jsu}cSqx4%Vc`F60I^nlDl5v=jz!^!4m@Oobh5x)pv-^#(DS}A}X<pyZ~+7?b<
z=ELO=e9)MN8t}+H3%}0Qg8NNx_%wbEeEld3=Hx{|dEpj#Jo^-s>FQzgm^YxOl0d)C
zy8%OsUIMo!Ke&5uA#D465O`$SqrU2YBd5gcC{K+bJIftv_Rl2EbO(Qb^Ekd?PlmL1
zdz0hX^k`H!`%B_wm7DN`R5dbpD0qzjdt{?=8d30*4qg8Bn%$2JPm7jZoFUqUevqtl
zJ}DeyvbC;!+>Pwi70B1IR3MJJLuFp0MDJZWL^ZzUiVmOkA*O7y66FjZENHQ9BEGoh
zNxUE8XqRoZo7gb@78O;Z;b+Ke!P9Gwa5jH7?0LL_m@>CjxWtw(XnF86S+M6F)KN4-
zBE2yL3Pwc=X7y-M^atxv+~SQ;e03mfEA}D>Mz~VmrK`nO#}5%{vG-8-m{%x|uWq0e
zv6uMV-jTw|B@3jYLvIl+*N38qM>9#qCtpIu>xB^c(Wm^g-yTF44L?Ej(RI9ILnNs6
zQZ#YNHo?JGR7=!c_anp)&O_4A9q3_CI!SDutGM>Yd9ozJoo`j9<i(p43A7}VO1?P+
z?%Ze<AGbdZ|BToqEHcEuTpu_H{J;Bw@8DmGYJ`4-v-4`6aLJFL-!vbF6mA!U=S`)u
z@0&ysw?|o>aGPhtUt|q7Jd)SG{fh8G6!2$gM#J|lM&ZSh&*0E2?ZlDNi@e=$xY&BG
zDkJ?ScJXQl>3JRI1Jcp^ilv)nLHy&|M1gZa5xNrkHs82Wub6bJjF+4OsC&f>n_&B5
zf>@hL4EvG*ZJypb&|fF|?qIDr#-TzSr<g8o57g8Akf{T&w@Ue?Cr(Inb7kU8s{_1S
zD>d|xqis^De+cyrx*&-49828K;Dgn!2mGSTDZHZaSmDPD>cRTWQ)FVu0OFcXjan+T
zw!<baAgTLTc+^!Vk+7$k_jywXA=_i5UaH>;%3cY?)}G_x*Wy7j-9dTL-P9lNa*_>0
zPO?8ZOjUdr1)g}1E+7=ihdvv2x0@nG^MIh_LDlmY@F;tA^s(v%i-b})eR)vL!a^&#
zUrOh(nng=(T{J~<rgrf%2X$n9bZc$liUB_HXUdr^OIA6*nsc+Wb<5J#1A`V)3eAT_
zrSAGQ>X^0<m#rNXQLC@eY+b&7X!PEct8H6XlzGo+D^dh+TUp_sa;ZM%__kFWU&-z{
zQxRNOTopK{_vY2(b*neOIiC<Yh18eS1Q*#4jy+Ldx<yyw7FHS3ux9Jna=+BB6Af$Y
zCRA&dIfXW^Yj~?ZvLW_l<NEDUyW`s{L$_~ensOlLR@cexWxL{zEfhIxca-l*IJ0K(
zlv6t@nv*ZohHca~Rqjjg+?)Evsiuwljn8;TCbjJW?7YyKb&gDO`_tI>tP*5WF584%
ze_Zi}oymH%u5sOP6TEhzy>k7|4a4&9bXvSgOjD~kPr0YRui`u{(mnAcGQmANGuhiK
zGD+X*!{8()^0O((tejQyJ%<aK$kZ;0$@(kj&_;<#TDS|tQJ6-WL1ZGQoRXL&rV+h3
z-f>?aHD*6-zon}Scy|OFs=3g)Z&!)yeB>Z8s|ok8$`K{X*0KDv3Nz`a$6?!Y$+=yP
z=8>+O{C)$E#g>#*`4;kjo0L)0oNT=%EM5OJA?IuMFIAPeDrv$}-u2JdMNwvI7F+6*
za><O;+&p1YMiN^yae?fnWFIPJZBD9R2B<o?KH}MD?@y~>1ly=QDf-kQYv3c&fcUuV
zC*ZOl|D=}3J(*uD`&fe`In^)2mi-rZvC#8nzr*9N%D(N>CLkjzD>={7(cr@0hg1jM
zRb6hl@V{vPFTDO4WW~w=kPC7XQj#(aG>ID@PvV*W$x|7=!yyPa1Q0;TpU*ojun0fj
zm#}hw+z3t3g7DW^ZBowN4%=f9*hBv<c{*YHF06gd@qdB^nOLDzg#VA5yT7T)`oBEG
zs=w<=l3jC^iB**B?m$NXg;H+KL`F`C8f|&7<kh4%O970AVVH^d%hFheDPT0ojFw@N
znc!q@PlE-<v8!N)31er>_s^K)mbt72)(|t{-J#yzui{1Mcs<Jp7qDX}J7Cic6V48r
z7?$PE86TEUkM|qz8y__}KT$g>JtjIbg+0%V!12A4h6m1GFlPh@fyrWzTe4@*>F35q
z1!N`ak{JW5&$-8NFvaxqk#E1v@?+S+0-X!p5>Gnb)8rYV0*o=y<BhS=9CYk&q6OcW
zNzccFlZEM-c-T~GvJ-Wa^VkC>ES#0+gbMooPYmPojgyV3KlKa~!Y*|gY!0$xwWHGF
z{pR4Dj9`Uuk_*`U;}9*Eg7-9>CXO7V!{;%KroeJfT(GPbmSMO|T9i1DmVFL-Or|y;
z`{8i1`q(~JWv|8}338WgL(Z)Hq`YAX+1bMkIXT7xm)S`<xv9o17vGV?eTVxf!j%9|
NKX8}?h`|3Ze+M9aFZ%!h

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..aca2026d8ffa41af6bda14a140d8fbca62eee401
GIT binary patch
literal 3305
zcmc&%c~nzZ8vpKlFCi$KkQWk+iYyTnOkji}BISV)mWYT|7DWvNStN@j2x=|3(4tbU
zRO?a&6^o)kUC>b$sao9kHMMp`-1XS%(bMT@N9VqmqEhvbIcLuFJIP(Y<@bH}p8HD@
zm_$FCqFreZ74E<hz#$~t{AH!Wp=WgUuq%1V+TGqX(2lgohZfKdG@s^61Hntsr6U$}
zX#s$>D}h@9LZqE+8<NtLBmnph60vo<TBn(mtu|)Ugh(J-heVr^ELtXbpC^eAi-jft
zHx9T1!1FLVNQEJ=5M3z*)(9wTt1#f<T~_#(jx`a|;@4osg03C00R5pO!U?>+iZ^?B
za0w6uc@t!f)T<1<Y&H6f0&S+wG)bd2scm!cAV)JPL!Vcm)p0r9N21$Ewt<wLBBcLC
z$aiLHwxc`kRP-g1OSaDH=~go8vuThUFc0d@447M|fY16taO0v4JYDPw_u9Q7W1tY#
zY|_J`i9+yP(GL3So&a(ucx*>N`NJGY-T800@S+y@S;4R=Sppj#Il!#d3@XmIA#x?b
zuz#BZj;v9Fo7NW|k9vVlHw=afsTg^;cmVZ44g!4~gq2WmuE7JW2e^WhFbvM-I>3_D
zK$yBP0!Es1VM0+quwL#2m*eZ9+O81#%6FiX^-|>Cx&?#_qrtuPBp7xL0o~6cXtA3B
z%ce+S(eJw<a%Cn=u5*J^X?&<MZ%3Uy5Hz0X30HbK!GZn#fm%BP6pc@yG{zm`8`gtl
ziyy2bykKqbMwtDw0dB!?xTK_s&iTV&@hBlkzEwi${TnE^pcibTJV0A&4HZ5-Xy3UL
zd}c>M!ZSYXD)$CiIUnj)s^RE0dpLZa2j?plD7JPzco>hv^I1{gepe0`(ktNF3mp`h
zl3-<NHN2dD2v#UJLse`jbgm}Q^Wr-&sH_jT)%(Mv$8(|P@&NGAIijcQenl>tTgc>v
zz@T%7s3q|*=Jy4tF?%~zUi$+6*)Rm{^Ih3f;1&lp{cEVnE>+->=1C<~KSk!CrPQ6c
zgJ_`q6#03KH~3w;1zPGj(YkyOGC5i!O<M@B=<F$IpK}!*TmJz0I~#~fy$1f(w2`#-
z3gvBCbru@FdxqRVMFfAZK(#TpX!L;=XqDxogC*yPIZY&7kt@){z$b*bPDxhnw}Hlu
z?QpGZIlL^-C;Q8biIAY7yqt>O{QHM?Aitb8s<M>N=Xs?O144K3G7hKm)*Sc+Vq3&e
z^(>aK8A6Z+bTcU3e}kj`H+$T@_W@4`wh;7j8~nB<25eIzA+N75@yo;}Sa7V3yf`Kl
z%Jxqoo;35&`H_dAV#heNtKSo76iz4VOGiN85pmFRx)13&ZztGCK8Nd*8h{K@ka{`{
z1O;W(-pR}O&z2hDcKclLzTm?9x_UfO+31Iw2Q-njM_-}|C+?7;ZT+p>OQ#cCosW{A
zqy_SPwthk8DN@KIV1(dR8BlyLn&((D1AO+h@OxcG=*h1n%rYxM@!@dNWrhtbE*ekl
z%^yn$eG#uYPKmxUmXXeBKafTC*N`^nCt}*qpArwI6jQrKCLqC`5W--;7ddP71gO7Z
zU!Vh4^ckKHU1Ih6#iZEd7E|oy#&Vr>72=M3aR#AK9z1%8CnZ*tU>YETq<DNQfpw3b
zHnw*54!s<mgtW+6EOF^A?bElPYkxO)4^P<uFYkeae0&GX6@LDI8xj!sLD10PVIjjq
zKMZ5SBSu6<DWhXz<5VN#M<t9-9P`oGaY@N3spBW4O`J4Yot~k|oRXEDlRGs}tJ52%
z<r_@}(+i7c%<MVq<Jq4Sm&}<vZ+_{gpOr0GxTyT|#Y?_eTJhyF+48Se3|zTt_1A0G
zR#vTBzoB~LH#M6!*KXNbSKsjMw(UE1HtyQ}_dR=?np+ZD_qBbu|G>fT+YcQ+a`f2o
z6DLodK6Ccm`3n~>T{i!4<wr}rnFbJCl`WST;3u8|{?TLr;WbVH8KfN+Jl9bNbn-ep
zZg603VlcKHaFsUL+`<re2F2|=Zve3E5Z>j2_v#Sd<Kp^vfZ+a>A7%S`OKHMFL($zI
zacgakj-{kbBbAq9G+AZlWwH-Ee$7j)MDv|RYiL;wE!)7+-J6H^&L_JYe>U5&M+{(Q
z$pj0&5Z(AwEa=h@3%YbzN_u-08~?$n1k0)BzVBls`&aywm#ND#S$Osc5}iS^BYk9L
zeK04z?fpG5-<6=GJOIReV@7tSR!viwjy;JtSlv^qs&fj%#0&w1xO<gpzD2n^WwLs&
z>rWFjUlqbyQ)2EcSmq~igswC3biw{zS^r$%KPL+!fkY}%{f~cg{6+l#f4Q-L*-a8%
z>Mat;1fpjW{QOCzVjCtgaa2;U<zk7~QF2xSm~e(+M&mClaST()gl92P43ot~WN~ZZ
zSYaePieQ)+w#W0X9?!U;k@dhjVn)pzB$o%_L1cV1JC7)2+eo&-t{FzfHsLAkl-p+%
z?0jmvf4pCM(zpUm)X=FZ$%)zQzBU2ln~{qf&R+!Q1cSh0@y89>y>9l#)T98NMw!K^
zS$i%#P6ta$*F16TSau%EHkQ;m=oUOv@mjb^ofM!?Nsia2CUdFdc%v=(#eUQs4=xs#
zH}J5f3^!<$<4o)Z6Qg4FIi<p`_@iUEdgEea=_)<Lj9`b#Jhldg)Tp7k>HbByCPP>y
zT;xKw{urX=Q1DtfXNi&H?C^dH6JBUJCk`xYg>@LNk`^N_Ny|EydWtrx0LS6-WbLtk
zY|0*uMH9rX26g_lf=rWdhQZ*g&d=8ux=zo`H|FSdu6}`o{RS%}Dk;FL542DK3HVRW
Fe*i)ig~9*;

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1a46cb40fef1eae7a5d0207565c796ee63a4a4fe
GIT binary patch
literal 4466
zcmc&&c~lffx38-1o_%0IW-<dPC}Bhd*#tMD%&;#;1S6Y<T|hua21N7e2q=m>C<+Lu
zfXF5&TO?qV9RW8mF1Vm^2b1^|O<W+p?jhvUm*kIk&O7hBLs!+Ux^?gU-KshBYuqIs
zW_+CQ&R^ojXYqkL00y}|Thnw4;+7=eQ)_-uHmbP%`IhJ@t*KYebq~H?tiR&jqpf-y
zBG0{j^Oz6#0=}jZpGSUZe44fuFkp<Y2^r&F003+QM7BJLVW~6K#Sfu29~W5y8c!r-
z1_iDP@edD_hx3tuC-?yhZbR`}G2@?niHyi(0UrUf;gKBxq@D=*-fpByWp9IHfC&Mc
zNyh<)%K%*RQ6V2yF2kBfnp2`S)=@q_$zRAAUlB5}KWqdfroYaTi+^B3B&Lpc;+@2w
zm<+ffF~RZCp{wHjLjvOhxe??bBE&y9CMrI9l@eto5^P7}H<&mX)Baz?{9EizB7V2X
zjenA+M1PO-%Ae-2=SvV<xdhdQSK&KLe+R!+&V*--v)GI&uHe|@V|ak(Vet8KecZk9
z6>M<s0XJr+<7NJRY(tZa*e>U3!0g&15RLbs@Auckw^Pp}^Ozy}+TM$xJ?S>=&VL5Y
zKa-(DF(Ke>!%>WBFrO&P?gf=i@8A^RhUT`J!cu20T-<URd@qiLt%-dosR0AK*ZQG8
zPkZpQ)i4qscf<}ibAVNi7py(n4(_Jz2OoCEVv|iaBJ0_6h={#fw4sh_Xclo6Ka@hF
zc`6pLvSeDUs7V7vZp(9^VGN1G?_3aPCc@(Rjlfaw4d^m|$a*?Bi_idEFnj%3@TSZe
zaDD7S)Ffl{{Obm=t@|u?)pI^bZC-(f8wq%2VJEP**2n8~hC!{y8dQ;D2PWCMg435K
zVj8>OL;X!PK-K;|_|d--VAiHs4BrMY;#2YO0y1dt(&gZBdonoKQAOkw`k_O$X0UN;
z17=Y?iY+_;1e<?$GF>la4XQG^PFF0qBAV#G5K*Q+*f}5vbF+iN#z7}SJ#if{sy|Ir
zyAI*-ujFM5ECQwrzQ8osaX|j575E7MB5|r_ArV*=#VaVU&~T0PhQ(`cqe<c^^g|H_
zaO9#0lUS_B%VL(Jgx?(DmG=W`&yzcuTY3YSC*aTA46Sy$i$axi@WVZ*W{?fdG}37-
zuV;+t0S0sZwMIsn=VW-g#}xVHvYD|nrqOCuE6}Y0eYC+&Lrum$i4J@{$nZ*&5>dhe
zpl`I6zJ^!>sy-Y*e*bEwZ&4*^wXfge303Rh#nhQx-V;NRFvo+o`o077KC~bDZ_WX`
z`*pyv-A1^M`(%bm`W_a$wGplKeF5*e%wl|g=LrmF8L`e+ThW)MCGnVzo~ZWGbok~>
zG%mVy8@PXw0zAI`f#`YA0DA*}1KwvcuqHz;Z0_?$eX79|RG9@R|B*Rjy}kfK-(G}o
z?nmON=wG6N`4{jscRhlgsf`7Vgdn)ZgAw-B5IlR*%xYEd1ZUlI>8XK1lS<}I1r`r~
zCMH;<!+g3q($d<4(YHJYPPvJ|Fr}R47PpKc8&)vghvu-`rfgIpHgSLohGSjj*XXak
z01+Ua3x2ANq4%a;1&y=1(D&)R*va|2s`|S!ar4DDv4W{i*u{~0-rTy~yu$(;qSYaU
zre&E5ui1Y^#N;@MmJDWMVc`JY+Yz8*tG*kdebLDHHUbWDPMmobj?9M*1uUNl*(`t2
z*Q~C2PJF^{na-nT9+BSG#*@nhtY~&UHn>yDH$2nA<5}1-euS54zWT0cIEn!56R(NX
zE8$qGn=idPsRVEqbl@R&pP>Oq16X^|LDhJ163!UDhB5jzne_WL*!7BwNKh5Sm{?K-
zv|n<8Z^3!2yWfN+$IcTrF8cWKOE_jaoT1sD{{?fKYzu718t8_Let{dF-2{0V4%kGU
z-vMpab9&O04p0#pi%#yD2OAFXxYoUOgxXg#Kzm;TidpOr7dpqY#JBE%hm+P!m3h{p
zxUCiR6$f+(`xTBTa7`dyxtY(or1u5gF4P&n$}FPG-uA<5-z6gL%q9)1A9jJ&eJ6EY
z^evz;y8}D3qzLob{uC~mh%@Go`VeWShu}z-43C~L3544g!^lNdSXdjtG9SzU7rx0u
z-exyA-+mU$Sm3J(eWIA4Pqh?on6MMGsf392lbFytSB*A^2GvuIA2WNaLs4+G8*H35
z3U{<@V{Bhig4VP~((V$PD2SzpT}X)o*Ye6>Z$uEVyyS-8d~AsBTD8F|qGZ&aEMjlh
zv?f?bQt7&Tm#efqTmjh{4KSG-0j}iV#}3eRxG5T%aDUuBZ2i$Yuv?x<n_lMuYyuw8
z@;SBaqmQzIQRyspleIRmU@e7%FJ7YPcWjU_pDS3O!H507JwqS9-ojisz`<U0%HYI%
zIZ($_iu#v}@ozL1(h4=~VVH-4ct3|w3zX8C0c|%KGa9or20nDsn4{O3Q4a;^Zront
z$h@t@`wgw&>=Q?LC?`v`yly$RsJC1Lc(t-DUun>aM%_6jd>)f={DWGZM+w7idJ7op
z90ENpRk+Hb7u3(@c^>~4wh=#3<=?;jICZo_oVv)Q9AoGjkRA5>We9f@;FSs_wL}%1
zxe*|S(B?>D62?$B@NPPiw297};*@2RpPa%Lnup(L%HOnARacrSloV{<Ho?H7#Jjm5
zWyfcxLFck0g<E!M&5nI~qq%VF9vz!xhDdreHC@kXuYpg?(QW&tx)(TQON+MepYB~9
z-rrKRBhw_HB~@g5Y-hGbcwdQ6>#<!qvt;+rW!o0-&YQRP<<tJw;ys5KBqFA!T}fKL
zZL0b--?ozULdUcz&Ij#E_ZGQknMd4gE8SPT=!kTirhQpPsb`5tsc(DP{&KIAL0t#!
z%MVoe)yEFrY%k9|;op?Z)N(kURTX@0?=-)T<JmP~mkXS894ZdhMc$~6xYbdSb1LSe
z=ulyxT2BT~(V~Jwg;2edjJ-MyDzvxdlA*h^Z>cFba~;bvW+;I5<*tMKD-RgQKf5s&
zjjLFer;H@;{3%ZvX_Mn0k(dhbLTv<p`I4#Nt$Rg+<0|m0esDy4b=pr#Tu^~W+~g`a
zceL!wxC&m%2@;M-VKoAn3f!-QJHl1Wd`afJ@a~xy*_*CG$70KGW>Tq+q-ec5kv2II
zPT-ChAU)@UGt;!2U5Osi2yX5~rX8#-&d?FK5ucoEJ_<hHJXVrCvlL=fH!8m-j8o_w
z>BJxXucQn4r?&F+TymYN3fgxFEmqz>S5?@#N7r_{wsZB-u6+icWy`v%i+VCmgS&H`
zYmQyWnJpi>-BnY3>9EZv)<l=ulHNk6eTK`sYfG;byBE6Txzv?iE%&Zjd8fOs{Q8N2
z*6kBrzdhbx6Mm&^dC#{Mw@%3(bmzIAs=U*<c6jJco4&q4){QD4yBWUc`Cw0boK
zFV^(O7E`8z^JC6dIiWCG)Mm3uy!3Rv!Gl64Lcyte(CIEx;L-sH1#r^(uEMWt--GTV
zH_~PIT4Vo#9;o<Tj{Pdf#JR@Wd^E;woj&D(%;<<!6weKn<53avIC^MQD8&&;LYSx`
z_|;HQAQnH?6~9xm=#Rfkf6Mx06kS(QQoh-YPy`5w2SS3Ryul>Z4I&?ZlI}jIzKEne
z9qayv?vgu|B4mu)P;~dNv!v0-bXQY6Wcp8ZSM^Ww8x^`LEN+a}=OV#oB)(=MemsLD
zynpro4^jUPx@e~*5Yptq;i1uie4J$e`skyV2z_n0;}dMi*8pINgrC-*Pa8A-l#f$(
z|2`Q$;?vx$sol6xc}moa1|lPjpO(HR>0g)fuMGS@!2$tKq%Cs$KYqjc54~Lczue{j
z*i8~-m<f1dhG3_W1^^;$AxGljvB+!g*u@ea!A&T!lt?8K$zt-M#8o0mkVwNM4iZV2
z#5PPhCnXIQQX^Z5#F^?zeY__%<xoy}pd3jSeKXy}WHvdl|I&%lZ4;>0o@z+f5{Vnt
zNPQ?(xh^rK^vEFdFU^9ymd1xT%!%~z_6Voequt5;)~zHPr9WGx6EX=>#{4OV)Y>Qe
z<-T5)t3n*ZB!Sd^Wq1+}DL&)6$5&rbx(n5eL06_emQN%(CyfjAvW)TZ{xZhbTM3<v
zw|ESn%aTv?A&W)INBK~tl*&RJm&Q>G5@$EcUTKst9)Gd3vfgB|Nf|G_L}Et`9iylk
z$b22<tPC<=OV(s2WkeP^fvP{5;@F{(b5i9_GIJ6gx$Yy8CXAhvGFfUDX(Lfqa%__l
z<k-9tx=*x2JQ+ucliE-EC$~~ZGiC{dy0XC7)$yTm#=$a~abRq0OoHy3&{%my%qm?o
Ys~Ki9Ohs<m0Q~s}PU8T$lmBh`Z>%?e=l}o!

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..56934cab8de9fd4346c856340c7d0894b2552537
GIT binary patch
literal 4906
zcmc&&cT`l@*5BurDbxYxjx#_6=_pDOupwdSASwdV)W}c-q)Nv!DvE*~qI6LZr3ngx
zg`$9>Vtrr(`!gCWF>2H()_3plOyZa1k8iEF-rI|N>fZae_dfToIX^tbUgi`*1yP|M
zlmZ2+0A_7I6-xar*H`oX^po_q{-mV8?|Jv*%MX*5mL00-cz_?ZFQv}=`%F3Sb<ljV
zM^8^e3I&uVrENl~Qap-9u{5p0i1Wop$ob+80F*2-x>ZFunPu6C01VzCO^Og%0gERR
za)TwYq4Ogoi4hbgP!&8zf?tq97Bb-coj;KY9XL=J&<&670O<J^L<M=!kuaIK5(u~m
za4wqwLXiU`{jlgqo_-V+IW|1ngb|hDj`5>S>HVIEkn?3n$f2W^iGa@S&#Uwn&qkIT
zMvy!UpA-(gVsMcBEjuH{(72G~n6TKS`Js{|iDD!@hzy+{5*M8u6D!O2G$ts)3<?N?
zB3$$T0sNOmcsr8H(DI;evt?yyQ-0OTc3!&(7Fw->x_&YAw=V#bvN7<(6$N;;Ngp0{
z8$-xwKH5_m2kXD%gMNND#Qjqrz*G%hC}FVSX(R+5_z0IjRRb%0GF1ADp!_)xmK2H6
z@&pBJp=dI6?l*(eB`%;7GZtRV{DdynkAqxIAu?{$1LBD(aN?$c-3kKw>h*wYs10g-
zJLp}=gUx}~5Vg($ruQ#|xoKL!-KGZDX4JthrBoPc+KA5AX(HYB8sM*U1>N@Z5Z`PG
zu|Er-O=&J{U7!i;KO6+7f-nf$s{<G2v0z7k1G*=Npy`}ETvt(puFg?FY<C2+rk9ZE
zrVBIb%Rp3P4m&X;*sjq8%Rbk`1DFa|T`26{>M5{kCLcumTp;uD4`^Ys3hXEJASROw
zxh5oZA2<Le%bdaU9SfQ_7=ytE7VIsMK+hdzICY7HOQmMWy}Ata63@c>CC;Gx&=f8Q
z=fa&&v5=PJ2L+kC;B(4J$akrN9qzVpuLwi$mk+|2tZzW4&H|pl$bdc93_&ke6}{T|
zD^d@AfRc<5#K-D_^X3_lQ162^MKll_sz1S-dP{V4Y=JySXBzAowTB2&-vN5_^a;;h
zuTYEIS?HgbL)6UglOJt*0pI9m4CYrKKn!sfYu|Pl4H=%pEaD!s(|RvJ_sW~-OxY7;
zF(Mu-jSGdLmI|C}WJ}f*_CkHfJEQ|1*yIytsM<{t&FX4{c7p_Td_^C&vIU1tX>+hX
zZzp(a{Sp)Ib-{OZDuDF?Kh#vw4R^A(!RHML_$brmn9amVWMr-e>+#7(WFC2pD9vQC
zyrtjD6~t@EMUT8qmUKM>{#Zorc;}8OSYmiG)e0`UZ=uKHzTCq{6G%QN!o;P=;BAc?
zC<Zt|^vJQ;v+r6U`^+)?inlFfbuPeOwvy=5^iyDCqfazzy@V!y3Rait2qPV*LEFV|
zaQ>$ENPpD<P<DC`|D0bBxQ!VeN7(@<Ig2<Fw3YQPClP+^&VZtm4P^1IIaq0vIcha*
z!N*sA#OIzH#BGm_V(Vt6U=60%*wT5{WC#0aGTJNvKMjd6xiAElKXN5iSEPf<;Wn1a
zHH2RNiUUY?gW0!JarJZs@cxjE2B#fINvd2Fdd!BrZfb@|qj{L2Z6g_SDv)$<6GCpI
zFEYNYPFDAPMzte)aFg0C_>z7XIFc|MYn|qT@)EP~5%UJ{H03)eCh``x=;t-qlLgC(
z=INe@GiZawD<454V&br}Lkoxx)CKa;f$5~AA(~fSTgRUk<&P>;Zek-1bl9blM(D!@
z5iTCTl&FcDho!!8L6_d&R(Y0jio2#ah`SFxRotr4&32b6D-?aci&TsXkvTtyCFl2q
z6LXuxU3#^ZQ{z1fwVgG?=9I{D6UG^{nw0CX8@KhbWsZE6c=`QU-gl2Uep~FxXkj_%
zo20Q*$P{S!oQKU>(#c+<OtP9@4-u&i>F7e1r6O-|G^9@QVlBGojQmTtq4}$eVbg#X
z?Ac^OXnBs}nC5JjlRw^yEew2t?z&Il=-nDb5ppJS=NhfqzFSuExUJq;(}OYS?a>%Q
zbnzy5PR#_bgO|y(KeV7;$s6!Lx)tvjt%y4N{INde5H)3PIac<-0+V}v9>Rt$pttv;
z2yFKf(QVuF#1>CoQa)c35B?B}p*3Ed@Q0(}$zZ44an)0B%(H}@B?%r`HN_Cd-+xM~
zjn6@4Yzs_7V>8ZP^9)=|R>0`YT9!xBTu%H;DcAGyB>59M%azDg3ZR4&cu(zB_G>>N
zgX|~6Pfc;`-Yr+4bwUr;p3{r(x7AkG-<VHW%({*Ts1?f17kwvJZs>~lOmU@1$GKV$
zI(g&{fgPE1;sj5_Dhnkh3glws592<M^6~HpKzG*%DLJTa!p7dj;C@TF+@lTl)aavK
zJl=Rm&Y1yIRw90mv~$-d_Fg3L_mfDKfwHOG?D#cEV$bG|p7}Fc_T)Nj-|CEiqxA_`
zvCr5mbxuNER06ht^AyyQ$5XWFZ6;M_kAv>MR4nMkb<T9RWI2Nyx8VNB6vKG$CM;=f
z9eY8Z7U{IW1(T#mh;gk2{M(;TvA7?vaj$T9v*U*b(A6U=FwOi9zV+peu&8gpcJzG#
zCM-OO-$}|P1J*r6KHm@=+m8X{mbS;}LqR+dqc#$DPFYJv&1}HKPXPXY*>N~F=z>a%
z3zQFzNZ`~q&%*<vxzMLvjh3lx!0qZ0Cijy8sd2gyo7r2-2fyQTR<HQ1kVX&GYWxwc
z@5<(^^Qpp8jz_WXkm^{loGyMoGYPJ4uR*<$!La+Ir^59gM`L%aPoPVp4D3vXNPeBV
z4Jo%Xi>;kIPpRwv0wm9GK^cmXaH;Gbp2yNs%;c-1ZAstbODk@nGl}`EG0k33Me(?0
z3Qh7A4+?>^_p?f~R~2WxoG*Ix;up+gOc$iuDhifvrO?2eC)ndtk74?41^jGR1EEy<
z0<^sCv4MF8#A5z*)-Jvi3ipzd^u44{m3pUOLfuvV{fb2}r56&+j~y-5T<x)RhkyD9
zLggg9vSe28Dihr0fN_L&J6Ej4IjTnf9XnU9V)Jxd3+&1=GUbI95&a!ytJW%O+h+;I
z<*T#RjJ&G+JIgcIk1z{9T_E1IW~0W$gopi|yVh>jvdiFz>?^WzbX{|e0=g=)zc=(O
zcP+Hvy>8nW|Hg=cuHEbNO@q3!L=Jm46poMRs|q;2XJhe%_<N@d9V$0%pOW^=!-3<K
zn|FM>0^_PXR&6PB$Wk>9JW-Xi%Vmp>TajaR?(S&?7LnIaRDWMNbEkc_x>L>8YVRtq
z>cH-rZMA;;gL{gcYV+#m98P$2y}LGl-~5gYu7>m8f`*XOxyEx&?kzkJ{*Q9EV&}S|
z=BWP0$Qvi?iVwxbcl7**9?)S}B12c5Q2!_N?2>F6y}F;0>dn4z^N9CD26wU)%y!?9
zfqBZnNNGeh4K>h0Lp2m%t^WdbemetIBn8dt++jb8QledhltNXX6yUi?s?c5RP&sOG
z#QjHf7&qIPBB;9TlZA0*&TtWo7gJK9;SwhG+2{jj`J3l6J_50ApY=z8NA&Rd8YZLn
zZ(qNBqa0+n66Xpm(|8j5OnH%Y(pz4zW44;HZ&R8|i1S8`Nhz=1s)V}c=(^=urmKdz
zZyOVEFlk72fk)x^$g551YT+}sPf7pv)sR|*XPHB`z-lo+(rfp$LhIyr{DnT%-qpU%
zi>WBzx;aNuUcaNF{To7h@~oB!Vgj3^hG=vog=nsnGv`p8NUGP>OhcYLtnna4Q3`{U
zl+%|yq#~UdFoMB&`s=Io5fPqk=@wGp_{8}z-L3Z&l0IKEM0-5AZ+`P~#1_~T#7mB^
zRWbB!D_9hAYV#<UJe$IVu(SE*;aA%V6T>g=5U&+XElP^KR5`2Cx4kGi>e@cZsl2Jh
zDKR$=EgHPqUYr_t=h#a22-}jhML(R%G4wlHlAbttvCJjkcKhPwC)XOnuN~dKB=y;y
z&b1@HEnS-aa<I43?^x-wC2!(CCE~v%;s)z1H7RVEhq!)yQ|HD+#xk5FERl$gOiW^j
zMTarBc>2nIS6R?xA^6e6;Ix531)Cu$fBYi<@?IW9&G?OFaat9CL_`=y((I&PkwZuj
zmuD0FZ=Ugx`KD>1KmD($m5?!$iIDR}hoK~YUZs`(O}8+bZdEkUjt%-J{2d`t{*9hQ
zhsA~`4O6gOOt2a=IBIQh*@~tle}?!Q;J->;w8095ti+IruowwN&_qoi`vc+A*Ywz{
zW=AtNu)>62*PmhyD}T+H7`@+5hQcV8hYe#*2xX_la6*hm_;OnM>a>4t#=i{s|0W9p
zRgtF1<3G(t|401qzw$Ew&t4_L+lc~I1CHRW1s_19DO3=9dCl~jJp5(}cM_(ISc&b$
zV(~2cVPu+EoGP{t7dwl^;bMny*_u7AFr67Wh{bM9PtU*f^h`EPWIQmA#4{I<F*Th?
z51eMWGINJirgdT(+O=5h!8G;(%v827He=>d!4@;jgZ+GyL!Bo@1^9bKF#9o{bbRRx
z=|<+yLFR-Gf|g-_vLUngm3?BMpH*zAOSo9V*vsJQbZ80qGWVK2o0+>a&2Z|n(1+uR
zqSx$`Bz{(L0sb@M0{vyF(|EIn^K<w4Iv%=MwEQ6+rj+*ap)S5j%!b&_gVC2MrGCMm
z<tD2)U2IyulwK@$WQH!$Oby}#ohK~}wn(FEV#z4cMNVbvPlq^sDD;}W%#x0r&W_#>
z5Zk8?pOY+D#){Sv%PKi+BujF5U6y)4jB_%LBg>Pqr~T8W%+U;Mf>1kNlCUT_ENN^=
peEe8RLPA`sc1l=6Vq{#bwz>5<^KoV(4^4pI{-B)z@TC7s_AlGe7<d2x

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f82d9dacab54b6721a1079f37ecf1cbdf3e4d8dc
GIT binary patch
literal 3317
zcmc&%c~}%z5`XXYF$Xh(LXR^r2vHdjIYsb7q(M#x1q3;w;s7Hc*9Ze+yjG18yzzo~
z22rD;cp%0EjPbq&!AL|A37&YwYhB~6iTk=I7+Lj?{l5J+^-aI4>Q()!s=un9a5mDN
zAsIKOznY;L&<z0jSlj+<?Vq-vI_YFs_#!IMWYepLnESB@e3}`+STJ%YEN6xvTImHG
zgdIG&utOREAnuLuNJB*Q2CAb_$&d<9K$|J#qC`zrazd&mH<dv$T2={V)le0y5`IjT
z!U<yo3<7vC!xI2JR+qO}NeK`^?=G0Zk*RUwq&oqX3p+S+q2v!483zCUDQ5lw9`WE0
zl5fE~NRzXZ@-nqq`h;YSUL#Gzi?rl~r0k5m%q-rWt5DVqRpo@L6)FD<lOJ@l)RZYV
zRWtVpJ|`Cif3G7p9+*hJW@@Pe`@$&A-VC#{9TgTM(qrM){AScc)l;w`%>llyRS>Ml
zbh12q95TENf|vf)(zVP!m?)^9AleNdNmp8(5Y)c@)@;q2R@l{HHFURFMvG$}2s2xR
zqUnt_!t&9dz=P-9(BS<`%yL}%&{d`t=vs>%`by?Y=}h*Z;_>amm?A$aLtX~#oTdnl
z1t!yb-xQ-yryUU#m{PQrXCZ71h^A@_`%BI4Is(I>NP2QxAdKCx3MR~53zjt5fUlQ0
zk=bj56|T#cicOByph@vhVQc6BVejjAVXD|ke7@34Fs5j>nW$zos=D15{(2~rRMa(t
z@S*cSRC@_=?q)Tt*SrKNsQ?~!l)@tov8Z+7gf9NwmLRku6Nz730NPiz@Yl9<5^b5L
zXz{r~7KQhsOiGkQ;_GAt3nGOn_Z-24yGO*%%?SJuzD`i6N$jz0h&}MQ^@uX}SOx?2
z!4fO0rNj`AAW`tTS-^4L4qB}rE7U!oDGI+o$mCXanV9-q0=f_+ab`!O;6)6eCio2o
z538~T^+lIK&44rL`(^dS9v@p%yTv7Bzlf{E&+sB0Z#M!x&!B+G?iW<yrBtF&9WOXL
zdmE4pJ4q%F?~Yo69AMR&KvU<4*`)A!BOz?E6bah)5kFPbBH7+-q4l;cK>17x;>*qx
zXPaE<T;e?C8)`@HtRo4x=PNCnHVzfd*A>7MM76+i)Gu(xgR5Zu%0R-}=6676Jr>OF
zc@k8l=b$}HhrsG$GpTp|eyVG<3pmkWK-m!qFf1fbth#m`-0G2UuNz&3^j}m6CKlUJ
z0TY9eCSODDn#)M)dJPo}*M^XnMOy^AS52_-yIDwCa@fM_$HicB!yemEI}a#deUdm7
zwS|aVa1TaVlR}^0;;5p7_u=a=b!4V_50L7=6{e5cOQakJM9EDTaAE3t6zhIP@=f;~
z;jnm17?&Xe4W`@S4D&^V?=FbM4^@=a=}I(1(cZ1l`HrZ*Qj3x*)v%_|Z*bw!`N9QJ
z+fe?obovHmi4w)Vhzs-dpmBXUtWQe>o^@*S>K#XP!|OP_q?m)w&QX{wu=J+HrG*09
z<>R^>y)_Y<SX9F~(ll^sV;fOS+eqhGSi)8M6~y$-*WuaR61wmHNZ>o+CcRNoWwQD9
zYT&eefXQKRCGZfBf$dM9q0E1p0E3TIHhm=nn_fOZZ^jph!dfK6(^EQV-MSXqjP^rK
z<5c8Si!geVMF32RoJrv-b%kOUP)<xK^=CyTxzNi=?#0CP5ZQ7?iunB^gx1uP8F*bv
zk-}(zYb#070-;E3B9V46HS5~V+=7vHmn$r-l-50LY(MGQtGAuX-l30UUngf5S2y>5
z9{oMN1`HfDc!>8<pJBeNpa1ZHz@Xre&=Kmek)y&RB1cC>kBNzmi~n@&xbX=SG>J*c
z+KDNtX_L}3GPAOQ4oxO0Yn?Kun|UrH)8|Qbw)xnG!ETEEX9}HUDpoGiDW`For~40A
z=*Su2BHh55yuQDUPne}ioSm$d>Yn7x$@+Zmyn-(Z=Py{eXz`MzMa!12Sh=dW<jd7-
z)~;K>;j7Y(Wt%o{`FiWN?d3amR_xlnXK&@cs{P*_`1ZT%nuFgT`r+`AqsNY)IC<*y
znX~84U#P9SSl@8z$IFdBU1_>{t@(P(jn<#rZr-|m=kC4s`wt#I`sMMHr_Y|NPFpD%
zWNbmfovXMgGcAj2N7`I6BP~}i&}L}43XjWqDJ5%TWE&XOKNwXx-|ODZ%m-_>8{_u2
zF$t3(prCgG<gy6BW8?IJ&V=Q7k_$UzxX$$cDHiI~nV$6cSZ6YQ+ov+LSt)vBtHr&k
z1*%%CRm-h$Z+c(-M~r^RPsJinAg6PaQni^HhQuA&4!zUZ?Ud@B=Dzql0(c^M=U_9m
zQMhyK<mCQL1A`b^?akdL<$OxG=8v$2j?~y$V*R$9e%|nZItyg9La9*ykH0!pf6aHm
z|K-J1{Z((0?6jqfRtaUxEiC|0DCH71GICVRVB?D=FC|?$@?`y3mW{v<M<ZC)!1|@I
zfh?QC`ls+`epn!kTluqW2sh(>$BcLUGMCf9DPl)W?d$3~5HA8o26KCV12+b616Iwl
zYHsk0<F-7Xb>sHwiTy^pC&rA)OAZ{A9v2&#%JDP9F}}|x;eprZ&nv+oFd6mnOODs+
zer|k>XI64h3ajDn^X9QTnBqG2k<roIK9n1buJh21c+&BipI#H=nH?89GCMw&cOBb{
zF!~o7)fo>C7N&RbaG~_mB?pbsa||{_&B^mZh7S7?A$+`XurYOno@Iw~%b*M{2D<pb
zL6Z{uO~Eng&k5lm8@Tvmh{mMgGe7<&MvlG1{5aOnV9XN_mb-;zSU!@*OT3fDbKdp1
z%)mTshxe1akM-lLTxyJxKyItk<V?=f>YbBxI%iEzPPV}|Uz?MgmYrqm?&ad{;-*k5
Q0eJfZ2T1_J@gJUl1J8q)CjbBd

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..842ce2caacab4c58384e71071de8b40ef03e2e4b
GIT binary patch
literal 3138
zcmc&%c~nzZ8o&3xmlXntyqI7VsuB@dL<EbX@`9`eSwyyuLLd<kLLvmxx)rTjL`A6z
zb*)&bxZr|VEm(_dl?n<}DlX_)r&evN*74YO>~inR#F46h%sF$W-%0NF?Z12TeJ8PO
zye~sDuFMcMo`53&4nZ3{?e-qKe>b7w(9xn{j~bI1z&J8ePt12<_zYhW0NjOEo=j+!
z1^|d$5gus>DWu#76$~x+2Yd&)Oq8L?*QRA_3bPp`rKMY-v=%BOl)_h8k`rHmK>!a9
zcmm+hA==?HLiU!42=)q5CbV*7LaVG4hyCIyHt_GT5ea)U{VZ&EiPn%=q}Sz}(zF_r
zMv{XUIoh;LLtc?SpG)KnlwN?!a!T2T6#s?Cm!?-@$CTNrnP!T+gc89&-=?Y$&7l7d
zFYpf?h~}lbO+s6D)j3VdO@`ZxucCp<L4wUW?(k`woMQdv(mM^)koieC?EF_xpRGe9
zs3q;`qP_5Y$)h|TXnT6tVg2tnpv~ULJYS~@zBuWgu(UIqDpjWnc20Q>9{FMf8gsDR
zp}>0>zrnr^UFmT{A4r3FEemQv<@7tkq*W?jo@^_yop72y5vJv9C-y^c%sD3L33rG6
z{Yzn2=u}=?=_rZAZ4Y1$jOWj~83vO#uZ3xgHh}VO7w}QJC*80<%-*MBrP$^~6PlU&
z5Z;IyEgW*~Hp~`#iqF*t2ohH<a1b?3L0)kS=%0@2Y5Do9Aa>khkZ`Ap5<cpp+BHu=
zRyKgm9uj!0BN^RTmTni~SdP$YJrX}|1-hTx;7>PmX;d>u-ZQ?H7Ir&}Y^oGg##1eV
zOX7uD-+F+1w~vXvt|IVR>_&m$<^ig9>`>r$>w8{bzX}*;im>h9e<k%*T^ownSOPp2
z@8V1M8-&J3^F^_D18oM>)&Sl+w!n^}sk6Jf1do#dFI_bTeAi$Qw6D4dnns^RpH{R}
ziXSDwZABHGy7e5@((S`9q|Whzqul7-=V@xhqczUm)#F4<jZ5Gus!`w(^8=iB?=sl5
zCX5>3@*Cji-xn+xbQ;v<7NC7A$HK-+2T4%-L0+G!-r!V+85!czV02`WSb60dxHYhN
zsBuaIGQC?Tm{IA%3!M>;G{qWv&mzY5{NQndiMmMoqNqk-{J9%;eNuuHRn1NTU#<YN
zI`+9nx%olay3^Frgc>TP>|2;HfEJGbHHEk8$X)pKeIu>!I}l`tY=^lq`>CvxfU5f1
z8??^egpz$P*?!!wKsX`Q8K&fkK!^PfIIr(=DtHe>;_sBa{%7jZJoz2RQm=1B?e#j8
zS+9mo!+wR!jxQCKCDfwg6S@5BJZF?49!#|^Hi529J7IfH2Jk<xrZ0cvfvyLfgq`w*
zsAZwtrp!5rC*D#jaD8u@-SJy9pp8=_Tqwx_oz*v~O1_I^v6C}gYg$ds-F6MO6jt$v
zAB+dV>0k4!Z5wR1{bL>Q+%ejwIY<Hg#EJ0E!yl3U7aL$6FOkk&!@%w*_t5Xtmx!W!
zY^jHz8{vQ(8=%V+73!X*q-Q%t^FMS7g<0|QdAO+jKrs&}Px7mLR{CfHy^JOg3DXnk
zD#jD_+XV<kya%`Ox|yPdfdC?k=J5qWk=VvoVrTEr$FZ*yBkd=XJNH)%80g~q+MvNh
z+>}Gzhj|S5^z!x@;XBfAlz+hJ*8|6l4H`FoLNKcenHU-t9uXNeNgX{oCN?g9O2X8{
zq~w&;H~uzldRn?BBU7uJk(He@GdEA4Z!pd(C^Quno8NqE_MEx%=D%ICVBtH97B6|X
zbZObL<txfpuBv!%^_sPnRqwA`zhUF1%^z&3-umITnvb^E?%26&cio=7`}Wr#XgK)s
zp~IgvHXZr&=x5Exj-NPr>h$Ml&bEAUuC?v_h4zllFE4g|b*cOEm8;i!t}7qN6b!P|
z_lVwwQl!txC$(NzNay7gngqH$9Vw!?Vm4FKOODc8j>^MM%H>=ic^=G{mc?F;DGd_H
z0l06OUQ+d~5;CDxhE)9*PqBgLtA3f{RaM{qSr5q5<!6~J4a`#ae{oheI^(+kqV-q!
zd|86><^Dj%FU-u==`{?E`<xqku3fq*)Vuoz<E9M!k*qh$4BsN$yDbs9|KQIchOZ7H
z+O&+j3(`^%_F%meH)m|$mDuMT|2bJ8rR55_`hVQZ|DqoIe|fRWzv?APOZ!P_rBJ%N
zpA!Ibh0K<XkB><jV|iGzEwm3Ie^$k^Y#ja)n#8hZR+Ytuv1}F_lEs~=us}3fg|KWS
znepB_;~lpwBo>GvHfHv4AD`FpB6M;D*@u|P7)l0gnq}2wP^FM9$7e^7eQw6c$-Ws$
ziACD5z}%GN_-w-0$Kv?jnu!O_UI=Fd2Z70Ak6RL6Fa5&QB>#MEcowT6`kZ^54yF|A
zK7Q&{vX3HzC3P-zOFX&wOl8s}`5RJ_CmT|exzw@0I7@y}3BBXNU}1VL9s;Gxs0~jv
z5e6HnCi0w++3G(olEWK=jmZi<%T6TA@H~QnF*Pi3X2!@jF(#vk5C+*y@W&xqE(M>d
zI87Wm&JOcaSe4mwPh7A>3(K$^k`^T{Ny|BxdWt@*2>apkB>LDsRwY+skpwbVqo!b1
tk<R3mX*7Ci3JMHn*J52kVU8i+)i=P~*L#FqtpMQJ2OOaRh{gXN{s7EyRmlJV

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..fcb03c17aa1da69311db0ff5f9542aafcde37756
GIT binary patch
literal 5020
zcmc&&cT`l@*5BurX|w_6jx&OSB27_>fDMrWsiGnvO^pskK&o^_N7105SRhIj1S}K<
z1;Ij5KoPqE6~P*f2Fr^*v4HyS9byu{B!7Hsz4hK++*9}7zrFW4Gu+>}N!(2-f?7lc
zx>AZ1r~#NX9ZhSGjO^{q;M=S}{<%h2IQ;TO8_#zy4!sFJ=h;?d`Yh{Pt<-rbHK#tB
zKDawUao`{YlqRKZL<uN9#iCf6mSDj7qATKj@dg0O<`~@y5KgA;&Qt=>`+zhlLTmvn
zzF5TdmqrFI43Wl$P?%64e1e3pkX{<n<NT99u@N0OP#Dk+pY8zYc`>3wT<J)dOxy?r
zTm(3mO#q?90g`@L^rJvOiu9wT#IfSjW{ju~c8o7=YSbUOh&W$%L>%;+j*!mpudDPX
z-%6ezMv#0n&xBlhMejKIdxi!|fl&eR5kZl03j?KbQl(IO5E{5JASygQB2u1n4<_7=
z=@k=t`MBo)1Nd(Xaa4<H({iPrvE`+>rSj$`+hP4uSYojjs(K{Q)4UjrO2)vC*A(H+
zW?gvPZU_OR1!!+s6m0xP0J=Ht5Oq%%z*rq#Dr1oGb13*7`UqD)RRAk^GL-p<q4WhG
zlJg~KRg5CGL_8T<51PQ49gZ+EVl2Fz{Rv&J9tT;PB4pS!3W%r1z=@gxHmeEfsvZSg
zeQi(|*g)qJK5X%`gs=^EFtcX~%umt+?sj#!KC23LDJQ~k<63mFN)zccSAt-J6X-Nw
zgy?#6h<ql5Cgu6CZLub795@X2xk0dK|46vBfCW2yYS4Wig1QR|a6?rc+FC~dQ8*n;
z>Rv;tvkuIvE&*|+DHLM{P^eJ{D?eAmLzoKPjudwP+bOVlwgAKj93l0|k7!A}DjXz6
zK}0GSvW!S*KXeF;Ryu&&2Nu+47=m5~3-;$q;p|-%=(s|{l_C@5Qc(h<V$VZ=vIFQm
zGKQ=ES#bALBqYUoLvHFW_?&PWavaNHr;9b*&&N>zsy-N#HVj5qnZb*fYhdqneHay~
zhTataiiQS0L~#ZPq9b*{VaqItsrJO`LTZSNicj#i+8iApo2$SXIRo~N*h?%Lx)Vk%
z&?Ve<y+IAm=b>j}CQ-StOJTI}C47^!A((bQgb3n1*1Y{Fdar*0GmCn{PU^e_?J2j=
zxss>IY)CX#6cq^X8+PMd18cG}uM?_UKA@4{icLOcf-0Pq(44j=Xx58CCs%i2DGfMm
zPMU{xdDz3xmaj3<en)&~t0Gt)@<w&L+u?57cKDnTgO4y?g;`CUM22Q*u%4W*MW&%A
zh@w;$%R@Gemm96Y3m<-m+|l+N1Y;4o^MebfXpZ6WR3kX*yo0l54|tCrPap-L1QS=B
zfOnP7pyX>0;lsyb&%bGa^m8ZhYaZ5+*18yb-AJM<GdsY_N|&hDdJT1g1gt7`It-sa
z1DY-m!v&lBk?z_<pkm(-_ZC(IZe@Z;Q8vJdPa}>k+Q#~j84EwPuYvs2HDtlAd00`M
zDQeVjz{i(;#OGh=!>vz@VC$qNU^T|q*|G(eWDENl8E)c>pMhAIoEHGA9y^h0tCxY%
z(I%Gab%b93iUUaZfXTF}_|RpF;4zSn`evL&acW!?c*2UjVQhj(g9VtrbuAgt;YYeO
zi6E=i3mIM=N>-fxjP?yViyQ6RiYNCt!m*gSSmO*wlpULf4_VNQC#l>;5uvxSrO(!3
zPZzHu>SwwkPM;MPt#S+viHO2Vjw~hys7vILLo-QfO*p@FUzK1+m=7vTxP=YZ8_6yT
zH9!NG#JFVq3ZgP<0hajI5nbuOqxyVJ2X|fPBJKh7v(h$=cD9R5MKS;L4@lJ@519%w
zSv>EjoQOLd?uzb4PNl~P)O6kio3}%O8#B(3Ri{#g-MpiVt(-1UjaE2_Wq<RS<Gt0E
z3>TGxu2B*@flPp!&)L|#<W}}N6_Qo=%X=cRW*NGaX0F8V8x4t*+*wQSJ0PE;?P%e*
z1+ckS3-)d{BDCB_aEvpz@DxrqVoUs9q90r)a7NwkLm@mP-i2CAw%67aKDW^Wt9v*G
zy*nO3h%es)x2dV%e)uYR{>KK?DSZn*$G71vqm@u=mk-vZ5}>ZaEyYS6nqj<OE<(`z
zOX%JGFaq0?EWTrXk=W{{Ln`EG;{F4H7+UAf34Syhp7yo!PO5dl3AY{WG^zja@+tZ-
z{=v_r`uI##!ZyP+G`8UEb<e?R$7&dzx{u`=H=h&zTE=yIGD+do$W_YZT18OC3H<E7
zZuT$UKrXVK4F9T&Vs~!628|QWV$GSI_(5xJ72QoagxQ=Mc&WY<erez+e{%h1{!XC{
zdD0<}rD2hVy6xwZQ3Xz7&&N4<a0s9uHZD@OQ`?MT-$h_!Z(-mN=cG7gyd(EPLn+T!
zJ&(6g{0;By6eo(DK40r$E1%3fb&4MwE96Be9K|1J*ixg9xAFPor*qEr8na^Y3#5&U
zF0ubIf%i`$ReMXOa?_*NA*n5!J9_prwDRc<DBR|N57YVttjOo=l#!>QDl7&&xMd1z
z$mT0qb=H%rbH_n@S0c9P)D6x|=Xjpp&D-!`c!GYkM;#WozKXp#TZ^<`?1)Jdq{O(!
zT)~~s9W3t0>)dPHJ?!ZBy{P-xYD_byMPPY#6D;jIs2%=5h>7w}<9Fk-NZ$>QkmoRh
zWBt*W+}iX64dg}>5$eOCc*=S*Y<3MEd<yXXl_%jupCc+N$W=K!B!;uEegW<q&V??O
z3bax^1GlM081G+tq{f+AY<A~90eGL}S-cUj0%~1Rqv1!iu`Qjm!LuAoI2p#eOAf{S
zc{=#T)HvuatVErm{;=nxo8pb1Mq_s^PoXQ~HQ2c|VucMutw>&R8e2PSfpXh}#YjQW
zfYvC5!j+Qycs5H*DOE5OZIAm7U$Of(Iv1P68dL8M<rJS=qFAS}`(Yk%_J3BbcQ5CR
z=Xs&WFJED<W7;6mT1mKK8-;q`KE<ANJb{^a6!G(IHH31}OVIML#d;U$5z7TLS-S-G
zDA-*_((f%d1k`3_BdSI9kJm7QDZPkb{u$Cz#nU#UJN(o42puNjjXAU0qDpXA0mc#8
z6t7OfIcf$zEyXEo+5C}Cxi%$hQWZpIAw4Z6YuBr2+op*mrQfEj8@QMIw3eoB9Ae^s
zCReg+-6oBRF^_s$cdg%|WwVAOw%whUspFJo;M=x4{X2cPQl~uIJsY-<@u>~zZQHXk
z$9Pd&n%Hh{M&9_4u5#a#dp8wKh`xU&&#r89;gqCTk9tp*ZP__(HO3t}y?kqlU7DJq
z->LG<U5;BvI_FQX$l5a_*DUnLsfzE)W*6J054Eq{R^d_ZUg6hXxqYAaLI1P)_WQD{
z<{gcBe4~9|&Vhw3Yq%N?`*Ujo&SV+RJH0>eQ1JJq&IJxt`SoEvwV^jpR}~zIif%dk
zJGyU+{th|1+Jx%Apr`Lhr_n2VDA}mFmu?;Nc*NjNmVwEhn{qHWIT$Gmsi2{H8)&GS
zg6`@sP!|gssC*e{R%8wOQIZkO8l()$yJP?_#4^S90=u#i%R?SKro*_^#1uiz@qj#x
zYx4$+V7Q!;iS(11)aRlB&Z@UBW_<)=`vJ?30FUY6ixeiK{&&B;eyg&`CI#mT&6D_2
z+f)UyW!yWy|MYZqL$A6d)c}W08j}*<yi*Ny%G7brHeaR|<g$H?@8P)jYKvX-#)o#-
zEmIGkRXAnYuW#O~hq#s4r3)>V3qsxZ%*eBh{~%c6S>aLPRll4H^QxM6Ea8_ARJcz~
zz}akzWMPC~JzWveJwF<<HeOXG3Qwldj%Xm6=F12g?A~b$RT;6Z-%<ufeZRhO9uo3>
zL&9^R#V{EW<%wj97nA2kkuuKherlWyh!yXZ24E#Qpm@*k#47OsfgMHzibpi?M0)ui
zHNfE|yjPX+CyD4-#WF~DkpVTJBzv)un`4oD_@K0Yy{f)fQ|{7$jx8e`v#s)Cg3jld
z26s2*#RgyADOoR^njaT>rEE@_S95-R*!2U_j_j!g2@yAsEbZ%VE=Y{Jdm@EB#Ck{4
z(jPlA^}UbpSQguNxx_KYx^Q{?)9W?C*N+z_CqBR1x_-#Cq7}<t_jQ(epD0?H{Py|1
zj+|*bWh?uK$?vs&lIXWvElK+H3NL+y>-}V*NnwMm$LVWMz>NrvWEfCTED;_W8^;a`
z4`TQNO%7UAg#+fohH-jZ_4W3%8A|l$&-ULkfN09gl~R_!{!ipIDgud!AdIBBP>&La
zkRmQ00%!u{{~N1$#C${0LVuBbMdn0|sa!;yFFFhX`s*sK^c%s$7=l-$Ow*sHN&iHD
zRDLHz;X#qXaf76<6cc7(dZiQf4o;v6&|e|`0r>CI7iU<2h!q<U5)>h&2$~w|Vt=BT
zx|**0)oo~w2o{*=>ocHOgUVm?Ge+-^)1fem<!Z%P6C(LZF)S3L5x$(5?oislHsfCo
z{C|@Lp+KxDcKuKD-~SQY{I6W-|JhX%{xC%-(BlYinF|2Knj%GsyZdbK$%8jbR7@B%
zVj;1WNF;OUhmjc)NutCySmGd&1WW9K<!iRI!c1mlCy_WaJw5-@(=+)nmhr$ilFVK{
z#@KiwJ+PnU#LVpynbw|ZXx9>nE7REeGE@1!#DtlL`J2r$_4oFQ4|JFm=Ii4g!t6)5
z(eW)?LN{`Mc5)|l5VQ>XlMk7_uk2&}ye%RF9fKuO#$FCjr$dYHm%01gxy;;!X$Di5
zhdvlj7`<j2C-t_7^7WY&<>w<$oyMCpn4gR1*YVKBqUBHVFr~DO4s`U2V>Tqtu8h81
zDe(*b9A|mG>0;CJrSuZXbY|!n&eR~<&tcLMf3qaICgzM1UF1Zj{&a|gheEH}$}Q>0
z>FnrzUx{tv;5o^IWvpl&iM*17M)D*F*X61EMmWUNIPyFhd)hy3${fw0CWy47r7=t6
vgW|>pL`RR6#>7M=Y9|E6#D+#iYMWY)GaY9lcGU#<{STT6`nN0kce#H9l9NZ9

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..84c399e67f3a1f307e88c5325c849ac627b14610
GIT binary patch
literal 3138
zcmc&%d010d7C-mC_d)_h5qKeqQBWjqAfSRv5qTgW7-Vt5&>{wifNYTjQ0s!USVXK#
zaU1Kxh*n$}rPxtK!37uUhFihbIy1Fe>rAy{zp-^%oqJy<j#T|)zHh$i`I5Vx?RU<7
z@0=t~Nbu)b-h=O_BNI3SI0Y?>U*Ws5-wgSf(Jt~`^NJFA;GKDeH{m<+5?-PSgaJ|;
zPbsxY1Axo}laWP;!P<RD!?UUYkT|K7vQ$G}dP=6joXKMatJsPZJCSz0R{A<iD(VY(
z3}kR36Tq(_vgC|Xck4t2yFgS*Z5*Z4CaWN@Up^%U{tX*b*qhnsum{M~^V14*jd_-o
zbc4kp&mxPg^pv#xoWk5ZF_1GzaS3VPXS5fv=D+ax%J53M@fF>4d<!F9LW$iEi<pQN
z6JgTNOHo7SezvmqDLk$lj*j}QcaZj-06Y5ZU}vbeK)+-!Hg4Mw$ZK6Ibc|fZ-kop)
z4f4IfERPxp{@1^TT=r9ZWX(Zjah9R<<{)lP`vquS`~_;OzK@2wnegWPboi;UhLH^j
z=5}pphq{(0s4wX7=#ze^HcF1-k2k>)Z2>w_av3jfWZ;IcA6k(V2!9KFgq3?DnJvd0
zLAJ&P8fseMR{0uuR$0LG@m++2MvmgLR`-;Ad#WD!XEn2%%OnzRKr-$hT+gMQPU1d1
z{s5wzl(6MVG<F=0nL_?BM0)-RXNTUl|K{EZ&K2Y^a(*-XxGM_e`f$kU?SmgoZ-k|7
z&CIn)!BBo|CVq67Lsw!>!|M8}=z#kpXmBmUd&|Z_?{O2L>0&QtwKoS|?|cm1!~Y3?
zPN@SX$dAeALqJ+s&K{akC3&*S40l>fVc=CYw{hE>cyoh4I^4gJsXhA)y>;#`6WrX#
z&a<ot?{+)OEKCmMymzl;a{Tnn88E}>4Qa6G-gwTrWG;9gY?Ac&453H=VBqbJNbnmw
zhEdOTgtks!iJ3Xag+zO?doHq!-=ivZXZ0A_QqvOD$~4*yiv12PxPJpSRYfqp+@FFZ
z?}6RozNcVsb^+eE;tkZe)=3`Jet_#SWeBuhF2?!sDJUkY&|drbO}O2=sK04a1Gc=k
z*KX!ocP@NpBsLTo*t&Ol$BUk0?8X_R*lV)wcBY>?(e*<mShKFhHSp?kn0<MlN3_>4
zq}*_dIXZbeqp$b|P42}?gMZO;E1SMWKd(2jxh}mSGwdUj9lM{&I0?+UuZF<+d7JPw
z{|?7b+zO;)lhjC`BZJG`YtaIiN+zTZVf(*oxt?e0@dDL7=W?Grvi5o-POI0U!vlXo
z%Z@LVR!rWBi%w)qZgFayYVXOMFSEe)O}kKgRw@Ks)Uh}248pepPogWTQruRma;Q)T
zarRrw?L1Z|cRPN2CUS6XM5Xd9xKjNWX060sUgoMsYb+ly^J{LRHuE~k-~$N|GUF>r
zwPS-r&DR^iyLN;_OOOVJ*-u6HzW+PUec}Mc!E(j?Djsz{zK@?xUm}b7(vkW8v<db4
z(?;YzDHL}mYuS0OF_LYr;V2_vAxDbJcMSiK@uqL(^U}xC)GJx~G7x%<JnYHDJiUaF
zjC**OtXmjXItmbGSWaRmmDxKu%DZ)U>f!9-$}8NIDs@jyuiow+efoO#^V0SoFmTXd
zZ=WH)e*Qy;4G#z$F>=)CH-g3lj|~w*!^VY2M2?S&o}i1F7#kO#Flq9Xsfp9{NpHTD
zJUwNGAvG=CI5Q(NYgTqnZeG4=c7fSaSX4Y`?!33>FIc#!WO3;`@0KljuY766vdZP}
zuUNTi^#@gJ)~;K>;ddK9+_ZVi*6QDHtJ%Keqn)+8cJJ9+_lJG^>pyNd@W~$!9%?+?
zbmVAr%dz7pPPU#pedcW2r{~UJxOl1k@|COCu7B3idE@ghZhm=7`%tOjv9-RB?^-Bj
zxmkI%)*H=iPL|nXXUs9uB1$S|3!~_8R@`>h9(2`Kihbn8WWKU2c4<r<v=tC~V4Yr3
z^=%SLsZEAf{g+RPffuWOh30itzx(qZkYmituvi<IweCMrYa7+1?!RpPH9lVzpsF$e
zloE4VrZLyRv!u^?;TPJamqxe8C4@9(2*Ap&S>`2H;jV3o%Kaz*Jmw|3AgawO#k-&_
z6%!A(JMmHz`ySN3==j&cf`V0PRJ#9hFaM2t?EmG7)&5pblA_#A!D^+7J#MZ5Dvi=n
zNJxlH9BqBDlv`O}N&!NsAPDi~OKE~26bqplLWCe>2w@rGStt>Rp{p=Kh@vyu+h(#8
zmu6~#8WLjX4fgdNNfzM~$J2dSF&)F{KuimQjt-%Ex)u3?AKhoC4xQ+qnmDyEJz`Y0
zep*5%<>$r`d~;`!L9`bp8X+JES?!5S%Il(UPD%{OOOMPD3{+oqPr@Ojx9t<AOriT|
zI#@#&p<D4}le18ZAu%9dKW$=u(ljx2;xFDBU-aazcu2Acy?}?NGSrkFIn_cLLX?im
zi$cXV|M5{`zDcqP*-|eE<LELnhvvYP6ftU6>d-kPC&Q@_NpdmGKY?hy6mk|SY7*om
zI>Ofrp~cpF62Vd}A|r^Iv?_@~TF=GM^|=v+#E%#!)hG6eD!m%3Bq%*hhJx9JMvG6H
p$>d`wD9A7NC^8n9v-0yi`~!#h5Ajp!Gyu;(&_sS$$B};z{{p%vL_q)m

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..79a6f24b35cf797574a2db8d32609d38de32a0f9
GIT binary patch
literal 5244
zcmc&&c~n!^*5Bt2$xR@TKyFMR%!8mPgMgwq1epazL=hAv3<Am|1kgHLv1sF1RA?Ol
z(K=y8gNisXiU?xC34GQ8wTjlZR;Z;;-@T!>^|k%uTkEa&_Trv#@BQ0*pPNH|2~mVP
z(*&JPtAc1f8VmqN+?wBe_9GwOz$1;jeli?I8EGnaxGl>J?%laV^I5}L&hX1)>xTCv
zXuRrGH#R6z$X6SzH>Vdh%XJ>pK+9=!2U<)EX%5YiyMrC|eosoh*9HJRSBz=J2q*K7
zmg@o7uEpduA#($cP$uQYC{yERCn_@&X-p!P{E8$iG1~%UOZ}5SnFEsm&=@d{km&%J
zc@d%$gP2Izj6w(mJOnt8O8_aLfWtgG%%jUZ0_M>ZP@Y1@j1^IT$NDm-t^deHO1<Ba
zQs_4w36tMnSD8(rr#3%~AcZbr*#*psZ7cbEhIWFuwAiea_*B*GIHgJ{NMZ&_akFF7
zlCx4$wK;cSl2Xj}2w{5wm;XP2|F#ep%;|RXAbK;UEk&uY$BOd(WIoJw`xweP6wq;I
z4mcDKhWpoap=Z4fbT`;T>>x4P`(+w@I!g>T+ZrJ4o(+JbAw1W^K>aKUB9Fa+t8e!M
zCt)OfIb8-NKMG;d=L)neLl>JX8woW>oZ!NCKd?+00?#MCMVAi`h0Ss)vOjGN#8XG0
z(k8&T<pi`Jwg%onb1)Q-gSNRs*bwOs^VW`s;EuU4bAcJ~wiv>ViDj@`F9%E=kD^Ou
za%6Sp0EpN6gVmW!kY3>msozWBwBAg}pCgA)Umpjbf_R8NWC^WN9N5)Sj=J~=G%Y%C
zQ)CFWH5NeZ@CGN%3&;(yf{BNVL3Y3yiZDCaVXT2A?+(L57!B9`Xsm11C|Ey946?8M
zAot0AG&f5GM+j?3$>qUj2ND{N9Rr6Yz7X;&2P)L|V5{c9p#mi|-O-1}D<oXm>4XCJ
z7lU<XGrU^l3s#RE;cCogxbrp@7O29ZAa^&s%RUd={Pw}Fz%kJEIfh;>djx~?Ou(|t
z1%7<K687F02-c|vsHf;>WEA%hsq7G>r&@vUhKZ1II1JMymJ{mzZ{g)(S9EqrfevLk
z0rpz#C8CXXfpwG(5wg1noeF4%juD%P1GC$8206Ck>jLb-`T9dhA)2u>TTY_Bfi0Lz
z+7s@AwpM6ZaSL56eu`WMq+>hN;-K$TDbBMSLmnt>gTvLoB1;IuM%Fu_{Q&|rx%M=i
zvCTkr%iFOPr*K%mU<TGc#Rs0bzrdu2{P0~hy5N2+9BE1$;7;Bac&E<5EgY9&o+CWS
zq|L^hC+CkM=cIGQ&Rh;>%F?m?f^=hkvgvJdd+iS(9)ieSzXoEut{9$0SAn0^FVN)j
zfd9CAI4K4Jj97dQemM{Tf(Rc-HXVZfFzXboxp)r0Hf0Rt)y%<ORFUXPa3gqn+7K0H
zFF+&C#>#TN!PI*KoW5*=i`TzGHXk1YeV<owZ}wrpJ)Q70dK^$$c|>J&KIhj>nec7H
zO8ESIIr+uz8Q4yZGpZVR3Lp068+>NVBYey`3$9geHdgL<gS#}!ovh}5PbND>;1?hh
zMi$1xvTlFUVEICDIC+{Qx`EJ(pK$=?9&j2v8aG;~3sYXNL60WXA(a6S#hvpcD>J5I
zq+Jv?aLiFMwlR|2d~_PJziLG8Z+eFg4rsz14sOJ&CitO69e%JiGY=mS)rl|Azk^be
zzQN{yzZ!cwXBklu9D=Avo>;nmB^r>Dh83TfL%gP2$rHzdNo9Gmu;gHwc*4Bt=*#R|
zn5nHLcW06vdfh6+6~h)22hyUjoR@y+%B$O=A67Q<R<}j-zCzCg`Nj?0z@_@SpTE0{
zM0SP9S-gqE4}VIf+@^SouUAnArdXiU%}&^i?K-@Sq4pe&ei`<!+cww|Z?Py{=LoiS
zRyP&C(Thx$mVk}J0&X^$4dw5)Vlx)ia98V-98GT@kyE}9wdT1BgpUS6jz=hGewQzr
zzH<wjz3K~C-)RPW*E<kqAr_S5rVV_Zx+-jL<a2a4a5!cC%_Ee^ci^`ib>~joxI)OQ
znu2K_4o1J6O(A5LZ$ZfDTnIgWm2AF$3biR;!t}HGc=aFws%f8&wd=<k>hnsl;)gC6
zzxNWv_qC#5y5<qso<*|TV=fUJL##-hZE`&3bsUCPhf)cT2f@=vHT*h*MmQI;otvkO
zG2J(6APjr(j5Hj!2^Di)Fk|BlICu3A;J<x249Y#o2~y3Z(qAm)g*@@lskdCFM}Dje
zdN_eM9lXx%4F@vXYb1Q9N#nL{yarXno3JyR+VCS|%=K;7Z6jPJ-^5D>`s1yyPYOp?
ztQYQ*j3euO<2c4{dFZ;&R5I-ge_2@fHasB_(A`g?^~M{l$FR>*upzfF@KyQijvD62
zdvL0RA7NO?pDmlkZyM!KlioAU9@YrSP4)G{%uESCMdu{mt@fe^ovjrLhj~-?(AS(u
zn+e#9WD<0~>Lv58B;t8Nk=%<b_JQu$^F*BY0PMD(9nyU6t3PD&3W9oZ9j7{tc-*dI
z_>HnwOj4dkne5vG@}C3{S<-@E>~!R0;w|L3KpW!FWdeWYL5ezyNAuRCuSQBQE^pAJ
z@6nQ{H(^J<FK%M?7C5Ota93EKhq8GY*pUsR(5bCLfoEF<DVjPI8rpNP==z&fa6lH{
z_Fvz?1Jmq*=~FbA>XS0=oULZ0&m2EYnXM#-RuzbEziZ_1-rV3_<L%+5_jRJ{mCG^t
zwra8a)panx{fK$;0|_QAJdfW|Z6+huK1N|C1U2SO1iA6_6ZE<uok%e>g`!cPkn<*$
z;|cYEzgkiU=N|c?onI8_A0LoG9ju7LBa(U0uD>5GF;wH@4kL{Joh@m6;V3q#?VuRK
z>-cUxVovPQAXH`l27Ow)hFTl856iBb$GJlqVKICw{8Fw8uJ1U2+LB^m&zlh4o8Jz?
z?zq>ZE3%c?#g#IhwML#KzbKDuzBx*-_Q4#aBR+*z3X<STaTmUoV<yNI8=)<#&+x^i
z-=K?`+c<+OLSY{*<Q3~`bV?r<0(Iz}s3LS9HH<$EbwB?J3mRMtIb#Hp#rZVqeEAf6
z()a{|Z|maCwdI7~&gWn@#S80<vL!we2Xl6deNaN^Qj+=p^OJ-w6*|z>qCdXQ5p3zD
z1p7~wkxCtN7pBACevQy^5_<gE)oKyJUj~GEH4x(HYD8HVRWld)-D+IuZC*o&!=h_Q
zX<ShaN3ObC!!=)JUQ1c$N7wS~N{edw4w}2QI!^WGb-J!sqw54C9u(E-dG_3`(;r7z
z)C;}E%6gH%b#c8xkn6pA!w_$a260%JvVoo+SKJ_(sk+xNAbOR>d1+j}^1LjuwD`PH
zvgY1-<MeuqM!D*$ve9J0gW^Wh#Xa{L&6X3E7tB|QV=h?aS(jX}RJ(Rvu-fQt*<_s`
z7Sm)?5LeP<yIs}QG_Yuu<wd*F{FsaO`$|hL4mzaix;Xeqy=Ak5=4wpykdqHenunh5
z>1uYYCahYV>cz1w&KInAx45*p-ftQ9wYSwJ*Q;T%m)ttyc3*P8rMiD<c=~dyKgEJA
zUUgp^OWuh4{}zitEv{wbs1>Q5yjtxZ7G$>ehv20K*OjgPs3BTZo;v8WRt9o+D+76~
z)%80T@>v{G+nv-RzG!8#w~$&_?`~uO4KJH5M{4Vz`te*YTe?V*Oil4d7g<P2HPzd(
zdzu>H<rp;(A75r#=pfZUZe(NcdwfiP?3-U+=#N7@SD}KGsf1oUdyR0sz@ta#EedcI
z`Opz6k*_RCBl0s@*dy|{SmSCCU{k0v2(;g?F_<u<vd183nRAn?VX#}L%5b7bx5jYN
z=)NAq5CzXoJlRK<B@PX+uMkfO_UILdO%8CQr%sK?qNhb9RnXy43w!D5$~A71i1@-R
zNu<|0kmpMn{8Gty(K@0n60IX<$jEhe6}=2@Wv_(66|9rb)M7d}(Q1c=3d2bmeLYuZ
zs;;@YsDbM26(hCowWz80cxvg?Sz7-xHStlmvkVs5%XIT4vzLsPQ4a?A@YKYc3TGKn
z>E<RQ2&uh{R0~(I37DvX7pevisYvb0b7vwG_(~)D#K$)%hwecZQah8qlJR*SJs~k`
z1DvJssfARmuDB-kUq4iei1yggQbLi9)C4Vamm(@ZF2JT#+}V3IZgK!~Sd+8qX<L#i
zw!}wyNPpZCl`WZ5m~>p0uw&uRvc%#w?nX&_3bT#o?%#jhXkJ<6&qm4RP431i6`k3}
zsg>QwjnmHb{cN0G!y7K2-yqA8XEfQLkY`@<cqv!41q?UIx)zaRlHHkf!X)RLg)dDO
z+*>o;bm7Co9Mg}w_n$Cb^sMrw>Eh>2!_AiTcIKEZ?dv{aw(L#cOXjuVk1$_B8Z9ti
zX`=m1B?Zdgf1s3sZ)1q|7{YeDhn&XxiJbq3G>4axl**F2_)H=>DO1IbPmX6P5W{z>
zMUv7HlDp2fN9=4*b6J-6=WFI~NzHOZ&kAC$C6?WdqpEa)L_|DBGW@PXKoLsB6GALQ
z>|z*VH~TlX_L#k~7-2594@jz%HP?!idauJ0yT7h7O1}|Tj3KVB*Yz1%_uBKHXr2D=
zBriEWH9^%+W;K{(D`wj@%J$ol46*ww#6JN4T@7SvH;{5NV-w?3lr+ImUK{LB)Y(QJ
zbjWZVQyp-_q#qsx&FNSEkax0re;g5w(VQSp)|!xNkBep37=!Ts*ldg#|K_ZJE%3i5
z3lgzRE(`ik^ZNf0i~g^?)&H|sNpfJUL~Kh*wvP}4$mCL8MQG@x@R9v*mb8d)WW`P4
zrBEm)GY=~h6p9>$SAxP<p-51SPtdM;F$%%#XuLuZ!1m1ieb3Ca!%Wr#>qs%_qrr}j
zBbb5DM1OWZK8J06*oJYfPz14!R|GrN?kk+w`MemHiOw<M)3V}xJ?2GB4^3qEQ$m>d
z7S3fFt-tYFCrl8G^!w8e*}V_!Gb6*@Qsev*6iU`!3(us(NW}Yj=+vp~JdkbrQ`d&x
zAJ05y%}b>WcT0<yJ~1tFx;AwNZ*qTrfnguU!xW2=KgGkA(kngAZ<>nTPy__A`dX!&
z_xO_owDo3+&B*)GD-_=B&@Y*-L3*UG$J`i~1x!s`StX{(Ic)uz5c>~>S@Y6bGLbXc
zG5Zk;ublpK(gw>~F**uuCHsxEN%pU6Q;$gT&0=u0d9wD5f5wzOntn}?nx`u>=4ZvL
thQy|)4^d`hq~(}r$7f_FrKOrXyAO39>Ld%21N{C6X9(suIp%lA{{vSbUhDt>

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..e67164596565a48f5cd69702114b2db7228ee7eb
GIT binary patch
literal 4782
zcmc&&2UJwoy8ibmGiP9E!r%-JMVg2pMX(nJgrTX3QHr2cDbl2xD4-Y+u_Fjb2T_V3
zAZQd&0Rch93o4>fG(=;Iv7oW*J7<VVd`Z@NZ>_uT{nyN{fBXJ@&YJy=lgQbGNASFO
zevUjY55@o(H#@%FRrYIjmd%K#?Xi>L&tew-bhaqo+Du9<WiVAU|N6AXfCF!gE;u5y
zqbrhn;-*FMfTzq;G2qGa<asO}OE?wusL!^1>a#WgNb6yAD~oX4)=XLV1rqWIfjO|`
z1$>T=cbMPGK<}tP9;Pg-d<QB2jC50xF7;2&1O{~0z{7xU<mnE8o=5PU9O*0=6w?U=
z90UZ1O#mrFfi(T)%22aqX-!7tM?1#+i`@Ct=N&$Ue$!E=@%(j_-uw*@hU2N7D!Liu
z?|AfN{K9=>LjA*{SNeHJd&>mTgCM_^zTqJ;p<xn)d6@DBOm_pJn}G}eA2I(HcDWj_
zUd@r$!<Hb=kbAL*ZL@g|1e<Sy>Om0<wyXk!;z{u16)wEkp#u*(CW5cN5;|}&9Ja1d
z0-dZ52>(F`z(^6E$zrhWaS*s4`2d$cRsbttCLDAXK*>{iShr7v5+k`-uwW*%)*8dv
zJTYj78p5+hAJN5{$&fDOqlwMhKs+)6D%=4qHW1KXqYa!18lb3T0e!*pu+x1itWL9n
z1%tuhu~r>8nTl|AVKtP>#(|nqJ-SdWL|QGCpp<3{S}hkKqR|Y(eo=;ISr5oqC4{Z-
zkAii!KX@I|gkDb;6b{y*VJQR+=cVDgf+DoFjt3%tJ{UKQK(d_{EUYO8L8S>4VS121
zt^wjd)xcet0|R0nHvH`@*s(|n1c$|t{P0H<9HRiWgf@gGb0FP-gpMOez#!fRoZhma
zaoa@D-Nu4L+1}87OAfj&k#MQN7}-}8gLc$8c(=|5wC)?hWuJ7o^)U?AM!P_Eaw&X@
z?Sw3GITYH@h2eb|dY5<)CZ(!^X0<6ieU=0Vu1)~$urcUG(XU9w?=FhgLl6<B1vWbu
z!jkt}P(xrHv901Gys9xnCk?ZuDNP4BF#Z7HrBVpmo;rk6=?gSCeK&E>p&jWP_2S#@
zCW6VpT?i%4VJ(^8q3AIj<acT|xwfwtIyT-wJ;jfZ>DUOYAlwh$oY;?Z^yZS4Iek!b
z`YqA~M{MR9V^m=$gBG_nLyK-CYTwY0Z9IX)bt7Z+c<KnoKP1KrTe;A%zXNWiX2Pd!
zk@$F{L~Qo-8DvoUIM&0?dSnuGiYQ2Cu}D2nY{J}n(znZ<%xilB_RV}Kd~1(!%`iNM
z*92m%*U)WxN9z8AX`~X!!1VQ};B}=P$hcWUh?*hxWW@>C(sK&GvSco#wywfPnn-kM
zK^LUgFGG#$Bha7}i&ZDj2etVQ(0ox9*V%Lg<gDMp4=ZZ`pKXkX^DKahNhOYZWw74v
zj)I>$l3?Ox6*9MUIabhMf|@3rz$>~xA&>L-@VTePv$c|Av8u7%c!K9t(x7S=9%Af<
zpM@xxnd1wI4{XUX8@>jE@0wW(R}mWd6^C_$VlbXJ2Uq!;3p>^>$BsuX#gq&Y+2kNb
z#m80=?|HrCu_Fse@468AlB#MYht;m=VC)S{O;?j$5Tu9R_X=>)l=VbqxF;6(N{lYO
z8&Y_Z)Wu2Z^Wq#vk7Y8(b+GLd<hc7j-9`#}Imkq5H%rRp5fwT_an=tsQI$)^qvmtQ
z*z!DSPUPf?tOmJi?E4`dEPlR{LWFcJws*w?%4L@&8Nx3C9fP&(STYvsKJCSpuWMze
z$dRmumv4x;y01}hs+o-ZJ$;Cq;mleyY=c}2GSSL!b799dbvUrYfKYcDPZ{mrDJ9+B
zgax}lL$~dxQQ9}}p+G4Esq^(y*~@lql;<=p!5Z#PLa$GT5`v32z-dl0I3K-Cp8N3x
z>hpdDt|v3_)A}-~wci!%m-AJW<CI{<cTF*=mlweQO)q*qyqdtu)(M8@ULbZkX_3-d
zLfq%QABIw#set?X@aSHvRQs4NIOUYbPWARtE1xw1rrdc<Do)vrirJ>vxN$picFGg5
z&D#L_$yF@JXb&o4B!T1faEA05%|uyp6BlH00`IOGV83($(#vut{M-=E?%Q<*nx=JQ
zExY^h+PNBXI@_}d)5X{Ek_onW@B8oMXEyGTFI2W5+im<<<IGdhfb~)`Jl9t6)q^ZN
zAP~^)tzNQLV|HNJo>0v21_n0Kw%l1$#GE@PN~C^3|6;l8IA9|oBuF2ANv2*3#8VyJ
z**zP}fjh60@S8st8xrfGhJ7}2hKn~6)W`r%T~pz(hmYV_t9vo!x^POhybOfTWx&1U
zJl=E7h!usOCoSxCh(i|%{M`&v;ac$=&X$N2<Za33=r8&O#Xq_Z`5892s`^J@g*{<!
z)a-=n)sa~3&ROWhUU`|>eT}5T(#g=#ABTCJxlS#xi;>d({wCZ}i=7a$qydZGT+Lp!
zSDmz8CC0pCy@|<9*-As7x>%eKS2<TWW$cJI*U-T64VW<Nw9?ee+hI+AtwzWlWsING
ziQkG&C*9KSqpws6YVHR&a#!<1^gcU+2vt;rqFI~C)r;!zfHQ!<i*JWh_r$0mH(T!L
z*hs3X(GzzI;XuD!1&UYPhFjDiOzLM{a@^T^Y*Am861cQWnZHnC`PMt4rimZW*0wEF
z+E?XRZ2M}~Em8&Zk<!91BuB$QekJM)@`17sPTcE1>0`I1o<WxcNmx&kKsrrjHYrt<
z%GOBtlx@4S3P~%SKuIz|aH)70-^)^$Nmf!pnbCXj_4{w4o~SI=q()~b=gD)5xee0$
z@8$q?=#xUDb2&9dY8iU)>^bH*sSV=h$|$eT;Gt`;9$^o=9>RhlE`F}9j*u;Q2I@;J
zv1^{X#5YO{SfxtVD8M;^q^}yCvb?v_2D~EfA2%g}DLtQHe#B@gWou~B9scoO2z^7s
z3o~Z5h)Zx10i*a9MH@Ea)EGV2(?uIMvE?;wvn`5~lBM~kfrF=uH*J>FuuSEPO1|Bq
zsOMbn+FFvlb*!<^*=$j1%Jy;7BkvEkmTumuZjnR@Ecd7G*0N34b8FkbWzPhs65AZh
zvb4-euJwV}+RC<O8F{s(3ak!n%b60`U+&g^V0-Sgh~cw2RtI<F&szKZzFY0)Me6es
zfgEF&>NMVJv%HRTL+WBJ@lFlvbh9n#TkQ8vt~i}xcS(0){_J>+XMb7Gxy&)&GSmKw
z{<3P1mL#6TfPs5s$j!D)hpUrUIDPZxEoqQl>wP9oFe}9%r{4ekz8ULShWi?VE?2rW
zr5NTO4gJ4I&mD~}lVA{oOCk?3k=qi0x>f1MNq8E6yKT!jaZTRwsQdjD%e!jwo8q1h
zcjt-^7qqN<{rtiAU55)#B?2xbw5=^_-9#zsdz`H;Zr?1gWuIqTSJJVSKPC9a*}BrM
zof=kYLc99?-Fx(wRC;vRmz~Qp_U*~DJ96Mc?)0dKH@c4;yjW<lNlMkep}eouc8|Vi
zPea9}gEJF=uX>pdX`Qc-z~9qDN2>soAly|+2di@TR7fD2?NI`V1&P{qX~LYw{dU#M
zkDDffc9q2Mi=L4xGLa%38m1-!m8R$*_<$0%J8kmCM+;i&SB=OdY6sBCr|u2DzA0EW
zk??RpiDZIscQu`G-BH!a3BYwYX6%bTP*oR>(Hn|27Yo;HlFAe$Jlz;wv*bj~<DrgR
zmnSPu#((%o3SFOiwI$1_+vK@E^Xc5KIxj`&_S~;0({N>Hp4-T(-uyX<ELHaxf&FC*
z3T^V;Uj`3Ud!9;BUH)sx^~TU!o%zdOg$=dDe`2Y5yp9;|*sd-v@OTsX(}nzbo7Ftu
zMnAk#y|Syo^IhyOH(L_f>MP!_eXg5rD&%3KjKKEG8#E_0D2!nT{!v6oP*gPAKg6H8
zUDH?TBChhFq4M5Ix&u18No<BL{P~Ug+xxN?Z{hD`hic*iiHJdrWK-acI28EOEMetu
zSO5FWJ4wr5Spsb)5kB=<hhYhSU8R-&U<nNCxo!SWEJ5yf&Jf}s77#tkgtlSIACc~P
zGu`1SG*9@e|9^=3cen-H%z@8}@(uJ4_2v;YU(&(;#JhBaj)xR2Xyydw82`)S^H`(G
zU(y&x?~h~QVLX=OY{r`4OOA-4W*8me^HJ%j(Ec?T|B}G}8!RZx3WNg3e>W-oFOjnU
z$dmkEdz6&#Oi`BArIa_AC;<qBe6GmZd6CP^(I<;vL>Mt*F0vGfM2qP!BMu@_oX9dj
zWFry<h^zu6YnHUa0%l|-64^05J^$R(Gs!TD@xV9|E&67Xk<oN|V7<_mnOns%tu@oo
zu0<k8rm=KmrjmV;F*9H7W4h49$7NZJpUsTbZm!OO%zmg7o!{5NbR+R+C2>M0LCdH=
z$&lIm!amB~#XQVU93b*$>?Pr8IJCHZo;xpH%FOMVW)!+4_0fD*(`%N|-Y(|hZmtW%
z-CZTn>3EAr@!5a%Wj=JVX!%n<OerlR{KU(mnGKPhBcm@-iu)XYv7Myebg^mqTzZjc
zJ~I@DFg1v9x0w;_W4e~Ei5a6r7deipKb_*}q0noV5=%OB8XdjwCbEnhJts-Bj1{dT
zl2mfkNCI+nT>{-L)Fy_GBf-ho)Bb5w=4eJWfv*wa9l0jPKibeYBErx+GBP|)Bi27M
dDkwZm!({4YlgY*cM<KxPA7~)}oale5{s+nP+{*v}

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..cd2e75eaadaa034208a570e691dc2fd0db882799
GIT binary patch
literal 3988
zcmc&%cUV+c7C-mC_h#PAKtSP*GZewXfQZr*OVBVwM^pqO#ey^esWKE>P%$Vdir5fE
zrHC{^nu%hgs$d0IBUaX+f^MSG>{_C6-y4F7o8*uEzWp}$o4NO%dr$kFdp_oehuqVW
zAsIWy!JVNQ7z|*za(T^**yrnePfWR&bgVbyQOb~p1?LXQf}d`xo03}~*;L8^qs>Un
z7$Kv{@ED$+4VcgaIud%o3IK#7F*hSj^nk1r=`o~qJn%H75<y@<bg+L|fHI81VxhPR
ziQADZOh*5iE2$ZqBQO})Nt0awSlNTIb7ynlayB6e5Fp?Q_#~*S)3zF{1~&%vFPxc}
z(fKuF2|aKlp^0B~#4L*6pR$)C6BP<f5NS>z>n1<IFi{VV2}+0xiH`RV4u}s>4`(;w
z!Tv!pkqJ@JDqwGs_!*L2AY{+5-hUDE58*b5nRc-|bA?hN&J+E(pK{8a0dbYHaa~w7
znNj*PysR9DPL0_mqK8j|gToGz3w8Fx$oWR3N8yjC-t|1(n3zSD`FDy;%`Xv~Tt|WB
z^#>3|p2sJ597V5<FJP;f$NZ%om!NgcZPcFs1X&G<#rtA{;dT81LSQnLD$D7B%7!;+
zIJo1<Ef%QMRUOT0IsqqTaj1E9Ctg!ez~-fMai^Cfd}s3lOAa{``y183rq&zP9cYDn
zX*=M3MjSEBd=<8xIEf11s>^$PrUqGtpC<RE@_1g!p9y!x>Ix%=-lcMzo`UumMD2g$
zhSkPlB7r#$&Ldtyn^ljnw{HTa4eBs)<!N|T<_hY*ju1I?41T(x9@5)S6W6?^LR#Y@
z{Ngx;u1-G-wzfv(5rY>{r=5r^QXOEZ!!$T?S)b6}{1Y--TMHVFKf&MqD}k`JAYvFh
zpcB%_V+*(Q-ef7^VQUKPK2uHYDV&S<)mftB#`T1C@jGJPg-68H)5G{9QWJ5t`8WLJ
z`8HGo|9dLZ!k0J)N|>Az1grX7sKKk3f!Wa$yg}a}^x|J^vjr4^#b;9pon>m^^)?;t
zRHhL_=64ZB*{$$sS~FUzaSz207NDr`J9x?W>u{!@4G4B<!IMR+NXleBHlA8T1)cSy
zj!gGJ^dnomwPp^A)i{Q>no%$$Dh8Lfy+`Fk+6c4qErhRDj_@sWiTb*BIu%eIsaa57
zp*=0a2NfsY#zSSp`To)c!nc>Cgxq>1Sr#)NC%tq=SAV)YsKD5TX#3QKp&aHJY}ak&
zyCrL=?S9{l2ASj_OYJP4(EAA;b(a>byndW6^BRU8G-^_Fb47x<v7>l(8Ws4~T_e0-
zY)8e4s&H0Ii>CJ$1r;eN03)-d{6s1ds^9O#b5}I-*J)5Z-SKHCsd^c@lr~OX^N}ee
zP4eU|x$lI0_U%CaU+#g;T?U|3iiHLq!)WuYOrfaxI9}}c1L}60K##oh2!#pFgcoXT
z_;a?b(ToxuCF&lGMz2mqk<!bz!DC7)%<ju3&VO5vIs#roXc(XdQ+3qX>4UpBF8suP
z@MfI<zzPfhc@aWhUqY|$N07K^h4k*!i{uuM5tJxfj|hAljL|wzI<(gmo;+$4j=7ED
zv_~#KEg*1c$*0C(-SZu#Wu1lc`Bqq0H<RG6dkQYOtHCt2oaY`tkB)tjEbw?dNz_wc
zAf(o+!6yVsw3T1y|I-_&g^J10m!V)fwp@eb6WZ{}tPaBTRFkHrwF7N6>n73OWzJI)
z7brV7Bl6H?lCXHOU8gI5iXc699cm@&`KB|!M_)X-346CY5&8x{126h1f6eeSP!SP_
zt1>@D^*c4yZ99%ogFYV%t({3YW|lvi?wTN!-MRxkLlcc-z3OoM`U?J{od%TSB4-?s
z7(iBj$*5f(F@^6C;!0i<6!BwUccJUYR%5;F25p<aZGt78RfcXx)<}|bhB!65i16Le
zi)QPS^wf90)Rq&E(c4|IWR%uW2(vFn5i_fa&=w%Fza0w~m+!?smN(SC8WKl;=BI;v
zBL&c@QHs9M${_425f=VKM(Lib!Cy%G2B(dADCnpO!9g|d==i92Xk$}4y<v6<PHc|g
z-J^7Hpl}3nF*P2p?=3?e;ejyzvO9V6p((y+(}J!_Q*e8VRJ1|ImJ;Tr@eQ}m|D>sB
z5fW+FqZIXUxSD^T*vT_cPu12zJL0zyD-Ya3?aFN4=p&wBxA0qDzFM8=z=IqxE1e)}
zu+;-=;T+WW!*d+<Mg&Py)x|5fGpOt36a0R`I>Ge2YQzs`W08LM9%SI9z+LlY<Z|ul
zyh3e96zZAG{-tK>h0I&N8PiSw`ddkIrI(OgBe7#IWhiAA;^!*}c~bCfBP(>%q+k_b
zT4I;CdJRDjHt}i5TeFt0IoxHJU4BZcNMaRsqalCodJRKGnnYgkWxAG$XNgZ^LF&dK
z7J=t>$qUzQ(w!LBd!w;%eWrn33N2L}NXr`Gvem@5=|K86V~+xt97WNF9ix3}!n&G@
zHfEbIY)X^bAI!+H4(lxOZ9ceZ&jjdhk=o^<WJ!<7uIAM<_M&9EN?A-LligkV$r6Ws
zY{Xt)lx#7D3)onkxnFk`^eK|rUt+)EC2WkfYz*~e3(cENMbq+G{}LT#$=GCg_AYJk
zD8H7HtU~84!(Deflx{7Ww#zE~W=rX|;+c7hbREaC?WJBNo~3@RWjo5fs{-40JC^UP
zn0qv??`CUx_F?~q6oIbOp<UHM=eCZTd*)D1ZRnK(*F8=ZyN^WNs0qJyree?6G2Kn+
zy3Uok$CSODrE||#?meFL?0(xG=fnF>u6Xsl@7CGF`%kX|La66bmDjkI)-s)Ut}4HI
zz2*qFT$k#C){PSD#kbE@7oN>DwBMlTT63Unn~7K1ytbO6^Vt?b?YXYC2QTiKsC<08
zt+x2`e!I0oeYd)jjzX7hrt{nDO0O1s6uRwoJ5qM7+^2T&o%SQ;-yB}pyg`53SBJW4
z0Y86*)HK+qQLI2<a_}{h{cNPi;C@!*@-dwdM1@Cl3@JoOMuscn`5}=Z9MxcXMK>+(
z878hWkk#wUE~)6(Z(HgQ$%7p;@dLl3LoiSXZm>Kl5OFOC@U(y7c+yAAPD0;MH7S6E
z9?;=<((g}M2fwgAtYUkWqyCI0X?!3}ks;Bc@%;?&F%mySvRo6{Ib)V6{oehb1pOnd
z(v0yS;VFZ{LZSi~k|l~p_&0phNYDL{mK{qOVLX<6TyTchulzBO<Me)g3I;PgcU#Vy
zl&GGC<D{64Fz`G^I;?*~&c7<~e}V<EP^u?&{~wd#zlfCnUz%6;S9Ov&!&oeo(c&v3
zwE?7h5;eJ}=S=U({f#BbBh9%nUapYK<+Io?H>SzuNpeM~+(|AEmD`7^&J?V|bZ%!a
zm%DN+D-W!!q}nPu51b?U%;lra%_p)O#~Ci1Y@fu<j@-n$mdo9_N#V<hs(ZNwCr1QY
z&9Dsgo|6#lG%3Q@$1{w(kMdyiTeg^;RQ~K$PS_;a(eF>S<?cSRSNeI6j}CSYl?QP4
zs_-lvcK8m+o}Yit$!^@#4_%dde?AfHnIb;Gdwh(q&x{y99~E>q-mHFnZnHnmhb<O6
zev=PZN=0n2^PG6@LhkC$>8q5I2I9|hRn?m<HaiANFPA%TTjxlw2C;rllNJYBEoEym
zj#FZboW#|iO|kz`*fWL7lFgh&$KLzO6-oW?q)L{vVs+%IO7<J6K=z-jp!-HSC9rW+
zI5~UPKWoZ8O}{2c3}XZ0mL!D4j|qy69TN~27n5X|7!s!pkBK(4v>9tT)<Wv82k_wo
MCrAJf_W$vJ0~5rxVgLXD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..5212dff6da433a3a19ebc8bdcb3f24dd3d1289ae
GIT binary patch
literal 4323
zcmc&&d011&+J9%xIax>|AS5P$f(rqWT|uQ_Nl?~+2r5f!l}$j_1_Wz0EEbB2xD*7d
zsGuk=c$MO<1*{70A_^*kMas2Ww_00q7xm5wm$rUw|M;Hgd%io#nVEOyeShyVlPA9<
zP!Z%Hrp1BcnJTf47={8^ZgM&lWX{bGo_gdJ#2!z}nfWUJ;ZCpEnCX+;UitY5M9Oh*
z&TsJPSsf|{u|#ZSE!HD1j+kTO43@mXJt=Qc8vq2BnDlyxqM%Y=eiNC9X_*sn^kh<g
zWJGfG+_;FeI5C#!NiHKvH<B-s^ZpJ?W=(<xF$U7<kpY0n*+?9!A|e~TJ52!}0nO*q
zz=(JtAdiqdLqt4RU82r5{egkScqe`-Z*WJ-qyD2KAvyhdm2CbeCrnWVCtu17ncs4<
z6h)^*Eli3@PM;eckscw6CzJT-xlt*J3zL$yS?<G<6<Ge9lIKz;{|ldgnfpbl_?lED
zF6L;nuNFSG;CQYM0QJ6^xGwGhUAXHFJli(`eQQ%D;EnQw9V2(p^NhB@m^tQj;I_x8
z-sc>2PFhRvp4%?4vTLPQ`HTgJj$4pKpTmbs4xtwoE!Z*T9(Pe&E1bybMrSMTBgbK>
zcymfLyr|z!@hzt?yEnAKzN0VEC{SUS<MwEmj|k0ZYJ|gbHEPao$2s*B6fODyw+Byy
z2hM$1y4Q=^a!d%$UxlE$?I+;-f)aRJsHR5RWntGzlbQH+#+-X6Ymr0z5qfhThZC%M
zU#Be9SSN8r53{N1AsE^qX3I-oES!L;h2jSA8vPv3INsE`b9W+R2qKuY<On?9?E|8)
zX^=R=20#3$9+sayLR|@-0tLtB<Guz4UGhH#uCC_vLDN2{Gt9s>c~fD;R6l6EFq|?h
zdWFoFeFeJHUctZS?gPrzo=OqB18-pg{mr~$&darF@Z*VG`10fdX5+RG@a8%P)L>Ch
zjo<m2`mp6THRZ@i?&!P>e8BE1S2M?%Im-Q+Nwg26PD2{FY>0xayWY&u>`%b@P$NhG
zDnflv$z_YE0{i#fD5Fn=5Zu2UnU|h`+kVX`eJCGAA8}=_+1aDj;m*J>F#yJL4z`$5
z%S4?DXM7u_u&#DCT4&9`*vm%Dt}}1Zo?&Mw>pg3zPdmNf8}%&Q;OB)ZzMfC_i(8qm
zzw&1y4kYST?x`{KO9(|fGrI8z`6%w@cuUmZDx(zRm(aUY=HSd{Ug*-R9{q>8r}+78
z^Z5Tj4@AYrC%C>EUE!B+dy&572IOG4mZKALpO@6b<1guG;O!0`i5kz@;}13o_-fm+
zoI2ea{9TVZUOLrKKUJ_Fum9*SFJz69Nt9NCx%DD$29p5?-mb?Ve0q$Vugh@ip1hzl
z4}5}J3nqy4Zd*a-<RH$+H#|}3<`OjbvyD*HWePirtZCE0kvzM#t91m;4R}HLuc+5|
zB5%y~+bB-QTBoJfnLB$;jvl`u7}wnzhn|0%M9VI8L!et81pV_ebMA+F)E4m!LccAh
zj#`P(vG!2ht{Y{b%df;0w;Zv~lk*Vsq7^;AkwD|BPh~w*&eLlGM>B#_6DqPl8l(Im
zUhEw!xPSYYPV>-Ha3pXOw;&>NMD=?XF#hHP#$fzfRKaz`#>T5DZvI2?-joeid3!jj
z^bdKdeHwn?y~%>(qq2rD%Y-n5qNy``I=D|lfSIRsfuHJ9xNU2$K*Pi{`0&~`YX1~7
zUGr6?wBw9xRHcPC)!KhZ&*fl|-WG{F)9e||F?K3I9n)qpDI2|IGw+sCv2lQUSI!&a
zF|-KdFOslLHwMpiZ{d66z4$llD|P;bE^@-n{cv9*1A>1%VG1tAQ3a}S?%AAb5Wats
zj-EOU_jp;Nx-UI-ZD!=qyuJ>K*JZ@#-uQ~TTGNUp2U2*$tE<4|kqE*oTd1>LcAPY-
zg>m;ar}ti<Df_-+qpk`!{_@m(bb_kqT222MExmsY%8Naz;ihkZll+jIGwLMNB&hNJ
z)$gJD^?D-Lwu6lREL%9yo{3Xt%tih_3w7k*U5A?^GAvSq>u~z#HQf2@O_^!)y>LWE
z1ikMwvGBrZH}2FJANmTvikteP3w3;xjZI3A8aiKI1s}KXH}f?gkE9z;Qs2(3qQX|(
zK{JQbyeY54m^F>}P=8q}on$Zq;yiYugy{#U*yBKz{$LB|7nkEuhfd)?hpBn*hZ~`=
zL_W0Z?m|lq3Muz}2<!YLXN*tR;-#{?Lknzv<hRwv;HX*^Y8d+(t!!G(TQRd5XEY~p
zzGsYZq|RvSd|o<ql<!7u@sZ$kK}BEt(F%X>d>mbp<>IrsGQkQXS4L-R0oQEZoFPp&
z=Ock(J<1iu!=;KF)OwDoD9_Lcm85?`E!ln@olPs{j5`<v?(=@&R0!(?+iz_E>s=EC
zM_o-|yv}TN_t!@_>7@WNr-&p=ip8kw*?s)>qkO)9kC6KHR4N+YyAhcND{<ExIlb7>
zpR>(y8j20lFy!m6MNfQM$6A~{<o9oInk~JQW-kwEJ9uW6WT4($M5u^?Use(!dkD?X
z0?d=TZ_Uo3ctb5ik8aIb#?>3;UFKeqn<tPu#&sU8SoXQDnX*8tsQhfXfn`v2=&{PY
zmBZ{KPnRjS<*zcHq`uR6Y}@CnP2F>OGUfJywWGb)S%x)jU;c$fV5Rp4Wz~w3aiO(w
zT}@RhOYP=0705hx6mA$F*IpggykphIiO|b>=WB!o&F)2f4ehbhQv=eXa^k5wOsp}<
zu~@y?_*3;Ar3T;^qPCP^iZu4m3EsIa1P_c1s`WK`o@)qHCDHlJQ3JH_OGXpW5BaQ&
z`jO_BrFp{P%3|-fQJ$+z+;R<l+Q;~oSkGxH_PHbvD0f|o#lDv<gQ`^J$`aozR<mn9
zJe(``>#z<#n0URZ#P6!@M~9cZ#u8PhedLM4;U4Q%U5@k5Z*p5E@xL}<!Iiypn%Db(
z=bU`KaVa${pnH<~R!g}@X~1=tj0fF^mkpcV<GR>|q!i>vTy{0<YoztVh_@wGNMmC4
zJ#qH<!7_afeWMg<AWD{H|8ls;kJGzqvw)hiOd`vod!131&{$g3%VhD!l?6Cw0dIs$
znd~*n8l!iuxj68WJWt>z%CZ#tmn*N>Z}FI|`-E8KmZ2<0V`6=Sg#6NHs;vfub<3~p
zMXz0ZomY@Zjivi;Oj1kEmp8~XzvSrt&{$|RQT@}MUq_!_Gs<h7b^N0-XN#>uY7_51
zGH)q!jBVc=-zRU~<dXU5r@MU?7q==uA8NDUv1R*q-%`h<dymInt`6E6RI}iTRma}&
zuhkFkJsH=zKl0Q%n}ny<*XkBrtxdZB)TaB8`gVIw!ZX{R#>LMbJ-GkOuJ>pz*T6RM
zx&4jfZ_=ozX_P#FoQW6@5K8ZN$rnE<KA9z#F==#Sd|Em;CNYL3W`u=h50T_sNeaiv
zpPI^FYN_iVFaBRrt$AXVN-Wg!EPGxI1_nW=h(|{tMwf;t!Y}6{_9FxEaA5cge%Y!0
zPytAJgE}m~{PQXS_?=&RPmsu4Z2yj5>i)(y6JwHN(+7BTA(r$b`5_zmMQg$^|BU}P
zUjHg>S)mh1IcZUGF-Z|(n$T@?{0G)<Zlc<2;7)iqIAQ6#wHI>+l;0)5tlsaZBF18l
z%9XXIrP>o>nLQ>R22aY|h~S&C_*%#RGg*-6$xLLb|Kpz&|0Op6|8kH2Yd1+!Y%kH1
z^CUlyH3X2ENQH`^py?ql0~brWm9}G>lR~LbC}xm{ZGH+xra~F3@Kh*b6&|tLH6>B-
zXJ;M?g%3Ltd2l3>cACZ_ut19Gi^tj7O(K(N0p6_ak;(Sc*p5Ie6e_k;hOwe{Ut!P6
z36YKg4v`_V7e;$dP6!JPievYa0!e(IEFhg0&qE7Bf*@@GPdjDz-my;$4{=J4_KH<R
zu=ZN_BpuSi2IZhxvsl@e?FLfUhCUEa0$Ee0M}#<~goOsAgokQVC;ny(<mWr{-FQf`
zNc%%PY$=tg(O$FD*$st{iq+RDWe)nE;iIiLDK=??rB^7XvQw`_wg#!;o|6|uIxZqL
znZPQMB4@JoCm{|T3RzQXElK1gJF*|9P-YIClQvk^is&e`l^ihACONRKO+75hb0P7g
z&6Bk!_{5YwngLCanx#glKVBG<ZWEQ7Y7?PWr(~LC#HiEaQ<BXboNXO!?PV$xfZyKG
LNCO0tzc>C3fD0FP

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d0f1bd9b4393fd8255b1fde58a8b75d36fb67909
GIT binary patch
literal 4341
zcmc&&X;f3!7T)I$$xQ%>l3)TT$Pkek1XMsuf-<X!Rt7~4f`E)NtDrbA;{X)}6%`Z}
z5UL2aGN>RpYYQp@b*jap*jBCfX$9ro8;Y&3?T@$CTd#X1XWGNr-##a~U*ak8G8Yg6
z2f+*v0VRMj09whpi^mVWOwzBOKJL|&T>m{MUw@Ije0LtNw8Ca?-;#A#&j>)k7YL1L
zd!B$LU};!`J~g5fQX|#?z}3a*mO>cb3*ru>As|Fnz~YI7oB&y5;F1toY={6;rBqvx
z>SZJj5mSE`N@PUm2m%b~hDUb*@|>rB$eBRELBQg$9YBfJGQu0~7?+Q-6;dM_A%%Xk
zQKb?5d6icFh5(Zz=p;TM&{qnKS{5G>6dAW9P!=as3Z)03flHP}g~vxk$}#mL)$ftG
z6&F9pH2#Y%{}Sl~bwR(nhu{rMen8KaUOi(uWvql1RvV$JO9EX@%fYC`5Wc&m2(Py2
z!IQR0u*`su4jqnyoW*?5+t~(D-|7LFs=$j;807vK3VtWv!Oi#OzzVj3!#*M?ea?e5
zyCi5`j3Tx|WCP8|%;4f4XV8u?h8J_*qpzw>AYVg>CN=5;@zfNkD0gs3C7`2P7dR8O
zK!xuB?JIb&)z2EjvK?Vo*9usaqzRk?6}UaS3J#1)gz=^|=vtKq(rKyyezptfG+l$}
zI!lQBNfjDLErRXKH6Um3B+M)ff~7~b;krKy_H`Xc54i|xuPQ*#XccH_o&ZGg3^1#G
z32Cl6FuS@0L>1<+AJd0o^;$?7s)k1}9Xg!_*uzb>uw@P(M4vlDTK{)wMf_+uM(9FB
z8VB-?NN7880*q3e!1FB&>T)N6IF|)S3uSQWzA{|6LBfr_X2`9)1axDsz`z<O(0ObM
zHv{tF{`*Ksit~oTv;#1da2|F#AAx;t_V92Qh6dL4fnnx2(5|w8=Px$Eq1zKdH*ySm
zwf|p8E$|VF(?<{;sRK@1XT!X~EVLsw6VEMw53j2&(P`sC1xni;4ox^jELGbFWBt2v
z&jYVe*VH_s&;1-qQofHiyG{c0&PNbIT)~<OzC>|jI4C0YE|%1O9op94L6=LOB8#!n
z*xsl>Sou>r#?iMYD~j5o`s`bz4IY?Hs~IYHRYG%H8lg!XgU+RPVC(B~*pjpm`_cL(
zCOqnl?`u|s+JkLyKeGUaa%1obrt7e$%hwU1`Rc6x^EJpk^bD~#jm0AM{jrJmHRQ4j
ze&n8(XHYlG6H$G3n4%?y#|s+3S?3L0vgqYLelmsR)1S+!$!FkAg)1oe&V=ys#@MsP
z^^kS>41Q~#J!Cd7$6hv&=*FxIkYBR^)oH$jXysF=Ds2XgpWzOTUyZ}{Hl6_GnFH|c
zl4`*1%<w3I15ojq#Hpp*S#R@V;rq4?Fg79z+kIdmwzt+CHB79>%P$R~MOXW9`!f^R
zI%x@5<=9L3T7PTOsB#+~Zsv<$gjldCS_bQ$xR7H~SAo%&jjYkP5qkMA96(kEW}i&Q
z)mABjuSyYjP;j06;>0Xcc08O{T3N++5A#8X6YgN+#oFv8qQ%_7>mpn-Ihm-4^2ZWi
zJEI!|_ePgabips#x(LV_i!`^Zx3S&UDl6_9dVogj7a?<g9*gV!l!~}VagsY5sET<L
z(4%G^xp0pHC&pwFt5&%R`{te=wolc8j8-^?<+Zl*ythfoaA7Iv86~k3$OJe(v;$kX
zrkS0tOtRFiGEw63Rp@%Ar4p~t01~Hpu~t5GLOy#7(2`BNVN16r#Kx*}H9aR#rg>Yr
z3g;TI6@D+!1Gg!Z?%h5V!ZqSvt+8e=*tVV*q3|VM`^XTzIUPZWzPbaR)6-!7lb!gL
z@9I&z>@@_306J@+gqk~ium?FyM>&q!f|We7z_`C$gP>opqc;!32&`<4=$`#GVw<N9
zsjyQ64;T!@P`Vct{MZ1V_BC^j?_hAoa}PUH7BK#Z?L?T|`y;6`IS-YvEiiTUtvEaV
z8My381%tFomPgzoD*EMGj%WWgh2Hv7F1b+=M&Sg0sj`#(i#L!<r8e*+S1M@Vb_*J&
zT*8|2+Hr%^Ej-@j8I;A`9{h5*DJvGgN;<gd5l6ow@PTRM=<br~oUG_{B$Ki^26KKw
zDNlQ#c)JrmPV+smBA>C>YoCXzuo&#vR$ElRgQsNIUPq4p)CAf(60xPNJ=84Mc&_-H
zyU;s6VPf>WS}ZQ3ioJY?COLDtGbT%r5he|V{Ch(eSe$paIkz}v?C4*+QRk^tOk?L+
zzV*$`u(IQrR(P)}CM-IS-;c{DeX}2<`Qr%6{+%zmt+5{s7Df{hD&t|lZ3Y=O=QtkR
z3iv?EIXKhjjP~v>R6aR2hN`Ud$9=;&(4kz8QdDwrhiZgze-M-E7i+LN?Uj7+KF76s
z#b+(6@jwle-l3e9EGm2c5iH?c80$W%h6QkS@M~#t&{<r8+Cu}N?474#&-Vt{ed|_q
zL$m?Ayg{Upt!77Z_h+)T^8H7(^e#sV{Cc!ODHLv$Jj8dfG?mi$YN#OYGd%g=U358i
zC(E$T3yuhQoD#)ag@ccZfI2!fy3XqeHJQ5rJ$dmn=3&?ZiS|mW$=d~}`}I?-|3W{^
zx~GU=X*o`e+WP`D=Si_{e=)I|KZ|vMKNAIettIKFcnu{uNEr!I*}u~cf+@X_U>*>(
z6cJi{y2IapjgUPFuZEb_R5rm`2^b}G*q^!{r^e{}oZY{EBb%q~Qs_{!Ax%MO5z=+G
zWMhW1mNZi+DczK%qVILYr@1sOXRKMk#X`w}^v&v1V;*-kAIR9M>9BzkNe^b`>A2+U
z`?eg+`fQ?SsY{WxEW5zarzWJkr7UNs>C%==k>jD<qRAm0M|{s6+Pr&8^uvoqj)%7t
z+a~?|*!Nh*9L-PG0XfDp({qAj22ancHgm3y^H!~y`IcGvS#CQ_%Fk|hy&;}mY?p!w
z+-~Z7m3b6P3*2rQET~%4v_atBY2;TIez&E-{kF+s&(*)aC5;M?$y&2Tw&_MiH9=Q*
zO<Th<-c=iVv%<F_-FWxOh&%PG23RJ0zFgURxyUxdq_{r*hn`bw*rt0MSH0@*Zpbj*
zcRFS8?XLrDv;Ak+6AEf}nPw#|X^QGL$&`6%YZl+ouQAj7!1>J@wyOszi-Q;QblrE^
zWm%M6E-+eHpUjzj=xUK=SWjct<ippC?bf^<<X9g0x@1O<+VpJ8@{Tf(-8Ly)tBRZD
z^DF(DvaKpRs}?n^e#f;w+I2ke%C6}-)>S=q;osD!C`_rob1Lp>PgBm6&+j!Q54?S+
zF!k7j=1mIf_PJA!KWxiYw_U3^t)}<VcEg3IbElp7{@NbfHSZN|Y9F^Bbl+{i*`}`l
z*3pFxYn5zI{?J_;_RZ<dwqN{oSKMf*A;5-7r_0BbjS~?X$q>+>SRy<$HjW(>9>mZ&
zn!Ba4RR?ud-{^|J))u$Q+3g>%*<TVg2Z7xODoQmd0*Qz&jHJnEz;6umF>^m@`7;@%
zt>q%5Mr;@|`sY>J=p!=v+~n`bsPYGD8XgoG95+m{X*$}E#P^KF>Bcl2{d4%g+4Qf_
zigK+$$ckMS5)>g55Hu6l!~Q_%^)x(=syNWUBftt1eq3?^YuNZ>w#?Z5ei#Bw!1AzT
zya}QFU>MSm(L0PBke(VHUyF$^-}pblf+{7_5PAHMe{G2W8S4LkxxV6m>L#h?8>muZ
zitgxdZIOmhQR3w_$J=K3VhQ&Xri@rgq!Ni_F8yZ2T_Q=8NP{I#5=pScF<8DPr442=
zBS(qEmFemENKen?!&oK)6G$>=wV|o$RC+LTwhJ?NOk~=bOhbp3NIaNE>dQ>!dWjh`
z4-2rEZ64sgAU@D(T9~hoR|unz@TBuwwSsQs@f_tr=p<+vjwc^7x{v&0{k*Lr1D%5<
zGR9xNJq?Ez-;uf3r=K!&H>MeeE>C?ppD=n&8YlC%it_cD9p&dEhfeP|cNm}B{EzdY
zi$%*H@?lCTjSh5P5XUGat{#lN+$eEm|GBR6deg<GWu){H$qZ)b9M04r+Rtg)iU5lw
zx+a#45nbd&rv7w_!-qnzN#&k&<}^B5?<<ie4xf`eS;mXDk;p4K>?8*{ye^0C8{rgB
z?<2>__|x%eSLSGjEkURiEsI$h9~5W2EIQg)784VdsFe^D6B`;8sby|$Vs2t4^3VYI
N@PbAHz?1&N<KJIsF+2bO

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1cc7b2c26517de57e1a2b4fe464578f4f87a314a
GIT binary patch
literal 4035
zcmc&%d011&7N41WZ*oJ3AQDIb1<`~>kVV0zh$QTbia~{75!r;u7Fo3_1r&`tAd0wy
z8><RpwLp~}p^Ac*f?8|cL9Gi~m-_0vHz~Hh+CSd+z3=sWFf(V)e$IT!FTNs)4HxG+
zab0}4Bo}A_V6;M7hXVC14zGM%ocJy+UuIr#dx5dQFGvb^FZL~O|4POMTx~ATg3{OI
zGPn!_dtgcq+3?6AD*%AnSVXNPMA#i5?1lzhoNoshntUEBG$bxOI4UGDii>neokpnB
z0fkXQ^6y;nEvOs;7XfO~q&5JQIZytOGmZlm1Z)=52@Ip8G-;?d+T^2bdE}6UN8-Qh
z=uj2>^_CL;UI9{8(1U-dz?2gnAC?>&5tkGk9+DKokx>U(cyL&JOmb|TvZev3a}Nre
zG2v5W@Lz=dWTn}<+?~2U+&hGFK<C&m+6cETGlAc`b+9_B65m+z8+dheB0OWcgH0NH
zgTtc_;|ulnfw4gX-1oo>D9CRCy_0kB``%5^+^Q4X<Y@+MdhUW){9E+Rw$m_4iv`2a
zIuJ{{Izdaub=Xn(5Zda>(cbuQ@TTS<#xivzN(;Kc(fYU082F&6&DOBQlLKcr)`4$?
z39u=>8)ejB;Hs51+!AOH9@`HhUb#EAuR$HKwv7PQ2V208if!Qi#sqA%)tAU&@)Sar
ztIrr{t%5ePv-sXD27@pSLMDz?L|9uOvAgjJ@T%j1eQ&*x`b2~!b8CV7nAf1)_Kw<v
z{z-&3;DE_1&Vtvao`55D1u>&6(US!=U|q*q>^nb4plDcx25Sj;c}5$^t(pf<8V-VL
z?G#jz<pM^zc!Ro&Be7fy0ti-}02;2pf}etq0?ff0i|0B4GFgFty)cjQHYX9>Ysmy=
zSM-Qo2j-)_)i$u!qy{T#e-A%De;;!^JDNErD+QJ5wqt2Q_Jl=QHWp(o#m<35Fts2I
zeA(|oXr(U$7N_eN!@q}c@D~g$?{x>()2Cs2%hW;PsYUoZZYOc-#0(;&GDfqgtU}v6
zIshI?xsFB&jhTC8rtn=S9}`VjftSVyq10FI@ba%WhCj({W3B00$T|iebMo|Cm|kfb
z>O0@xgu_h>ppAA8L(Tso8GD0dt>~#GOZ`T}x^Jz~{M~F;!gw=AwMGTHc0+(xx@ZrV
zvyY?g3;Id_Y%vkTD*^(GrOXr}1ysJ@j^;0KV6M?17}YP|;Hj0%V5edtNAtcpNSz{K
zEV<<d1NLr%!K-(H&3%U8@Ma6#(04Rxm9s^S-BgPf2R?&0y(W=kf4C2$)GXA_SJ^Y?
zWoKxzYW-04-Er{snOK~E@jCFGmIWkVT_L``T?4y9UV(r!d04$U2R3vEpl*$@5gM!_
zRCw1GslB`aBHnbu*SDf^bZ9yMhT{c1+jk7X&Nsk9--RQ%MnXnDFb5CsH>fpfwSlv~
zyP1lR&{4&onScp*9up%b<iJ9vEz;NDf-%=T0Uo>4fq7OL!zbx;Qa+f*@*SAMZZ`gM
z7_mwn48w4&y{w1%(jO2D#Z$qf>Ud^X_IIFmQak!4rwcpoXrv+7l#koazKZ<>FEIiI
z-l)jL0~?GXK<B&Dn(SjQ3B~0oOyLvAoO*Jz<~|)Kq9dais84Ujo7}<~;V!!9hPx@$
zw^P6#*SSRXPB#t9*%|np{(Oua?7<?V0KB<j;V@^d%?NFcMP#2I%knw`%v&cgw<Hhs
z?<+cyPGvkfviJ}%c+LTVMdz`OJ}X8dcAjwZ65!<*am;!!Pp_|V8f%?=4Q#<`nC7#7
zhASUl1$*+`u#twp0Y=;tW`=PqsEAHL$G3b2Yqo209J)>t!{?3%E#0Xoes(aN;hC%^
zy!HdQGb+VI?pKYH)>beVZ8s!b7rCR5lo0&rYOealG1HhX5uW&WtV2xsn?Bg{bviQ0
zuh+J}vI#8dK5pbCm;iYNt=O44hcM~-2XM|voOJw`l*q0dfbVw5@z@cgK$P<l7(J^J
zi);p1{_XML!WVl`fK9LZSGo!0^gul*jbVXqjS{$W#74~NC`4+HgoOUNDzuW{ucffO
z$LgwzKw(uru-5Efa6{uda{ZiQl+qN<_>s^<p=x8W3t35^XHO~Yl7#}hi$3_(d*<jz
z`(}8VpNTp$`Rw(24usl%1=A=uXjtQ&MUbss12Z`?aJldnww+<f$<o$?+mg0oD-Qku
zI}-C5<4#I|)56<~LiK9)!Mg>(qGS@g-oXG&P@4z)pFKyhZ`mN#k)yLBj|=-=Jw)#p
ztYOW#p^iOklf#iWcR@oxG3pBv;$LXbU>wkPg^`jp>Jyz?MRK!A3vN2|kFQglZaojD
zhfh%fZlq0Z*l(90bR@uwVUXg|nK)|+KqSv;e|iQ+YMBPq@6TAp)HL?k;Z&HJ#pc;Y
z_0|`zTB~6sR`5hctJjS%l@tdw6lHDDwGKVELv&!xCjH3?4|*F8tleVhlu7c%2NgME
zJaSE?jR)6lHSsO-C=ef7zinJVRa9T&p$+*~3mX-D=ffKdCPZ}?OPdaF+BHdj>s*2J
zk<EKPTl)M#U(=B-`=+NOmYz#-cA>LE%Pg?DIOl+Swz20<my+B=-aBk%SDQ<=9+|aY
zyiU)xG_S<3SW*($Qo60o|9EKoPS>*S74uIg^j~c$%Rd%epUKj9E8kHWb}rX!ertKb
ziO5Sup1a&CcAkvxt&&}9t=M%cUS8kMa#uws($Y^)PjAi9pK~;ObmDz2GjSUgJ86K5
zUA;M|*8Arh4cCUJsjqi0&Eup!J>|8Rig~YV*W6><EtkJ~s*1Tn8PjK9<w`2%?Y8oL
zCnA3<^8D5PLrl*8Q|XAQWfAq%sByKLu5HYNr=yz{S|*b6sAtBl8}%k7JbLhK%(-l1
z_go9v^RXRy=KfVN{m%vGci2XDm&*o)ox7)|K7Z6dXmV-4c&(P@;uogf2fgxbV+UTC
zT`86vkyI>xY2H&Fcq-xXz{_#H$3xq4Eu(+2xLUpV`>NQ7zgS*BE$kH<aFGf^d3+2&
zSh2D=8cszd;xV$sBxXcR1Px9oB$Ce5DL2x|GZLnc5;iNb+MgfSC*g(@*Wm-OB6|@a
zARYn<3R(rz509cJs_hfdiqcjRj~vpWq19h+DW#8~)kgEbgH{?JfL2UITx608FjC-Z
z01Drl3X4oBaP?RBe-rdm)$%vm0UjeUEGi;4go{(SQh@#hPXz`(<s+OZR1EA8@8gzp
z87k$Eu^+AX$1u2v%kXiat#O`mFf@ckRD_`c3G^ucMznur;QuTZ=#YE^zR&*{I}86A
zAp3uLzQTX%CFwK@bVwnob5uZmR`U&b>LQ6`mj6`M!{Y77t!T0niA5sOZ0esTZ;>cf
zB#sohiA0ei=Sbz9m{OQQpPWS^PkN{Lp`GHC$3)r#?MO81i*Z&~lc|I2Ob?oOPNi2@
zdZAp4L_YK)meQ<JF0!Wi=uq34HlhCWlEdAmL`wrCQM5ePm&$M1Vro(Pb5=T`lAuWC
zPkE$eAK51c`rE~YyGM#bXnSRNsvL@>L%d|}T$=Zy7gg!X)K&RJQ+MK|5P!RPX~4|*
zzyM|GRJ_@$`n=|RoDbD3ivE-j-Ab`M+<jgWEf9J7(E3WH)S>vZJ(c~YnoZGA>qR0L
z`sf}*_dp)#Hf3?B?NX{I6KN%?$*FYzsT5U1q3*;=ODc1!I!Z1TiBnZ`QYK4VQ92@J
zCsjtuB31Xw(xtI($y6L=owPmWpE9LKqtXOCBY8-|lH`aa%P_gzG9)1(KGi5CA|X*0
cA7^A^Ki+1%HQ&bofDeD5j{5cLOZ|NRH{sIBfdBvi

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f892d384dcee4c386d866900d99cf672068e5bc4
GIT binary patch
literal 4860
zcmc&&c|4VA`@imIIU9!~=ja@SlC6}bsE{1HNTRZ}SVPwAGZk8tN{VSAZA95aOl7Gk
zN{faz+7~9YP1{Tp>3yDK>YaWw^T+S=c|Y%aInTXb_w~KM*ZrK2`+Ho(t_A`^;3Eid
z5=aYR6o7t1MnIyG^~8rC6&vGskz)kU);Sbfn--)Ve;z&LH#i0zHP_R@#J^rV`m*)S
z^ZY9d*9btMAW+m5@TfnAfT3&*+N>`zgsd-K03fA}Q6&%Ic-mBB&38yyK!}Wh!4nDD
ze!dX_OM`u5f(4iYPoW1XJV%;oNR#zX!9=>$u0enSRq&_^fa<3pfrk^-rD<3Y2-pa4
zHj@C7%K`@VNK+4&%QE9p?zG8`RCJ7Q2nbnUW`r#ChmQh9>aU~J<R7Fkid5@I&Hdz8
zQrg^rNdLI-z=+tT0lu-m+z_e}60p=iGAu4ULPE0wQ`n4YZYDH$;L86G%)jOSP(?7H
z;w0!|O4wJ*zB|ga-M9jl8?A%#E-`d9E(6`XiSXoxG`!2ufc_R8@Sng(hmS<UmL+`9
z*xdq=k2C=E<l(gp2I((C!0Xs2xISDAj35&@;vs_kUwE)`j~FFKOJmDLCeT!=4;S{@
z!MN~A@Othrx>_+AvXq5Lr(rx0&-8#5=?E4n1awx62ey_P$n!0rV>u7Dc^O0KW-FM}
zwHy{DssekbJlt|FheI+6prTieu9Pby^~Mt5Z?*^Z#w!q2I~5{cDnNtGBG|D^8MX}l
z2sYV);8QjZ+7~n6Kvxxd%t26dSqg5;$wPC~SRnRUgMQ6hNOe#L=ZZWKl^DQ5OdIwo
z)xhe{70?Sa;ijDcd%WHhGUoC@bkq(~2cDqiadJ>ejEC@4He~6N&~oe;=&rT}myZmn
zP1ga<bOw}V`$F3TSvY@<glqft(d^<p7$0*923OjGdY>L#_sfC@!x4}e>kiqehv0L3
zE9|x_f&;V7;qe{}4JJQ@iD_eCT)83q@_H>CzNH1@BSxWj2Y*G10lg?z8$ncrI@oS=
zhUkj<SWR#hkzPCuzg0{{rzT}fvBo*V;jxDaAH@SOez6AOa_AkZbGQUu(=&;ZrJYg}
z^xE;Q4mx0PvlqgNOIYL16X<uX%a~#005h?p9a`4hMHlm)A;Zy8*#5`>_`R+GXKS02
zB{>~Xapof$2TqvDIek>@z(w<#8=z4$8l6q)#Mac|kdf$(b-LNW3*)z#u*?oW&?F7U
z$J|j(K?^)c+X<i3qw%qN$(Y&n8DvP75@VpX8X1I~CibT?7;Z_kIN4E3oG_L9<lg32
zz@LQ310QE&(o-=!PEZea>K~xZu!qyvKaJ!A7pAW|4IfGzfa_@kVJefbS4-+()5X*H
z4L5U0Yg&fAttZj7Ip@L5OoOOZeG4`Gc&t3t8dR(uq2cNnoS!j>G}avhS(`z4w6p?n
zGkrW#U;(VSG~%Ss4#vmK82GtmE$nHnB6AOUWBY3iP`y?iKIO<Ke9`5nxcTX^O!d@w
ztV-_|Gik9gd4~Ct4Ab|-FF*{K<oH8!zdboB<vY+l(ZG<qh0xnyaRA>!(4RFESNu*I
z+=e!xr;cY)>?k%0IBiCrj9!3|+KVwQ^J>!nycd~Oy%6bKS0szuKBLmnZMbgfcD&xv
z4z296gIzIc_~^ymc%tkB6drO9Tk&!O_H0=)Q9H*4v7VY?QL-n|=<rA^@AxueNYG9m
zKQ@Q-tqSAimzMJ#Lp{)u_`8^j<~Zj55N$NnF2coARuLtUi?M{??9jEr`*QhO_V|Vl
zANEo7g1bYhh1ouIg16`MLnNo2gADkY437IVR``7ud)3W)R*BnK)Nn~3GpWtsMNigY
z)X0`&ckXLo2NW#GD5*+p*OGpg`*uq*OqdTEx{1uQwgHT)&$}@1l}*eIvLr*vC=K1T
zSwOa@P37{QPJo0Nu8b9rZIQ?RooMO$T*&BFg~J)TL}aeLNH236N9t@nw%qGAdN_L;
zYy7>ZD43(mxm<0`T)2G=k6rJE)$~q8A5Mi6qN{hoWo9bO@86AIdQyive1C(WU_fUk
za8XmI2ljA_kBrr*3=I1r9OJyd0)fA`qYsZm39N9X=)U<CV!MkvDYaV}_Ztep&<0ml
zP~QZ2_OyvJ=`IGRUG_55eEn34OtoN2&kIt1N+z<6wU<^>+J>7>v18lsO@Rrir3|Oo
zMXac|No<#a8B#rU`5bbcG|1ot-d1{(`Q9B!A4?N>UK7de*nR`*r?p{?nH_kgxtgrT
z*4>2RyxX{^d=6)+XbGpy)LuYZFH)VIBrDz9#3Q};O0hG~o#Vxm@sLpU9g2xj;HbKc
zWra(f!1H<yu{@?B#@z4<`qM216HYbrcvGxd7rXTsG5BTDVzvfRc9p;fXOME;c{ACY
zqBbC3OD225+?Qzev)iz5hb=xvbr={Cub6AbwL*DlG*-FI6xHqGam_kvNx21+p`|kc
z^Er2$HOC>2qj~2Z^r*yZMY+{ru^Y>o%XX=fHp}cV-*{hQa(y=c{^#=y_NQCy8|*@6
z)bHKs=E)RHdG{H<@%62+qO(#htVaP8=CtAuVzWrk&3$P87=mT~$&=jPFo1@#qlj>M
z6*y?Rkqn(%g$JDjd~o$yIQ`TP?a$4Y{c&_OtF(48?it30PT69#T0R}Os6ZI!xhAP}
zp&FapQOXDRvmB#$e1?Cu6ROwwgtjzqVr`yZgvFl?Wjr7iF+YwvekC;)Ztg2V9U*>D
z_{l~3_RkZr2gc{nHPKq^;#!f^W<@iSb1;pmmbF->xn~)Y;@6?I+z_~y_ZZ*BQ01ob
z717SvAMjNL_t3?d-HeH~u23Z4vGb&BqzZa-fK~QcuGY1PHHEVf^}l|DIZbSa1aq#!
zsvQE<{o6Ba;QRp0xi5`hYOW$=_P+*IH%qL0u_p07e-7gi-v$M_CXv)N<T6iiU0PR=
zBJ=O76hWt6NYMY_C_Btj)21qX_$P!iNO(7u9!-%U*vWvggcb)=*5IsB+8$>Pu35+A
zjkC|T$XlB#B{U4~I+M3<qpX@`noyj-ev`bmYmrA&e(ILd`hFL(#fLU*Rhk~%*VS}r
z<2F@`wJeckL0YD|eU`RobHSz`v|RG-b1Vxt@0{pS9o*eqxMjDVPji~c>Tr6_l;F-H
z&$EZO=1z-xd?Cl`NX9<X#5aB2XOC<<Fe?RPD_R$A&$CJ!rQ>z3DD#lr_Hhn-tc$Y>
z9kUHXZl5du;mF*BmYWo9N_G^x6}c9BwUq2Eb+7bm+hbF@tK9oUbpP#^(%nawo>|LQ
zvMtN5^1qO!<K0@8b1dkme1}}y@;$YoUDY9XTFY~fM@F4#`-<*)Mr*GGeblsyzd_$b
zp%-^?lXNqc7PQ|zIf87G1p0Y*BycVgI5H`*kOr%xz{+!5D=4rRG}x7WG+1M<)e$rl
z{IH+GTGT)@9i??tf_24vgeiN-Yy^m+sr5Zga{)>s-amIv0wV6HaT35I3KP0e(k~lm
z0>8g|@#NOk2eVQTDKp;Vr-$P8Y{BG!uAd%RrO9bH7kG3$wn-NmMf7xaJh9t8)+S@T
zXJ?PY4xI%R0o|Rwj@kYOf$asJ*M6R}cShXHp6+W;=N=TN%4m39@0(ZPuyb<Y?d$z+
zMe_=s3%za(EGYA=jp)67<C%M<Z%c-T_s!>?H6ia`JRy@|IXju(R2wGvF_4=qUANl1
z110kxQJ@AdQ8IB+gPY81FT8`2$t1XCND+A<W19rRMsL9@DyD#9@|Fs>@YRx<Rf8Wt
zNlmYO*tA|s$vk~})#H|QCDSD78Pz>)J0^OcN}qA;=PP?nR}M>?)bw=}IOdvfHK`rA
zQRZEr#5Mi#d3R0dol{#)kH5UvuySyiYj)z7N9VRE&B`!4`KI@Bu4%H&%({2|*Gs)m
zXUwes^~K%#m7itI8$P`5xs*F=n|b5M_ff;|u{ZBA&6+970&GNI*nfK!XNQMG&{`uf
zh6oFZiDd?c1=81a>Q0^_qp(X~p-5LVTT?TgNsEs^fBpXU;_V}F{wkeV_0m8hA`l}f
z)zQUe5&nqHGxPgHL-f&KV9Nd~H>hP26SBVe&~oFiqm<L1<OZdohK&D7ZpeNWA7OzJ
zL9rvcDjic8Mw)#_n!{5m!SUDl|A6{;yhZ6oAY{b&2M31x3J6MlX<&a+aT>}_W%3r3
z3<D!f_-*n9j1lK=^$_j%?_&{Q0)~?ry_yh8j))d|7=`fVs5BI*_-b@~3Gn|V3kp1u
zvdHN_EiwN`>h8btCjZY~C50Me1)e5LVXq+{K%^{`7Q4F6bvGG#vxEl;J=%=KmSVAZ
z9`(@XC>AG(ErZ0iVsVhzDo8SBNjc1+TUKJR16@=7FE!PZG-K!pbRhBE?<eZ%O{W?*
z&h~WQDuFI-=z<C@7CX^}r6=8$%!~EueyE?Jvw@%c!ngq28KIsYuEF$txC^zv@0L@A
zB%YNd2(<~yM&e1D^xQY=W4zprA_DA!#J==;2|PuIGS4r4*98mc{%pD!p)T3_$bLep
zG0RwAccVy859dfP4+(V&Z{7&M+4H~M50xy+{<I%Dm6lNfb_-+a39*9{?Jsdk_<}#r
zL6UDO*_3@ry;y8bH|@gc97K89&RFhem`LShD(yriIf2eUwZ)M`p~fsFE2*7R?5KH9
zv1P)@IY~B4ucCa!l1z>)l8_u3mr(Z%w~eE4Bs}T$RD5bFeKaGUAXJO;jb0HK7(2;7
pDr%B%baZ5bT6|!1Oh{yent}0TgUR|LCuM-IA7~^1T&Ul@{sjY`8d(4U

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1a786300b7ea789c918d25ecc282aa7737b43bf2
GIT binary patch
literal 3554
zcmc&%d010d7C-mC_mY<o3Melmf(j^sB1%9(q)2%|StBAYENUez0?Jlatr%3q;)p0J
z;=bUn)lsBV%TigY)}?fyO3|WqI$F0nI-j<pQtf?7v31lx=KJQG-Y>cLo_o*sJNJA!
zC*gdA8%J|~+!!s#a?l6BwjgJ6)wJs~Pv<w>+Az>f*J1r#!^yP;%e0z;n$_KNtRC5M
zz*%w%7f!~Rb0SV;;|We|uZ)82l>z|iU`%Ei!qgf{)sAd9TIm5IbEQHOr%O+mk*v#3
z=CE8QZ$|PnNR_N&-=s?ELVN@c1DVXp0zl|kE=Ws!5PVkC6i5)z5-|;oDGMg1tdERv
z3B$s;2%>VX-|t>4*xnljOZ~<pCrDmBC6^{n1|*oGN={<cP5K2CC)0$C_}sL_^qd(9
zx*VNp3fZJ2%!toO%}q-;fGfrFZlpR-sqSN&|H9^LAs;s9YR$D=DG3Hay^eioMF*@1
zgRDc7a82?Nx^&+&cywq4`fTVr6V^TycH8ZyWBYA|!P6b+@Es3PZSYy>^eU(K&uBMs
zc5S7W2fqz&9oHd^K8qVFj-khn=dgRmZSkD8RyeV+3!T}12f15i;w>2o@VIs-C2{g)
z_OEY)Lyb?6J!rA_=W4Vs*c44@u7?Iy7HV10ju+NaP&Q{OZl5?7?s`7JijM-RtxYoU
zJQ|5=cAkLmN-E%aX%=PYT8MqTMlmU?tVOp^RwK8R<Mftdk!Yg+UFo_^Yia6$E6k?m
zdtf;fF<YO6VA%*v<#KfpIH(&=yI+&uyfu=s1XJ)TI1b(WgTXX<ETj$?itoKw3ro)&
zr!GwNg_5S3_(2_mz8-f9e0&_}j|V=08p}L<uy_m%7!wNhU-hRf%bp^KqN89o_9^^h
z#v!15)KmuN2W)N${b}qP(UbCQ_~FDN*l_X)vvJ2%yrsqs)j8Hu!}t6`O*?mk@;z=R
z9#ou%kGOs#)=&3j8pZ!&Qq|Ga7my9!>*JyDRuI!?!CY`TRxh&n2B8N(lVsEF0`<G2
zseW^1FtKMTa;Q83H$q!bP9F(MIPSw-a#f=hF`gi)=nIU~bnNI`&BULIVM6K^u&R0r
zTIIsP+voc+`%XVc2dqw0E(cap^Ew0J)2zw3E;JBr|70fJ!?iM>9398#j-;Ab9XMzi
zIy(yO$?L)cRQBR6DNd-TRY~!~3+VkB({cWzK=k#~D;D<_osukRi<SHp-8EffeL@_f
zH<N96eid0btw(N_<sxb19X9O>D=FxxWA{(AL-l9Xc<Lq-N!GBpMKxvz@wZnT@cc2B
z7MUi8@!I!pv5_k^Osb*^99-sz^O!t1@_a3xI<HB*#EcQu{PdX4KQb4!mW(hpzu^q|
zqasA_e;<INwp5@Qi#I~q#euN9%!M8pZpXTouaKIw)Ztk%|3+6sMzVu1-$2Py7wNfb
zPw|wM3(X~U6LHP;A*lPaG+Oyp7le;4hKRqNXU_h!7PaXfLDXk!s77Z~)YKk@+s)$p
znn|kg_UrCg`qP(?__!5ye?ObXyXGmc_<l*R3?IapRN7E+JqZ{siC~j%I>VhCP12S=
zr{H+_CUJ=_ZouAm9AWsiyG-BV<!HOu9a~$kpu|h=LC~fJ;9Pt_q|KSeW<JnM!f%f<
z`P{znEv85YZ&5UL`ap;Hr$}I8HQw-JO@_E_<pro4c^WsAw^4_EZOt5(SJLhiE>TsE
zK~!tcF>~*a%gndR{g{@31d+8z3F;U-naS7~q?~lCl1fSjbah$mTmF5@FkYR8hjw8I
z$O)3YGdxgot+q-^e(oA&+%~Zb*C_q{cbXR5SI`188QVm*1IR)>`#VB28MwcU5RFn=
zQV5t;_-$LTkYf8dMKx|)SR^*L4_fE9eNnNA!acdOaeL8+X11CV1z)vzX<w&^y-`h7
z#mlVJabK+CcPv?M?Ui-2vuVeND+c;4VwIYmCFO&HRyjpC?_9dtF}x~hy=K>6DuzT=
zCtqydwXD)Lwz)*<zq@q(@Z|Qr(Ji}|ZyX6%1?v*Mtfa-SOrod#_XOxcQMQ>_>Wb#{
zHVYkBY_Xn~byK4UxKG&T6P>1dbvMy>b}P{b{rc{;(3=OWB&w<i&ti8y(6SATKCB0p
z2sG+J`up>XW&Jg4g4*l@mfMV8WEtE(IHba5dh?p#uT^22edc2><h)bFF70MbMaTu`
zDF>%DEaE~tTw*>>z1&<8`pvNS8VY{Fa&4zN?nG&S|FznS?lZsKG`dJW?$U@^7e1Qa
zvUc3Jp6QqC=TlZ;U0zw&&u#Xv47==|ceksd$ZGr*pAWnVq=?bP<i!>Xy`t8NSS#3t
z^fu1l5n~VU7Fp=&?<<iWhA0cwPx|}+F#ck7AyB?Wa%CZX)e{xUtQBQfnL>8Ry0Bq|
z@GKwyoR3u-2HJ4gSYv};E=-cNlysrqCT7#ADcL#V#MDHgXpsulD3xF8FaKPjs<u*f
z7>fBHljOB!$dB`S(Jiov7#IYdq~hw3kW>KizpW$%U+NPSY^NdZ02FL5uTXJcJtazh
z>m9J6cQ|AJrh;qsqWq>NrYGeX`-$5~{sU4Svr}EOB_;P&^=~lxI;_f44^W7*<C7E9
zbR13E4hQ^)?#01I`%zy%Qi{O?D_#ybCo(d>+z|+Tzdr?sIg!>!kfs%eXAwFaOf2+1
zk3&DA-&W9XF#MljK`v9;D7F9N*6L4cZ~iaItNODtNxo{3T&7~>KlZl-P}(SDd_=_f
zNN;0eDYntB!tmfVJkL)cUtxsud_J#9;sba-iT6)3oM{NdIAQ0{^TEPO=-!o3hHbW>
zK~Thx|6qu#s~6df4GR)z|9oK{D@;T+&ufKA6D^R2dtNQjv*X;u+~Ojq<R%1+njIY#
zku2P&g%khg&LWdRpT9u~ae@q^KEqbHdr3Y!CekB4Aux&83GxQ>1P&R|y>!In$pRfB
zOh)Ji*Ny(nCTE%)U8F}wbW~VIOq2mSu{XhpFJ#io{*Yji@dtl|P--$00;l8%7ksc*
z;5RVk_u8KjY=}1rHW|I4=lL<hHZWC)L1s+As9ACDb4W}^2uvi%`9l1Oi^fMGXBvYf
z@tmL|_tCs2-}p`jX9X$3!y6)L6f%G`o*SS?rv>B^I|iJBJkd`?g{Lucg2Faam-T*b
sV$RU`%*>&>tgMWD+q}fA?39djTQ|>PZo||{tqs7757g5D;pE@luYn)u8UO$Q

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..bc20a7699b64f53ab5cc24074d8f61c1e997b00d
GIT binary patch
literal 4872
zcmc&&cT`l@*5Bur=?Dsg3?QJ$h|-%zMZ*Bnn`ori06{>ibh}`HL9r20P(g|)7Kl=!
zQlyCm5Cd4iC>SiL#2Cdq6;R*3Gr<?XB!7Hsed~LB&78XT{_VZby=%^I+(hn1e1h-8
z_jls+_%H#$u%>2Ozgg&OlOp5Jrz)9m1N)uYmgYUFOYN`5U7^|RV|ul$X>sWNr$?SF
zyRp0UY-uka_=<dGUCLgL&)_puXM#3s%tnbdW(5FTZH#Jp2*<^jG2t7e$|nS7z>pIt
zvHg6*{Z|J0MhEdRMV{g#r1%RGijk1@PtF9oRMf!7fNJEZ4nUHH-RCR_1Z)HhHj@Au
z85WF_VVTQPTC~VXJKFS%xRqFAJ4!6{hmInZ%U@Ti%|CL%q`9P%pL5Zc@sC&)8x{~A
zv(n!;##bho8U*{VTon--8x}6fE*(>RhlKBN;XYjTe}Vb8Dc7s>ThyKSzcD3e_dxbd
zAJZmvEvzxy1|{7h=&oH2y7@Zr@D>l=?9zf~&C_Ak3<Y%Tcm(WNp#WM3njzw6EdT>~
zcr^}#^cTV4UH$=Xemnt;z&UW-Qve0O$-#z$BD5unhpiFJfrc}NaP_b)ObOG2SFRt?
zjZ%HcP*p<HYo-D*XaKATN3hsRKxgSxU{BKkc?ApTSR)6!y=Owmb}Ml1UIWV#)PbET
z4|iNjpm1C~s2P-@>m{m4v-Tt?Y_|i=+UpQmX$s-LDniY;Ww39xD(o2j0TyHhfY0eE
z(7v1jN4n3Vehz|)HZI&9FAt3klYz*w2E&RWNVeAmm(qL?oHT+wOdE1kDqz#6Qg{sW
z;I=Iv>;HBx>~d8A!S}Y1{QMzW6FVNx5K|#6nGG4bBs7<ogYG69a2sYoW%_gwrZeDl
zmM>iElZ7i?By{B(B8L<CFg3arMmE@h=2HW>>6Zb0AHyLb#sjjF3*l4TWjJ743`ZR1
zL;pbxjcgeJ9q}ZXQeq6hy-I>(cc#JA@CoQm-oKEt|6>%RjUX~y6Kr<7z@pJKv_D#m
zr=R!;Z%a+lMZGL8Yl<Trn|zG$Q9c3_m-paqg>O*zS9^&8$0n2@+lO}APY0vhk0Fd`
z#cDIpqnHV76c&6ROXz5a=EQrbC4UeZPmIKJBm80QuPGQ?dp>zGy8}w=htU*p!sc8u
zL?`TJ(89(Vs1-(`rmdY=VigX%5|&~wW)5LWr)}{g4Lqng+6;Z-O!$-@g-<rvf(@?T
zLIh{1FrHs7Lq@@ML~b&JL256@rp+%SS6%TY4>!JqN@q938ZgIrrWhW}uLfJqcW}-4
z5$EZ%S)>BUz*igV;N3}kknvgop=x^A%N13S)>4PxS~MTT4Xd%CY7%uhUxAFWC8$z;
z2qI-Kpps;3P_uS~nj4dFt!?EXyI=%<UResbxgj3Gw*XeGn7H7xk1@PA8lE&K!Njl#
z>`>uSEVsf4RZpwJPh9(imbDGw^Xn!vHIw78QxmV@o0rcdbx-ZVLk+#~s}K!yvRA>D
zXLjU-t?NMdd<|p#9fXGdg#-8&f#KKlaOHJ8@RHBwjPl#bbLGyY@7Ykff>R|5jv=1t
zc-%cqO*n<QQm}$E+AhFFCL4*95zDdox3;Kjq<4J5G&}s-TsuB#y-a<dN;A`8vn=o6
zrw3@fb~Z9n*vsH}46?#{S?rCst63)(O-7F!<jAFmx$G$Y>5K~566~j5E$oP*1sTab
zgYCU^NzP-BB^jzz09v{U%s4U*&VJgDE#1(-OpzrSDrRC7e|8;e7n{n+4a|V}+3t+B
z{Wi!mHxsS=_7LpqQHSVgMUJ}LWR}6+-5hRHHMYk46?)(>i#7HB01D#ha@xvfGMDU0
zlndjY$15J|pm!I;2*Hhe;5IKA7C$?Hw?3>w9lmcNFbGin3>nnW>4`nq;WN%^!Y(ZT
zu`$N^{W=7^Z%6O?LkO&BgP?c*bz+a3CdoaZiu;ZFV<^R)75H=p3=TAK^zLC$=XRJW
z_VrUMo;wXp9=#ytP4=RErZJ|XvKwcnyac<$TVY1>DTY(bGFIf!X13e&+1y7}1srl4
z560mHe(lt4=I<Ur`dH3^XX%#wjy<=ade$|pc5eqh<6@(noQXBdc;Q_<|J-WgXY?b(
zTgws44c0~}9X{;u(F>WOP!ifl&&zSY|D6<f1>s^RZ)Q!aAvULSmt0F?G3;OQjK%x<
zGSOt?&+xaNi1pfPquUF<A&>3SB`OZu$g-;!VR{P_3D(eU9Q!T|WA!Mrnf>LsO^hAy
zPDwkacs7DHskjJ!sfb{zzLo*4od*c-f;POR$AA%yw~-bOTEyua1a3IAPq`<59y={E
z1vTSU%o(n~qD_N$A!nZrK1ux}Fv4Fl6Q^8;l8`9u%<j3UYQLO}c}FEV{u_O0?u^GG
z7Oq6j_OTq{Pxs-GTHLhAMHN^~Y6)}oesyxeYFo@V&X*|tme0GPIge=_U{Bm)7cnE>
z_n_MswqmLW>J?_*+zD$t&uBPknINU?%lO5`MYz}Yr)cpcf;Inx7rCeAIU3E1B*Ns?
zAjs-C3UNJ)2VMgFz(akwzCH)@H0tL4FfoetwYM_z3S~p5><P3<J{`9xMHuH7A*ph;
z4BI4lHbJcSgxygVfUPQXLe<khpdF2AtnG`7vACuXMjxq+`EfMy>&Y>2JLe?o2=)WB
z8&1UCCo{0VnU_$PAPH+p5^%RGo0FV8F;gRB`MAbMs}WbB3MI(|Lsx!3zMr8kldPbO
zGGo5OHy*u@TA~jybSmA!!sj6)pI5;>`ZybOPt4-ho2!BeX9;@t>NOTN%!T;*GKw4b
z@lns)LG06t6t;6O4}Wzf5>0w=2&pf!#CnzsiS-K3j6#J4DA0W~IWn6Z?`z3q`&_!q
z(yz`^=>2qs!TxZE9r{QS>x;=C^X8_KBb%C_Zon4h9?FtERk<8@?Kq_X9!(sxHwuiR
z4{p3-z8sr%`qTJI_hOclP9r?O@*G+l&l2NuU(wf=dLBQQr^~PB{O86+&_}C8&_9Tj
z<S{hVs1E=5BSNc5cr!}hkm@-EdkbJJC5ybRi8yP5wr71_;x?w-6uT^o{G?>Al5tRX
zeg3voSq)3ElBnR@G<j|JV$X(x<Q)?Y{jO$-3R8Bfd=>SyyP+_3x4K0VOJI3ayjRmM
zL))wIXxewv+zRZnEsM5i>Ufp~^)wdkIAGw@C>B^9OV2h5>MZtZI=1uBtjPYW*;dDQ
z<;+cZ{nYDBs;m0fTY#KkDt4P}l`5y@ULjtnX}eovL568sMw-KZ{S)>3?7M_6Ip&)%
zzQavz_adhp%S?w`GnSMrt4-oN-q!W5487l&>3Bzfh1>e~!=!H38Q)9W1#?q$v&#b7
z4$j`d&^uTWeDkDNb&B4hAHwccts7zJA3neKQA_sRRQ;T)*kA5m*uXT%ty%Zxc~5n!
z!I6ubMu*>zFb(tS6A7-ex!5qjF`1__XCup~;8L1`j(3gNsPOVm)w%0OS;j}N?w#s*
z&^*n!s3lW(Y1Kxy$+5O<(~!G0X(q?pbIdmkkFrgRf6TYup*(N9>50xFr$g(bIW)R!
zs|0;c@xmgPO}1vaH7A!0E^ppWqkB~jkDSX4xs$kpyLsLp=t(l%F?8*?(=>X<2Pt~1
z1l^&0485E}x87k=(w?(s!|*b$1ignsuPFKHT;#)+9LIB|_p1|zp5Jdh_kCa8R-CKm
zbpFiGO=5YS6>aCwKD@F=(=pfSLfPZiER&GlwhQG0H}b4@s5w_vJnPaaUUX{3^{UFj
zJ7-q4<~mpZ@ZzWQ(SyC$tIxgctKY^|cd0r5>fzPzbXK<4To@YovC#2|OKsKfgWcsJ
zeeJc?Z(q{z+>536fBOPn_u?3Qg8BQ`f55+Q9gxBYzR9yX?ooHqbfim)UOV#kPv}j3
ze??cTyAb_;sL=7aD}_GX{i08bE^ISV<zrH=VfW?zoE;V%PV<g{Xd*N?I))h#8bIIO
zsf&6&NAbC?VyCX~iBLE!VKsk#$Nu&<?!$NaOh8!GJRlL#jgd?i_#rk6R#F_~CH;0%
z@>JUW4F{pjB%;I`)1f)YUsoxmFFDA*S^vaAWIr>K(17s37%9i1Sja~t+&WXZ-;81*
zfA#+lsDGEYAl(d<7}2YO0>XUx1jWR(us<<6Emfz}@)i`c1T#$O%i{AHQspm660P^2
zW8q_bhLbsMO(;o@h$ej)g)nwhTFR7v4cfm1_<tq~iadd;!0G>*ZVLZ9l<)uLNf!RM
z9wo&m#)>>4OR>UO0YIRt#1px@yL!x#K3Ph6gaIvPB1@4-w2=DJ;wTcui!1|0HX>1=
z$SP2>W=SbH(<3X9$e!-0`B+cQB*SRh1MNuUx?ab?;45mdz{QT9TgB7u0=l7Gi$qRz
zW9datCHo>ndLH6u>|*5Su_V^tW_E~|r+W~+ALd5Iw{8v9Nc>qzoKQhfBK0R3(tBUn
zM|*pih5OqEihOB%2|Se!C0=85_iw(T=MHouO<fYYG@cM@%`(Q<!z{wf(<Q>&Q<6G`
zw@{j&!{RUFp^8PxpW>lQX&LEnyCjC*5ZOD?`Vyu1G5m%0l6q6crev)2B9S#cv<;<e
z5b13*dySuQ0#y@JT8S!hJY9b(MCqYWYnBpADsn12YTrv_881C2NwBmPr6ZD5Qfeeg
zQo1fl-7Cx{mco(bN!wHYDO37rq?(|l5$PMXHZ~weZ&hTZo^MoCM7%~^KvZ;aM7W00
XOnoDLLxGbjz~>Lt5&&-0e_;OxMuIRh

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..151008dc482639c3df72873480ca796f9ef31d42
GIT binary patch
literal 3553
zcmc&%c~leE8o&3>OlA^-1kJ<*QK*)P>`GBVOPQd68WnJX@Ser62*@e~sE9eBfN>W$
zK-7l_O5LgyZ3Ul2#HB7M+M-xXU1(LbuUhM~ty+6$Qmj1nk9W>Huiqhe`+nbk&wQ6)
zE_5hMvVQCsEz7W=Hvsmzt*PGE9rj<!8fLNB*M7>KoHH$vr_uF{cbxzIf$k#s3By=9
zt8~G587pE%w!?uV(~VU!-6#Nn<ZXmU1|p&ps4qZUmQ;BGkxZo&N9j{yBIETL@hnm>
ziYBNy1=aCt=1r<pF4#xFB7g@Oo&XSh9{Iv&QVPTn^b*s4zzTDDl9?x7q=&akrkkT=
zq<>%(ID%JCG4qcIkPtx|`62>G%b3*Y%;eY<Lu8EJptnrGi-efS=+val<P;&ME~xkp
zs+$S*ePsJzn0#%d*JNy$Ov{$y9N+@&Wqxuew|p{-e9E>^pL{%#((g->Rqos^pO6?1
zw`W~N1JncQ%?Xb1X^V>BJafq%sWVXa<3M=vNr!di;2@%?b*A_u_`T)Ye#hvRr=Q9;
zJii959n0WQd9g?mc9%)+V8pp?4a|-yZ^1*SJ&@1-6|!`<A)-2~-RP?h2lNkxAC+dY
z7p;H)4imOYLnSFIfrHBj^wEGA(Z1*F(ck7Bp^L02Q9myeY#Td`YB9ZIDZAwavb{q^
zb8ZB{@XhOBWI;Jt(QXH}t#Bbz%Y#(z#VaKiM;p-WhzIc6_z}$8U*Cf95*Nvt+Trx{
zRrxY;!xU8atuy@ja5AYncNGM`TL`A!DIreZtcR`o#~>~qz(yxac;rGjy0&y?FaO>v
z5L%OrB#+L5*k4-U&o>fDR6S4C;d_=`6+DQtD6u7?p2i?p6w1Wib^>>A9g(<RMc}jG
zjkHN0HK1nHVBp#LJ=Mpv7zP-Er2YD>Bu04#ii0-h0jI*9BCX-?Oxi=EIJnE(qO-nI
zLM@bnUIa;ezO#*f6b7i78Xs`4E|qRw^(AN+@j3cuaVxRc*WSuuc?mfz<O*>eo)<+p
zOh6BlC}8o?Bg%9!o-k=6=u`PMK>FTsGG<I))Dh?i>ox>fxrXGE%)>T<X}1y6H$EmV
z?QTJeeW^_UnrdMCqa}!_JVTsncNb+4XDGk%4&<(LB;oOJtxfxucg2g-ir_J#o_3o2
zFF6116|iY-0MXy>Hy}#+fzBUz9PCa^M|)R}g7xcVmZMwuQ`Xbmz_AP2C^aM!P8^ph
zQGfL{=p2wWIBiNDGA!Cn&suLsjhz*U^jUgxPXQ}EH|Sk@Ozb%FOK~-w_Def#JD7)T
zOB&_FzgQ0DT-a+r-oX<p%Z?L=r&bfXCAZ<!{v_l3Oh>Ic)CHfGrjf~g27q|~?J#lj
zJ|eCe5G6O=z}cKlD17K;>4CoK%zF_wP?sbI7py*n^ZP6#{PsX3xu>T3ovcOkRd;%u
zT)z{y*2bdfS}klC@(eC*TFfk&T7$BVCW^kHY*3VB5OKE90NOU~fUOBp!0Viryz-qB
z`et}Dyr^1$PAyPbEU_6)Nh(aV{puOLnmT7e3wb?UV3`0eZn;6M7ui`B%5C5}!x|#D
z>T7r^qeSGqKNR@QyeZlut+S~5whXv@IKrZFv@P(IOow+K{D_i&wE)?^mWtf9ENp*#
z7d?N!NIbDaN<27`2K!$thjvpmsC|Z-%#lwNZIzFOaiK=)i8p1f*U;jb%~u$=hEjRQ
z^AjTRvrFQnP6hhLu$rnEwTOB$|0wwERv_G5UTU>-{|sVs>rOccJ1X(|NiM2>7A&n{
zWnyO6bL;(~HH_A|33Q$40;ikyk-auQ;J<>k44X&0;NPF0ZIUmolH@NwrZUoAjwj-`
z^AP${;Qms)$)ic}9Do?5UqxO%!Sr?vZ>-2)NXrHWmHKU2P-vkX7Js>M%fdxg_8OCt
zt1MXD$1$`f{77Zt(!L&1Cri1lMa%n*Ouv2k$ks(G?fe!nDovHCcu>%4M_p6Z;x&VV
zD}%~3)l1enhu6lpH&rh!ai7^_Qu%LNR^}Ohp+<Lf+w$@e;2LA{uNY<oE}cQ89@`gC
zKAG4kW%S6xCg*G?H}{!~O<Eh7^Zr|{jmqeiyrKKUjR5#s8vWS!1l8ZJEOF0kGKt4*
zvo_w(4_H;s6-ZP*1&+m)W0%=@=a55ISB+ciq^mC&vS!=FjXraKC7ssRgj9_$^C@&%
zw`1C_h=Vzl^ZH%7y4mgZh0Y~=W*o}-<yXq3bg#a7iE31lOIdB~nGN3aM6Mg^6291>
zYbbIpKahO2e(n>I+s04lbe<|3wa9H#edfI@2j|i5n-9%}M)?@`;>w(5Zbm@HxLX@#
zT>}p1R*8+$!_%HbP(~LQW8LgW4<EbtZJcq4GD<5N^P5#h(mFT52*Rg(Y#h0eK3(Q-
z!a57?$=z}Vo$69MTNasHY*5bzB~DIA;oEI&2APzQVW4A^V)^EUJ5?U7c&bu-p;Vvm
zt8Ny0`OBB$we84{{j>2h(GXA&L|QVVf8g&BrrYM}HH{Zz1yVBI7{2kodWu0k8t=)0
zZ)&_&FWPWYY)YKL{G2ei-UDx`Z`kA3`>Ogk7=2x=s%2h4DaweBk4@IIBz|=`pqCym
z4z}7|ef)4s23|<nGvTbr4DR_d;BkMJf<>%IJDQg!l|otg2M)p(y36BWgZ10<`US)P
z87xpRDqEHIfBdpi|25B?|I6i7|5Z0hvD!|-s2N39fBc12*(#-6Xz1iHAM?dhR*>#I
zd2t$!<3jL{rwJUF&1vGe0FH~}{Nsc(4F*i)SN<F~j-T<qd&WCqnZaw|6>*buoZa0=
z;>FlWLHyo7n;*yW16Iv(T7J;z_^rU_JotTL)UZiIqr#?V#sqjL>cT_gd46&*_HXWN
zJP7*y1tr)COlEz;lIQi1&xi=~N{I=K<Mh0|U>=8qN!Ps(oi>f%kLL$-=z{BJe-iPT
z#-I=LO4Wr=N{t8?LdW((%<+w%+S4DLEKD!?!>3Y{785w#z%#gUS{^Tevb*hvj1%&W
zlZ~l6^&B^bUj`=eIY^5L@SYtt>;s&WcX$v^ayFlT?4r3S_)H^6V$X4On6KkB+2%S4
z&hk<i!wH!*3kgA*&xO!+$pM+zju0m=kM(0wzBFbeP}-;I)8}Ny8eF5((p>fF>8aWF
hS+VIE38^XeLx;N!b@NbZZ2@@k2M&<{1ml11eg_+(_4xn*

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..b485d3882928aa5ca25ac11f280ec415a3432930
GIT binary patch
literal 4311
zcmc&&d0fp|8-LDkxl8M9`Ca#_AuYEeS}5z7?k(*|p$+ZYyJ?Scg^40jBgztGX`zU+
zm8A%cZDflv%otu`3`UmJ@SfjIhS$s=@8|P=-Z}L<=RD^;=lMS0=ltq^%TwxQ%}`7b
z6XwCFF`x}#)p&aDzJ-&YT<8v{NM4#^?y$LRI6rGhcKO+pf#1nW0_PWQbvXU@*XEzE
zTr@P?r}08}IRlK4>2JXZ@E3>S=-YuAuP=s(*XIQQs%8Z41&AcGC3ccdq|Z=dTi^)9
zB7R74d{|&)a8e{g2nE8sNcby~WFrah?}CXfaMr*OfI9&m0I;2j7+();vNY@|1bhS}
zpGyIy&I1}h9Q>%N^Tw%TceZKu4s4812#9!nJ0c!>?<2&dK3~O~?@1Au)X6uJ68Zxv
zGxaceXi8jod~#q|aB{GE6rMzd1%}FFQ{v*4G;0XqGD5P9lB_26{|n5&<o>!Jvq<E@
zyrq@wdsI8SX{V)gAlh~@RJTi^{qStCC>;X7Tv3D0JQL_>F^AB>x@dd39G1`01(UTc
zAph9}z)}aEX%LY6I12pt{sEWYRsttt43ztdq3o#u=B<;W1&L}zw0I0O?Y4r`8=S!~
zZYVsP{1#oP83qOVB4plZ4Adh_;K|*=K9hpB8e`y_8i0<jJzR_yz)F8Rh*{<U6WgO<
zM*2YD7wW*ZN!74LBMk;v?m_3O^^wuxD$rf#0!D|=K|=iqi2qFpjT$px^=y4u-m?!T
z6oo_3PD409lLMRDYtbDQ1a)Us;f9tDG&c<bYNI1q)jfwSS0k8IQwriLYuHSf!A8A0
znE$Q@?!lLE)tMpgd@~mEChLNDmosEN`~^j)Xu)pE7~-<{P+&nr%ig_UG2aP1-*BKl
z*Bm6d9N1YD46U~{;nXD>E|pjzx5`p5PC5hK^PIrwfhAlHDS+E=;~_oS2a2+`z`N9w
zu-3T(Ho1+5JL?G4z2H6!$^HTit8L)vvqiA|nkg8^YopH1FH!%ndnnlqK|;I{IIWxn
zi8WJ+y2x58xAHB#su_Wf3@uXS8M?#vLEEXI{+qyfrU~V_r4t=;Jp=8d@~Nu8Hr2tF
z=gAeW=3ssG9>h^+h{J^kP?zah!bbj(n||>;v@E=dzAJr%Z2Bb-CGs%nI<%GKn~kTd
ziZ4RVu{X#NJcuzTtWc$^I-1hl2!|z!=y+xuvG5QHdFcT}oA(5GZ1<cH?Q|wLHK~E!
zULRDqwFPcx7s9*TL~@Yj0%F{#(R5US9_Qi7J;*xhC{>ch;dm>)QYlK%Q;8jLi{8-u
z1ayZYdea*>LTv;=rZ5fQZ1fshZSJZ(=om@sf;x=)`Y61vas_q22@pGADDh<0A;|ge
zD0#(uJY+Y`CZ0FY=+eYfFm9X)RX^}K)aj-Y)me@(z|kEVFML7j=5-^J#d|?>LO1*z
zSOerZD^kwb11}|;IvBK?^CmwD{?)Pw)}5@S*KY|RO6sgpgXtl1c=;dXjI;O2@ka-7
zjj~dSTFYx(#Y{W;822|i*2<4O4M{MjI20CixX{{}8DMdsk)w4Dq3172fZ%Ol_0^YT
z{|q(o?#V&--H)SWZ9WP+I*z_!X@zLBnS|;1J#^?Pf7-241O<DhBlFAs>B`o3Xh*+R
z(qhLda$dVL988=>G`KsXHA&fIznRy`bj{l+F6u{O&TmVIN3$1D^%Ff2@BTO<LGvK$
z7bho5_s^z!nDg}hy%Xu++E_u^j%r=^7++MLdXpF+G31s+nW3KZVp2N%YpO~<lSq5z
zj4pNG(t5J!6o1LZApS1&Sbeo#3)fAdskZLjPe{wG7+LG)b5wjD@#1dr_+MXb;8l4K
zLXBsvh=2{M{KR4AoI1^F;)h!%#C%6xtpwHG#F|+hJfBrEI#yH$CKl=3R5}%E->o46
z<~4DbXwsaz7hP0ZZ3a4@Jwjb@e=wwt_TtRB<Ai)m3Q^!U>ml#@K-ivVK@IdA#Iwv_
zsiJzkfr$2hhJJDz$us`(K8jSaP&vEDjyrwTLIJ<Qo2a`t1id~IM~N@o1kW$Cz-!-S
z`phqf(8b_a;Co~>d2FycYHIT(+B8FTH2Gyj={*}l<;6J&?>diO--)4!ZS%yp#-F2B
zc^c8GYxT*Ho-hI}@!~~17z~f@H>n)gJ_ScTH*m9qLk3igHHG1KAJaO+^HC|+hS1Yn
zNphDw0hbM#FgR-m$0K<LFX6d@@A+`F>IuUI8uVf{&>$(Yb;nii3m>3^WMkmhIyv{^
zsw>bivXwZTf05ih-ayl2#ahZ{$_>&_r&uLWJWHj_)P-yv>%!2EGX~yk640B3_H_P<
z69PTkY?PEFREbkPKu+ygOGZQj`e}KPhJ$t<G4v(@cMp}RbmYpI!AF_}g5i!l{E>E6
z!1SnCFqs;Eek@aLSP{kkvczpW&%g{1<lFBN%A=uk1iPB4LGc#=!4uhU&Rj_Hw9R~v
zZC<#TD=>5^vM*hfr7E(CY(G}Ic&Vm=EL$Wk`zA-n%&Wq;sVr-GKdX?_Mba%xR_KjN
ze9+#sW$DU+_KSF8+1BiQBbNd*zvitu-<o=sxfIK`Eh`-2yC?E`^S0${ErXh~#SYtZ
zi-$+HRrnp>zGD5zggd8;9m?}Ij!pml!S&<iD>r?WN$~qSR;(&@$ksOZKT(mt#d(#X
z>pI8Ef^F_aHc>ZDRDN4Nd9y61|Aea5mEILzmHsVNg*$w9hqSJnuwzYiz=6b$8!bE5
z?g~7%h_C0gv#2)obb)!m$(_Y}Bfc+lUGG%Au0E!HPt*@5tJm+BCmd`2fbMt9bb}II
zdt}XL=s6p5FnVP>qcEO!{^mjN2Q2Ow1z2tSK?&xm1fvy^l^E*!Aq-W!{%TDh)VYl;
z)H(&|R~GcfQCCoh^=JiDv?&0diWO=t>mAAm&5gX<fz!Cz$QD7{d6zPcD*?SlFrUjP
zM5gmt>eEmUFZ9*ZNq+!UxXbPjfDSx;x`bua{rbi8SDHcg3rW6kM7khYmZd7TOMWc~
zam>*%pI(=)73#D?Z**$sYppPsd?VL2BQmtZ-3o{J?Mv>`p6yXQJnCv)hEBw!jbk%j
zc6RARdX_rm2yN%;MtN;>FSbj0qZ>W7(z|kc{ahwydUe3T)E94<Sl`;v)-|^CgmM1$
zF<lrPDIk`w;05fLixtMr^%(Nx0lj;v>I%5eD0potU0RA!e*IXC$1giG`bBy+X4oiz
zH&s5B9e(U)kiwt~2i&(Sus-4K3*z?|grsw@K11~CN0(2f7C$a3p4F1!NmOi9QZhF@
zHk_4VSaW4+2y4uR4~!(c2TF3ethoI6N8&Gql_%rq!Kf%zl^L%=4QND#6Es$p?N~_$
zzn6z0@5j%TBtD^RB3_>_t0X^P#ZDh7$*tCsJ;VM^Nos!3m9gRR5y`!hxd#b5k)&j(
z<cI~<lAp)_8`Qs|EY7tB5hp1$GCVGrp|G$vAwCl5Ci)&bb?mXi23tb(Y26u4uk)u;
zm-YMeMi_$Oc#LCDQzGT<uo|Ag2z|F?(jUh+VB;%+|C1~T1!8@%$N&1Thkr=8|6g9>
zf9zEfHd+b=5}vTlTo*vBFH)0wc}?~i)B9$LHdB^tvz5xEQt1@@u+3d6O_RzZq)t+4
zgw!EIxhBI76WN)ARO-r(*zOy#rJN?Q5!gV|$#aKTT8_e#36oq{+aZnZC$Jq3EtPt(
zoy?ClmHSdF){Y6WnPeT}Gd(5DX>^RAuU90yALohl%ZSFEGM<An2u=dG-gwF>yZ6cY
zB!3^<_%P=PX)t?U36JUE=GSL?O`FErZfw^}U731sJ~4PrmK^M3EBEuAB=`4KQpb2x
zdil9c{WKq3EZjcIhb^TnA<TJtGP@yl^<e##PHBDkQ(Tqx#>K|1uk=!>BRh4DWowY&
z?=(6(#3mirWCZJki=4*RAE(%RDR@n$Jc%>M?C`#yRF>9zPs(K3Q`kqUtYq&YCCT1(
zC3U|zrxc8%<jJ1L@$pgiYI;3EWRMV?I431Md1z=t!qDKvM0uJ)YItH&lsw+R+HRQj
UFe|Z#KEQ`>I7|U};(uxU8z)yG4gdfE

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..0da33db3c308003647b082db9d88177f6ae33a22
GIT binary patch
literal 3765
zcmc&%c~nzZ8vpKn?<FrGk)XVo2r4dMM38`j0#<p*E?U4U1nUwO0VM=kM8pZAvbYO%
zV;L7*YC&zaXf03$*SeI}rB*xCTI*WJ(HXQV4R!7d#n!R?W6qf~{Z4Y%Z~J}sp4?v|
z_$YUd<|4Sa!Z|4i-2pUJlXgog51W>bU}~p)$}v-dD@r~;I>zdxmpQ$?rsM0{*lD+e
z(|h;ap34EJ<a)VqGETvXIFY><XxOej729P609G1IS{b6KlHu6tCbH*fwI_%aYLz5T
zpO!E^S)ZBAVWmuY2`O8V(>y2kk9?_JNV33TAdP}_0AxOc3kfGl2+3<{8YBor5-|-{
zau#HAc9fiG3DT{z6HH%8Tg7(ms95?J9VLPC`YPG{1qvomPGYBa`Xv;NJi!>BZAeVZ
znx3G~(#uoGASGdXyfHP~kY)k50xO>(r`wd%BFg^1i21vauiJ1x*o1S3NHGZNN9<!G
z9khA^Of)Y=b;<ka74Q8FPxlT-hg~;XvHi!v&VD=T8ND{cz^RUO#J0!Cal;9?GP0Du
zGwv92c59(lh7Jbz%eTNlpT(c7JA|^jOHjg*QOvyaEpVcs6`k37A9?gl$D51^@VtII
zCDDv#YRb>U-llfcAHuQEaV>f;RE{E#Ho_-P8R*!;3%H=3g0J1QXmzX?Jn(A6s@=iV
z=4Kg4*7bn8?I+;dl6CN6MF!Q+Egz2>>CL2+*@=EQxgWWw9HBQAibRZNDjqa?KNEi{
zmZ>`W5W*T&u(>@9%Z6hrn`?mJ0ngyH$4%+odm|WIki*CYN8njaD9CkzklM!;Kb%$%
zOV1pkzMME3N}6Zlwgv`W9DfSR_D@C!`nExx?OeR4FaY`ljDyC{dsAgD3^*?S5Uc}#
zg72p91!|O*GID;vW|z<(&sZyJFU^G8Cl<lrOTC!NZBy{3I(O7CsGfT7^b7R%xjWS8
zBmKk!3g_b5o~J3(R4>M*b`6!P)lr{9Cis-cL;k%Gru)KNa5>Z{viS<3wjU|TyAllA
zF>g@4a%HeSvxMq7^(r;b`deg3`3BGVM-g5%ZwkIYGoL;%J_50KM&Wepk5Ny95rcjQ
zZ2f2^ZRT2-k3JmF==Y^6s%!Vyj+-5W-ksZu`#AL%Z%WZ1bBmhdhb^FMj8k#W(_nP*
zr)xG37oCz6ou46jA3cz-wL2jWGg-?jUR*~unsVfBTPl)9-)9ZiSjmFR4Q$QCeyH)R
z7Eh_Nl4J}WEUL5Kga3BT5&tE?)+XKR1H69PJvMrcKa;Ae21l29;<?OR*!N;Ro|4xr
zF0y7sbx)qtIs0-^OUZD#;*K-qct?rm+z3K3o7SP}ODbX2)xNNEl?&ZBq95y4x>{;=
ztO3u8{U^E}Hi8}a%^j31b&;Oi?<JnRra&QSn276c4MEQi8))_Ctq}1>Aw(U##GL)U
z9-Y@eg_y%@sU~MRYQ7MIFIdO-u$ENgt+za|^vP$C_`C%@yD^)_JMz@mMt?@Hi5S3G
zZLp`}%n2A3MX^bDo#FnSX6do+r{G9LmAFJ7*JqdSAQ*P@0n=kxDcUObz;<@4DRI$5
z2&q~K&V{w2@T|Al^fr?u;s<Z5<NfoyF^gr;jiRa3wU@<DqJf#=?*o6YGm6iz`4SpN
zoW`G&o~J$-?O^S=as%xV`88END1>S;A5!=nSf$vk^ka?%C5Y@iOVH)ONsO^FME%yi
z4OCJxpzF(LbQ{}!6~^xvuxl%ZpsWy?@33IW&H8Gou1C3ax_X-Qv~LK<1ian%R<nXB
zJ$_t~nW>Z-tPWB4R`_$yhmR^0!vfgv&=;au$8oqVl>w{wpD-mClc|#MSn-*HT_78C
zl1>QdiLV7~P+dikwQFPn&9+^p*sHxH;u{}QU+rnZ%6&$*_pTja|1UYjR-dED&uEj3
z`;m3wnxwxz0JM;XiWU+jvPI<3hji4>Um!G-fk(^9q)AFk@&U6dzby+3D7L#MrfEyT
zVzHus$VR`diwdn&9?4glwk|HVcJMDz@zqO~_RvJ_ifOJcT;5X~_vuD{ThU6pkr{Wd
zG;b?j-PdmstM=bsQaT`{Ortxxed#-cBC12m{dX)|Hza0%^3|g|mTz#IakNA|cIS%n
zVaXSE>5lDOSvdl5liF_!GQkbT1XYZQ*1k)cFRu1B!79$gTKQ^iOl;1hyJ04v%ii@e
z0o<Knf=2~2S67Z$(EjW>rCGCuUm{ieEYXx!2d?PlJ)a(2wmo#Mv#x&0;CFV8uky)l
zr=8dCirgMn?o;Tzu4dBi*pKHkL)P!s)y=wEUpQpL-l>iApSLqE8$Zw=U#9jgaw*@R
zc&@@bPvlxrmvX5_*HGkIdC<^WpZk+&Xw{)PH_w#&77zVveYTGYR5lGH<bgaJlPaT_
zM9Zl{CVS~7670rlUmFu$AtnjC+<o3Y$!()H@9878Z*LR5xel3R7mE#>858@xc<4|9
zsl&z$KQm^9O+!A|ab}unL}pn|6+K^Wh8vujl`mkBY2Xkc;Xo}(Se@_H0g`;KL!Pgd
zR`?dTTyM@^plA<ez4K7VK-+_Pb=tClMe-r?4ox@T-1f(cKF1?%zE^k9Wh}}^Cbn>P
zH8RnWW{(b_mz{IW%i27AIYD}{_(AnSgQ_9-Ec+7agl+XJo%Z*)=Wr)Y2zm8wl^9ae
z1j>+@NvEb{W{DG16NPt|JhDwv<+8rYGj>joRZe#;gygsP@poU+2rl3y^I@-GU=XcG
z451J4ggV3_;T8i-|D}JOIuHwssMszYfjYdtN|b)34t|6MJ#zdb>R|nnNTepFC1rIo
zBQsJyMoz5*oy-FWd3fFbA4L5<$m$iIpb}-qCnp;898Cz6BmNB!bF>fN-NTQN3h>0L
zSL@D+I+b7LHiF)-C&FP)6h2C@rd5{H5x5#A5xP#vu@~|0Ao#Zg{!g%=l&S62;s0Zj
z_$Lvy|H~cz({7UT_CTe~iB*>MvjtGwt7Lps)SJ;hofk{Bg?1CfllSL&K9YO|8OQTE
zynhlO#Pdn~*d)uEKT#MjjK=bOsL+#nS5Ib^VW!|gaKyhke~6pgNHPeV5F*US<_K+|
z&=A)=A1*ZhI$>(r=e5FocAUoq_qgcE*$F}3vvo01$-=%Ng5;Mwi!>I0V=Ycd5=1)v
zS%$*iEB2YO(Vl4u!AZPcu(yOKaER!-=24R-3G*<a>4a`cy)&QLWX(TIAMI(>#Y`~9
z##o?}c#)m>!rpo{A5ttLzsW}^rGI)t@Z>CEgAWZC^esv`UGXDBE%heFCel@Uo(~X)
z!Kp$G(qn_XXT^EUBQ+T=D3K!P2=yl^b{+~@^S4-%%n3TOujBo5I?u_HtYAfScuOTa
zjVvHL*DcU>hM;T`$AVL^C;o}4a5SBopmIppXUxe?%yNxSPj}U4WEgWC<|bxjrWn&4
Z+`Wdn57nx}?EzkXpppDNiy(hk{|!(~IQIYm

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1e1b4765c9bc16d822b1bac7fb9e6d103c2763c9
GIT binary patch
literal 4158
zcmc&&cT|+u5}*5(WfvA{`{6F4AZ4jaQ4kT8rB^{vDMsnNtyB{g6oZ1INE47CARwIx
zLR8d<2-q7o&?F}Il88o6j2h*A-|{f}V*YsNyz??=_uf1A&dhIS?mg`9;3@Jlp$N*F
zvU8)@6et3yuQq6?w1{^Z=^I`!;X>d@*WeNT5o*FZrT)|lKlw%4$~03z@hH9_#iisa
z7R6FE2VM5KEuTGZ1pv}o7(H?kh8IZ*2azg82+V*bFW_^6gW^L2B7>45DNLEG+>4Y?
zAz`GD{db`RhIEcVVL%V^^aMcbfs~UQodc7zF@b=C0Ov>%K+3W~PL^#YPg^mfFg7u+
z%9P5VS@YTBCww;gw~aDg$zONrLm6F3B^Zup4`9Lv#QRcoWkVA}QsTnmlLJD7l7nQU
z=tWd$KuAJtN?g38xLi#6cO<-w3!h@D{{`kJOWr?`>YeCDJs~Ake<$<e2I;VI1w@;z
zht?qx3>^*!!)gP#eT54zwraz}emw}$=b_#839xxN5421AA>k)&03#)MHUWd&U!uT&
z?_0S1t`S%fmQe32fL%}JVbu;1%1Gp5(E>|2y2lt!eB}rlans<L$2)YsZ8{XF@{!&l
zEg&8n0XxAJtk)7S*ro*>9d%IRS;NI>dC2oOhnP*a;64-$pRH5_PO%bPThIzM6VhOc
zQ3pEPs){rZHvw;x6KEbj3*xSs5dTCO4o&zBio#W4^B>=UeQ6j3wrIe)Wh|&1YDc%E
z5p<rBf$IuN(0g<$5EXV{-1!``oHb!VTQvxpOrQ$Wg^Ed?ux7Lk?!sKS>PTU?zMKPF
zJ$NAa+7YrI-A2(V3b2RJg19UW6d01wzjrSfu5kd**DUDD)dOKJ3tCEp;N(p?IDUzQ
zOFNB`OJg-?C7p&hs~kY{fe~B|E`Xcw;$da750qxrz-a0KlsGm(rOQ0HwF5(MGVX&x
z_GHj#HHD|oGGX^M9ngwbL@%m-Ln@(nQL-)qal9rt<Sl^2wnbQHWIK`D_zqsS%|zc$
zE0tkuxWewKyNN)RO3+%SO?cM4K;6!#VaOt%XbKpV(KkAWZ*kTGldE?jjyR1SF5Zt`
z>72n#6CO#eym$`!*L{ahRX;|i6UErhgiv_ZU5j&c=aEfi7on}^HPQe#%(BlIH9E_p
zg}sO1urLuFTRVuY>&9X0%B9$#w>|t~{v6}CIO3H@xnRE62X)r=!_Dks7|l(@ry6Bo
zRu;3#sDeqXM*|(mB<cvUGmFLYPM<GbDxM@AJLL!RtKJddO+#emYZr_=6T?%egW#z7
zJDfDVBmLmvERqMZV6plL{NCgYvVQguJ7pR+vb-B|P94Foc+Z3Eqv6=|gCx4-ejKc<
zw23aY=g`SZ#agrMV2Yh996CQ4=WTt1wAb$iIr}&8Q$QQwR>pV&Wex0<Y~ny*5$kn+
z65Q+0gdGFzWO>a}Y-gtlI;hi)&!~Tke|F|RKJUm>Db1`@tlj9ERQfV=vPbF(8Efo^
zpMWH=EDM2*hfbv8+Rwpo{~?ybHH4o3h64oEf${vgxXR~T@cttQ-FH2Pk`*~9^oSLC
zAaOB9>Mp}{=5>%E$NkBIjwML%vI^OFauhXBJc%1NZ^I9|I-*rWj&N$&h?Rt&A+24s
ziI(#OZv4DRWw?4SCr7*i_2b=A`W{cvn#b3nqR0WCto9CA@gq{}GzOqGCK20{HwSfZ
zmzTA=*hMNVo(}zkX;{L-0OanRA}ze}J=~d+sw4L9#F96*N`-G%BkjW-v7po-qU}qH
zdtP&{lwFuJafMSSC4M!Gt{zy6sh0Hc%r9?&6@z=!U9@K)e%S#2?V>u|Z_@*`XfnZ`
z_tuZxcIXlMqf|`9DNTV$+j<n^(T+#-0bX)@I-LEY0`oN);(jwRkv-pE1^LBtU{J0R
ztx?Lwt=kYL{j-pqbfN=WBY3EoJ?$RnVn-Mj(&2^<>b*sqdvn;E7Byh0$6{DFNfj(u
zS`$B;l?+!an$X3lU@$xHMqIzAkKHuyLze`Z*r`l`%qA5pQo1TzO1)s&gx)*hNQT#q
zGG(LSQuQr-J4;PAi>HE$lMC_Hwcn#tNhK_UE-$bSyv?fScFNS=Ed#^GSu#CVsxU)(
z33~YKS1j(e45ZDIRbE|0q2ZU0vC-ulIPO1i@n^@yX!4KcNX^>@8(t<PzTmmDYIyc2
z!YiG8Gn-ThvXSBh_FZRBKUm89Ve~kQ^Y$7i_Kq@kGr54QnzNC7v*s8ax$lT}mY2#k
zcP+y`E;jSP=a{tF3m&WPttYpElIO5nMisie8rW_Iz3}MxBRJjLPE6SOjQPpx;ZnUE
zL#jvm&)-^txmrHK{4u1Zilsh<p73`U5DF*Z#Y|?mN1EVd0LJE9SFK%#vlVrHd#cv0
zmy*|TDz&c8%#z`oMh^8<uiq%AZj;Ry?fNoDN!P2v_vo&y%@d7-Pn3#kHf)(>k@#Tf
zXwAkvHS0{az@|1kU(=~T*RQuWr%=apms6Qd-KJs#-;T)P-nz{tMuENA0^8lWWiuiN
z8~l##-cmkGeCtG+ZT;4YIV*pCFnp{&uX6rcjH6=Lu&vrQTT#!yuOYw2ahrzo4!g#J
zI@eOusOx==h4mg)HaRNxO+}5~4PK4@{Y}NqK6`>s?yzs(-nw*u;=}9x%_Uz4^ki}-
zIkc3vhny(TTRPBEwm0I!F6VNG)*W3jLmg2!23pJaC5U@YzDM`#(fLY(t~jgh6X-cK
zdgG96x?%pL#pk{|FotZI4#w3tBygS*I5PcdZ5@NwO{2Az546!}zc6TLEA}Of{McVO
zD{uMSRWG03y|gbcV1C9Zuy_{R0!8bk6oiZRZws>BC^yxi$Ra=3e)B|=@XP!2Lmcze
zL>qXs3qqX>br$Ykbf6&2wZtfBplEhsxO@4m756S5D2(u^oWG7mSr$butZ~d2F77Uh
z@~-!&b||(ij$YjC(;j}MyEw+@YX<#g@$$KQB<Kq11b@3iI@j}O-wxSyw%_J(PP*~&
zy=tOe?#<ip_!k=JkQu6ROZQ2zI2pvZR~$0HTDm0a(*DG1hl)9L=x(3K*XidYe)kIf
z`U(mAbX6&AjJP>{%yu|&QSl6k3riwmqmq)P!eYZ1CPdSo9%<!UTFOtR3cIHWuSyu=
zhp+aN+{Tlkpwbdzqsu<X1riZq7)cYlA)3sMFi)M92V>JG$Q<n<5k7m|h9Pr*-KC8_
zBy%9lH2ga<C-<J(#fHU4B#&{`KalbZBz&ST+^SF0xxdE$8`MvgEXXwjJ}W6CGAu5L
zB4|phjeWq3wN>3(l&ooT3uYMq<Gxd@G2@TfCS&*KjZhfHa<gK*3BKfZ81{_O2;;Y;
ztwP6FXW~nM|Fc+7<_c5=ZvSK2{tr>-|H~Wx$6k_hi@q{f$X2e^;Q<I#`CO5gmxqt#
z*u&yi5k`!diEKn7(L(ymh^t7HCbEeTIfz6NBHIYbo(*l_&a7-jB4=i%_3@e3B+Dcw
z0uxB&@r8krkp;c5U*N>(wrR|0&kS^Ek;si1Z2TBiaxOAv^q64N1t!5hOHx7|X2<yX
zdPOqlah`O3pGVV!B%ZA#2%Q8iWAP+Q=IkT?B!3^X_)y0PQ4r%Vfv3x%#cy2qTD+Li
zU6^64bV=%C`NYtBHpxLgW(j`23ljW&C8g7N3&-knS@dx}bhBvrARnfcHsVmnCCSWz
z$k~mtml&mu<1ciU^qX!rE#s{hiR_rAV=U7HvA@IY=wQ>8bWdh7Ms$<YnEul#j$I19
zXCv{XGpDPg&;3L;X=C>!NtW@VZA6kzjyXw+9NU+a?ic5fLgPs4Wc=y)v@3HpW0t^I
z7Y8M-NC``x79ti;3rb8(NK;P@OH7JNh*vi;pKdbUSm34#@csjb2mnv|KL`H;bMo-^

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7e9af93b02ee386e9ac98f1c9aea4130450e7a99
GIT binary patch
literal 3781
zcmc&%d03Ry8h_9EzHjE6VMd1K8)t+RR0c$lO%|1uZ`c(LL1l4)VG#%rVnAGK02Km3
z7R}W3qTrgAhT#&JY@(S)38myU^L8^tE4`0(UBc{qLu6j-kNZ6LxgD5u&U?=L{@&%B
z;rs~aBiuNebLA$6aUu@P0aOdToonMS#q#;GaF27~Dhl0~HoxCs)t=7*XUVCYI0g9;
zaUyFkaA1e_RP2y807%DT(kl?AV4S8MS#z}76GRHNN|K;kk`$M!)2DJ+sZbt9$|j_V
z*06sDrFJ5*0f&Kf3Nio?xe#&DVMG*y^`j|}AfP2;8W=eXBJwdIADNu>l@oQLnGFsC
zhLRicBzzS+w4-9FUv-ourDs>krip_wB}`GJrypu=(!b^8AWzCl%*#w(k{g$#%hk!#
z$RsT(E-@=3FLQ}8%Y#_C7Hck2nrqnlzwr5ox%ZlJUzvq*M@dyk_TREUq|-rFvmoc-
z9Nd`NKv%r^Gu%5k5uJ42Y{HJ70efus(D5U-!`Ov(bokC6P*dnR==G_g_s4acIJ$IE
z>q5tY+r{gUNuR^-Z#<43*q_JlSvSSYE_6X>(G~Pz-7VxkJR8?$CBcKHU6jOODzksf
z1vuFH5RHZ~Jo%I>dNWjxX1AY!_cb}Fqp%wnHBnHxY(DOe4218z?qbzj!PNFP8F;-L
zg&KEt!k48R;c-O{W$RLaeSIb|Y3r>-H%}i%ZfPxaZLvrcX_zM6oNXn|7<rZ1)_xl-
zoe{JB;dCsUh^aiT8G^^$hqLZqOK<dhGnOC+pO;(U{{B#q#{@#gNN0R|Q4^GX*g}04
zITcFV7UR3k4C<MA27G<(=p&=<LZjtU{C05wj0~6oCq90TvaI|G*{yyTOap&{f5aUG
z%GZ_3;{1TkE2ZCye@*nTMi1X~u7WpCH!xdw&d0TlZm8M5iSpR{h+1&IkDA(ID;`t4
z6gRkB5*rqJF|Fe7nGDw$>I2Zj<SmI%&>zB>7cK{<<0nLBmk_%9Z*tjmyTNtZ^VEpt
zGKd@~Lw1`wp>IY9$~Bjuq!wT1vWqLKiuD4?MhjpZ7GnFUhnd7PvCQ-nDp-Gb9$N3j
zz_?FGFmIlHj1CMxOF12=rdIX_!+SY%ar2B|^t!&38ouyzYMJSmC^PK~{L=R&_}1bA
z`pC?1#P<2(Y}5D9@XRb+cWf~|z;!Xl-kr(l8Zs312i~@vksgiqF1><BYDSA|(;U!1
zmzv@|UZ(eFEyVfvf>F;;SIus(IwL8$5HC4|zLURZ)hV8CFqOUW_!=^E*n-?FYedqh
zTWsc4R`T-2W_Ew1Ejn?|70=&hBFULBPSj}nHvaonJN!z3rCGMgJNWfQ{cKdVmdQ}n
zgPqed@ls|fG(3JC&tKUlE-__9jXyr1^Bb0<uF{EeMV}+&Pl^z|^i>dwuHA^@)@+5!
z&qu+YN+)_$xGn2aQzbR&XvQzb{tI23?#+(<q7S7?ouuavdx_^&7bzsok+|{tcy#|{
zCawPX3WPsj3=u~^WzPMx30=_LgXoj5QLT=0)Ycu1yG;`<OeOWW?z%gc{`e6jKj=dD
zze=a^?v?7RQ$M1s!^bcto2;pXfh3GdBG{B0j&Q55P1<3823o?miA!|}Blk_Q2am76
zV=O#sP@UKvTUk|6;*#4CvaJvtiw}sxau=}KcMX#8o0CjVjV>6*td_wril)vUxG4TH
z3Yd89Wcao*OMIdFGidfci{G!gK)o~7#?)@zCfa@WWvbpjgz6eNu9$qJQn6j>$8-cG
ziL5+J(Z#^IOxD&A^_>1qR7xtKYvu97{LL#d-jIo%uV4tu4UtXp2$p=^R4<LO*dmQn
zFOr^}62dV73r1aUQ!q8BPAT+yr8LvzIMrXF<s478D-<39Ecp|9UcnudIB|u;o;(7y
zV5Xu4VAA%mHV$N<e*OfZN(Szf6QOVzEhzxZs{D2o7E!FZLv-toqSa!>=#b5Rb*qX^
zRPL$0t#zwkHMP-}s`&afWfl$*`=Z<Gi_3?*CVa4&-&wNG$|vVWZ`;mSt48^)V%6GR
zr8Q$h);q+s?<(71A6_4_MZ0_L#_`dIQ$KItUB1aBzP(iKzo%l0M{4)Jn2tT`wtB-g
z!MnsDEA8;Blo)9Ly+H<0RsM~5x)Q?~tc&ccYOPl0+|U{T?hv(C2&UZNdY|B(+fMLc
zgvCBHgCeM!K-CkSHSPwWWp6NsSp!fa$Y_9|T8<dELM0nVe00^>t)N%$V+LenFSpiB
zTEQ6<CV?aw$pBlf>YqLUvhICl8zu|@?Q@w(wkz%wRoMitpI8=QH)E4)a(81{r2Pxq
z{ff;!*UlLixi@@w#F4dg$44EC?_BR$KF@jnvFz*JN6MpI;#vwB3$Kc3x5UIV)pn6b
zD`GrSKHd?y!E0TtXZodgl6#J>o9~_VWoxm8cjbaf`o2$gM;@zOI3@q~wayLRRg0cq
zdGF5ko?}&U(+Y_D2?e0)nMT|qOS1x%^f>WYbO;q#r^geAtnyQ{0@}j|6~GzJ;4~~~
zLrHb)ku}oci+i3kfQ~UZq#y6n8If1|2-Q*8${*2YZs?JsXIhyB&po1N&q0r5MTZ<u
z%OA1qH{Egr8)NB3s=Bm>M6EW^4awwCSQ4OQKyFaEe$#WzWoOXOdg`H`(sYlu=I|ho
z2zmO=m1L$Z5jaJ%p3X?q=Zcdvl7;t}Jidj)l$B$Y53Mv;RGMBRVfp=C{lgbFoC^r!
zq{ipfflbE1An28|v<@+*evpWS8xD~0>-a3C=rw)^2#aEMzv>8-;@MRK@`O@^)SGFl
zHGf7aOn)O78OcjhatEo?08;*dGy`Kb+r|=7@htwIc>SZ4)fJwg66q6DlQVT3O&FFP
z{vA=XvkrU9!jF&*@WiU8>&}S=m7nH3g5Hx8;V>r(^A)UVmGN{0GKYzWp;NLOLGWz^
ze52$4Ocs<1wY56zf6OBPA_Dh+xx;_iO;X-;P%1R6^02KXfZAFm<0B$oh?+cju~a)~
z7omCbTAt@;laJ75@O(b6P2q!hK85#BF|KKe!c1Z2&-0<eNaUfBNXDsNKoEfV7gmgS
zaq%IOz*!-J?4K|6fkH>1c|J_&v@wEc+~-{dIX%IBmRmy9yu75KN$D}s5vjs{W;ls&
z`HQ48;`tjvND!nA;u)vH-c$Da*eK5>Nx><+POvw+C+UzDGbBgMoh!)Gg>Eo)W9WnN
zq?0vmt}e<mD<*nYR&2B}b>eUKV1Co*JRJ`y7HPkWM<}H>J1KZxuCT#}h6(ycrTiiP
zvqO#bCdDRgsPsG^AWVZZgc@YW22FY~!F?I2$wWbk6ggk0KM8U0P{^9rXh|X`*^&Ji
zUYkF7PR3vbE26_2D>-OnOmcADn0icRP#*DP%u}!@_{3B=nn6ua*<|Z-Udl_(bxzFA
ocGl(OWaZl|P0rD$Wi7FB^P1o`!Brh*4e;A9oS*^1$v?2a0DmGVAOHXW

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d112e179eb9bbb8372b524bcc40eea2e5cae3f76
GIT binary patch
literal 3997
zcmc&%cUV*B8h^iYPLh+bA|z=7Q79t}QBcIK%?UC^K^zRV$dXMEP`tIa2%@NniUSk}
zLnQ2NTSv<f6crV%2x`@$t%@C@_SV+bbCP1^YX7*;bD!Jyfp5Lz_rCA<J;^WLl+PF;
zA+#5c^Axg$pbsEkv>^_2fT!~I8vS#hZwu`H>4z^DdPw!R9u|U7AQamQNg<!MWlP6{
z6**`lCI_tmfM<@;BZ)BlAjn#fR7gl1fz6kQxxqnkp@9)Wi4j6fM2hN==m?Ve$;iJ8
zC9$P*10e?Vz^5kwdM_7F^rUlOay20ka1rQoIdagWwT$pJ6XP-@TQNDP5tHao8xdW>
z+o!bh&k8UMzXP&%;&la9I-$yt<*{LLNr9n3NkKZ1^dd4eFhm)%JT^{UQ$G@Yk7NzF
z>=q{dFCgDpX_H8JTI4Cbz)}~Rul3{r%VqsCh;mGW${q@OjxPn<f{}3hvNk-)vVi+-
z)(~PNK!-|{u<1hqSmd;U@>>f4I|CTd#US&?NC>EZ0bjo?1$OvkDDjg(;X^*G+C!mF
z6ST1?$z*7*v4>N8UBNVV6byL1MCYqUL$*|mtdE%k@xTs9<xG&L5OA@|9JrPyU?7k~
zR}>$%1dNC1jZQGDCkhs=GzRWY1GqA~5{h(_VT4^B>a3I^v*TqT*ysjk$2%duaU8_`
zB!XkQi(to6DQtRv7@Tv%U`d53oLkI>13k5<kB6Y)tQK4yW&kbC?*p-K2G}<|hIDr`
zm|ax>lCm+7k6FP!qXt<0stUe?Y0&K|#QN4wfh;cpNUB{Sz5h0fT0RVF2y=){=R&qE
z3)<@I!FII^c>m6Z#!PFFWwN0nHwZdz>cPoNEV#7a9(j}&fO+B>c(%#~%<kF2*TLCv
z^JN^YO!9@?^dfk*q8)NvRdB##I`r+q(6dkP!pMx_U|KmA9uB0!p(~bP9;c6<<o_EP
zhJJ^VtPsS<nL)v4ONnpM1@>q4vseKZGqJ+*O2N!%KUA{fIyORP%Gn=jg`S_2;FQD1
zM456imi((Ly7cVEutG~Wd|lTPZZ-N*=Wz@R&OQH}zvtC0q*eV3Ym8tUo9Fw0%(xVR
zXLts1%I3b0j-9c`CO2mB6GmIJ8}ur%Yd0*g10p%=^cofH`tUyKyH&x85f_5?2kk_w
zODH>Z#(UVpRn43mu2#s%F#~lw&u5*>7^lO(YXixXeAvtST#(=XohWc^9%S_z!=Wr&
zqG6AVp53-BJgwHFSX96Oy5%u}H2>x<is0Gu&eo0REZDk+&pkR98#QMQ@$}1BLUR5(
zcuz}*dG~X0@^Lrb74$2FM*wQF(Lv1@{jgh`mgqX^XJOd(Sd8~eCxrcW4n6IQCa~gF
zk{i=I39{FatCb_ggP(_DXq^uke$NIT+->HKx{kqD-g`NmzIt3W#S$Fu{KztJ*oIt^
z+_a61w%}JP&tYzRQ@|#@oGq$VlJSo};d=K^(i*NR23DFj=;8$4QQpn@#TQsh6qDhH
z1|_F!>t#4Pp#wX<tqYev(gBOjIYdC=S^RXb9Xk;}%aVIo5EbVM-2U+n!`^~v+zs*T
zP#b=PW8?J`TK(WE?Azgj4>x`Z?6{vfYfRgrGCBdP*)jzk*~QnH+SSM!Hh(m<T};N5
za{|#U_vJj<wQu0gh!vLca~rUv^_85ZyNp@ROI@*`6+uMRTA}uNvuT_eVeZ6bZZRkR
zw_eo!MG7X(X%dY8dNVA$SYzU0;ef=u+wm{w732OJ@1c3a33B=if7aGx{pfjaJP~U!
z0wSDBP_$Pq9)1GwoZF+J^P_#3-<Tfl!|x@K9|Ra8{}?V@)GI}+4Ki_g6~cHw$XG_F
z>af+4`}!H9zUOw;g<&Cep6ICc3$&?a1G#aY3R}?{&A!Pp#DaNdcxQSNbnh!eU6H}y
zc;1t^`n?TybNmT(Ns@}4PL*hFG@Q!f<!5kAvKQ;N+*yjW1V>P+P9$6^=)-rhjdjum
zhG=KfcKqXm-=NcpIqZ>*J|JImn_ZyYpmp%O-C$cfL91!16dZU9(EWi&SnTgwkUU*S
z^zjZM>izWr_UglR+*voY@qv@^X!xx>WIR`a^)8kX9|>l$iv-Rn+~*V4vq`LBK?)9c
z$%(7v=%cxU8?R2Xxi7A8WA2Esn@QQM{3+{M&sMj>S9e{}{=8hh^2WuuS68_Jd|P>r
zPXz4Z7v9<`A)iZDyc*W%qar;=wm|>MemK)oOX%(&V7_(gNZ~=!R+z&1>uZu=u2xJi
zA854fCQJnMgulFi&~z3&Q8A|}9D=(HFj6efPg#SL`c{5T`D@ZRd{ehvc|mHrmUwJL
zPg6nKdOZ_GhL|c`yTQQ9N9EUCn7-*f``}Z#RMEQ4MiUe6^)wf)-(oCJB_)c38QaX<
zvaS4E4sO_P>0RiyTT#4m=SaW0h~AdsO*wW;S~4U~hcb6NL|jz)w;tM@HzB_7)NZGe
ztbJ2fKDyW2TC(N92PqiWaE5AYfm4RQb-)SLwj$T9rtW)Ylx7#t%pDtf^+f6R60dy4
z21Dnv9i?+sKBWO|Wjo7#Yl1uWIG69LT=;py{i|)|In{wpsazwMirm_eQ`y!F+bee0
zhhHdk&vU8V(-_@T7kRC{GVe3xt(FZ&u2p-#NW6Ekbm7UWeMgfY_I2dBR_{N)>gl8V
z*G^U+`07)@c~ZBU{N^;$z-G~@nu6B#d^3-|ZncGNo5T)L*H6_Jo!nyLv{C9_cd%o-
z)!ed09d*TLa_mD+?{%*~)R{LivHyBUeaZO)@-*IXj|Np&k=u5g#itueFO@LpoW{Lh
zsD_|hHV&dIZ7!G5=;)J&K9|#cHRw(ie6yvOk2IQI`4heTO7$D)sf*u6mmDtfstrlw
zS<Ix->)bl`YtUtxGN};LkR-RE6oVTZ8OIQ&utXvzGBJr078AzM51MPFa75RQMLou{
z{Nb_&H5+^LnZ1)x$c0m1Qz!DMHn0#8h*&gv`k8r`wA|B7?;uaKwOYjFpbbNw-ae&`
zhLEQw>%SvUdatQeOjulal7<q}^r;`os;p$)mNb2OJN(~(zN=bErXz^ii6IeTu|YzD
zW{wuv8wAus>RDkRr#U1zV&b7K7qT_RL)jf;_t#|zF(KP?D&tLv)fdB%RE$O#ydVoh
zI=%@LUk&`9#e#^GNF|>CV|M%xQPuy;^OgNmFG<vAE+S<lJ<*@q5~*05^6~NVove9S
z;(Wr65l2cvQPdpzpOKjql}stZDHn<gr<}soXA0V27PE4qD0gP2_k%ONQ!f*l2uvW#
z>!Xo&b`$A^^K3U}@084p&dflErYKKlQ1~-jwVtwP_R+y(XO9W?U9ddVWm2@ipHBp%
zkM*YW`&Se_sN*@QgV0IPqKT(oGP)uDi2=TjaiOl^R1o8@hNsJ+#eZ<`Gk-p__h1H1
z>FU%q`9#xailiW4N2R~tY-NC-x^x<Ej;213c|-G|n?=hT`7o_i#D}^rNMaO}yC-9>
zHcB4EpX09XH{EPn23t>2Gnl1o4AX=70GCNo!DCm_JsHOs(M?Wf`cJ2*xfJ?Lq4uOR
zr>mp&{*)qFb5H7I886y~Qg>40q%Kl(t}fj_)@3=3qpp+jr{mME%++WtL2MEql(1}h
tSkkDF`1nyl2?@$%lNDhJiIK`UlQH8*j~Q(*@st9*{)1x#fH(ca{0|mk!UF&R

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f3f7d2a7d0cc4ecb1608dee4e76b94cde19d2b83
GIT binary patch
literal 4018
zcmc&%d011&7N41WZ*r4>M9obTKtL%0abXkOP)XPk6~PLq6vHAQTS{28?obpJ72E(p
zR*_9m5LyM5T?My-&jq#0vseLN>-wI1efK8C>a+dhec$_D&zH>1nK|eD&N(yRmtQ=j
zo)#2N*-;Me6iI;*02r)Z78|!}x1Rp2qp7})1!Ygd(-hlXZ{B>jHe|0&-nH;)HWZ))
zlxRG~rT7$!V(Hod6LQE#L=IU20JxtcW`qdq1Cnm2OW|T`z~YNVoB(-LkbkIL8A>4`
zSJ(oDolp`gA^$9tcs!jWpb(%3K0N_I>mHPyJDmfQvk8s?4g{JUHV(8jNn1W`!N_p+
z#JCA5?cXyNkwYgU68p_YNLTUcE`7)|QB{F3EX4^(dhz#Vm}mw?2gXGPN5%RF$z$c3
zVe}#_$UiVTA}%sYRoELSd;}%uG07vO`(I%GVYy90s#EAreNU(=&gVVLBb;{31u<0%
zP;F>6o?7-RcwRLLo|=%!Bgf1J2S*>omulyM&;5*WkD_O=-t`=~J|!J5_wVMJnq9=U
zxsC-ESMPyH{2V&D=NNoxd>&aw53*NuT?FkLZo$ri2hj2}1<H*M0x#<iU>uW~M0r*h
zsA_rz#{hRUt<4;kxoX1sEhoT9NepaV-;FlZV_^G=#i-lM5&UfP6p0QwV|mRQz~-no
ztUb^U?xgGi?^0v1(PrzA?UboR*iIeR;OQD@5!Q(3CbL*xYiDsY6*}Aqz1u`i%R?ZT
z0ExU;Zb)Mi!s4g~;B5E;bXfLr`v)cy0-y<|tZoD^%3Xn`uOo=in}8mESr0aMHey%2
zW`dOFW$0-G0WZ%v18i-L@H+jcpjHr%Dw7?6p2KW#;?gKgu>CbON;(R(9AAUK`&R+X
z)*Oqb>;M^;f*)VHoAoMP34Ur%1p7``6WK+JQEsgTY%s3JtV-TuOU~cNW;Tvy8z#r2
zYO`zXwSG246Z;n;!rT`-3zT45Rv=h6;6jX8zY>f;c7iqh8iY^(ML#xqF)*Js1Jhoq
z0enYhao<oEiSLfiA>`E&{KAS#!R&A!SQ38==}E@0{l#B$-&_=9QmfT?d9)u&c<u}@
zzrH=Z(AWj*nC?Om4omcR>$J1o)@o_&d)Etxn`A)?K{|`;{eX<TO>$OWZ6M3NM#Fo}
zd}479j}tR-EUQ+l65YIQgz|)Tgo1YjrMI>5z0+hwgs2c0jbFizC*nc%yS-@fs%G{^
zErO+EodOf8SHg=alQjAFO+ms`Pu5p=ouE(d9_ar~HrU>y50pwFSKnhaX_mf&%WG{w
z%NIR{y>64q&wscNL%HL*=WA@(3)436BYDTL+I!>Ri&K%f_|h%lF(Vl)7|6iR{a6pX
z<j+A!D1c3-ny|Us2lZ}UI?R5=c2scB5^?`|0R+Fi2w&U{$5HVr@$H!x@H7uYf|sF-
z1-uDDaHA&~(r*eL+;8SixP?HYM-Dqh9-vn`-56N){Y;FsN{0n(OQfT-17mM|2wZa3
z1JmRRmV4|HQt@;x$766Rudlw4OC)K4VHl2eR9t2M(;E;=Wz)bws*LJNy8;>}cc7E$
zU6|>q7Czs~fwY`|1MBQDV=1xogq@oae&`a8nLpjF-BU1wvstkbwqx~d(|NzZH4kop
z{kxs8QTo3ER@6iGhB2o>Wq1ravST`|-^<su?W!Y&e=!lXcPF6e`TlT@YaCZ{^9RtU
z7jLZasztGzD%s2S>JyI3oRK_Uj#qs{X<Ra#!FC9C#jkLR*@~Av@apmPNH?QNVDq<a
z;H&N<25v@HP?U8VJGGz~^WD-97mUKmnQwiGv=f8yO{M~m9H|FF?Mq<zylO0@4PY5R
zP6QXe-j94Nu4{bzSqwRAkv8;=;DBzeGPq`BDrQ#&5%(VwLg#D^S|c79kuu>YPFGDZ
z3aoL54P)QJtu33$EelFfd}}!C4xx<#xQ5t;<XCWZe>v<53jo%a-0>SfnW8&3ZSb-<
z5p^btd0Vt?32uH0+hC{Pu$I1MkSC~ziJD>Ha=~3}FH2uDS)dK~#O}gYANT=wDl=H)
z>O6tn(jQp`8nwIw_p-qFvdO$ATU}tqT?hvrKS7bNcpzb>rf~Ic3idpIfZlz%ku&GE
z2KM-j0*>m<hWcJI)Z-_?zZT446$u<+i04}3^;BZGT*l@sZM#8EY{(Sces_k&d3%i$
z(I-TAVs{ex(>D>X*R+Dh`_3>oJ5#Ho&JUZ{RUrW0tz7G80#@-`4~<fa&mj-J8(!yG
zO1h700fT1-!MT=dd|2*d=4Yp#OVyI&sb2E;-%6ahS`p4XR9f~E24Z@`e*GRoPXatE
zg|ya7;+%B=ks`bN^&2p9go#g6{)Qwre~e3}T|r_pPh=T-y{RB+la_%jMI<f!X7flB
z&r+Y}!sM-=nFpNBlooB=rZXj`|9W%LrXBiriKJL|ASK<<Wv7X6%Yn_ij6DinvSh_u
z_KfqX3GHbq-kM>yv?WDse=s%6Dzv-QxAow*?8%C|XS3`}w(p<5;z@r`YsrqhS?du;
z+o3eAz&>Tf*hOun=|#?IV_f$+l<h2@ooN|%qpfUL$-I2oW^Kpv-DO^-o@I;L%lA}x
z9|`E#=UB0~a`Ca4fg9}=8HfFw5;;0fhcc@J&+Z(%`1GNyqaoiHx@J37?yC#GUK4in
zbY=E;(Y-C3b)2hmjw}1S%NC!h+TW1y=x#^0^Wof+t6n@AxOwJqUgJ7|adlme<Todg
zBTbi_JyOuRiErqZ<5FGNzExzk{MOm(qBA=T?6>H;)*R^AW#U!7q@$+zT!wjIXO8R9
zgBP-=C<kwK94)z&XP3ks<yKqTRpheE)UUI)>~aZ%&aTTjUiuNbaorGlwCR;{8Xc~E
z<as5@Ta9jih;O*;N`2j!YrmmaTs!;$dZOQ_=;Cin=2ZtKagAov=rt}Ea@FXsUO=Gr
z>LrwXE77HpnlZV2Of@)>VNncw3RdC~Vaix`a6~XeLuk&?OA7l&3;XpY^`j&gRn+Xm
z=k|xh!j7_i&;7_?1PF)+K!R-t9y0HfmVWibJm-&ipNO$miHIDsVOZ3syR^}7To9?a
z;NaLlBT-uKDOE&pR7k9v9}Ys{Ae3A;k*qhNY1F6T{{-}psuibN0})FZ7#bWYr*N7^
z8lex^sFAMwp^<hpp#;`Q^l{56mfHAZmdDurei;;^SnjrrH!f0L48vFvjWBdUM%r|I
z117!-_&<vULata>?EXKd%6}1M{l7e4$zS!7gsH|tu7nh(8VLXp>xwj_o}Tl(r>P&7
zC?7Xt#9AtoN~QDZFC(+1(gdk2MCv4!hDhy0RC_YoU=Fjgmr7lknbwD9T2n2ROavy7
zbl%tF%*>|H3&*)GjBcO6jE>Aehn7m+nL*~usH$_RIirUMSkAQw@Lm`f<TN$h*T*xI
zIgj+9^IN%`9#rw{RYB+^Xi>*gEt#{A{FRHmt)qgRL!@%XUj<K>LyPZ_?)k+RjPAw^
z>e5xItMdt`_hhkhZ|i7ZpSjVCd{m{=c=Ofuxh?oOAG%qze2@>*N|_?ad0{MbAa!+T
z>{UhyL-_MuRsE)$P0LX0rBVlG=^VlIK(WYa>hb`~6?9J~F-CNg6PW(fDXK4p-jk_3
z>CEZs=yPAGEJ1xws$>~2+D5ACq}oYUq<UXfx^JXY9F3!@lkunH)2__bs4aoWKp~I$
vDlRy7LZCu1K^_wmonR0j9HR`2jxw;YnP@T5T<oq3!21t4fdk+{|6Bhz)#cIN

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06444accf507f2191f70afd33d8ea36e7ca5aa07
GIT binary patch
literal 4326
zcmc&&2UJt{8vp)xBsUN?HztCL0x>FEz=f88vc-tXauN0lK?s72p#_5j#fpN80*V`F
zuvS0?>#B+y(dVdEtJYezxZ&x$H;C3}d(M03yz}~<<iEc8`~ElNM`Y3<N1C7`=;#2N
zPlGXlLw!!o{IpoPUq#yb4tmL=dkb6k9z1&6y8Lm-O?m3>+2Q3U1ysi7;p<8Z=bKfX
zyK(Fd4YYvnZbR$ShBSxfn7e|6>e30Rt|$PYE5VpiAK|!<vq1bDnbU;G1vrKxAum#%
z5<N9uo*qwQ0)4?<B={AH)ksYJRWOkaQ#H^SFoPj80WkYiS{A@m!J-&UAmAY&cw7QV
zJqkF?TUU>A<1?15=yy)6`)9QasV<F>LLY1d44yAf8RZ8&7>=utbr&xuKjM+-MJuB+
z6fr5PsnK$kTrYuHBt%b*QYL38QnU!yV1issoJ)w8<L3WG&Oe2nE2I|)1L$0i7I`lJ
z`Fql9{xnE*nFlqkQfO_61)ItN@Y7{JJTJ0_he!KCRDV;nag!1jPB8`Rl}AB&+Zw>$
z1fJ<*u;_6Dgzx<)Tza<!IB`Q^Q>X~4o*KfeRZ^6b#>WyxL!oh(1Dsgn4Sf{0@NCRG
zbasawEHf9Pe)U#BJhBH$=@0I?1YFo*1-!l%U}EYH=MxQKNw_N{E%1cVt%)!xs~7M}
zOyEjj4XoGCgr4?$(3u)@WZAG4Oc(foWy2Xrt#yW!-vm&vKM9t{n#015eK4vt1|qii
zf#%5^Sl7B6-O)jyIjsxVj7*@Zu{RLqBf&w_2Km005V)ffL|Yx90+T?wnFi*(-vRev
zIJ9`v*qyJ3LD3jf5bgAa{9k@Ti5W()i?9MkJ`a}JkZ^SGUa*<t1+upss9n?##EUqv
zy;Ke-ZyCVxizHlJ>wx^WRDxCdDQKVN1(px&;Zo!>xb-dtvQ!~Zn!g_2Pd^4Ly*I-;
zzY%a}6^7b#?!y3e59m|l1W%vMg^gGGf>nw!dS3Aob&tM>R1yTKDVE^1BoM}R6e3Oh
zZer1vckpV5GdgHns!R3phmE~A5)s|kfz@PdLbm=nY8_lm-1k3%`r9|-i+%fnW6M2I
z5T~$)lKn_!%tO(K+{mo+&2TjDI{KmV5pwF5img>f!<z#eaGqoYxwY&(>^S@u^??9v
z==Tn2i?1FU+f)w?;xu$5_X3u80EcV#4(PFK8z$WDjjwCugJ#1~xTP+E_lwf--u5||
z+u$K&!ZI_?FUR&E$Am+~+I$X&luX9@j@Uy+9S<khG(7>odLgWP>xc23F+78=18>XM
zaMJ0n&Vz@8NK?>*!Ltv+>#e?^7d8r#d)i`8rW}C69}eM{$Bh7WV=UHIN1}_PkHfM(
z!Kk)Z8)!_YV>S6Bq31|{s6X2Sx1P5b3`VuX?WsEecXPm%v^!84YT~<y<(#+0>G1Q>
zxzO)YcXIXmiP&0=BdY6r0N--*J(_g-K0e}5Z?0wjbZlF<lX&)ISJGzNQasrq3_k(s
zFtjWRavu7S#<?@VW`8}$=n6t@FL9XF>J1KG4ad9B;6qW?MC`k?37DxZBJ2FU(Ta36
z-fi+#Jj>t~QY75Kru|lcJ&MgCYDdcub>9t3HTVv7Qz$WzS3~8uu|x;mOn$p}G%4Sm
zY*@9e#?(J46m6P*9qTFX!(E#oK^@H^Tsm+zu~j)4%Y5aHF1Ft^dNTJoui$(HZzp=J
zx7_R~*Du?Ezv}&u$VgI#98HTkIw6lJ#Z8JgyQPlWI<7aWKjnZ;T%*fNv+Kvv7}Q|@
zxM_{e8EI;is=EtYG36l@vebi27FL0^O%``LIURPtUx7`W)yOR{AUT>BZ-~s@Gf=bI
zS<mo(f5;pX#F=)-3x%#NK~uk84MkUb!NwvRqL-{UWna8RNB2k_mKgpF{pdG{vbu2}
z#p~GUoZjQg4PKgO$g3NNY3>a`uMa8+(b?-D8=enA`!12Eema28%U?n0!R7ej{(7kK
zLMV2@Aj-smSA|vHbHa39oPn4(&FJ-=Bm%3RCAvA{46#&ZN$RdN$0IwUF;oyl#XaZ`
zkM1|>95Fr)hh%HGYI$VO&BOY_z`Ku0lYzyklIw(-nJvM&1y8_dO)m7$-^K}0O`=lU
zvU##!hUk9ZCr6*0#|M3!z)x;#;l2m~GQwjh{HjrM&o8|Ub%Rb~4aMj2T_Y?EtQW5&
zoW@?mtNQxj%^mv<ht?Jut`oSEN4%mrW-e;fGHL>;T<s$o|8ON97Z2#ig%SFm#zh$R
zjRLd1j)9lThd*qfH}CF&DxKTt94Fk`A8Sh{L3ig1QhhNVR|kZ1f5_Vm{I8A?(IdNI
zH@ziDv&zfBc5EI&wYA{X)$Tm*oxS*#nr2L}TS@iUTn*;G>p^(cX`K05JEl*s(zT&0
z_@BO&3AP<Vg8hTZ$a=1YB{Sjg&LK3Dgy+ueX$7C)<p4$r-79kQaLQN`dblEQ9@nss
zPpNz5+<aZ3Q+(^;%6an*EIiagY1P+-CX%4dp^a7f3%fZ)o+y>BFIa3gIPF1e<NEna
zdb!V~L>?Q|#g;zHBw<Y(3cu+qtMV!Hs9sPqAaqas)u!r&EA1ni)FRJ~i^>MZU)UUW
zWaHx1gHrFDDD&J@R6Z>0_XlCS=8x(1RSuBG&T3h2&-sSdK^pZ~OYbEXqn0@rE-Um~
zVYlV*a^H*Mz;d@a80~jS5>y>f?or}*xqooYq=vb)e~V3cZSsw#68|f9Q)DyWyd`Z)
zcgeqBAR1O+Q?@7O^r|7VIJT=a3758p)fL#T-lzD_v#;*Vu9t0-fon70WHUEq19fX_
zm{B%AceUpx)FEqn$beWfxD!N?7DVy59v!fn?w~fNl^b;sKQ{+v13y3Tik)2tNUA?o
zChB-vFuHu7;!$gD`YZh@ZZp~*T;7{$Fm=S7_SY}kUKvEV=ixkobC#jpBVSkKs(Ni0
zIkM2CU$7?2D9UTG*^ueaUmHdH6nk6xu5g}V9OG9qAZ(xNjd5&1*}#Mr%?y*c!17@;
zUOs<g5-+RtEEKrRG))Mq_AhhIcx#$Ce#^Km!L>8#q~IDheUBXBtzahYv=>G>!e3`H
z8F%WU9P_71Ov1*3TB97ov>oM$vL(m&seA>or}xC3ZL!M+P>|V@(X+-Yv~ef54`sTu
zFR5MhPn2zSJ?+ISL4^DMX*|7bt2Czk!LJ5PF3jkW4H{9l`R^%AxySEhFhPjsdR~W0
zA!}f296d{=462@8=K8q(V~v-UEr-NQ2bj}XC!6&7oPF^W2`MbIj7cYw6Vg@OnB*9i
znK3l1f-l(BS5Ran-qA}um&;PkKYl{~lxxXouK-#{OSB|Z9Un+Uv|=Qe0y*MQZc!g7
z<J1rHm+X>p(Tb4jvSHcfm#2);CwA#mWh6dl_gC!F;3L;ej!B7Abu#D<BzTF$^|s<`
zHVnV~GX7s={WHv>MJ^!Zq({ZaDC9K3@N8@B4}9I)JYc(tJHxoa1rvVWc$(8`{5kz)
z?LJ)zjnSL{H`bdFYOjZ-^ca(&>zb^)Gx06h_}aw(4i*IZB6Cr||N5VazlqHMU!LN>
z?NJhJw-xA%DZ$fzrT`*yAzvC4G$v$d=aVI@AnaLjk$Om_(y`2&6@RHTQ|b{X^^!{C
zq@HowGY`gKG`sSYN`2Xx*>}y%PP<HJBd~#_V`dJpw;#+bMg{t?d(TXE9K{YyXsI-S
z9X!I=tyV8}VE0LpPJxb*A;B5ZUPF??LWAO2y+X#+HzSc5wDCN(L6{;K>5QjcvbxXw
z)5AktQlh=%q;l3@o1THgNLbfCXu<?`@5c_E(6yy^)|14Xd8p(eF3PabKxKHS7CMt}
zY$ray@t@bjG>efx)Wf#YBQ@GPSj8%&z5%Sg)+n<p|5#sbznNw;($#vYbR@g<PG)<M
z8tyeDG14iE>4`II#56gR?LSju=cO=b9$HVPat0lv50iRicHWb=Sk{ZNk!m~H>7)hO
zd9H;Xrtr#O@@R3g{!DzvmA#ryOAuP5%G0K0#HeheQd4c^X=%z#i|H|G=?Tgd3rANw
VM>_{mfH}a&7c>w6GUgYKe*tj26RQ9K

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json
new file mode 100644
index 000000000..f7f0fe9df
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-14 10:43:37",
+        "end_time": "2024-10-14 10:43:38",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 31.7,
+        "gpus": 0,
+        "memory": 15.83,
+        "object_store": 0,
+        "execution time, min": 0.003
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.2,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0
deleted file mode 100644
index c92d73bfb11dd463e1c278b8eee0d36cfefa5610..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 263
zcmZo*o%)9X0&1sdba7@(>=kV`bvef1&Dh29FUL4&+AEz55IfyAV%q7pHHtts=WI`J
z%P;F+?F1>})EAxM+v^^@9;_oYv!e0!op|2ZDH`5P-mIxzoKG7a*UsA^w1)vM#8Gr~
z#<6)hD@))aoH_IV-HAJN=mC<VZUKS2`X>buia5^XM{0ZQ6-xqXWA<iA?c%t6lI7s$
v#^V`q5zhVlGtcgrWOD^<5Lgi>$C00kwNmHX5JDVPeDN}2o}7L#eWiK;B|Tnp

diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0
deleted file mode 100644
index c3966bec265a8da7ff350586855e53ad69f800ed..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 31
gcmZo*ohrfr0ku;!yqUdOyqUaNy&1j!13{@C0Ar^J*8l(j

diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0
deleted file mode 100644
index e419c951696721f426be5649d731716ef4543547..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2840
zcmeH|?N5_e7{;HzO=!iJszYRGDLN`>M?$8|>_x$<SqWW{!9<ONuA~+Vtc3wh%A5s@
zu9Rt?5v>lj7L!7jIZf684iQt?-1xvMY+amz4s^hfY8<)=xSp$N#`a-<0sYWlult;H
zo%=bx&;1<ad>X-{@G;OLDZ|V!D$|ryiWS9Lr8w|Ysok%j&AfdD+G0(9SwVqPOIvAC
zLl`r-W+({Cz0$@YKh+{9Xc-=gB6=b^J)E$@k+Otvm%k^BusJEXX;Z3ZKGE!Lzafyw
zu)=xxwPP$2&pMYI)URKiNAydN_wfneANRmd@<FnY=yro+5ux9-nL~Iwrk{X{>6m_j
zP+}i_j<Aon96nJ`yhKvNPi`T<r(acLzp?zEk-s8q0Q)lkvSYmUksn%rsd5=vw^kO1
zyla%#B8a{&_rW(Y*2*P1XMW@Jggu%w$jfvJ)2MC93iyBX!x%WN_JfHjYWP`YvS_kS
zApMw4xXKc{fROun67ut2qVRlIvQ3!(qreDW9y*D0Ep|AOXYGA8d=Fe_f{xvz8;DO>
zrx52*Q9BRww(7f9kvNZW5c^GZwIkksmJ6-B`xzOg4nI1Ab69%l_Hv>JRnf@PRHZ_k
zHgixn0lrsSS)vRSiJ7@Sg+5YiqB8?^#WFKqr86=z-h3AL;7dxC()rRYHME(LMwbRA
zNtr6vKZUgu$3{{0r(PlZQRmIbi<&szjR(?SUW2$t6b`MDbwLXs-UQp;iN^w?>NfP)
zMl)Fbb}Q&w5ev2)I0xG9tOJdc+2986ark~2xd+;kZ~<KNi2>BNd;}Km`UUd}1>3=}
zKh9&{ky{-Y-@#Xcd80nCZV~E-YD^MiK6^$5>cTxBw<zeJ+@FT|i+|b(Kl#)lj7M3W
zh|AM){uC?cE$CnAeDLO|3(S5NJV38k1O0nbCw%C-_3+`3Od@V?wt*#zG{hY($<S3o
z9rX6@VBr?^S)l7zr(*oW#vNepcfs@Nk-DKXIVqrizyw~2*aX)5C1CuPJTU!f0(kmZ
z7JO}klc26mhj^sC9i&xUQY5McC=u4|63N8^in{l*6E&PSSd3g<b$9TnQ$7lJX0c=^
ztokP+(UZn^KfuaRiNtBAP{Sf&FBs}QMz0gSMxYBGVf<DO?s)&+bt{OW+t!IYm+9$+
zzdeD6{WKRZ;EsiVe}5qcT=BT0zDYOyP0J)45|0@Xq838EM=gFA>qEWg?qMH#+>0HJ
z@EboVLoY}@y9arFu{RRWF;gYRI!nbF>~pO-4)f&#74B+FO##ls$o~`Zg*H3(pYGd?
z^}MC`kmg!hFM3y~_guAa!gzxx73=<6A`RUw*mr#*3-fNYHbEPzzs3I$DiK554E%?u
z6Oc!>I}_)4qWmc04LM5?&sg~eSa_oxe%94dm{)!=559KFh5Q~5urc0o;%oG%f0qb*
fu0$wmu0;N?L^!icWbQ?ihFduIqM7xgDHHz<PQNsa


From fd0f52c34541b5c46b6208fd67bd50803d5033b4 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 18 Oct 2024 15:29:50 -0400
Subject: [PATCH 042/105] Updated ray tests

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../test_cluster_analysis_transform_ray.py    | 52 ++++++++++++++++
 .../test/test_data_cleaning_transform_ray.py  | 61 +++++++++++++++++++
 .../universal/fdedup/ray/test/test_fdedup.py  | 18 ------
 .../fdedup/ray/test/test_fdedup_ray.py        | 60 ------------------
 .../test_get_duplicate_list_transform_ray.py  | 45 ++++++++++++++
 .../test/test_signature_calc_transform_ray.py | 46 ++++++++++++++
 6 files changed, 204 insertions(+), 78 deletions(-)
 create mode 100644 transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py
 create mode 100644 transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py
 delete mode 100644 transforms/universal/fdedup/ray/test/test_fdedup.py
 delete mode 100644 transforms/universal/fdedup/ray/test/test_fdedup_ray.py
 create mode 100644 transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
 create mode 100644 transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py

diff --git a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py
new file mode 100644
index 000000000..a3771fbd8
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py
@@ -0,0 +1,52 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from cluster_analysis_transform import (
+    jaccard_similarity_threshold_cli_param,
+    num_bands_cli_param,
+    num_segments_cli_param,
+    sort_output_cli_param,
+)
+from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_ray.runtime.ray import RayTransformLauncher
+
+
+class TestRayClusterAnalysisTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "run_locally": True,
+            num_bands_cli_param: 14,
+            num_segments_cli_param: 2,
+            jaccard_similarity_threshold_cli_param: 0.7,
+            sort_output_cli_param: True,
+        }
+        launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "signature_calc", "bands"),
+                os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py
new file mode 100644
index 000000000..a62105b2c
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py
@@ -0,0 +1,61 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+    operation_mode_cli_param,
+)
+from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_ray.runtime.ray import RayTransformLauncher
+
+
+class TestRayDataCleaningTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        duplicate_location = os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "test-data",
+                "expected",
+                "get_list_transform",
+                "docs_to_remove_consolidated",
+                "docs_to_remove_consolidated.parquet",
+            )
+        )
+        config = {
+            "run_locally": True,
+            document_id_column_cli_param: "int_id_column",
+            duplicate_list_location_cli_param: duplicate_location,
+            operation_mode_cli_param: "annotate",
+        }
+        launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "input"),
+                os.path.join(basedir, "expected", "data_cleaning", "annotated"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/ray/test/test_fdedup.py b/transforms/universal/fdedup/ray/test/test_fdedup.py
deleted file mode 100644
index fa46fb071..000000000
--- a/transforms/universal/fdedup/ray/test/test_fdedup.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-# There is no local test for fdedup
-# This is just a place holder t satisfy overall framework
-
-
-def test_fdedup():
-    pass
diff --git a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py b/transforms/universal/fdedup/ray/test/test_fdedup_ray.py
deleted file mode 100644
index 78ee7cc04..000000000
--- a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-import os
-
-from data_processing.test_support.launch.transform_test import (
-    AbstractTransformLauncherTest,
-)
-from data_processing_ray.runtime.ray import RayTransformLauncher
-from fdedup_transform_ray import FdedupRayTransformConfiguration
-
-
-class TestRayFdedupTransform(AbstractTransformLauncherTest):
-    """
-    Extends the super-class to define the test data for the tests defined there.
-    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
-    """
-
-    def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
-        config = {
-            "run_locally": True,
-            # When running in ray, our Runtime's get_transform_config() method  will load the domains using
-            # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration.
-            # columns used
-            "fdedup_doc_column": "contents",
-            "fdedup_id_column": "int_id_column",
-            "fdedup_cluster_column": "cluster",
-            # infrastructure
-            "fdedup_bucket_cpu": 0.5,
-            "fdedup_doc_cpu": 0.5,
-            "fdedup_mhash_cpu": 0.5,
-            "fdedup_num_doc_actors": 1,
-            "fdedup_num_bucket_actors": 1,
-            "fdedup_num_minhash_actors": 1,
-            "fdedup_num_preprocessors": 1,
-            # fuzzy parameters
-            "fdedup_num_permutations": 64,
-            "fdedup_threshold": 0.8,
-            "fdedup_shingles_size": 5,
-            "fdedup_delimiters": " ",
-            # Random delay between reads
-            "fdedup_random_delay_limit": 5,
-            # snapshotting
-            "fdedup_snapshot_delay": 1,
-            "fdedup_use_doc_snapshot": False,
-            "fdedup_use_bucket_snapshot": False,
-        }
-        launcher = RayTransformLauncher(FdedupRayTransformConfiguration())
-        fixtures = [(launcher, config, basedir + "/input", basedir + "/expected")]
-        return fixtures
diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
new file mode 100644
index 000000000..4b59e3a7a
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
@@ -0,0 +1,45 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from get_duplicate_list_transform import sort_output_cli_param
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+
+
+class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            sort_output_cli_param: True,
+        }
+        launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "cluster_analysis"),
+                os.path.join(basedir, "expected", "get_list_transform"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py
new file mode 100644
index 000000000..34f3ee403
--- /dev/null
+++ b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py
@@ -0,0 +1,46 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing.utils import ParamsUtils
+from data_processing_ray.runtime.ray import RayTransformLauncher
+from signature_calc_transform import (
+    num_bands_cli_param,
+    num_permutations_cli_param,
+    num_segments_cli_param,
+)
+from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
+
+
+class TestRaySignatureCalcTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "run_locally": True,
+            num_permutations_cli_param: 112,
+            num_bands_cli_param: 14,
+            num_segments_cli_param: 2,
+        }
+        launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration())
+        fixtures = [
+            (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc"))
+        ]
+        return fixtures

From 954dffddc11070366fdf56efe2229a412f8501f4 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 18 Oct 2024 15:31:46 -0400
Subject: [PATCH 043/105] Spark test data and tests

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../docs_to_remove_consolidated.parquet       | Bin 663 -> 663 bytes
 .../python/test-data/expected/metadata.json   |  16 ++---
 .../docs_to_remove/band_0_segment_0.parquet   | Bin 0 -> 1513 bytes
 .../docs_to_remove/band_0_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_10_segment_0.parquet  | Bin 0 -> 1505 bytes
 .../docs_to_remove/band_10_segment_1.parquet  | Bin 0 -> 1523 bytes
 .../docs_to_remove/band_11_segment_0.parquet  | Bin 0 -> 1523 bytes
 .../docs_to_remove/band_11_segment_1.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_12_segment_0.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_12_segment_1.parquet  | Bin 0 -> 1532 bytes
 .../docs_to_remove/band_13_segment_0.parquet  | Bin 0 -> 905 bytes
 .../docs_to_remove/band_13_segment_1.parquet  | Bin 0 -> 1526 bytes
 .../docs_to_remove/band_1_segment_0.parquet   | Bin 0 -> 1523 bytes
 .../docs_to_remove/band_1_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_2_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_2_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_3_segment_0.parquet   | Bin 0 -> 1510 bytes
 .../docs_to_remove/band_3_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_4_segment_0.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_4_segment_1.parquet   | Bin 0 -> 1513 bytes
 .../docs_to_remove/band_5_segment_0.parquet   | Bin 0 -> 1513 bytes
 .../docs_to_remove/band_5_segment_1.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_6_segment_0.parquet   | Bin 0 -> 905 bytes
 .../docs_to_remove/band_6_segment_1.parquet   | Bin 0 -> 1513 bytes
 .../docs_to_remove/band_7_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_7_segment_1.parquet   | Bin 0 -> 1505 bytes
 .../docs_to_remove/band_8_segment_0.parquet   | Bin 0 -> 1530 bytes
 .../docs_to_remove/band_8_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_9_segment_0.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/band_9_segment_1.parquet   | Bin 0 -> 1497 bytes
 .../docs_to_remove/metadata.json              |  58 +++++++++++++++++
 .../data_cleaning/annotated/df1.parquet       | Bin 0 -> 6923 bytes
 .../data_cleaning/annotated/metadata.json     |  56 +++++++++++++++++
 .../data_cleaning/cleaned/data_1/df1.parquet  | Bin 0 -> 14933 bytes
 .../data_cleaning/cleaned/data_2/df2.parquet  | Bin 0 -> 3068 bytes
 .../data_cleaning/cleaned/metadata.json       |  59 ++++++++++++++++++
 .../docs_to_remove_consolidated.parquet       | Bin 0 -> 663 bytes
 .../docs_to_remove_consolidated.parquet       | Bin 0 -> 663 bytes
 .../expected/get_list_transform/metadata.json |  48 ++++++++++++++
 .../spark/test-data/expected/metadata.json    |  49 +++++++++++++++
 .../bands/band=0/segment=0/df1.parquet        | Bin 0 -> 3984 bytes
 .../bands/band=0/segment=1/df1.parquet        | Bin 0 -> 4763 bytes
 .../bands/band=1/segment=0/df1.parquet        | Bin 0 -> 3695 bytes
 .../bands/band=1/segment=1/df1.parquet        | Bin 0 -> 3684 bytes
 .../bands/band=10/segment=0/df1.parquet       | Bin 0 -> 3305 bytes
 .../bands/band=10/segment=1/df1.parquet       | Bin 0 -> 4466 bytes
 .../bands/band=11/segment=0/df1.parquet       | Bin 0 -> 4906 bytes
 .../bands/band=11/segment=1/df1.parquet       | Bin 0 -> 3317 bytes
 .../bands/band=12/segment=0/df1.parquet       | Bin 0 -> 3138 bytes
 .../bands/band=12/segment=1/df1.parquet       | Bin 0 -> 5020 bytes
 .../bands/band=13/segment=0/df1.parquet       | Bin 0 -> 3138 bytes
 .../bands/band=13/segment=1/df1.parquet       | Bin 0 -> 5244 bytes
 .../bands/band=2/segment=0/df1.parquet        | Bin 0 -> 4782 bytes
 .../bands/band=2/segment=1/df1.parquet        | Bin 0 -> 3988 bytes
 .../bands/band=3/segment=0/df1.parquet        | Bin 0 -> 4323 bytes
 .../bands/band=3/segment=1/df1.parquet        | Bin 0 -> 4341 bytes
 .../bands/band=4/segment=0/df1.parquet        | Bin 0 -> 4035 bytes
 .../bands/band=4/segment=1/df1.parquet        | Bin 0 -> 4860 bytes
 .../bands/band=5/segment=0/df1.parquet        | Bin 0 -> 3554 bytes
 .../bands/band=5/segment=1/df1.parquet        | Bin 0 -> 4872 bytes
 .../bands/band=6/segment=0/df1.parquet        | Bin 0 -> 3553 bytes
 .../bands/band=6/segment=1/df1.parquet        | Bin 0 -> 4311 bytes
 .../bands/band=7/segment=0/df1.parquet        | Bin 0 -> 3765 bytes
 .../bands/band=7/segment=1/df1.parquet        | Bin 0 -> 4158 bytes
 .../bands/band=8/segment=0/df1.parquet        | Bin 0 -> 3781 bytes
 .../bands/band=8/segment=1/df1.parquet        | Bin 0 -> 3997 bytes
 .../bands/band=9/segment=0/df1.parquet        | Bin 0 -> 4018 bytes
 .../bands/band=9/segment=1/df1.parquet        | Bin 0 -> 4326 bytes
 .../expected/signature_calc/metadata.json     |  48 ++++++++++++++
 .../test_cluster_analysis_transform_spark.py  |  46 ++++++++++++++
 .../test_data_cleaning_transform_spark.py     |  58 +++++++++++++++++
 ...test_get_duplicate_list_transform_spark.py |  45 +++++++++++++
 .../test_signature_calc_transform_spark.py    |  42 +++++++++++++
 73 files changed, 517 insertions(+), 8 deletions(-)
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/metadata.json
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
 create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json
 create mode 100644 transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py
 create mode 100644 transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py
 create mode 100644 transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py
 create mode 100644 transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py

diff --git a/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
index 557f866a7c3a83d68e8842afec48e1c9af5e5cf1..edbd80b43e1a3e1ede5676006a991cffc1396238 100644
GIT binary patch
delta 26
hcmbQvI-PZbEI%8A00T3FAOkA{Hv=yN-$tEmCIB{E1C#&&

delta 26
hcmbQvI-PZbEI%KE00S=r8v{23D}x{d^G2O)CIB|<1C#&&

diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/python/test-data/expected/metadata.json
index bf26b5228..ba1f5b0a6 100644
--- a/transforms/universal/fdedup/python/test-data/expected/metadata.json
+++ b/transforms/universal/fdedup/python/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
         "job name": "fdlist",
         "job type": "pure python",
         "job id": "job_id",
-        "start_time": "2024-10-18 11:20:38",
-        "end_time": "2024-10-18 11:20:38",
+        "start_time": "2024-10-18 13:22:42",
+        "end_time": "2024-10-18 13:22:42",
         "status": "success"
     },
     "code": null,
@@ -21,16 +21,16 @@
         "num_processors": 0
     },
     "execution_stats": {
-        "cpus": 136.2,
+        "cpus": 32.5,
         "gpus": 0,
-        "memory": 23.89,
+        "memory": 13.31,
         "object_store": 0,
-        "execution time, min": 0.0
+        "execution time, min": 0.001
     },
     "job_output_stats": {
         "result_files": 1,
         "result_size": 663,
-        "processing_time": 0.021,
+        "processing_time": 0.047,
         "input_files": 28,
         "input_bytes": 38040,
         "input_rows": 44,
@@ -39,11 +39,11 @@
         "consolidated_rows": 8
     },
     "source": {
-        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis",
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis",
         "type": "path"
     },
     "target": {
-        "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected",
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected",
         "type": "path"
     }
 }
diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6
GIT binary patch
literal 1513
zcmcgsL2nXK5T0EY+$N2|#<$r`I1r<o9&F2&)QIWjLF)#^3K*$!A+Rh^Q&?Omq(^W5
z5`TdvjlZD3z>^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL
zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p
zHuSaLi9eaPx}L+<HATCWwC|GHFxCGVVh}Z8GN4Oi09YaHRwyVL#${nqhpt9FBy72g
z05QToBp1SNqNGV{>a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh
ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuP<MB<-tYUfeK$(j8d_UokRHo>AEYJvDI#?y${L
zc%Jv<D_14R|9QhfF`gc1ijgt)ui@sI*LDZ4{ZzC|MN>>w`<T29U}W@lUavnteiR&%
z{!QK?oAV;)yoP_Wr<`M<&mVK%=cT@=6|vzNtxBAiDPkT)%xJ6hlIkzW38+$;pfM@(
zfhc<9X;5elicP!l#<2>yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf
z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;`
zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m
S&e+cY1V1>$PjeMN^1lIVM)pPk

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..9df2f3bd54e13d5078be076585302c2d0f4e93af
GIT binary patch
literal 1497
zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K}
z0vA4laj)@Nj0+bod<hfJX{q20B)Z|GJ-_#!^L@QHEo0Rbagh>tj)g2>17P#h?yro3
z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI><LK$avKj2}{3#PvN$q;ze)edZ5G
z^_J&|q$;alB=wtQ92v_04AHlU00<dKrN{s*A#^k3!|SdPiYl5Cc}ytsXW7Ms-mc9E
z-6AT(`pB#MZqFOLbF6;GjE^zmb-nA7S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDYL|
zerMF3qxb%X@s;n!_2x@PE~2L&?}3&WI$gKd5D7ZRd-B{B3F9kYI7}{v2db>6_5Bsx
zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I
z<O;OlDSJi45?hKROA(ifFF604RnV$f;?@)tup1;j);wq`HMX{uf9;t0Y}3qN+Nb8Y
z)vL9eR_(&hRfF~2a)6Ik5*(7tD6W(Oya9+m&=C(9uT>qt%AK@ZR=(}@YNKY(J|}z+
zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(<f)cDaQY|Nm-lJTgtHyqPf#50H<b`C
uXqiF7e{<oEww=LXyW#uZBr|sX;hERZm=AW$ovgl}1_&QGi~*GCr~VuJx%5l`

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f5da05a106242414df178b29c0ac05f21de73c73
GIT binary patch
literal 1505
zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3
z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv%
zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W
z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X<QT*?HF2g
zmV_;$SYTdQOt#*|XTolwq+xyJ*8{KTk3FI8i=_XQtY0zfV+{2zR=uIuQhmz?M$*X^
zbwxXd7OkjPw4R}Od0rO|?6Wr0#s1Y*Kll2bQTN*LKCD?k=#4BEzoF$ad1i4R40Y&s
zy<UT<IL~|XwWkv1bH8v1E~f{&VkXUl6}-IFy@8caMZZ#X#nSZm@!J4q(%9gY%Cpl)
z;gF1P@-Eqg=Q!tO{E|K49Dxo$=Dfr69f9TWz*Ab~IWJJeJc^jnR{jOy&&i2W`2v;3
zB*(h~^vP4BsZ}Y~oa}4Y&Ze7o_Of<rk6XQJyXjOfYMF{S-z^D#coONLT}FAiAovOr
zf2nhRNPL~j_*Lel-Ey*Rw^toCGqrQ%_i)OGbHNlZ<;tYT;oRkiR2<12Q6!tze;g*l
zlgm?Ar#5i=C+U}a*k?j$hwdX4m+DO|<O@b>&<NgKc%yB1FxYMcfj>!&y<m9e_fz(R
U9eXEj9wY(64;;k+O8Dpg2b^^EJOBUy

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..0e089dee3ccc91c9decf65449e3ae930f1d1717b
GIT binary patch
literal 1523
zcmcgs&2G~`5FR_J>!K9Ws%zO&4iPF=5B-UWib&PVCM{MVlq86><UoELCn;*;;G~KA
z5<CER9svn)=D>j`Kpc8ToVkIRUE3s(<iI5(duC^LJo9}!E9Ys^AYqaq2RR}USO?g8
zxA!xyz6Gd;AOg3*6A`M{LRBC@Hi*8BsZo5Tm?{SesvPoD71){)#Z6xMMgc%XM-s%)
zCg{I}7zu0pAVoEGrFri52bG3nk%%HGUj*fcpuf=N|LLM_BO4$hpo*dan8$7nRw3#`
zp382{Pf+Dmf5q5MM{#o&yAMlab~lk%zcO$tuHA8l_5{T@Vf|wWNlurY(`+WmD7l>@
z;&_BI1rU7Dco^$sb8>9Kjw%N$)+<<#qXDs%Y%n%K0{xpj=0*PIRk^UctwEd9doL(o
z1^x4~{(2edO&6cjauz);w-2h+x7v26MkE~S9dY8mc>X^p>;-4T14Yte+TjAOpT4j=
zu<)rU-@}Te%gU#v4FD~sMrg74<n+KBl6o!fF&WV`rL=%AlSh<dphX{0+M<~jkEJo;
zF<WIQ%`%IAV9{e+nP)8ilqp^+lVzpRNz*nDI!v=hzLc0Po2i$UkxJB!)K&S^7&bbk
zX5B1ZmXk%E-_CP<n2B&ut^&W1<#;)WKhP;XVtCEs@Okp2*)UU0t5X`(ljRG<ci+cD
zDMyMQr3;LYNvTDTSaAe*%q$bX{zK2<nyC!yYL<Ie_at$47yFD@w8g$hC>E>di-#A~
tc(3NZy0iy7R<E~Hb6sZ?AKGsJ-08-Rd%MPNLOYBBcn=)o4?2&(``=K^{GR{-

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..4b0fecb15e7a68344562f556912622282bd20231
GIT binary patch
literal 1523
zcmcgsPj3=Y5T9LOyVYuF)3@18I1p1eJ@gMNsgb6a2dx_vD`2F`LH;ZY)D#whLi#2A
z0N(uwCdRV|4}JpUp=aaSo5q>_BWU5^g~{&Bn|ZS{zu%je&GVwc!Ysk|b4+5e2C(^V
z_h(#v3s4O~7;b<kB2+Jhs=$D)GkptFqxecORSptVIpn7*usI`&n{MT61ppBpNf1LD
zp#KtLEUfK;6xGz_=4pR0tTY^pttgW6MNobS`U_qDpDx-KvH>Onswf(OdF)mp0@r=W
zbJ?x>3920NSH$jW6gOwF`w*G1yNSF8m7!DV+Z|_QPf>gm)<1@j<khlsn$08`Ww&xn
zoQzQB0D=!14`YpOOph(tQRQGoy@K^58W3B~24fQ>(7(wOUhHpPl{35B8n#_}?*-+n
zpnqP{UoRoOtHtNEoJCK|?SU!{thU{$F$sryTb#Ntp8wAYd%@Z8K#{bVcCdi!r!VXd
zEPN`;_pl=Avhpdi4xq)<6<#bpKH2w%q+W`<Bx9cDoEPvVdB`~iTKqoeEuLw)u{0(;
zqE&|TELrpeiym!do>Ke?DYsN6OQq3C^R^pwNK+$UN@UAs>ZN6*5_KbWQ9dz7jZUdq
zH%sT`WYNuU=Usf5iEvOZ0>6-T@wy=XK<E6B@S4TZ^W<@}VWyf^r!=f5%V&u1o{xuf
z7b$*}E)XA+bBiBRaRhfnmc+0Bz;j$pDnnh(a?k1>C!XEGK4Xfu=zD}>Q9WNgyr9N=
swf?Jfd$?`&dfT;r-x<e8c7Jf{bmPX|9b+e<9mD{<2afOuoyXt(Z|O(;OaK4?

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..5601f5cb07d71179df35855265cde6b0634c130f
GIT binary patch
literal 1532
zcmcgsy>HV{5I@_A>!N_ts?V~e3>m=c(2t~15velpl9nhCN)jN24CKT4Dr!DR(nKBE
z85kH@85!9a8TcC*PzNT&j2HlM_iVR?qytM&$vfZ2yZim#Jv+w>QRZV2wv}ZngGGSl
zck92x#v6c&0zL?UBQjKurHaIWEir3_X)K88YRJ&M1VdBYR2`OQcrhp{U&;f>XlRnG
zyaMaD6k<Me6V#w-%s2N3!%?~3vspmb^iPuhU9z57+W&MhSC9=b8E^@r0zmsFOJou9
z3A^xHfc^sbL50gMXyt(n$_#o7ZrsrP?i10Q_F`~~-WRam`%mdzL1M%5s8=2|I=yj2
zaQrIiKP2n5&-$Ps%jvS`G@HpX%r>$_-kV|YIe_GXrh_2CQ(0tJ&(6<_7@C($J*U06
zSrDAhc=MAK)%vNO@?(GVD<3qvtx;Rh{3z+)B<u5>^>PkrP8UC?<t&<Jb`uPBXtx`k
zDpPT~*W@!7$Fcu8X~#Jm9_X4GG7}eY4=?U6T=-P<7k*u{@Qw8^0hl2pzzc<Ydt1(s
zjAMD7WWtl2^F01Z?r@Hd7Qe-Li>F$`m&Am-^pxT}LmF*QqeY?AL-OCJMU+Zqs5BZ$
z-WJXtt<=bsBI!~({?v}fBeiJ!sI(Uy*E_{#EnPe;#R?+7ofG&l6XBp7d3HV{@Cpzo
z&^g~Gyy?RDacs9)Psf{fr#Py`N(YGVhKq-DffP3-^Ta33xy`q!ID*?DP2$$S?HFRE
zB}HA+rM}(WjXc`GJ`?h`Xg@-+sh%qiUNFM_>fqU7W3+1b`>WN#pf?GR8-w9~uN#hD
VUyH6q%tQ#l`N0l;sdM<<{{tHM{bm3F

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..02bedff1c8ca8f026eb6a3d7664517b0f74cbb2d
GIT binary patch
literal 1526
zcmcgsy>HV{5I-lW>q3Oms?V~e3>k{mp&xOhB2smENlO$6B?%%eS@PjHNm28GlP2nj
z#2>&v!Op_MU%<q`$kZ7zFd#N0?w;*7H0i*SlYQsAckk}^d-vo#FPbdGVr(ae237#p
zKW_htMcx3&;7ZuD4A=@=+h8(_;44Qp#b41BKUanI8Br`CsxD;#0_ez+6bu=^q$msN
zyC6q&ZK-+c42G3P&thR!RzFMX56LJQ%KtRcH;4fX0ICQY0QBQ*FNbKah-vQus>=|B
zyB@0mR4=8fgnL$*F-dOOOVE^{_eJEUqqr!@-6{Cw0>L#fJ}g|byNA36m0_>q*qz?U
z78Jip>Tk(-7cxEtk>qsQTbj)zIcs{vpG0yE9m<XA`5Btx=TOh-C>EG8y`Rc8KlU%b
z%9-754cilw{JLnoT|}DG#cydji>9921x+4UZM#!rG7k5CVB+4m{*9A%gR|j*D(g{w
ze-77hes{r`_f*vHAyqaM^~=I4fF9MtyjXm8yyp%{y9{iTjCqQ4Uci^+3FjDS@kgAu
zc)BHGDNJ}uUun*>WYPC6dbE^&LGkCLM5%O^N~4qFZ4vB|rbeEW$d)t7SC*NK*UjWb
z`PdvaI;CbkQ#vmviXy+A7x*v};h<djejzLH3J_P&IX@u0nd0bW;;7lkB%4;JG^{7e
zXNd2ihlg{46gQ;`#3#eK#Sf@Bf;%Kj;?;lbIzp35Q`bznZ*`C2Cp*|@OwktYM<^E6
z^TfpqTC87lUZ2~;O{?GEtT|3^92?or;I!9`nGd$it+>7)1#llY#9vt+|M-6aMPB*y

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..bf131f43cbf10180944b4906799b7d6288c54724
GIT binary patch
literal 1523
zcmcgs%Wl(95FI;->!J$Ms@JlmEFx5_F8YXxib&PXO<Jr#C`k}$$$~r_Cn;(kIBBB(
z1RubjPe4MfS+L*>5R0x6Yjz-J{2+lO3pO3uGk4~WXU@5E<vcGMEX)#YKgT2nYXF<?
zc7Mjzw*b`;gy9x=B0}{_s0s|&I@7l>HHxnkQ{^B*l|z220-JN9xan5DQ2-FpkpwZc
z0s1c?#=_bjNKs8)Zk{=VVWrWt*oq=4Uj*fcpuf=N|LLM_Asb*Kpo*daSj27>A`tZ<
zFJ!msC#Z77UlF_6C~nSU_aQQ2cN2LHD#KpIu{*tyJw@?NSpOJ8lCxzOG@DB@%5LYF
zI2obL0R$g39>yBmm>yfOqsqaGdIjrAG$6K|4aO!&pnsDmyx8BoD(7~$HEg@|-V4fC
zLI1p@zg|Lmv&9#*oJUW~?SU!{thU{$F$sryN1VDZp8wAYd%^keK#{bVc5n^XPrtT1
zaP3o3zK0b_mz7VEbpS1<uJB^<$;rMqB=t(%B^mQH=e&R~$s^7&(Bcm`Z}CjajioW+
zF|9J3XUU=;SoCNs^Niw8Nx7vmSt^ZAnz!9xk2E#%r9`%Dre0b`Dp5C5m*o><)aaC&
zb+dF)P8Qw#cHYH@nFt5vGVlvo7q1KA4|L8C39nfkJx?At8)m9$bxOl}vV4yC?)!K+
zcah>p=>qXFIk)&B6-RJKWJ&z`4?V}#q%zdiEcdPMapLqY_8C*OMc*S7i|YB};RQ9`
suQ{(S?BTZ6?{C)}r#Fs|Y-e!R>&A_HJH}2zJBR^z4;<kSI*-5m-$ZTvUjP6A

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d41b35de2e6f38dd1aa146c59b1e5050088f7b70
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyM!!c@iuM312Nj{VSlJ(M$BFgY-`Xt0VC7C5GVy^2F1c4`vQIf
zAN&Z$ca5LL_~3&NehCxLX{q1_iN0{sp1=2=^ZWJQw2W0##6?QnIT5meO@OUWd%rRY
z0wf_60b*0!-KKI%Bt#;uDN$wSsj#&mOZ$Uv>m>l00$Gw|Fn&mB5!d%2k<zvG_PIYC
z)mxq;lB%qJk<@RJacn66GeqAe0w81{l_CSMgwRIFhu2*r6jd}O@|aNM&$5dNy<M9T
z+8`>!`pB#MZqFOLbF6;GjE^zmb+hY|S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDSo6
zerMF3qxb%n@s;n!&E`u+E~2NO?1PpVI$gKd5D7ZRd-B{B3FB*DI7}{v2db>6^@A1M
zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0mccu3lJd5>j+1w^dS%kl(~5*<87>|n7I
z<O;OlDSJi45?hKROA(ifFF604RnV$f;?@)tup1;j);wq`HMX{uf9;t0Y}3qN*=Ods
z)vL9eR_)TxRfF~2a)6Ik5*(7ND6W(Oya9+m&=C(AuT>qt%AK}bR=(}@YNKY(z94)L
zr+5$pq_n9}VLle3gNNLlz#Xw=nf8Ai27zW(<f)cDaQdg&m-lJTgtHyqPf#50H<b`C
uXqiF7e{<=McAUXrr{VkFBr|sX;knn(m=AW%-K>6)1_&QGiUE}Ar~Vt0tMqdK

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U|
zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5
z1c*T>0>rwwze(k!NQgvAQ=-b$Q(<FHmi7nTR*L{K1+pZ`VEm9$BChX3BB^Vutuucx
ztTjDHtf{j4MN+>>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=<Ujs
z&~2hJs13cE?{>YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^
z?X`!U8G7$;8DIHs+-$yJ<UD%%@h)hIfzxrjb&;TRd?3$UkubjYg~Q~0c%aI9O5b0?
zjWd>Z2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi
zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC
zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9
zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t
vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J|

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..2838dd9727770220dd6b3f3ae9f0d4bdaefd8ec2
GIT binary patch
literal 1510
zcmcgsKX21e5I;MKaZ!cRs?V~e3=t|;hyIC)ib&PvC2g%hC`k}0WXT`LNs5{{IBBA8
zd=wTI1_mS~J_8doV(7p}Kx|0dd$yYdhk+$0`|jPndw0K|?<GZAHb|7j$=w1G39JKb
zecJn-lqo<IJRv~V$>uhZFiRp*LY0G^=}LjE1yMZDtF7b#M06xUkU;+-BuG@-2Pv+p
zEA4Z4II6WAi>xY=@>x*63;KyJ|4$Qb8wmgr0aY9g0QxC%P(ZXdgw%Hc^)-k<%p-+(
z>eoVDz%v^}pCz~EC#Z77UompwC~k6cw@U#zF|rBz+t`%d9pp8vjhvcm_nfiKDSi`_
zUxNN7s=tpQ$#B_!G+Rh=(bNLa;i&~X^Tfi@fY|)hzRFWxWS&>;!tQrQ-I)n~yrutS
z7bRSr(F;dm5k0N252`e@x^}NYBpm8Japt~w{$F3%3oeESilim9!zEn5<=p{&*;7%z
zL={Pwl@GBE04<@e(sKFf*?~7C^;+CxGND;YX$fB@k154Khn`T{p}7u^WijCiTjeOt
zGmCy;(PLY==Pdq=DPAg<XQk1}(k>4=OtVJ5RG6)rnOByPNi~hkRrSmmw|bR!)2v)p
z(`BCDEpmLAiEvP^0>6~!csYn0=#(BYyk>d)GJV=^nVGiLtBjiI>ILF^=;NW3BgI46
z660f1YSANB9Kjtk%fzq$$aA=6D#yB-)q&MNO})5}eI_j0VdoKw#p?Ou;RQ80Xt=L0
o?a_`k80<7$*O?^8wmUp``bp!#uCbfa4if<01IPGNE8;)=AFJv2-2eap

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7cb2cbac4ca976304da8c64e8db39c678260db2c
GIT binary patch
literal 1513
zcmcgsPj3=Y5T9L^b(=H>8{cL(;XsUTdax~9Qd3MX4_Y@UR=`M=3xQ>UHigB7LVEP(
zC-D<#()bDb2|Ri519<dgV&csHkt)l<3lo@mGjH}czu&%>t?-)3B$i>jWel(Zu=#1{
zS0?oiAOV3T!^nVbu=@|OEyZLer!^(o(UhoHh0T~M0{EzFB>)i<Ne~jE@k2;6N#6rG
zrE6>5Q*S(Jc3hjStCD&xsNV&nZYcjVL?>#%L_n9q0I)>ZoluY*#zkRKho(e5By6#Y
z05QToB<I3zqoi?j;x;{J;7*+z>fTGn=LG6oL|rl~R$q$UD>HF^a0&+ymg<UjG$mTm
ztY|&2#^{p3_@c~pvA=aS&z)g!()Z<E3F=qDxVmNhq&u<*zGP&aJiWXJnmo4q&Y;C)
zJkJN>jjIym|GeR#7*7vWNl)tsE4X>a%I?6*Per|!RLM}(kIB0LdRklOwc3l5$H5_K
z-^3lV882|otN15-#yJ*x{0ZkhUhMg`0yaFSRgv=&Ma-j!8Eq9`QT-)3AF5a)XiN&c
z?-yP2G$^z>#fFtXv(0?AZRRf<C+4&>sCU~|{i2bp`ThNh&ksA14%%guS4%!$pTr;P
zoF5Wjt2TX;JMMO@eAgb-C+%G09Qo~sd^q=+;zNZh>9IJs`5}QLxg&~XA^y`K@jbaB
zO|=>$dw86Ey^C{ZRPE98NW~_+&;q`oWkxOU?S(VhvPYw>mgl*%%+&G5r|vLgKH4_7
Sv-&|AAo#%%ewr)zk^c>+^Y&B#

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6
GIT binary patch
literal 1513
zcmcgsL2nXK5T0EY+$N2|#<$r`I1r<o9&F2&)QIWjLF)#^3K*$!A+Rh^Q&?Omq(^W5
z5`TdvjlZD3z>^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL
zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p
zHuSaLi9eaPx}L+<HATCWwC|GHFxCGVVh}Z8GN4Oi09YaHRwyVL#${nqhpt9FBy72g
z05QToBp1SNqNGV{>a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh
ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuP<MB<-tYUfeK$(j8d_UokRHo>AEYJvDI#?y${L
zc%Jv<D_14R|9QhfF`gc1ijgt)ui@sI*LDZ4{ZzC|MN>>w`<T29U}W@lUavnteiR&%
z{!QK?oAV;)yoP_Wr<`M<&mVK%=cT@=6|vzNtxBAiDPkT)%xJ6hlIkzW38+$;pfM@(
zfhc<9X;5elicP!l#<2>yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf
z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;`
zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m
S&e+cY1V1>$PjeMN^1lIVM)pPk

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4
GIT binary patch
literal 905
zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~
z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl
zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99uf<UHQGICML{d{hQZWZ68e>R
zhteNup(MXlY7M-RAN7|6@1z%kT6GuH<izcJ!?v6x)~vMDQ5?Ubi<AE@1PW+rt@M%O
z(La6;`hPd9oGJ8kl-tB-AW$`hwtag3C2n8+D{ZsQu#AWedf848DdFQ0qK~F8a%EcZ
zmA_2H3Xc>&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk
zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB
z<p%d-A-Z_L;skfdBimg6I1YqoXL41`8M~ue{>Kj0nQ_+V{e<GO-rQndP&4Cpcy;YX
lTkd$g)egg8mYI6t<T4m#jJ=$Z%WI`HK>WaOx;}Nfgiki<(2)QD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..9de62574605a07b28b991ff2c736fbf6e3a7f45b
GIT binary patch
literal 1513
zcmcgs&2G~`5FR_Oait1{inVMhhX{ey18w6*MFjP-NsAQ-B?%%0U!ufulA<OKPMWAk
zZafK308+&h^a(g|-~l*tLPBE3Kco=nz$K$(W@mQ%&G&g%uJXFcB$j1+6%4Qmu=Q#8
zS2q0)AO)dGQDne2+1>ltmS!@OGnx|bXiD6x!d5~R0Rq(ZGJuGQBnT<d_#tGNr0;{A
z*0uHSsXv}HJD$TfR7t%O)bE1PFqHopq7yY>BA`oS09YdIRwPJ{;-av)LsQ}&61G@H
zfCOP5*5<-)qNH(i;x&DD;7#3Y>fTGn=M?H&L|rl~QD2(fsW5SVa0&+ymg<UkG$mfq
ztav@ICg_sD_@c~pvA=aS&)i{e(huZa3hGzExV&Nfq&u<*zGP&QJiW3Hnml&;?x4kF
zJkNXLwW|{5|GeR_m`o2;Nzdp9E4X>a%I?6*Per|wRLM}(k88I9^o+K_>-Fa+kHSOJ
zzKOeJGhXDJ*YHpFlyfZf_+!p{ywnS7MQnIRs}kpBikQa{GukS>r1}eT0aU3>(3li?
zKPY<SX;5ShindjF<Cuk9+bmqzC+4&>Xms0F<J`{IgZ_Rs;D?<^2kj!xYvq7%K;jQ|
z&JT&NRiD1jA9p)eq3aA9lXl)dLw*k<KAZ<k@u6ak^jMrb{E)zr+z~~x2>(f#1fE=p
zrdsyM86M|e?ctmmReSV2QgH|`vXC!m*-^`Xd+tuQozZB!<@??&J9Yi>sW;4;4|dF*
SoPLl22!C*dpXMrl<bMN(@Agdq

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..9df2f3bd54e13d5078be076585302c2d0f4e93af
GIT binary patch
literal 1497
zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K}
z0vA4laj)@Nj0+bod<hfJX{q20B)Z|GJ-_#!^L@QHEo0Rbagh>tj)g2>17P#h?yro3
z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI><LK$avKj2}{3#PvN$q;ze)edZ5G
z^_J&|q$;alB=wtQ92v_04AHlU00<dKrN{s*A#^k3!|SdPiYl5Cc}ytsXW7Ms-mc9E
z-6AT(`pB#MZqFOLbF6;GjE^zmb-nA7S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDYL|
zerMF3qxb%X@s;n!_2x@PE~2L&?}3&WI$gKd5D7ZRd-B{B3F9kYI7}{v2db>6_5Bsx
zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I
z<O;OlDSJi45?hKROA(ifFF604RnV$f;?@)tup1;j);wq`HMX{uf9;t0Y}3qN+Nb8Y
z)vL9eR_(&hRfF~2a)6Ik5*(7tD6W(Oya9+m&=C(9uT>qt%AK@ZR=(}@YNKY(J|}z+
zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(<f)cDaQY|Nm-lJTgtHyqPf#50H<b`C
uXqiF7e{<oEww=LXyW#uZBr|sX;hERZm=AW$ovgl}1_&QGi~*GCr~VuJx%5l`

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..8e1fe121e25cb51ec8c26b1aea7ee00463f9400b
GIT binary patch
literal 1505
zcmcgsL2nXK5T0EY+$N2|#<$r`I1r<o9&F2&)QIWjLE8<A6);leLSR{-rm(nBNRR#r
ze}M<%-K4+3_#-@c@MQb}UQC>M%SvVG!3&e!nK$$1&3xa^%a(c7VhT&LJ0&JF*Z|o4
zwDUWya)3Al!hmhCyZ1#_f+bSA8t%+j8f-3!;(1YRtq35aqezkh=65N@6k`t(Nkd=j
zocg12qwP6tT~oA6N&709N2dBeO$>qtOa@d*Gyv$Q*=~u#A)(}3fLj3{h|O%2SeT)!
zVGa?CMzO#mv6yVWi_eMOL`tK^*lYN1-<!CC-4{vwDVe`w=EoTFi&nj+*HV7V21e9L
zqP)T#T@6>%D_qZ`czBT)4(zi!=f(cz)i`qp-Er^A@IKryf6yI?7Qd$DGI~a75A?*y
z>AC$TOW-{3$ycsQ;4ggPAh;YJXo`_C_E&K8R(A(hJ{9d!(G*kF-p6kP7%6?7SF6uX
z9tA_vzsWl!Q=aFXSMViy!Z`-I{D|`|FLXsLj|op{Rp7ix7X8qoM_Yv#6n{=ilqwXd
zG&*_S6G4wOHJV9{Y~9YicC1{cW#umFC)T9huXS2>?Yy3?iu_(#;KNLWgK`o2m7>5a
zK>UTy`2pdztCLsR<4)Vob)0@}+{)I^5Z}WY9?k_)Je02xADeTBA5d`wcSx3GR{wF}
z2u-R$UG4hN860O`?qZ)QMZ0t!p*U1;CIMd1)5E6!=G+}`Im6*r)Azk;dgA({Q*V&A
W9&B6N8Dl>M5Ik^*f3PzC-2VX7ZuN)&

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..37aea5168fab44a7bd45091bfdfe5871bee8d360
GIT binary patch
literal 1530
zcmcgsPmj`25T8=$b`cG`@ilG2frB<2_77dMM$8_3u)7V4E1<G27XqcgnuTIvA$!xC
zhdujEOpJ+lKY<5NegKaiJg9NrYrB7hHSxlvo$1V*H}m^-US5e-3=$?uvQb0>%K)i&
z>%Wrm*8pMgB<5QJWSQJrAyE=15|I+B9IU8vkgGsyMidM1s*44H5IRvo5aarHAwk00
zCP;BjU2Gq^!%@BESR|%Il~025P0*{l{69^!6~=&s0970f0Qx0zzley>h^0OPyjlVg
zA|SzapIP{62t)z5DI|4~-;%$s%A)_p_)SM~k@LHD5bzVlH@l#}kDN2Sf!v1mkyCf=
zo-?*N&7XquThQNz^$#KvovwRDx0yu$H9^5gKRv-<ZmJyQu%6S=fZ^o{{*cf4k<0w*
zM|Qt6>Ry=Umj(TF0clPbzoO+Vnp$xaRB32+?OuaOIN;l%3unjk|N7Hja5g+pqFO@R
zn!{0?-&xS-Jr(6NqC|CB`5d_kpe588tyCWG-}8o~ehsZNna~`iw2Uv4hm>NVLw70d
z(0qr-a+vUledQ@FFpGX*(PK;bXDt4NDPAgHV5QN?(Jl`<OtVJ5RGF=r*%y|PO*f6~
zNp0U4w|dof)2trXG8LZREpdF9iEvO(0>50~csYnO=#*|Vyk=#5klAau%xv50RY%QC
z?FjMR_3=>3k>aLYnej0xwdgi0j^K8fW#ZR=;5l3~m1kYe+Q91XrJrwLp9zb0*nWg!
zv3kCEctK4L8t%(ud$ei|2CEI%btcKN?G6u}e$u$JW~`;PtptGgzz*JJCA{_j0OaiY
Ae*gdg

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..3d1f158e9e79bac193f88f94d2b548b79827778b
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyJS(ac$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U|
zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5
z1h@mC2oUSy{w9@^A|VnfO^GT~Plb&+S=t|TTP*^}6v&b!gYiR3iMYNCiKMQrw$A*)
zu-5b(v8KxE7fJml8ApclKST6QA^<`LQb{rZ3kW4cKD_Q4p{SxMk;jB0f0~_7=<Ujs
z&~2hJs13cE?{>YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^
z?X`!U8G7$;8DIHs+-$yJ<UD%%@h)hIfzxrjb&;TRd?3$UkubjYg~Q~0c%aI9O5b0?
zjWd>Z2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi
zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC
zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9
zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t
uf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEVGN)|KlR_NHuSUr

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939
GIT binary patch
literal 1497
zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U|
zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5
z1c*T>0>rwwze(k!NQgvAQ=-b$Q(<FHmi7nTR*L{K1+pZ`VEm9$BChX3BB^Vutuucx
ztTjDHtf{j4MN+>>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=<Ujs
z&~2hJs13cE?{>YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^
z?X`!U8G7$;8DIHs+-$yJ<UD%%@h)hIfzxrjb&;TRd?3$UkubjYg~Q~0c%aI9O5b0?
zjWd>Z2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi
zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC
zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9
zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t
vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J|

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9
GIT binary patch
literal 1497
zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU
zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1<lre&;}A}&(m?un2EYyfP2+WnPL
z5Fier2oM|M-WHWpA}Nw-O^GTqPle3|S=t|TTPp#`6v&b!gYiR3i@3fI$&{|Gwa@+G
zsNV7%v98MM7fJml8OMh5KST5_A^<`LQYkV3O9&-GKD_Q4p{SxMk;jB0f0kWL=xt&~
z=r&Oq)<<65cYEI0on!SgW_*khubW+$%=%|u(o$YgMN=Y=XGQ)jyO`H!WyVYV&8vRl
z_B*5Q9KH9qjIVq*ZZ=;sauGfKWFNHT(CNCphDg#m-k0aDNEl!H!eMeTJWypltsku5
z#u+QS1IDUPMg0(0WkXTlCGG&|X>A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z
zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A<U|XPajJ%04s4
ztzNC&v}%`jt{SZGmIHjWlHibBMRBDR;0-|hfsS~{c&+OARqnLivhr=GR~t2R_66a4
zIK_h)Af-)(3iGiL9X#ab1n!6}%e4RFFbFiOB2TsKfzv<DzTBfZ6V7&cKS6Q0-&8`p
upk)RP|IMX4+I9wm?S}7rlg!xlhv!~DV?Nk1ce46H8X$b&C<aibpZag@*7KGC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
new file mode 100644
index 000000000..c08326355
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
@@ -0,0 +1,58 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "cluster",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:32:15",
+        "end_time": "2024-10-18 10:32:15",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "jaccard_similarity_threshold": 0.7,
+        "num_bands": 14,
+        "num_segments": 2,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 91.7,
+        "gpus": 0,
+        "memory": 24.01,
+        "object_store": 0,
+        "execution time, min": 0.001
+    },
+    "job_output_stats": {
+        "result_files": 28,
+        "result_size": 38040,
+        "processing_time": 0.061,
+        "input_files": 28,
+        "input_bytes": 115324,
+        "input_rows": 168,
+        "consolidated_files": 28,
+        "consolidated_bytes": 80640,
+        "consolidated_rows": 168,
+        "groupby_clusters": 35,
+        "cluster_duplicate_docs": 79,
+        "jaccard_clusters": 35,
+        "jaccard_duplicate_docs": 44,
+        "num_duplicate_documents": 44
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..03a0c321a678ea8f24cc6466e71fc792f5848e66
GIT binary patch
literal 6923
zcmeHMc~le0*6(1%00G&ONdTb*4JwNeHbrCzAe#vSA_&4rCuu@Ul5RU4ki{KPMg~+|
zSw!3&1s7aVISNk}7hDHf#07CdTyVkBah!RTs5A5C^ZU;E-aF_0Wu2Vru3J_2{(kpX
zZmKE~f+$xOopqYVN)WO*EXWjs#vC610ZwwFK?5{s2t<d5Lb{Lv#DokXBWO5e44FV5
zi$SywG=#>2STKw42<95IbXmGq9LDE<4uke-BpVv@1u5v%nlfGtK{}wpq0xsL^P|%?
z)It!iTE~h-=XyZ8#$1jerle+L(%BMRu2v{nOg4x8R)@Vqhqr{rWBh*%avcFxh@}I8
z${0K#2(%w(vj#>BS?oW!CwW+_r$Tc!^>TgvR;;Ehjb^W>rC%DTW!j(jqK~RL<Ye!$
zYW0OVp^Q~ovAeq#u27mRnBla!bWv(Vo1L_3<iX+jTWy&EH;^Iuw8gX%nxo~8O%FMl
z<6C_4qpIgu-?VsFkX#Yp6nA!gA#-+dVe^WuT>g;<gPma|(_GM;RpnPYlAX%ExD}1&
z<9u~HOq`<i=6BTAL~|t9-iKV8xV8Ug@iUS2g1zRJA;jbr-vswEr)}t25%H?QtT7MG
zbLa_U%6{f~q@^8u<~UwfUJ}2cb>%D@XTAQ>+`N|aoE_9Oj<R8sV{g``tn2;!yA9Q^
z3tPwbp64Yrx2B9cmE8Si@2ivNS@DX^m3w`Qm%p}_ITVU2(C5xZy{V=yP4`}eSeq9b
zHm(hP>R;7<+T*=-d1vynh=sAc-ahFVQ5f)upxq6#H=1VDIdoGJb%!`WYZzwmjfD_T
zOz_%x%S@-?hGJfr?)zht%aCjGZheNAJ2PoqML)5<NzZD=L8aBLy{-d?TYJLr-HFDs
zlcFdCSwhASBdXiH-@1^(l`dB{yS&&u&);=!P;*)j(VN4`vAlJ6OJ$R^_-aqdjf-`M
zEUG717Py^V*0Ad6<ddxO2U!F4$IH*EzYPpzEE0rWFSztl6?E-xBFCnw=7YrDuH|w5
zgBtEQUx|0~aGqY?@onc0Ke-ZyA8CEwp5$|E$pI+O=$qMZtX!gf;za6`tft)RfsS00
z=puGb=Fhz)^uoNB2W9P4qZ#GtltWbY?kLI*HXd#<+?;yr(v!GyRt3K$s?Q_;3R1wk
zcksz^r!>Z|Z&)_}IN>8nA0bGHQ|Gj%c#d!l8VIWqgpA56-?pJ(8pCJnZX$71T-V^r
zYi5lnendLSYnNsYD|hui&$Uj7i{|$}4DAi)6Kk7(gYO=Zxp!r(Uvsf~aNX^=`j}TG
z2Us3I6)W01Uk{tk2&{ACXQ#f8d>9rI^kjb@{l(@=VT8|^x;8XvcdP%4xKo3H4r>n*
z_sbMYl$h`kKI-Vxaz}l~SeYVPkePm*?->xXJy)5t((TF8bsK9^Z^fM+<VTBEICSSc
z2|oRgrT)czymj!c0@cL+g7(t9b=*_+L3z_E{ri{4cpr7l+Ie_OP2sa^)iIvUN3U^3
z`H!xP-Ped30?MWgS$~~qi*j8oS!!Zp0)<iajCB>kT*r6myekGFC+lnWE?|Zl-k99L
zKrVzlUEE04M(U19TC#{?Ze7GMYO3s;S(vtW>BKclyCxcSGw69wh>?>xIr?WhwvG8#
zpDr+lqvzF@>Q}Xw2wz&)|9YA599s}s`n>NAY_rlrwDaCwlR2uc@AG=(QOC-+T4vnl
z28i0=;EUVe|Fo`=^TwlZokUEPRS`*V$MP27-j!~(d-iqM3lm<+{d3$3VjTEKub(Uu
z_c|w3Dsp2IU!A{6*E5UVb?7OXcG<F<Ev~F&`c+;WqMPSX__D0tX;;!xheC3p&?i59
zZvQJH>_$&>(_6=*Gwz%tgUl<=7`W!0fBA}T*U|Fko!WU9Bg3IOcP!l(X|{UZ$n<Sm
zyeju5_uj$H#ZPDMaSC2;rhXYj6w;XQ`A(9bn{x8;-?5^!$wt2M{qYV#1F54axr@cQ
z$M+k`LScNT^x~DN%T=@2XZBBgUVWc)#(vkcB}`>TP*@(tkeXf*yAOZY$@dBhKGZhX
zA;#UBUZiJfYim0p!uHfYbzf0afKKkc7xN+_Mt%Eko=-T^v)&{w?~p-kb9LxV=jt)G
zZqM^V^Rf?*J~6`I+BGOmA*SALuF)AaL3ic;?8MvF)v3w*haJd&<o7d3?9FSJGays`
z_hwIiaV;r$7T;WD@7NudTU6YC<#Ariph{U4Y?kyI9hDtDE}^^W(ne?gJL1+W)g66b
z#f`(?)jE6jU0HHd)t4Dt#7-@}dDX1VJMnwCsI5~J+dg09J;^uZb%J+l>DYJ24{bWq
z$F{YFjs@9)+Hj2;OLUVD9tzkq%{1x4gca!X(>uJfJM5OGD$Wi@@x9rF+zpB64F~mA
zEy+Tovqx9nj;ot{rbkbbu43OiJ0s{>{n%p<Ty-8EF}ito`khSHLkTBmUr(rwCu~%u
zv(IIuf54f7gr*Q@>ejnD-P*0i^Oa%O-p!fnCFEc9>MCehHlnoVed&ole&a(MmrVnn
z-EIDD7eWi{FOubC7sE6)bA;!zw5p55Q1$b31{3}3<`*QqT6cK{-tRYrYu7|yJbKq%
z&UAgQ_`-#P=7Le7-U7PbkjSXai$<nNcRJZQjj)@Wt*v17%c4<ZQ;hbd=MHmquJG(^
zyBeFgwP5LYJK4j}4a?p0CaT|Wt)&9XvAvmFXb~0dzG(Ar#~#H0G-vz7kk>z6fB49z
zMv*)vX1~rw3sb+qVUK=s8Ft(C=)Qq4_RD7-i`o}8FjxOR;l%FyXP#^mA+7<&Q*uvV
zh%_9(nPr)oU;e|d<_RX>o5q{2NOBs-$UWCr$852`bg+9#n()V2KbM|p^83){ymS6k
zYJ@tajyYqGe&bJ7-CYmpMwi6A0esDo4Y8AjaeeteEc}k|jC<|O94@u73cr;(i|}*e
z4Ov1zE#7M+e%Dl7@2+!TV*4ns%e1<h#90ZAIdx0UvnHh}Tlps5O?DdvWMuN@lk5bG
zJ88qyQ@0b-UpPn8qmy6gF#R}$$=uTq4?dl4xyrI!RKH*bF^yj_xjyAs|IjHpzbL0x
zd#s&87hX+z#Ec8--1$2^v|;Gs5}w%Ue74=GGp!Em+8g?)!}=Ww+ZR4#D0dh47|^Tx
zPB22uS0{9~PGUaemCq_~9n5+buFKFdeA8UL5vwj+yv*&~d)wzDciP;jKoR-xOWSoz
zG569bS%>a58qXi$M_<LY_gG%LyGmtjz96YoZ8!7QPMeUfNI7%L7U6#1mz!G4vQOS?
z^s8(-ZXA8tc(!%11(qAteZjhBm8<);<kZfk4<c2a*7yWt#=L_@<Tg5S^^)6OoLSkH
zzIx)5%lng}`x8RHF&z5XDSmuSx=EJ4{dg1XXq2JXxn);Re7m1>p+0sAQ^(T`h1=eb
z+)W!Ut9-l2>UcpU#W61Hc(_`3(VqRLtNE>OeiIF__kF0d+u}rwt_tWH7Z+WnH2LAa
zkz|*n`;?Pjdy&flZl(KsUv6(0pGBzgQ|p#b2=jRn;VZ5=6reU=JAa(n1sBF7+N%!{
zk}z&9&Cp}?mJcnv&+3JY@r@1i<*|K_+%?EDpz}C1^^n6}v<C$1@4HA(Xv}S$I8;TU
zq!h_;gb|oPFcQX-VF^Nzm;}SsDwx1zD6Rxg5}_zLDuoF-BEh8c3|O22Qz<AIE1}dX
zM_8gJH4{LG1dPX|D2XX$Frvi4Ao7zMjbcO$h7hPERohJgk&;5>xSBwn;b@>wj$~%Q
zayg<vU^ymBp}=T4rXYZ2Duu+=vJ|aVIR;<=t8zq)f3_>fl`@UeWSj(C<aj2klmVpz
z07y;9(Z3M`P%^{_iKt*n3ZMq`OK@ENfAJ!4<=<Fobo{|dl7gmd%;U<xaN@KN)z`2J
zMwJ-wrVvX;^+m`$Sg4jEX();6$Kh&NiYx6YI0Xn`3Wvog3^*wPg-?^TG6}8#43vOR
z5I~8i!zp0#Qz^i~QdlLyP^AQg=iq8miO4Y(<p_tP>2MrQroz#f0#yN}DoTaQ<yxn!
zsT7=agrj3X97q+enL=e~H7V5;iBTA!fX)Mb6#Q4LFz^+iq9RZvRi#0y3BLkYQ7{Gq
z4nmczmg_4prBtPHHNXN>z%~pbq^96>;A9x1U<HbR)dVKXF+f+2rGf#Uh&t=Ld3eJi
zfLkme2RH;Eq!`3lLvcFr7YPJPI0PXTfEm#4Pa>ES&V~b#G)xLdprl-(27v_3M1lHW
z62h1Y7!%_NDRq(nc33h7!mdQsBreBgTDJuO9{~UWzlNMZ0ah_spl|^2IV#gGq43Wx
z`Up%j><hkj`d4Rb6n=y&LQ=ukK|?863tuaBgq6Uknvf!zb(oHUxN7vuK@)iQkDws(
zk3sqjSONr}V(+XU1A-<(fLj7l0wE~{s-)oQ6i%Y>Gy(-Kk50izDJ;Y#h&H~!b0SQV
zg2@o2e!N-&K#5UOyLg3|m_%SHi4!1(l*${n52frijw3NuD){P>_#9M1+3WwWM@tKE
zwCtT^#bSOsS~5UFpa2||_EgzgNMk?n;uVhL{WiqvYYO_Bg1)ApuPNx?O+jD0Y%*ny
z5^C<CK3}uxg@RNCisq(F3N}IJEZ(12X&?j9q#RAR_MS{X^7B0zZD_3_?`8O(X-rQF
zb`jv$fI_u-ER9`4<827%wTm=aOsnHdXt+WTh@-0ln*#;H(gk;6qjmmv*9NheK=wzH
z91RJr02;0EPm`adY!26o>+|2bxci#LziuSIw7L1$z0KE+<kyYlf3}hQpLz2?PQ2`s
zt7&WwkHOwC$sB^XRve}vDr#D+_s2xcIju8ZQygRhP$D%IC{qQ3bb-H25F`-D1OeJE
zfxo{%AkuUO2n3;;Cuo0uf|jPs-(4dQ*Hl_n0jV0*py3|`27@BeR3Be8Z-OvQ6(16l
zE)JH@iJKWE*R)+Dz@&+Rpqc<$Q)W!j2!T;hw6EGXfD34vBj$U!#K(HLNYb@Z&>#6x
zKQIEo1AK7;|MZWzwVfX)`wLREFSw@u3mz?W1n8T|m#5C;ho^}{Vkohezf2IQ)hGD`
zMKcpv00rWJ+X8Z~8l{{h1mT2`Bsjr<6Rk>fcy#r2c6D~)`hXv@j~_^H7z9Otzrp?q
Du#EMC

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json
new file mode 100644
index 000000000..047921334
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json
@@ -0,0 +1,56 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "spark",
+        "job id": "job_id",
+        "start_time": "2024-10-14 10:43:38",
+        "end_time": "2024-10-14 10:43:55",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "annotate",
+        "RDD parallelization": -1,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"]
+    },
+    "execution_stats": {
+        "num partitions": 20,
+        "execution time, min": 0.284,
+        "cpus": 20,
+        "gpus": 0,
+        "memory": 0.36,
+        "object_store": 0
+    },
+    "job_output_stats": {
+        "source_size": 4111,
+        "output_bytes": 8856,
+        "processing_time": 0.46729254722595215,
+        "input_bytes": 8753,
+        "result_size": 6923,
+        "input_files": 1,
+        "source_files": 1,
+        "input_docs": 12,
+        "output_docs": 12,
+        "filtered_docs": 0,
+        "output_files": 1,
+        "result_files": 1,
+        "source_doc_count": 12,
+        "filtered_bytes": -103,
+        "result_doc_count": 12
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd
GIT binary patch
literal 14933
zcmeHOYgiLkw+@Ji5meA1DB=bYY@39OqT-cs6OfyLPz9`$$&idpX5!3*K(&@CUfOCE
zue2&v@Jef~TCG}ZRn&T`7p#g}QB*#w^@dv89<|@M_9P&BJkN8!?>W!;mHZ%?Jv)1^
zz1I7_YwgJ<O_Ls~a8e*eSQ5O+JY=%I+k&5?&L7BR4l?V@$@Z?0DLfPbO6b;E;jD1>
z@p5Ut<K@!a*HhN_HLWm!G-~?<namL`oE#k5I0im;@KHE<hs&HhdwaRjtoRPCdxzv0
zvyoM}dOCR?cJRF85SSI{^6yOKjm?uO9A)sQGrY-UaJ_f9;%cwf%bT#WWB3S}{NvT0
zLm~?niZw^Jgg$YNUvi$B6M@RUQT2JNY(f6EfYQUs(>C-t8B=<&s=EL1E3;f91H(Rh
zUiYNzNoFarKGHSEA=x3v!Rx~H`w<s+FWPjie(RvTaU|QbY**>2#fw)o?VFtZTUGy?
zN!4WPt}&-gmlmCm<Zqv@dGOuF2c>7mi68c;xD!4{QDKq4b>d0f*eTmHEaFn(t+#Fs
zzj^6w;i&!JqncS?9ADS>Noi`2sg>;A;ya_?if;6sn)ozg$^78z<%5SWEU*1+@~xds
zUmWXF%dBildvE6KTHldxOiliL^O$ZwZrtcn=kD%a78CPlxYzpu35wghoh#~|99}S>
z*RI{e^6IN@4)L+fcbGnX#JHVB+WG1SPQ7c|&g`J*5twE$eYXEW=8bRGMIT(8w`1Dz
zZN`E@R}$}J%70b&cfDFVk19&M<TSPY&hrPCb!(G-R9R7*ncip`Z(Owe!LS+UmNq18
zAn*FOb8-mi?jGPDSdkPzrBmg`+-IJCGw<#1H^Hzvw63qA>A*VevP#bi&mMDxdZ8)D
z=yreTy7LuP5AyPxes*)~?leKOBl(}_ruz7H?=(+1ryiHbX6J+*F0buQog@M$abq8U
z_jp%L-c#CbTwcYh>Zv||pL#s~M8JTacVq9pu{k2%DW_+jPJdK%v;>B(*y6%|weIqp
zsYPQlYLu7Pe!j|e@coEg3g5rq_IP;rQ2^i0yC~}Vh=A`NCtdrm+m#cxnlJK0g1-Fl
z+~gHY%PR~uiHk?96L;i1_1(Jmn_l5>mQFZOnB05TtV7XSf9<uzv273OJd5Y8-4K*s
zURyfi_~}ELhh{`LhlV#K9I9?J>t0mQy))IvgiU{~EcM)QT65yO^Q|y_LH*k$+a6Mt
z2Q#YqDZL~8wf<!pinxy4_}mk7-G89tmmXR78+op_&+1R&SCPjmEm=1sZm%BxTSCLM
z+?<GoF79K~=Ny{8x#989?WZPd{*kbHW$n$KUvMV_w)ZjnBo`c=H@0BWmAlsrd%jPZ
zaxp#e+QJOSx!Pl?7X+U*rp#wuJiZ+`_tdAEXU|P>@kkvSU1Zu5Z?3<W{_gKbKK7as
zw0gz-wL#;(r=yYsXV&EITJH94)zYZI{=RLJZq$~HZkqL|$hV<u()@kBPSp7NH3e+>
z=8NR%YcfB+a<RVZyUY(p3_h8VH|NGMQlZ@w`$sYF&gkA_=N@a+)E6hNsBNft5bFGW
zF!|G@+!(*Y`kW=g*F6(AHny8od1#Ew@z9b@ZVGDq$jJFSo_zj${MqZj^j^c5$KUv>
zYt#?URZXY*Pbu$rVB_{_yO;Mb9aKn_udUcrJ#K2*Wd6vTJN7Qo_uVo7p2^kyjlk2a
z@z%YILQLX?Lf_cy4er0!S3O;xAZ9K7{>LW6mGT2F?7It!m_6TIiN5(HqV2R!-ICTX
z>h(tBPq%-W>UOB6msf{O-QEYW?~m3r#dKM?G5uNBcXy3E8>=3cernByE;;Ulx;rlm
z9)HF=^~C7do?>F#xwFDz$38N@d(Q8tp!RpD@`II@+%w(w{5mgU&7y0x{9Z(tsS2+#
zyMB9k%e1dP!-r1x`?lX6&Aw|##{a?p>^n1Pj@OJ)4`Onh_KfPeW9!<zXZr3r6r!7Z
zE#XqQPsa_8OJxn3>yy0KPj%0{xx2~e_ISK}^XJo_=e=>7UcXJOK|y=Q_PgWzcewuP
z<jHFu)<1IdO3hj}KC$P(A+wuSdoONO&xmYTyEspXUtBAHKeu{Vw%>@u+unCCsPDYt
z)~KF_U&bFOry6=5ubk9j+@Rp&qgHC#$U;gCjp)>SzEsG)yRN4@MCR_VTE=$JbVwbx
zwcUkdY3JIwHHweCR?6Bt`o_$jU3Ast^I^wE!>qw|Wj@~>pR_v8=fUO`yUZctt-Yfg
z%bpHJF8tNV{bw%6|GGOxXZrnA!_1jgN2mX><$T5XlXo2wt7x(R&|P=(5>FQmsvcVS
z>B8>Cmp%<H>vm)5`t(ib_D}g}OZg^M$&K|5<>`s;+WBSPW!}lPa~wjVqGl7WoX3Qf
z50@`;IeRK;K@|1Rh=&7Ig~b=Al|LJi*1z8%s>Uf}p7`a0gsxkDUbLqlda5#g@?`G8
zGOvYonv#-FS9QGnC)pI<SXgMPNtpPL3zs~X7O%~2;+x2KmM?76-o25i)5QFm|LXzI
z-r)h3UYm|bmRHaA@x6a^ifLcxkhVz<T}B?zeSR!)ZDAFg_9W4e`t`}gANx9(_Vrlh
zy1mctwPXAv6HZ43wb!j4{Cr-I$K8HDEQ{I8YTWKSM{n}F<D0ZLJF}0+6?cy-M`nDq
z>bxP#Y0yV~mqy(TpZ0up)!AFgKTNhvT!M~n{+38Sv}ZxM?}QuM{6otg&HMP|mXPhw
zG`0IGKl9yuefZHG`&Qp?)4AKHe(QfatGF^acv`wg&Y8PApLvYvVeD`(FOIL7@ZO^n
z3nN17+mf==*GIcJ8tNARnB99vRI%UIDE$pTw?L-u57f=P<dVsuU8M88AtO63l6_L(
zJMd5E4dH%4>pocA@ys0G@^1SNPINf_(N{;m%ssGX|F@$Y?~FSdlKxi-f3vM;kkTU{
zzWr6du3y#pxE;|qg?2dG$*ZixmD6kX2J7#R@|0CNIs_P(kNC68CrbH4&)$c&$&xST
zG9Pngzh9{we(nI77A%@}&p2fY4C<E?C*+--bT}(_@%ANs(tY|YE0PV@PP{$;mnx5x
zM!M7Guiu=Y?$|bN-<Lg}`^AhD+qGAHay90{&ah{p%l=xa_AKn6^oWbUoVz4RmayLO
z)v9HHtnXg$ERIFVgqcS%l!@R)BgKj+L*#f0rJ5*KK$!-b*P$eiBt#mz($JeG4YZzM
z<!R|Cg|Y}18h0i|PD}A9G)#%YLP8_tq5Z>E#T3D-I7WxkDUqNVxUFGH1IG&}o8xp0
z!RlaYhCzvDK}4x$QJ{4c8p-4KbizdIgduX6;Kk(894peSnG(Q4o)9TLO`;UZE`mX^
zMJ9q3u#IUjvLKJqWW}bWrKV>pQ37Wnl-NXdi+w0KExp<JV5Cx^|IrTu;=mD<4iSh2
zV+6@JBBvvY5h1>^5b(|@aG(WNr<DAp2LDkgB19Py5~30&nF*d!<#U{98-^B;2=}8o
zBiLRXgs-r-nb&w*Eg!{l78d1mJfcM~3fv`vr)XA>1d5?ZDLh<0(n|hg2_uEDBLx#h
z(gY)dwI&EK#VC<r;3z?f2$G}(f#YfLC+?}I^F?UJ23Z%GPZ6S-hr1@8)0s(_G@s{;
zh@l8+R%s#u31-rO2m!@HfGHNm(Sm{El_-m5sbT^agFnEqaXdw#G>S7Z6if}%funRh
zfig%=6rsh)!9tAS1;l{`yr6y&5%3bh2pqBiJeXn?Pw8P?2uQOl1ha|aY48gPIO2=|
zGlAj@X%Z%)S&}hJ))E-IA_dD4Rr%N!(m?Qfc!RMdUvNev1Qqb5M?3}VW|4_wXtG!d
ztA_5_JIKO=6LAl`0I69p(AE~)l!98GiLn8(#IV7A$rjX#9MYO;;Dgx&FvX5Gg1M|{
zHX;ZdzRo9%G%OJU4eM(K3v8+gO>8sT^*m=04G<`a2gR+tWkV~*NIuO2v9f6aOb`H@
zt#Jh;9Vjh>0B^y4pd!seq&Sgywcj9Xz-SQ}AkKJZNe8TK02&HNOBLf0@SMrC&H_HB
zz<(I%CLX-rIu4l7V>!M+i8KtMC5(h*L}~#;7&s%LMjD-t#!v=b0M}!m*xVpZZo<??
z;77blX+E3ruU}*`AXTucRvZ?Zu#m7&wcJ0O2G?O@DMA2VNIammG(!VRU~D`ogFVi_
zBt{6>W3`qlA}N#ANQf@eihMvV2@XmGplb#SM3~}109i>~Iq93gh?g}GgB-%oii9A7
zthIy+gWN1Fxfw@N1n}4#sJ4~utyEd*j|76&p+wq<3SsG3+m*mz3X3afI{>X&>eR?T
z4iowX5V16vfJ_3i5KIBw;5oD2AWfv>K)02M1RqEo0X~^oo-QO9fMxTt5mkh#NU<b^
zY}#hzMCc@mla?~jfD)>Z;)_A(U<DQm={PmY27hy00VcEsO9sS%DiT2-nper?=fkk9
z9!T|)zya5R#?Vi&i5;y6hJa@MHz{b2YwPS{4JdZ7q&qlU*&J=8)cyby7!-yAFVGNU
z$t(-SkU-fM7b;Ofiz<T~X&ympG^jx6B{02eUu)pBXcTY?M}Ux0W3vzdxtL<4179%!
zbscCQR-IS!O-rmf%n}~UGN(k*v8i~JwDj17ObBr%9@|DL&>Ij0(uzhHUjT*E(fK$7
zyo5$jFJKx3(PkUK$4CPmKo}v9ON@OP)H<L%Z51b6BDD50Pyt|q&Bu&u8J2*Em{fok
z@V^n?mF&f-2HM18@P9N+!DAUvNn`*g;83+7RLVuICz8y<o)pYH;MR&B%voDflk^H^
zVyyYCxFv|MraL^w3+)wJFoiKG<rytclYmyDW*wn}BqM1NARm_>g`nYpWgah-wd!Uw
z^&ksFn$yw{=$A5+(JTPrrR7N0;{pQ$pqKKv3A%HTQ^2H`B4o4K0?5RKf*gW9g(#zm
z;ff)@V7!Nhhd=-Y4)Q;U^bjl0iHVT$LFcRq#|XfJ5wu{nnPF5S4ak4t7m0SDOIogw
zQW*)pfPxfA=y5v7NCr}XplG2u%EzI_Igx;*DS&<#!U`dj`A}d;T8=Cr`G{f*X`Yj0
zP>te2z9c^5fLjFuvn?4C36xe~eg<YTd{&}RSclrEM%PLfq6lQdA7i1c!xLseoTV;d
z)^^-wKyw$hT%(0LsbxX(X61ubKz76!0aQVyimcox^c+U74icP*lUxh|o;7hiBrse;
zT8p{nbVLHmVEy29>jHp7{lcwb5%4IX_yRi~3}tX}W5HR6fC^Nq29dQIl#(QhRl%rO
zEN}%t+7gig=#Hra17r6>orUW)h!f&i28t@EB_OFo6V?TgdLt;k8q`TGj|SI@P=28d
zBg`o$ZI_WeWio?F5D{GJv$)EF{_-puax$Po4-96cas}2k*dU6g!NI{6i$!hXfjd&2
zkNXGX3#h`N4iqRM7-j}+n0b}1Wdos->PHpkg`kp3S}TQNVigU&Z5s$xzmSlKegi`X
z4Nx0IBO@PYCJQJYb`g-;Sd>IVZUYB!CaKmPjDiwGpap?=S|}*S=$Z&cZ;)4ATZtVb
zq1b{vnMCJP@?@e2C7JaQJ)V+}<;)OKHV9WQz)k=bpk2#BeL^7Hm!vjW83pqK{^KB}
za~5QvplpVS0m=l@n$sqNC&5Rw0EkGj6*4FjNP|ltL4-OS0s}g27Gam8M44F#unh%>
zH5=fOShEQ914md>r3m@?g{@d_v3{8-qCm<-SPvx0ra@Zt1`&cU2eH&yc?0Pm6BhFm
zKyJZ!ff~(7*-9h^ixEQ?KnL4G)$)GfgMoM!lnsl6HAEA<7UNSQ5>(aDz@m79H$sU9
zqet;jw4f=N2T)O@Q9NTbi=ct9w~3Lzt`R4F0-Os;T!pg?PA#xz%FJ^Nr<dFo2R;G=
ztn!020D78)NuzNY1?4!jr!>3BY8dvxYkaMG+1dDuR&$dH0FksLNDg4W)T%^~ra=~=
zz?YIK#1)UrV19}(f`Szq2dUX$5*lo=SHBRCe}AX^A0b=*`ritUYkd^1w%-a|a7_qg
z6{MdD>fZDG=$*j*y#x2R@v-mit%7CW+e?VH@9ph-d;8uV^1M~`?R$H0gne&s-`hhC
zWZ&E4=SB9ty?t-r^3(^O9@_Wz_Psqkv9j;&TQ>amz5R<HqT#VZ%dg=7;okmr$Md|0
zhReL11=2tn3564!*X!f>%5gxM!WG(Dw*7r%Nj~tDz|+aw$6NXTbc!(0{s6*$xWIn6
z;H95%?1u~NhYRe73+#srS{+@pA1<&TE|AVuzH*ihKY)yGIqzpbTwp(3U_V@7KU{zh
z%-IhYv^;CDA1=Trt1|v~$MFBp9zb}meBaY4(8cq~gsw7~w~v>rCOthhd$9F+gjb1U
zFn$fxYv45+|AE&ijmDyh(re;08oef3x}}MV(rA+L&1j7#0l&j_^E+JOk5MnKQ(|Dy
zLC_XECKmqUFTqRtD1Cx)q5ni}Sa`^|?C=oMBDKQ((Rf^GaF`s%8>@-3Sm%{)O5LD~
zLHfYix>{Q`niTwjfbrfAV+!65ODWXGXNg+Le!V6}8jrM@f_uUY@RE5UI5#!POo=Me
nWKt1u%Fu$~jFI$D2cZMhq3RHCB|MM8k7Rywm&wxLA4>lPWZ^x$

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d
GIT binary patch
literal 3068
zcmeHJU2GIp6u!GGYZr<vh1u>VOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd
zr)%p2U<hKU6y-tUPmu(TQBZ7vglG{-4E}`(O614-AO#a35t|Y$rs|n#i(31_gZk3*
zaR1IX-#O>bx#!+mwyv0QF_?+Op+i{+<sB&+!rm<iWujr3KGHLY$ztZ_1I){~8F$d1
zF{<%rjADH#Z=6*iuvtAZgq$#OIUJLmp#evbaRo||+Z*st6J=w)m|V{(l9`g3OrOj5
zsl#`}5n3I}xEDhK%RR(65j0-t5Q6z(4|8_%U&Gt5&M`Cv9c_%~y`CGp?{|>J=%~>8
zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N!
zT+gohy}f4!viskNb<Jyk=3=U&>vSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h
z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{<Is@}v7#
zAMEU1*s}fjEYF3{y@QEpLoZi<sGzI%g&~J~vNN-*|A2Q5cdk{g%vLVV|7LG|?pEP!
z_ES5ro(Oli_-sLbU}101gXdFMyE>+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk
zldhaoEl!8**-E2<axUk?mv(0yjCn%yPhC4P?YSq1*uMU<+`zGiqSH~qnRhS!`qj3Z
zc~f%EEMM0B__c;?_qVw3026K8aq)@kH8-}z_uXEVJCz@sE@de1vU4rf4XPzcTRtXO
zC8CCvCd?646FE^ab*zd!Rb=QjVi*!lVpSrXn3URZq754Y1uSQndOqe%&H4Zo9N@(y
z)kK-cL{<Q#jnG)cBmoeWa?Q5dfF;==l47c~5Z42rMB3Z1B#{)slE@1Npd~S-f@MR{
z6q6ThtC9$AfK`bklu^5qBJ&n$lcGT^lG09P9!Obm$W$ddCNOZNEkQJ*V@`n3fS*$o
z>0aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1
zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe
zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R<q%rskj$++VG-{8m)S#9FkqBT4
zA|)1*Ba#F^8&U!sE*!tSbk7VjnIq*T1Cqo5qq<#LPR#V(oE|#t37wo2oXE<Fto)a<
za=g1Ju&5OI-8vlI6k%L&IpsRX+<l12gtfZ~n~UN>$Zwx35Deu1M^|QO;^<5i^#8S>
z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@C<t&?Kt9W)9%ku0J
zTg6sYv23-aT*9(3tB3h$4>N06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1
zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^(
zZ9Z<q6$=R#K#(7vQ(B8jjRH<p3y7w{WeRt;F2eJmxU8_aFdWE-N@5k0XI%)@!n^+u
DR7u&o

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json
new file mode 100644
index 000000000..717d9bbe9
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json
@@ -0,0 +1,59 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdclean",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:10:22",
+        "end_time": "2024-10-18 10:10:23",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "document_id_column": "int_id_column",
+        "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "operation_mode": "filter_duplicates",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 112.7,
+        "gpus": 0,
+        "memory": 24.17,
+        "object_store": 0,
+        "execution time, min": 0.005
+    },
+    "job_output_stats": {
+        "source_files": 2,
+        "source_size": 4490,
+        "result_files": 2,
+        "result_size": 18001,
+        "processing_time": 0.308,
+        "input_files": 2,
+        "input_docs": 12,
+        "input_bytes": 8753,
+        "output_files": 2,
+        "output_docs": 4,
+        "output_bytes": 4650,
+        "filtered_docs": 8,
+        "filtered_bytes": 4103,
+        "source_doc_count": 12,
+        "result_doc_count": 4
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..8aa870c00b0fe5f1488f3bfd1d1f8e619f637138
GIT binary patch
literal 663
zcmcgqJ4?e*6h296478S8@rDF4WHEH;!@dNayeSnQRokGnx=33?6s@UgZNbUW!O_*f
zATI7s{u}>*Ab4(47jbd%8_xTj?;P%JFsDFc^j4ttIwch50n#t|Pjl@Z;5h{{%m9Hn
zNRWg%h%mb^f+SF&DotmwhftZyhNh&Zc}<a~mI~=#qL>UR;)ehtIz*79ySESPK{RMg
zuYgQ+EqUVihQ}lSIB@&^*rf@Ts86DLFIs`6{Dn);;B`PnKtWIeAV;PGnmp{Cy8RB%
z#1_S$grE}L(2f7viQh&n1@Bt`&<!oY9A~e!8m*|k3VDvQ#}G{}5ZjDRnC-E3#ynOH
zQ_Qv*+u_M7W1C!~|E<xh@z4PmXrcCa{|29fX!7C+wQ5)y4GqmGUPfh~cZ<6>?UFNY
zm$ojtt(D>tZiIZui}NNCUsxY9MV+9TgHCXD?T)g&!64fS0)JwT-C%g;Uz&yGTp_ol
O8wNhkqX5vrFZl)Zqj<Og

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..34b15a76c75091485eac702e1c9f8c6b28c3b30c
GIT binary patch
literal 663
zcmcgqJ4?e*6h296478T3ctZjivKTt_VPAqy-js@us%=nOT}oR+6s@UgZNbUW!O_*f
zATI7s{u}>*Ab4(K7jbd%8_svn^E-#6&YV1r(OZGmYLrk|07$*$KF#%afM*G2K!!NX
zK@teeLxkA_5hQ^ERcR`XJ%q|sHZ*0L*A(fyQiaqnQA~ys@go2c9U@55-P?!tAR08L
zmq8}FmOSx$!}iE;2X5aVyELH^^+{CkMJuqBzi8=cybh=cXb2JjYNX4c$-~~M+wbs9
zY)Skn2rAJH-T1Ga_-({e@V-R=-Ov)uarT>Q(TduukmD$M4AI~QvBTJe*&f?q%wv@>
z#cZ3gU7oBkw#6C!DWg~Ap+j!)h3xVEO+E+F;KdQDm9R1z8a|_V3CTY17WQshMQ7Y9
zZeMhptA%6S2=!1G=S?ELus&3ZIzclBo#5))9c6lhL8cP~{=^)+!SKw#H1jLje0Etk
N4E#8c0ze&q$uINqc(?!n

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json
new file mode 100644
index 000000000..d4cd3e362
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 10:49:10",
+        "end_time": "2024-10-18 10:49:10",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 101.1,
+        "gpus": 0,
+        "memory": 24.02,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.007,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/metadata.json
new file mode 100644
index 000000000..a0b26f931
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/metadata.json
@@ -0,0 +1,49 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-18 11:36:37",
+        "end_time": "2024-10-18 11:36:37",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "sort_output": false,
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 4.5,
+        "gpus": 0,
+        "memory": 15.91,
+        "object_store": 0,
+        "execution time, min": 0.0
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.024,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c7d3d807242ca605dd7ae80e4807895ad4d48c50
GIT binary patch
literal 3984
zcmc&%XH-<z7T)LFd#5oYAalnVx`hD|qzN`8%+Of?k|2m(qARijQ6MX@V_fCT|d
zu&#jkUIG@X1xp@SF@VNc5R+(P5;cOpdxv1+OY-Bb_14Q?Gv}OrcHd{O;UiEHWJi;<
zEA6hLc{B_KFkSkpRLa?Qmm6_|>r&>{ns3qC<LO|9>WSl>YEfiY;u0EYF)g*Cg|rUM
zp*e<5V8I)dNqK|X03aBJnN^4|Q9#Jsks(dW9D$=Flk%gZl4It?N2SNpSR#}(B1tQf
z$IE$t<w|D7<OnndX3=3b0EVASd#jin*qm)i0{93>K9>Ya!~>Ct=d8nMv7|e&v8EDQ
z|F^8Ayg`YSNBpWIVJi9KEhE&j&{Tp6BJV7g_mJ;Ou@J?i%*#xQP0pAT6O|DqN?;BN
zF>~gnBxWWhYl_=|B`=Zu8X<p%4gZUne_C>-gszsT=vGQob%EB8+bGY?0g!fVI<ASY
zA~zg-122w^L#M5awRj`^;K1+$<XrtdFe=iF4BY!8s`a@Dw<c^Q56x-Uvb4EEZ1foo
zb~heC5_u7yD*Oz+GQWiFQ+m0J+O9y$icZv8{sh?%OU0!rG4QH(AHlb9p$?U_!Lj;Z
z&<IfBNzJzCppOVmYdi_3<Y}mBc{^TFOTeZ@GjV&c2Yl=F97~UQ5qlbh;B-6`)$D75
zJ9&lhc0(F5+~#BKJYgc0kZ-{0JzI_J63&pNxg1Wg`U636s(~QUxSQJ5_!Pv}h}!du
zFBXo&L?&GaUL#+^1^fGgp2y=UF^FKo(lhY#kPnE$Js{E88b6&~3u{}?5Z8lUAg^IQ
zeqKkRtN!P}+1ZRdVe%Ym#98=gt~(gJ`@zXCj0o|j*T`(;anSa74PVVU286RMkwUuy
zFEfw)d~N~fm#yjWuq6j}oUNjE?wyHCYwS>+c`f1aNk1{`(j&s<%y90=+$>yWbCau%
zbfW6H-%*LS;lz1Jhe;*#;N!>M)X?RN!RoV<9Nn7;J^z^*wy6DJ`@s}Kf3XmP`_>|}
zq851M*Mu^L@=?qgXX>_%E!rI61pGohpe!P>xl1)Q?_31sdr}Jd)gdU~ih|MC^r?dv
z-lD_9E)Z6Sw-8HidBNvtGjN@s7Yf%a5%kelsDB>!r=qG7bt(=Y75gm=L!V@IVq^IT
z?i|@{LEjY_p>SAA9!iPC*)P1%)z{s+73SW=g~{GD<vz=#z@UZetJW6oczYM=T9hC=
z@m7u?^a(Gio5x>zqmFkdcsP2{phL~vrNvJhJDO9YeH4G$ZHD(qT&Yy8Px02~W}VP2
zN-9xW0cKW<xLH&dRK4AfXD(^ruGXeF299|syJ|7Ik~dDI^T-miCkAmoyyuC+N(<4P
zRXbr*hY6&oO9Uo?!+ADaHw(0y>hOYy@6lb~@w`!8k5Ih8N^q&#i5s$Ig-(*zXGG0|
zG3e##BvST8Cj?H(h3Surh>Ks>qPD0P5El=q-cp1b+QabO^>c@~4c&yxAJ}8TPnRL~
z)fM#e-a-=ZUn1*vxlC>e97$;v84}TbF&M25;>Gn?!jnf00_#o;X99O|^P-}SD<_+S
z!~Jh5J%_ERoNJE_3^o(o)lb2D*K)ATJ<L&M%;Kd!SMvjVCu-fVtq@Qvg)oF5i3^8s
zaDNH~YOZn;Jl>$B+qPVXy73qAsjY2<<>^Kp9S3)w{j}ReYljUdow!7~`kIkPz95O$
z6Di$}@+thasjE?xlFPT8`W;&K<TmUs@Fa{(-T){0DR;$)vv71_8vb<iWK_FdN95dg
zg3_Ha7Fyb~@!aOyJb#}|f&9xZxNn?go*G<(Gu9mC&fjiAdCd31QCU&s*t%kI_uF$E
ze*aDWb^d;C>Z=ZP<MZX%u&7?_bZsMi*#4<$;(ZC0mYgN-WaLxf>w3_1Ba-LRA5LvK
z*^By$Q^_PfV<?-vhFUnaiim3l;`Op7IP=I0mF_Io{>QL1-r*CGM0g?}+O-d&WqKP3
z*JB6^zL8S~=d1D5w!>lwZ4x;CDCW$oR-wAl{b+sTTHd<pl{l+uA?FUIkD~=6iOabe
zaAWr&)Rqtp`}+fhw;x*KJ5J5$sw@Y$=E$_x={r+`vOKP7e&mqG`}2{OxEAGz65wk2
zJz_h@M3gJmM}-;Nh^70wP-}V-XUvHpsHAoH<-!`ReGf{2cjT?^iJ(fJgCGPw{{9E7
z8q)~bE+Wa&0vdI^c!GP+^@4x5khs`bMGh(b9!!FjxFb?dekAti>=k>UxF9w2>rB@R
z=~X-{x?b?x-$;^ey_94JiJ>x%sWGz=Z~leQd<veeXP9~c$^RHIPwHB>d<DT9Y7tgn
zwqhk$XM}gLYk5wtmefA}R(<)(HQJ`iJgK5$)mlA^pvtg@irn?XY@^Q?EB3D5XfPqI
z=T^hsHJeRbb9gf4zPzm?z4I-?8~3f<W*%7KU83B-u5e6Pb$mzT{`EyRa~tzyZU;7$
zIK;PChBqD9xN|(*Wux=e!n`KeO?)-!_KBw&q?>j#kvhX^wc!f$&7}rQ(t4C?fM<-_
zGRCJ!ZTph(cX1En5A^jab=5kaTNqarjLs^1HITv`lscdfR<k_nL+WS!xk4jlfp^;o
z&y9vta>PFEqkIdkA{z^QuF3;;J1@hu?=_2{{i@x{Lf`9_AxCGO%Ax&kSVf#j>}o9Z
zyE%6Dsipl`qPk@p-Lk>RZM&+&e*WcMQ&vj+Z;xAW{YYffcK<J(lDkeWBZdWZPDp!j
zX}4QZK-Z+KZ#z$|95%Jv`J<n{>1ryPddEfG-`7t_1Mj-#P$FaZ;=r$zt3^gGtEAKJ
zxv!(GXEzs5``UAp(G)c)3%c)}Z{}a>UJ~@ccl#LY$kVH2!4FkC9Fw}6OM)K-lulXp
zhSZ<l6IkJ~(b!|h^v6L5{CB(L>d)w%erU$g*)2O}JPA1#c}h(ggnScL4R5mX57}7$
zvzZ}{2N03>`!Ir^l#tA#qu6vZF(EyJ8=Dx*LJS5^)C(l-BPC@f@<T@Q8ye{K`?2~{
z(BMi>c!%wHu^1=>w-|gBjo2{+=o#2peDofwlk#q91{FZc8`NR((I0Oa2frdctU-DQ
zM*kH&(tZb)5@VC&G6v9JFOob&@&XHay*UGr{^<TMg8o^pvJH+P<)qJxk4=iANd_dE
z;oo6RGegx8Jy!-Uf+Lo`-*B2Up!`05WA%P}3L4WKl{0HiN;S{I;!(^*7<?Wxea62j
z>t7T2KZ^y4P-ZAo{f~k0-vmbgFT*SUyCzApZnQ)w=Sj|w5(CH#r9wqe(A3aL1A`?k
zBW+l6R45e+#Wd#2lAl76tx(1(JQa#Kg<G8FPRS_vvnMx&!iU`%esE_v%`u(zz&cV)
z{b-Dh%>?G)5#Y`8ZrSYW!7hwzg+j$H%5av|$Q8CMzcAW9z%Dv8Bs0cy;==H-pm<iE
z6v*VacmcC${JCkIFi9{p;7@a8W$)RiM}#^i$9TmlqF8%Pc%~eN!Uy@F88cYkmt6)*
z*Q7p>&qC%-nGqH0m=YcqkP;E5DV>QoZJ<8i>F?*mG>f6%<-@j8nHu92lEDfTJ}OpU
zqm(@uf0~b`-%PU^8f?8n;m#ht64@T4MtDwK5N*GR>B%@&iD_~++kYm-fk$EPlp0GW
zbEY~*9<ESk54@8mS=NftQD{0jV5BK>;9gUDc#>x(6Gu}gYtQ&+OxdRy&;+SzYE;^X
unXwtx^HNi-qten+vQ4vM)6x@Cl1=TL#@dawm8lE?-hJRC2@uHqFaB>f=8;VR

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..c355b299a2b6bafc20dea943a5a9833b966e68b4
GIT binary patch
literal 4763
zcmc&&d036<`@f&(ec!XEsFQP&igZN#YK!TV_K<89(P>Z1NsDX^MRiKH2q6?j$Rssi
zDpYnuri3Cf_GQL0Wl-VwzNcx1S^oH4*YEm$pX;3GeU|&aKlkT;-gEVQ@{)R62uY!r
zaJr|ECxih2hC3F2a%Ycj(%o2op`t81<I?r8#=YGu^p3Z88ys_WS!Ta^v}K^PXrn=f
z5QLgSEn}fTs3v3yS=zQ>$mz2car&$PfNzNDUVsQfVT~mZk+zT&+W<>VEaC>rVuI&|
z%i_a@SW}?cj5OPjWRZmPZ?VM2bh04CKzC~N1Av|f2wgqtB$(vQNd#O3EH0Y_z6u8f
zDjd6hv=!4-dmoJJS80nleLEr!`rSs8rt<eydh>TGn5J?LOS;HEs2HjQ%Yzc5Lt+x<
z1<Mj-DiQP`B6wbqJSs6dMoBIkYraR4$Al!6(EeXw{v&0Djxa~ZQ&`VdvR<XyxsvUi
zu>j`Vtc21QDYTpj1>*uEcyL_+oofx?+1U{gG*SZ<?~}voxf)=w<t)g5G5|1Bhd2E&
zWd9NYen&q-^QRJEg^h)MK4K_*tp-cCNzw8+0iG`&3w34YaAAiF42?E{H&Z^LtA|Y?
zM_YtO)C>dig&A<<9$>$Mgr>v8fID0d)HUp(alRU?^RtD>EJv8sG9TtF(gkj=I^3LG
z3VZq`fsR=@x>Bl*^iLcBjVxEtKXC<OE3F~sr6$z$n**CdwPAJ7_u#ZO1Og5Xg$92X
z>}olJ?(+~-T;{{A{_0R)Hw4K1iC|vQ0cmdfF!^u+h!0r6ZfpqogDN2T^I>=d<Kc#j
z5Z_-l4%SZ50P%M&koNomnxEJo%E)05oyLV6V+zh5JqpIj&fwL>g39a>AjxLIp{+8w
zcvlt9U!&mKPIKg5QUJr^e}L|#&Y=I)44MOT;O?gwSd=gewx;cY&kN7N7MK07%Y6de
z--c25@+V-V7z{&8E#dW>R4BeV9EQaVK%KjPLt4R)P=X<X*cg3qUN;%0^<<*W@d_fl
z<P*F*Y>iHuY~^!?dO-1zVlqH$7Yy`oCA{`@qL$Gc$R{4B(IVBmXpP$lu(<IEqRAid
ziQHo-VE`9JN8G`S8XMqjN*lUV@B&#5jKw?U!LZ<EI_4Tqpbq3U!r|I3G!#7X*fZv+
z#7zZFt*?O-k~nmFMH5b`B4F*J+4vXR4lFw4LhPy&K*ip(a95EFpR?nLA!f_*i_qm{
zM9v`A^K<3MBH|ReGmXWf4E^!&3FTDKc|U4L{VS-P<b^m-?6ANZ6N$oVaM6Dc7cC$1
zo<18zX@ClhUUmxJA8-Q|UnhvtF~P6qRzc>aQ^fUY6F^ZHiaV+)bZydk$SI$RDs?*`
zR`ob4O`8Zh6Fs2j>R`fP<xx;|>V}`@9R|YAoRACcfs?2pj|XgGb!~`;$7fSvV6+@>
z-!mKUtgt}U!>fpri=WY)%TI_2r-rcg(-z``11}Qa`rA^*2iFr(=Dx%Qh=;LxL9qOp
zD>Y!nVlX~d!|Hz%p^o1O09g^3e>0xYS}cHq|Adk~!UpO;M<-FTBT;IF2TL_PB7M-l
zg>6_zGL*eD!VvW|hzY6HGV*}jA1A$YLD#z5`@c#(&rNR(;C_dGQQ0);EZhBCRl&B;
z_fUVsJY=D<fyJBkf)m}&;V!#T%{ee_2&(zP9M9gt=f;_iU{$D=;@j;8IC-K*|5$z*
z-aPjiXV!WLDoRuc2F8on3#o;0<nv}cdubgzU6o>0y!}8X9a)ST6xJ$gPewx07;n~s
z`_9N`XD*tzYCEiL)rI1<#-y&-5RTb~bv*v*YCPZX4Z7z(iZkra6BN!f=3OqgWzSrn
zqQ<SBhASQ!q4y`FN%7S-@EV^6-rqM<KRl>Hjk0&(b8-_=J5mMJHTmEs)gX0MZXqss
zWQloiuRzF$2K4@ZB#DcbirXh#A=i89Q~WL3L|{)aM(N(1u%{#8#gjVT=>g~Al-CZn
zLKdjAf821ediV>aZnXgwur2YRLF)*1`YUkVu>wY>9b|bX%;Cg#e9QHEK8Am0=<<Hl
zN&)mENaEtb8|=5UfC_LJ3qM!L*^TS3L-nYO_{4@rqHKbms==Bqq~+9GMB#8(qM_%Q
z+StmqYP&S;sngEEtU)#kbi-)|CExBUp8jkL5f%>U-s*sUjsw<WyfGS^v|(^ga21TR
za^XI#D&+lyeq{L>c;JpG3i#i>r4-k~35BO0`%=n&5PWlv44ya;x4RgkifzuSCR0;L
zPR9*`)2hW~-#<#+EN#G=N93Hr`-?#PR~7Ipyi8nbHDkpSmnnO91M<*Sk}&Vsq}5t5
zo|_q)j?NNQ?2%JmqU0C1Ab*oHF<AE#uwq`ZQ-+>{(#SYmwr(7%+N`Ex*H}sQpJ57T
zo0716>O3^bEs-a=eFq-uEF2y?tpX=xl(IuN>rzgkE?BluMjl=z6kOFG&z>0KMqcL@
zv1329q8rCoVC^ln8n(@AU_n!vp1Xk+66KvEPEIc(e6yaS>4Qnmgpa<|`kLpcXKO4O
zt*!&%j{8vLlp{pg86dViFoi2i^0ALai{Sf#ahz}bw2*HU7n)Q{P_lY9VSgB5-p>+h
z(1miGEPggXVe**USRR6d$~{r_h>vJ>eI_Ss`hL9dbR_F8rG*1|`oxvA1h|oZ05wJg
zg3VP=^48;#_^$04bWNO!FQtn4Sz2}!Z?}T2m*d~B{$VKMYgD0Bl?b?2aG%)B(p5>*
z&_cNh8;NCm@1RTZTUbVw-e4c_fK?!<;O~8u2gW6%__cQ0V8xq>p1t`MM|bfdX@ZL8
zvQ0wN`tAk(JU5*?sa-(4IUkD#-`kFKr#awOe+junV-jnRh7$_&{+6Pz7PSK5dV#U9
zmiOnSh-6AHBAGv0blc6+)1e>4rymgtrJ!>*bBn3vk=*5gIU@VrD^du~07IYJ-6<>C
zYC~PO+83mz@kN&5Ewu$JGgS2)6e4Nis!Vl5@BKb?g=wn?ng?FkD&3R5X3*%kr!94R
zGS=zZr*gy&dleh>U2_b5>-T1E9PU-<n&(iIm22cv9^P7Cw0etKK)pijSe%__72dSp
z_jK`^?W1DvU&wRZw>E#=qF<l-mSs%Q{bo5(1FRKZLmV^I47@88Q}tce={e<CXXa$O
zZ#FHd-Q;#nGCAKa84KN;4ZVvz^Br>CuaBHrI_E^H(Bp=&UuD#t`dp8jrgOcPeCVQ#
zx0cDyWQoV68|RgWT;4WjDa&MAMMU!f-|BRe?cYbYRW0sjneI5Y;NhjbaT%uhRf#{}
zI=+-`wzFn&=kwO;46|J)lY6>8bhFKO*QStsEjxvIL4BHF(AZ@hi^4OR8b*FK3X47G
z)@YAg(!;Ubdtt*ck8O6DmPMCxjb~Ra<60G8&a;lZRg-D8uOZ)VX;%-|djF3F6IW}E
z&$2FQD*6lh;A=FxO>!T4d=`zqPl>K}fJWcrKE4lq2k&e2k|jMpHhWzU&N}iH`avan
zb5{I8CHh0&U(lCy@c!h^pr5o&X4~$pDGPkj+KV34{h`}MbAD?|=ojdAd<NY}Nxpw6
zO}?b~<CpO6mB!RZ`-7J&Y3j{c8%rDVqnCCi@P*4?r*xKjA6xLKp=kEam;N=2-@bfw
z`{t{8wJDUUWKPR#*_lkCS!i3!o1hCD45k#%Y5g_ia-NO+aa(Ig*pCHH>m>fSIwP7(
zW_%ZV=hoZEmLnloi~Vo^7TtC{@zvuyx8KR{)upKppWF6+!NZHWW?}7ZAL5@j7EReV
z_fA*h&#jg6C+&B-lU_?YEwqKWR|mL$eR<|aN5n9iAS9lQiil5OheU-i7i;<!UCYxf
zH`2^DlI$>$JXI==FMnMBcvTJ%PX0q3aH<7BA+iNiY!1i}mjgkxK$!RYt^XDC^Q7C~
z1p@7+Y$8sd4I>c#zDgT?Ef8vK{!JjL{vj8lLSn)adi4)25k4Wwerw4|D_SD_J^sI-
z{u6I;whf3_@j>As(J~=P%OnH*h1xRE_B^C+PYWin!J@B=FJ$!^f308`yFZUbh=nXq
zJI0$7DUXOzWSB<iJ1PS$I=&tgUkUtIvY;stYl}VqkEP&$Nsj%$yvUOO)|;fc(?U}q
z;b^v-YXFF~MFOd}_mo*<dtWTkZqkftHc|(vR63RZGR;FOO_Dl<Nu8zAFsWmha?OD@
zn8b`6rBXNMNzeP9^h`O7XCg3xq*Im{nVF5I2Tqe+nYm*U(>pO89a<{&WI6|5W~$tm
znltmrK+DM%fwN{N20M?5^!4!$XZEAL==>JXr#odlM`aK?3A**hQx2KEul(cvX4%99
zyM#$)jK30|rb9R1zPa~|8O+?B>3XRvQ}4|ul3sI2kj=7@`}$0l`}ruT(|A*R`MFR3
zIv=`Nbo(M7rj!n`!7eism<_3$Cu6TPO6tR(>ZYtWU2MAbm0l{H$P8Vgm>R_TIggni
zXt{{4i8W(H7deTkKb>Olq0nm%N>4g-njO9GD|JZfJtt+dj2CSqRaUasNlCJIT}j<H
z+BuQNQSxN`>G-rOb2Pn{AkvGK#Vtq-NiYeDjWv<Q#mSTO7KX&dN62IJENo3JOwGlf
Q+5mripoRqSqW>-RKi7BbssI20

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ad59ee31cdea4b0c0bd77befe4a698181186db33
GIT binary patch
literal 3695
zcmc&%d011&7C$rh-YgLI+!zoQMFJ|wq99sC5(EUJBA^f`#jpsJEwYHE8W4-J3gU)!
z0|mvUR4LdBwjhYM*lI<q=(B2H-F>gxYM-s5zPU-U^|k%uec$_D&qrqF%$)6aW|DIV
zmWQ~qH0#4oSF!>Y%mHkcR#63GKPaA$mVNuf$Mstt<&7R5b!y9$`%7=R4ZGP}^kW<g
zteBNLvc{|_%VT-g9^fDtl#vPswE@6jG$x}l!qhqwSs$`yX-jwDnOaJPvFh~rm?U*p
z5{o6qk}f3aMY1HB;Ll81I+7TH#XtsAG64`?!v-r!3|!2fGzCHgw2)5&V<Z3``Is09
zyiAECM`rqoQ<tz-zeX+<44y~@)Gsm;g68+T<j~MThXzwr@o_8J1NtQ#2c!6mxSX_v
z^z4{;b++0lnJkjyW8yMWbJEgv0JmUC0hV2+WDl_Qf8p~FVSi-Cc9|*JCXx_>{IcLt
z6zyL*8#0^1a7)rbx_r-bc-%A*ed1JOC>S*pc8%OcFR<DUqoeKU;GK_9Ti{9P_gq8o
zjp;KS>wJo;2pj{h=Wjq7eG-4X{v-6W{pZ*%<1T;6nN!fS{4(mTyNBF{Xz;d-c=)-k
zo)S8EGkZ6kfu_zUXcQ>%lw&SvPoNQ++jRs!mSv*uf<C;wje^xnqHtfRAN<qfA(rkB
zptc_sfyW2osHMIKt`@I{7v-7MNauX)<vE#2t{%#}d%PLBCU?-=ig>(G?KFcL%}|5X
z;a8X~UH3ukgqZD5g0N^JrgGSJ2(bMLK6ATnaQn_AMhr&axwHd*+8YQ)DnCda?u75Z
z(FP^G9n@E$-cWpWA%57-ptG}1fR~pY-8$kSw1^ku_ll;&@aZ$*$mwB}c=c0cx8egZ
z@p}s2#54ir<w9k!J|M^`rVlSz$9u9S3vTul!n?;0GMjfs;cYFhsNKGeny~vB_4?<x
zDDRGueA}YM_@MIzzBbx}>EwUMq`Ii6PazAYY>I>YJ2M#bg15l&qa!@C3kW^@7kO;z
z25^}+m9lzE1fc^Z$Zlf~+?v^qvdx7kzQc>T<m`efBRxR4-U1khXl(D@%*36DWP*-J
zp}KiKs&-^x%sDG&&u1^tz9FAcj{B;pxBCO&aOOPRJ~IGq$SS6WL|>$qm|R6^$zS6|
z->t;=7Ut8fvx1S}mKWBT97aRZGH~xjXI>WdIpY&#NAEvPQ!WqJSzWA~DlE~gL_Jg+
zf9#y^(6W1%VCy=6YS@V9z)Qc+Uq0$MyqA)R4^+N}+BTRPd7Wux%;t@Up1xe1F*gRy
z3d}K({q1YGK76sgCbR`-uX>NaaKi}3Z(#sdFILk{@35lNwp01j69Vb4gbjSn&ll18
z!v)xSW2e~TTm>xZJ75!JHvvgE9j87CYoJu6w^7(ITHyUm#Z(=+iw0^mbehF*Nb=o{
zQsx|_5|073@!Rq6Mc!5%;o2`cG$d0nEz%09QiafGvIi}*D5rdy5H|RSj2ZfAGhSwS
z$Gq6-rtnO20*-4|qV_S*P+3=rpfs!zFYZp^U1h9rtbr}{MNu}K-?|r_Nsa~g(@Of%
z&9V5Z$1!x)vJm$cS{jyGc`*jH#eAFUXydNy3z4C?4HX(C!`ZsOQyX|AjEcloXnpot
zYH9t~s5fgPZ(M5#_$>IAS0`#QtiQ1d9QRBz?DVpR2?q1go$r6ZX-^Cx*V{<4bRCN>
zKE8)vys=U^>xzi_{)7e%yS5pP2vy*V(K0$uJd3we?1vIVw9M1VjG0=&7cMw<Nie>>
zMttSP37+uT1!3xS3BH<L&D6fOig~)M8#-<UplzFLO!l=#Q*+Mj6GM2nf%_vdui;s+
zsF5`l3iiJ+YYk}>D93fd-4l1=WY<C3c-!~fPv&uBcCWyZtrh+Hdrxy$E2X(FI-#9>
z8(T6_&%Z<{kAVkegsBzL!hFC2sZVXea!O$C5Ybt?d<EZh)QlRRy22tusasNiXWfcb
zCN_#<seH#fB^C}LjS)w86qOBeiT$)jzH?>8P|wWU{YQ7MsvO}{D6mx27q79MQSG4W
zsxMh<AG~A6CPhQ(`f(A>Nf)~s$~HPL=qk4K-BrG6LQ-F&s(V+(=1Fjk^Dfkiio1PQ
z3$?WGZhtLES8pYrE~{9r^>X{lZA0JAysgjzJRoArh)pA{%TL7K$?e1*SXnfhX-)mB
zh^ZY!<{dXJ(4u!4?d$<q$?<4_SQT51$&-r45Fe-N?DFUpjm&^(^rg<a$$6~S)X<NB
z5e^8-q<4Y_K-AY*vUdCc(4LnF_n?dmveL${dSXe4-OP<H34JXkq4smO`xKeGmxhfA
z-5uNz(poxiT=@G7daB*a<~v1wsJYSCS{Cjc(^0@!c$7!D#s!?HvI{*_u9}c|y4G*4
zM@6K2%7p_7XAf0GP0G01S!6M3_3M+fZhhGh`r+#6*K+S)>sdRg@{Or)KYnoI?1z;x
z)ABj@k19j#3f%I6W~<EdO@pra6uvLa7s;b|i<o@n`ztyMMk*Vt`W10{S8Cr?!?dEQ
z=MGhPYhgNnO>}#)`8PeAJ&D=U$73VUG1})p;(z^!Wi2DES*$PkGhVHb!nEXcuJ|Wp
z(W%K<+5Cjm1g?6LQdujK+_sUNw~}>9Wmj~a#cyBQAJ#)3*7IfSB1pi%Am}Hh|33FH
z3EkFD+&g}yX^{&0xpGcODj1aEO8@V7iO4UF48~NLvF!TjKdbyEFZ+Phg!II0eRFdc
zNp2!ppRMfqC{p`>Z~jl5{t;Hoa(9sOvf`2w($p+XdLTRen^wxsTDjlChx7#Cj-{_A
zoaO0-Uu{h|xnHk>#Vk+h#c9)0-L-J-7A77BugA`c*tg;A>m2_lSdbW7T3agr$9>*k
z)Mx!)9`9fFk|YOY5@VS_veHHjU}-HC$wNZsgip~wENLz6%#piXA(zYNk`G5S<?>v)
zB2n%ymnX`76LotEA~1_v`O4*i+)VhvnQ*#g7H5GoB%hNv&e_?MEc|BA;CSC$ZuH{@
zVp=X&a)Uy}vAT1)3&*F#y3KZt4WFMA?>{+36%mreou>tp_}+Sx3_5$hIwK?qLVA0;
zC3p6UepY0-dwP68qFl}C>)aDKgj9ok$h>(RAH)rM=(^DL@uZMFMYcNJJwp{SJ0mhe
z2c7tvtH&1<_G&yNS%iKQ50^@XCO%+(Hg_NoRC4k<q1-|La|3nxCdno=n0mQ<I=2i+
z<#M2j^q>4@tlJWjlZl)VNpdcie-fhpQplb{r%56w=*YQBuE^EjlP*|Ji^#}znba%k
zK<f8(&{b*vImC|+C#O&B6IJeN^pYU8(Wo;Q<s@V~#c4E7>defHT${xSnOVsh={Bw&
W<6XzQSSqanUjD%m8X%bbtNCwBcqY97

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..fb2a0b13d44e39718519f4b481204d15e4dbf2ba
GIT binary patch
literal 3684
zcmc&%c~}$I7C(0~NhX0nSSBWrR@N95WfNPa3lks-ii*_A;(}q3ogj<Y+Qt>d4Z)2?
zsen}$#SN@Jw<ow#mAbSRX=^F1pIiH?h_ALywRa|B_1XUMzVCgn=S%Lr=iamZ&YjFT
ziC`jqX_5}2L&9l_273UP_a1Cqd*7{hVde0dlm0QSd(p*J-Ms<+fn5#r`0q{(ylr?$
z1MNu5y=XBlr3JJ=ITF-V|CyZX*9HKw2f>br;C)A^E+HjNDgr<tRmes0hOET66hm$b
zO~}NuHYEEPsZvzbUzt*P;TVA?01r|;0bo9Y4hqLHuraGi1R?}hA|VOZ66!SxmSD-o
zGO=nh+Ubvo<y8NPoI+1zWEjPZyZF#b&7mN8ydo{Ay2+<7)RIJFLVjjaR$g49A<rO5
z#f#L$xCCQHer6U2%#37rk?I^zb(c{77e4<K?tD9Xxm`G2jT3<pAEWx-B(+OtL0Z5f
zXzFC3^T<r_su>F3er64Q#h!5M<X}h`;)u4@8DYhANAN5=3C3?c0lXdHo(%yje@KPc
z=HKAcM-3oI9u0NT3aGs+g}IdsnwMiuq$x&2>uw)7vq=YTnZw}Tgh%L;ox@>`QjP{6
z_5kvZH&Dj05Hz2JOFKP4G{^-U9E0FuniN*Yj)e5(!7#2f4W`Z+2%_~4@cH;A*kV%v
zFMIDr=bMzs{YWD?F4u$mk@JwfZv<rhO9qE+roy_JN?2k32tvz~V8#wN=$Iye&7FHt
zj~GGAIV<S0bAYy1S0JlGz^COt6o$FO_?<PNX!M0@LJd{UE%5&1op2pqhbuao=vf>H
z#S<Jsu}cSqx4%Vc`F60I^nlDl5v=jz!^!4m@Oobh5x)pv-^#(DS}A}X<pyZ~+7?b<
z=ELO=e9)MN8t}+H3%}0Qg8NNx_%wbEeEld3=Hx{|dEpj#Jo^-s>FQzgm^YxOl0d)C
zy8%OsUIMo!Ke&5uA#D465O`$SqrU2YBd5gcC{K+bJIftv_Rl2EbO(Qb^Ekd?PlmL1
zdz0hX^k`H!`%B_wm7DN`R5dbpD0qzjdt{?=8d30*4qg8Bn%$2JPm7jZoFUqUevqtl
zJ}DeyvbC;!+>Pwi70B1IR3MJJLuFp0MDJZWL^ZzUiVmOkA*O7y66FjZENHQ9BEGoh
zNxUE8XqRoZo7gb@78O;Z;b+Ke!P9Gwa5jH7?0LL_m@>CjxWtw(XnF86S+M6F)KN4-
zBE2yL3Pwc=X7y-M^atxv+~SQ;e03mfEA}D>Mz~VmrK`nO#}5%{vG-8-m{%x|uWq0e
zv6uMV-jTw|B@3jYLvIl+*N38qM>9#qCtpIu>xB^c(Wm^g-yTF44L?Ej(RI9ILnNs6
zQZ#YNHo?JGR7=!c_anp)&O_4A9q3_CI!SDutGM>Yd9ozJoo`j9<i(p43A7}VO1?P+
z?%Ze<AGbdZ|BToqEHcEuTpu_H{J;Bw@8DmGYJ`4-v-4`6aLJFL-!vbF6mA!U=S`)u
z@0&ysw?|o>aGPhtUt|q7Jd)SG{fh8G6!2$gM#J|lM&ZSh&*0E2?ZlDNi@e=$xY&BG
zDkJ?ScJXQl>3JRI1Jcp^ilv)nLHy&|M1gZa5xNrkHs82Wub6bJjF+4OsC&f>n_&B5
zf>@hL4EvG*ZJypb&|fF|?qIDr#-TzSr<g8o57g8Akf{T&w@Ue?Cr(Inb7kU8s{_1S
zD>d|xqis^De+cyrx*&-49828K;Dgn!2mGSTDZHZaSmDPD>cRTWQ)FVu0OFcXjan+T
zw!<baAgTLTc+^!Vk+7$k_jywXA=_i5UaH>;%3cY?)}G_x*Wy7j-9dTL-P9lNa*_>0
zPO?8ZOjUdr1)g}1E+7=ihdvv2x0@nG^MIh_LDlmY@F;tA^s(v%i-b})eR)vL!a^&#
zUrOh(nng=(T{J~<rgrf%2X$n9bZc$liUB_HXUdr^OIA6*nsc+Wb<5J#1A`V)3eAT_
zrSAGQ>X^0<m#rNXQLC@eY+b&7X!PEct8H6XlzGo+D^dh+TUp_sa;ZM%__kFWU&-z{
zQxRNOTopK{_vY2(b*neOIiC<Yh18eS1Q*#4jy+Ldx<yyw7FHS3ux9Jna=+BB6Af$Y
zCRA&dIfXW^Yj~?ZvLW_l<NEDUyW`s{L$_~ensOlLR@cexWxL{zEfhIxca-l*IJ0K(
zlv6t@nv*ZohHca~Rqjjg+?)Evsiuwljn8;TCbjJW?7YyKb&gDO`_tI>tP*5WF584%
ze_Zi}oymH%u5sOP6TEhzy>k7|4a4&9bXvSgOjD~kPr0YRui`u{(mnAcGQmANGuhiK
zGD+X*!{8()^0O((tejQyJ%<aK$kZ;0$@(kj&_;<#TDS|tQJ6-WL1ZGQoRXL&rV+h3
z-f>?aHD*6-zon}Scy|OFs=3g)Z&!)yeB>Z8s|ok8$`K{X*0KDv3Nz`a$6?!Y$+=yP
z=8>+O{C)$E#g>#*`4;kjo0L)0oNT=%EM5OJA?IuMFIAPeDrv$}-u2JdMNwvI7F+6*
za><O;+&p1YMiN^yae?fnWFIPJZBD9R2B<o?KH}MD?@y~>1ly=QDf-kQYv3c&fcUuV
zC*ZOl|D=}3J(*uD`&fe`In^)2mi-rZvC#8nzr*9N%D(N>CLkjzD>={7(cr@0hg1jM
zRb6hl@V{vPFTDO4WW~w=kPC7XQj#(aG>ID@PvV*W$x|7=!yyPa1Q0;TpU*ojun0fj
zm#}hw+z3t3g7DW^ZBowN4%=f9*hBv<c{*YHF06gd@qdB^nOLDzg#VA5yT7T)`oBEG
zs=w<=l3jC^iB**B?m$NXg;H+KL`F`C8f|&7<kh4%O970AVVH^d%hFheDPT0ojFw@N
znc!q@PlE-<v8!N)31er>_s^K)mbt72)(|t{-J#yzui{1Mcs<Jp7qDX}J7Cic6V48r
z7?$PE86TEUkM|qz8y__}KT$g>JtjIbg+0%V!12A4h6m1GFlPh@fyrWzTe4@*>F35q
z1!N`ak{JW5&$-8NFvaxqk#E1v@?+S+0-X!p5>Gnb)8rYV0*o=y<BhS=9CYk&q6OcW
zNzccFlZEM-c-T~GvJ-Wa^VkC>ES#0+gbMooPYmPojgyV3KlKa~!Y*|gY!0$xwWHGF
z{pR4Dj9`Uuk_*`U;}9*Eg7-9>CXO7V!{;%KroeJfT(GPbmSMO|T9i1DmVFL-Or|y;
z`{8i1`q(~JWv|8}338WgL(Z)Hq`YAX+1bMkIXT7xm)S`<xv9o17vGV?eTVxf!j%9|
NKX8}?h`|3Ze+M9aFZ%!h

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..aca2026d8ffa41af6bda14a140d8fbca62eee401
GIT binary patch
literal 3305
zcmc&%c~nzZ8vpKlFCi$KkQWk+iYyTnOkji}BISV)mWYT|7DWvNStN@j2x=|3(4tbU
zRO?a&6^o)kUC>b$sao9kHMMp`-1XS%(bMT@N9VqmqEhvbIcLuFJIP(Y<@bH}p8HD@
zm_$FCqFreZ74E<hz#$~t{AH!Wp=WgUuq%1V+TGqX(2lgohZfKdG@s^61Hntsr6U$}
zX#s$>D}h@9LZqE+8<NtLBmnph60vo<TBn(mtu|)Ugh(J-heVr^ELtXbpC^eAi-jft
zHx9T1!1FLVNQEJ=5M3z*)(9wTt1#f<T~_#(jx`a|;@4osg03C00R5pO!U?>+iZ^?B
za0w6uc@t!f)T<1<Y&H6f0&S+wG)bd2scm!cAV)JPL!Vcm)p0r9N21$Ewt<wLBBcLC
z$aiLHwxc`kRP-g1OSaDH=~go8vuThUFc0d@447M|fY16taO0v4JYDPw_u9Q7W1tY#
zY|_J`i9+yP(GL3So&a(ucx*>N`NJGY-T800@S+y@S;4R=Sppj#Il!#d3@XmIA#x?b
zuz#BZj;v9Fo7NW|k9vVlHw=afsTg^;cmVZ44g!4~gq2WmuE7JW2e^WhFbvM-I>3_D
zK$yBP0!Es1VM0+quwL#2m*eZ9+O81#%6FiX^-|>Cx&?#_qrtuPBp7xL0o~6cXtA3B
z%ce+S(eJw<a%Cn=u5*J^X?&<MZ%3Uy5Hz0X30HbK!GZn#fm%BP6pc@yG{zm`8`gtl
ziyy2bykKqbMwtDw0dB!?xTK_s&iTV&@hBlkzEwi${TnE^pcibTJV0A&4HZ5-Xy3UL
zd}c>M!ZSYXD)$CiIUnj)s^RE0dpLZa2j?plD7JPzco>hv^I1{gepe0`(ktNF3mp`h
zl3-<NHN2dD2v#UJLse`jbgm}Q^Wr-&sH_jT)%(Mv$8(|P@&NGAIijcQenl>tTgc>v
zz@T%7s3q|*=Jy4tF?%~zUi$+6*)Rm{^Ih3f;1&lp{cEVnE>+->=1C<~KSk!CrPQ6c
zgJ_`q6#03KH~3w;1zPGj(YkyOGC5i!O<M@B=<F$IpK}!*TmJz0I~#~fy$1f(w2`#-
z3gvBCbru@FdxqRVMFfAZK(#TpX!L;=XqDxogC*yPIZY&7kt@){z$b*bPDxhnw}Hlu
z?QpGZIlL^-C;Q8biIAY7yqt>O{QHM?Aitb8s<M>N=Xs?O144K3G7hKm)*Sc+Vq3&e
z^(>aK8A6Z+bTcU3e}kj`H+$T@_W@4`wh;7j8~nB<25eIzA+N75@yo;}Sa7V3yf`Kl
z%Jxqoo;35&`H_dAV#heNtKSo76iz4VOGiN85pmFRx)13&ZztGCK8Nd*8h{K@ka{`{
z1O;W(-pR}O&z2hDcKclLzTm?9x_UfO+31Iw2Q-njM_-}|C+?7;ZT+p>OQ#cCosW{A
zqy_SPwthk8DN@KIV1(dR8BlyLn&((D1AO+h@OxcG=*h1n%rYxM@!@dNWrhtbE*ekl
z%^yn$eG#uYPKmxUmXXeBKafTC*N`^nCt}*qpArwI6jQrKCLqC`5W--;7ddP71gO7Z
zU!Vh4^ckKHU1Ih6#iZEd7E|oy#&Vr>72=M3aR#AK9z1%8CnZ*tU>YETq<DNQfpw3b
zHnw*54!s<mgtW+6EOF^A?bElPYkxO)4^P<uFYkeae0&GX6@LDI8xj!sLD10PVIjjq
zKMZ5SBSu6<DWhXz<5VN#M<t9-9P`oGaY@N3spBW4O`J4Yot~k|oRXEDlRGs}tJ52%
z<r_@}(+i7c%<MVq<Jq4Sm&}<vZ+_{gpOr0GxTyT|#Y?_eTJhyF+48Se3|zTt_1A0G
zR#vTBzoB~LH#M6!*KXNbSKsjMw(UE1HtyQ}_dR=?np+ZD_qBbu|G>fT+YcQ+a`f2o
z6DLodK6Ccm`3n~>T{i!4<wr}rnFbJCl`WST;3u8|{?TLr;WbVH8KfN+Jl9bNbn-ep
zZg603VlcKHaFsUL+`<re2F2|=Zve3E5Z>j2_v#Sd<Kp^vfZ+a>A7%S`OKHMFL($zI
zacgakj-{kbBbAq9G+AZlWwH-Ee$7j)MDv|RYiL;wE!)7+-J6H^&L_JYe>U5&M+{(Q
z$pj0&5Z(AwEa=h@3%YbzN_u-08~?$n1k0)BzVBls`&aywm#ND#S$Osc5}iS^BYk9L
zeK04z?fpG5-<6=GJOIReV@7tSR!viwjy;JtSlv^qs&fj%#0&w1xO<gpzD2n^WwLs&
z>rWFjUlqbyQ)2EcSmq~igswC3biw{zS^r$%KPL+!fkY}%{f~cg{6+l#f4Q-L*-a8%
z>Mat;1fpjW{QOCzVjCtgaa2;U<zk7~QF2xSm~e(+M&mClaST()gl92P43ot~WN~ZZ
zSYaePieQ)+w#W0X9?!U;k@dhjVn)pzB$o%_L1cV1JC7)2+eo&-t{FzfHsLAkl-p+%
z?0jmvf4pCM(zpUm)X=FZ$%)zQzBU2ln~{qf&R+!Q1cSh0@y89>y>9l#)T98NMw!K^
zS$i%#P6ta$*F16TSau%EHkQ;m=oUOv@mjb^ofM!?Nsia2CUdFdc%v=(#eUQs4=xs#
zH}J5f3^!<$<4o)Z6Qg4FIi<p`_@iUEdgEea=_)<Lj9`b#Jhldg)Tp7k>HbByCPP>y
zT;xKw{urX=Q1DtfXNi&H?C^dH6JBUJCk`xYg>@LNk`^N_Ny|EydWtrx0LS6-WbLtk
zY|0*uMH9rX26g_lf=rWdhQZ*g&d=8ux=zo`H|FSdu6}`o{RS%}Dk;FL542DK3HVRW
Fe*i)ig~9*;

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1a46cb40fef1eae7a5d0207565c796ee63a4a4fe
GIT binary patch
literal 4466
zcmc&&c~lffx38-1o_%0IW-<dPC}Bhd*#tMD%&;#;1S6Y<T|hua21N7e2q=m>C<+Lu
zfXF5&TO?qV9RW8mF1Vm^2b1^|O<W+p?jhvUm*kIk&O7hBLs!+Ux^?gU-KshBYuqIs
zW_+CQ&R^ojXYqkL00y}|Thnw4;+7=eQ)_-uHmbP%`IhJ@t*KYebq~H?tiR&jqpf-y
zBG0{j^Oz6#0=}jZpGSUZe44fuFkp<Y2^r&F003+QM7BJLVW~6K#Sfu29~W5y8c!r-
z1_iDP@edD_hx3tuC-?yhZbR`}G2@?niHyi(0UrUf;gKBxq@D=*-fpByWp9IHfC&Mc
zNyh<)%K%*RQ6V2yF2kBfnp2`S)=@q_$zRAAUlB5}KWqdfroYaTi+^B3B&Lpc;+@2w
zm<+ffF~RZCp{wHjLjvOhxe??bBE&y9CMrI9l@eto5^P7}H<&mX)Baz?{9EizB7V2X
zjenA+M1PO-%Ae-2=SvV<xdhdQSK&KLe+R!+&V*--v)GI&uHe|@V|ak(Vet8KecZk9
z6>M<s0XJr+<7NJRY(tZa*e>U3!0g&15RLbs@Auckw^Pp}^Ozy}+TM$xJ?S>=&VL5Y
zKa-(DF(Ke>!%>WBFrO&P?gf=i@8A^RhUT`J!cu20T-<URd@qiLt%-dosR0AK*ZQG8
zPkZpQ)i4qscf<}ibAVNi7py(n4(_Jz2OoCEVv|iaBJ0_6h={#fw4sh_Xclo6Ka@hF
zc`6pLvSeDUs7V7vZp(9^VGN1G?_3aPCc@(Rjlfaw4d^m|$a*?Bi_idEFnj%3@TSZe
zaDD7S)Ffl{{Obm=t@|u?)pI^bZC-(f8wq%2VJEP**2n8~hC!{y8dQ;D2PWCMg435K
zVj8>OL;X!PK-K;|_|d--VAiHs4BrMY;#2YO0y1dt(&gZBdonoKQAOkw`k_O$X0UN;
z17=Y?iY+_;1e<?$GF>la4XQG^PFF0qBAV#G5K*Q+*f}5vbF+iN#z7}SJ#if{sy|Ir
zyAI*-ujFM5ECQwrzQ8osaX|j575E7MB5|r_ArV*=#VaVU&~T0PhQ(`cqe<c^^g|H_
zaO9#0lUS_B%VL(Jgx?(DmG=W`&yzcuTY3YSC*aTA46Sy$i$axi@WVZ*W{?fdG}37-
zuV;+t0S0sZwMIsn=VW-g#}xVHvYD|nrqOCuE6}Y0eYC+&Lrum$i4J@{$nZ*&5>dhe
zpl`I6zJ^!>sy-Y*e*bEwZ&4*^wXfge303Rh#nhQx-V;NRFvo+o`o077KC~bDZ_WX`
z`*pyv-A1^M`(%bm`W_a$wGplKeF5*e%wl|g=LrmF8L`e+ThW)MCGnVzo~ZWGbok~>
zG%mVy8@PXw0zAI`f#`YA0DA*}1KwvcuqHz;Z0_?$eX79|RG9@R|B*Rjy}kfK-(G}o
z?nmON=wG6N`4{jscRhlgsf`7Vgdn)ZgAw-B5IlR*%xYEd1ZUlI>8XK1lS<}I1r`r~
zCMH;<!+g3q($d<4(YHJYPPvJ|Fr}R47PpKc8&)vghvu-`rfgIpHgSLohGSjj*XXak
z01+Ua3x2ANq4%a;1&y=1(D&)R*va|2s`|S!ar4DDv4W{i*u{~0-rTy~yu$(;qSYaU
zre&E5ui1Y^#N;@MmJDWMVc`JY+Yz8*tG*kdebLDHHUbWDPMmobj?9M*1uUNl*(`t2
z*Q~C2PJF^{na-nT9+BSG#*@nhtY~&UHn>yDH$2nA<5}1-euS54zWT0cIEn!56R(NX
zE8$qGn=idPsRVEqbl@R&pP>Oq16X^|LDhJ163!UDhB5jzne_WL*!7BwNKh5Sm{?K-
zv|n<8Z^3!2yWfN+$IcTrF8cWKOE_jaoT1sD{{?fKYzu718t8_Let{dF-2{0V4%kGU
z-vMpab9&O04p0#pi%#yD2OAFXxYoUOgxXg#Kzm;TidpOr7dpqY#JBE%hm+P!m3h{p
zxUCiR6$f+(`xTBTa7`dyxtY(or1u5gF4P&n$}FPG-uA<5-z6gL%q9)1A9jJ&eJ6EY
z^evz;y8}D3qzLob{uC~mh%@Go`VeWShu}z-43C~L3544g!^lNdSXdjtG9SzU7rx0u
z-exyA-+mU$Sm3J(eWIA4Pqh?on6MMGsf392lbFytSB*A^2GvuIA2WNaLs4+G8*H35
z3U{<@V{Bhig4VP~((V$PD2SzpT}X)o*Ye6>Z$uEVyyS-8d~AsBTD8F|qGZ&aEMjlh
zv?f?bQt7&Tm#efqTmjh{4KSG-0j}iV#}3eRxG5T%aDUuBZ2i$Yuv?x<n_lMuYyuw8
z@;SBaqmQzIQRyspleIRmU@e7%FJ7YPcWjU_pDS3O!H507JwqS9-ojisz`<U0%HYI%
zIZ($_iu#v}@ozL1(h4=~VVH-4ct3|w3zX8C0c|%KGa9or20nDsn4{O3Q4a;^Zront
z$h@t@`wgw&>=Q?LC?`v`yly$RsJC1Lc(t-DUun>aM%_6jd>)f={DWGZM+w7idJ7op
z90ENpRk+Hb7u3(@c^>~4wh=#3<=?;jICZo_oVv)Q9AoGjkRA5>We9f@;FSs_wL}%1
zxe*|S(B?>D62?$B@NPPiw297};*@2RpPa%Lnup(L%HOnARacrSloV{<Ho?H7#Jjm5
zWyfcxLFck0g<E!M&5nI~qq%VF9vz!xhDdreHC@kXuYpg?(QW&tx)(TQON+MepYB~9
z-rrKRBhw_HB~@g5Y-hGbcwdQ6>#<!qvt;+rW!o0-&YQRP<<tJw;ys5KBqFA!T}fKL
zZL0b--?ozULdUcz&Ij#E_ZGQknMd4gE8SPT=!kTirhQpPsb`5tsc(DP{&KIAL0t#!
z%MVoe)yEFrY%k9|;op?Z)N(kURTX@0?=-)T<JmP~mkXS894ZdhMc$~6xYbdSb1LSe
z=ulyxT2BT~(V~Jwg;2edjJ-MyDzvxdlA*h^Z>cFba~;bvW+;I5<*tMKD-RgQKf5s&
zjjLFer;H@;{3%ZvX_Mn0k(dhbLTv<p`I4#Nt$Rg+<0|m0esDy4b=pr#Tu^~W+~g`a
zceL!wxC&m%2@;M-VKoAn3f!-QJHl1Wd`afJ@a~xy*_*CG$70KGW>Tq+q-ec5kv2II
zPT-ChAU)@UGt;!2U5Osi2yX5~rX8#-&d?FK5ucoEJ_<hHJXVrCvlL=fH!8m-j8o_w
z>BJxXucQn4r?&F+TymYN3fgxFEmqz>S5?@#N7r_{wsZB-u6+icWy`v%i+VCmgS&H`
zYmQyWnJpi>-BnY3>9EZv)<l=ulHNk6eTK`sYfG;byBE6Txzv?iE%&Zjd8fOs{Q8N2
z*6kBrzdhbx6Mm&^dC#{Mw@%3(bmzIAs=U*<c6jJco4&q4){QD4yBWUc`Cw0boK
zFV^(O7E`8z^JC6dIiWCG)Mm3uy!3Rv!Gl64Lcyte(CIEx;L-sH1#r^(uEMWt--GTV
zH_~PIT4Vo#9;o<Tj{Pdf#JR@Wd^E;woj&D(%;<<!6weKn<53avIC^MQD8&&;LYSx`
z_|;HQAQnH?6~9xm=#Rfkf6Mx06kS(QQoh-YPy`5w2SS3Ryul>Z4I&?ZlI}jIzKEne
z9qayv?vgu|B4mu)P;~dNv!v0-bXQY6Wcp8ZSM^Ww8x^`LEN+a}=OV#oB)(=MemsLD
zynpro4^jUPx@e~*5Yptq;i1uie4J$e`skyV2z_n0;}dMi*8pINgrC-*Pa8A-l#f$(
z|2`Q$;?vx$sol6xc}moa1|lPjpO(HR>0g)fuMGS@!2$tKq%Cs$KYqjc54~Lczue{j
z*i8~-m<f1dhG3_W1^^;$AxGljvB+!g*u@ea!A&T!lt?8K$zt-M#8o0mkVwNM4iZV2
z#5PPhCnXIQQX^Z5#F^?zeY__%<xoy}pd3jSeKXy}WHvdl|I&%lZ4;>0o@z+f5{Vnt
zNPQ?(xh^rK^vEFdFU^9ymd1xT%!%~z_6Voequt5;)~zHPr9WGx6EX=>#{4OV)Y>Qe
z<-T5)t3n*ZB!Sd^Wq1+}DL&)6$5&rbx(n5eL06_emQN%(CyfjAvW)TZ{xZhbTM3<v
zw|ESn%aTv?A&W)INBK~tl*&RJm&Q>G5@$EcUTKst9)Gd3vfgB|Nf|G_L}Et`9iylk
z$b22<tPC<=OV(s2WkeP^fvP{5;@F{(b5i9_GIJ6gx$Yy8CXAhvGFfUDX(Lfqa%__l
z<k-9tx=*x2JQ+ucliE-EC$~~ZGiC{dy0XC7)$yTm#=$a~abRq0OoHy3&{%my%qm?o
Ys~Ki9Ohs<m0Q~s}PU8T$lmBh`Z>%?e=l}o!

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..56934cab8de9fd4346c856340c7d0894b2552537
GIT binary patch
literal 4906
zcmc&&cT`l@*5BurDbxYxjx#_6=_pDOupwdSASwdV)W}c-q)Nv!DvE*~qI6LZr3ngx
zg`$9>Vtrr(`!gCWF>2H()_3plOyZa1k8iEF-rI|N>fZae_dfToIX^tbUgi`*1yP|M
zlmZ2+0A_7I6-xar*H`oX^po_q{-mV8?|Jv*%MX*5mL00-cz_?ZFQv}=`%F3Sb<ljV
zM^8^e3I&uVrENl~Qap-9u{5p0i1Wop$ob+80F*2-x>ZFunPu6C01VzCO^Og%0gERR
za)TwYq4Ogoi4hbgP!&8zf?tq97Bb-coj;KY9XL=J&<&670O<J^L<M=!kuaIK5(u~m
za4wqwLXiU`{jlgqo_-V+IW|1ngb|hDj`5>S>HVIEkn?3n$f2W^iGa@S&#Uwn&qkIT
zMvy!UpA-(gVsMcBEjuH{(72G~n6TKS`Js{|iDD!@hzy+{5*M8u6D!O2G$ts)3<?N?
zB3$$T0sNOmcsr8H(DI;evt?yyQ-0OTc3!&(7Fw->x_&YAw=V#bvN7<(6$N;;Ngp0{
z8$-xwKH5_m2kXD%gMNND#Qjqrz*G%hC}FVSX(R+5_z0IjRRb%0GF1ADp!_)xmK2H6
z@&pBJp=dI6?l*(eB`%;7GZtRV{DdynkAqxIAu?{$1LBD(aN?$c-3kKw>h*wYs10g-
zJLp}=gUx}~5Vg($ruQ#|xoKL!-KGZDX4JthrBoPc+KA5AX(HYB8sM*U1>N@Z5Z`PG
zu|Er-O=&J{U7!i;KO6+7f-nf$s{<G2v0z7k1G*=Npy`}ETvt(puFg?FY<C2+rk9ZE
zrVBIb%Rp3P4m&X;*sjq8%Rbk`1DFa|T`26{>M5{kCLcumTp;uD4`^Ys3hXEJASROw
zxh5oZA2<Le%bdaU9SfQ_7=ytE7VIsMK+hdzICY7HOQmMWy}Ata63@c>CC;Gx&=f8Q
z=fa&&v5=PJ2L+kC;B(4J$akrN9qzVpuLwi$mk+|2tZzW4&H|pl$bdc93_&ke6}{T|
zD^d@AfRc<5#K-D_^X3_lQ162^MKll_sz1S-dP{V4Y=JySXBzAowTB2&-vN5_^a;;h
zuTYEIS?HgbL)6UglOJt*0pI9m4CYrKKn!sfYu|Pl4H=%pEaD!s(|RvJ_sW~-OxY7;
zF(Mu-jSGdLmI|C}WJ}f*_CkHfJEQ|1*yIytsM<{t&FX4{c7p_Td_^C&vIU1tX>+hX
zZzp(a{Sp)Ib-{OZDuDF?Kh#vw4R^A(!RHML_$brmn9amVWMr-e>+#7(WFC2pD9vQC
zyrtjD6~t@EMUT8qmUKM>{#Zorc;}8OSYmiG)e0`UZ=uKHzTCq{6G%QN!o;P=;BAc?
zC<Zt|^vJQ;v+r6U`^+)?inlFfbuPeOwvy=5^iyDCqfazzy@V!y3Rait2qPV*LEFV|
zaQ>$ENPpD<P<DC`|D0bBxQ!VeN7(@<Ig2<Fw3YQPClP+^&VZtm4P^1IIaq0vIcha*
z!N*sA#OIzH#BGm_V(Vt6U=60%*wT5{WC#0aGTJNvKMjd6xiAElKXN5iSEPf<;Wn1a
zHH2RNiUUY?gW0!JarJZs@cxjE2B#fINvd2Fdd!BrZfb@|qj{L2Z6g_SDv)$<6GCpI
zFEYNYPFDAPMzte)aFg0C_>z7XIFc|MYn|qT@)EP~5%UJ{H03)eCh``x=;t-qlLgC(
z=INe@GiZawD<454V&br}Lkoxx)CKa;f$5~AA(~fSTgRUk<&P>;Zek-1bl9blM(D!@
z5iTCTl&FcDho!!8L6_d&R(Y0jio2#ah`SFxRotr4&32b6D-?aci&TsXkvTtyCFl2q
z6LXuxU3#^ZQ{z1fwVgG?=9I{D6UG^{nw0CX8@KhbWsZE6c=`QU-gl2Uep~FxXkj_%
zo20Q*$P{S!oQKU>(#c+<OtP9@4-u&i>F7e1r6O-|G^9@QVlBGojQmTtq4}$eVbg#X
z?Ac^OXnBs}nC5JjlRw^yEew2t?z&Il=-nDb5ppJS=NhfqzFSuExUJq;(}OYS?a>%Q
zbnzy5PR#_bgO|y(KeV7;$s6!Lx)tvjt%y4N{INde5H)3PIac<-0+V}v9>Rt$pttv;
z2yFKf(QVuF#1>CoQa)c35B?B}p*3Ed@Q0(}$zZ44an)0B%(H}@B?%r`HN_Cd-+xM~
zjn6@4Yzs_7V>8ZP^9)=|R>0`YT9!xBTu%H;DcAGyB>59M%azDg3ZR4&cu(zB_G>>N
zgX|~6Pfc;`-Yr+4bwUr;p3{r(x7AkG-<VHW%({*Ts1?f17kwvJZs>~lOmU@1$GKV$
zI(g&{fgPE1;sj5_Dhnkh3glws592<M^6~HpKzG*%DLJTa!p7dj;C@TF+@lTl)aavK
zJl=Rm&Y1yIRw90mv~$-d_Fg3L_mfDKfwHOG?D#cEV$bG|p7}Fc_T)Nj-|CEiqxA_`
zvCr5mbxuNER06ht^AyyQ$5XWFZ6;M_kAv>MR4nMkb<T9RWI2Nyx8VNB6vKG$CM;=f
z9eY8Z7U{IW1(T#mh;gk2{M(;TvA7?vaj$T9v*U*b(A6U=FwOi9zV+peu&8gpcJzG#
zCM-OO-$}|P1J*r6KHm@=+m8X{mbS;}LqR+dqc#$DPFYJv&1}HKPXPXY*>N~F=z>a%
z3zQFzNZ`~q&%*<vxzMLvjh3lx!0qZ0Cijy8sd2gyo7r2-2fyQTR<HQ1kVX&GYWxwc
z@5<(^^Qpp8jz_WXkm^{loGyMoGYPJ4uR*<$!La+Ir^59gM`L%aPoPVp4D3vXNPeBV
z4Jo%Xi>;kIPpRwv0wm9GK^cmXaH;Gbp2yNs%;c-1ZAstbODk@nGl}`EG0k33Me(?0
z3Qh7A4+?>^_p?f~R~2WxoG*Ix;up+gOc$iuDhifvrO?2eC)ndtk74?41^jGR1EEy<
z0<^sCv4MF8#A5z*)-Jvi3ipzd^u44{m3pUOLfuvV{fb2}r56&+j~y-5T<x)RhkyD9
zLggg9vSe28Dihr0fN_L&J6Ej4IjTnf9XnU9V)Jxd3+&1=GUbI95&a!ytJW%O+h+;I
z<*T#RjJ&G+JIgcIk1z{9T_E1IW~0W$gopi|yVh>jvdiFz>?^WzbX{|e0=g=)zc=(O
zcP+Hvy>8nW|Hg=cuHEbNO@q3!L=Jm46poMRs|q;2XJhe%_<N@d9V$0%pOW^=!-3<K
zn|FM>0^_PXR&6PB$Wk>9JW-Xi%Vmp>TajaR?(S&?7LnIaRDWMNbEkc_x>L>8YVRtq
z>cH-rZMA;;gL{gcYV+#m98P$2y}LGl-~5gYu7>m8f`*XOxyEx&?kzkJ{*Q9EV&}S|
z=BWP0$Qvi?iVwxbcl7**9?)S}B12c5Q2!_N?2>F6y}F;0>dn4z^N9CD26wU)%y!?9
zfqBZnNNGeh4K>h0Lp2m%t^WdbemetIBn8dt++jb8QledhltNXX6yUi?s?c5RP&sOG
z#QjHf7&qIPBB;9TlZA0*&TtWo7gJK9;SwhG+2{jj`J3l6J_50ApY=z8NA&Rd8YZLn
zZ(qNBqa0+n66Xpm(|8j5OnH%Y(pz4zW44;HZ&R8|i1S8`Nhz=1s)V}c=(^=urmKdz
zZyOVEFlk72fk)x^$g551YT+}sPf7pv)sR|*XPHB`z-lo+(rfp$LhIyr{DnT%-qpU%
zi>WBzx;aNuUcaNF{To7h@~oB!Vgj3^hG=vog=nsnGv`p8NUGP>OhcYLtnna4Q3`{U
zl+%|yq#~UdFoMB&`s=Io5fPqk=@wGp_{8}z-L3Z&l0IKEM0-5AZ+`P~#1_~T#7mB^
zRWbB!D_9hAYV#<UJe$IVu(SE*;aA%V6T>g=5U&+XElP^KR5`2Cx4kGi>e@cZsl2Jh
zDKR$=EgHPqUYr_t=h#a22-}jhML(R%G4wlHlAbttvCJjkcKhPwC)XOnuN~dKB=y;y
z&b1@HEnS-aa<I43?^x-wC2!(CCE~v%;s)z1H7RVEhq!)yQ|HD+#xk5FERl$gOiW^j
zMTarBc>2nIS6R?xA^6e6;Ix531)Cu$fBYi<@?IW9&G?OFaat9CL_`=y((I&PkwZuj
zmuD0FZ=Ugx`KD>1KmD($m5?!$iIDR}hoK~YUZs`(O}8+bZdEkUjt%-J{2d`t{*9hQ
zhsA~`4O6gOOt2a=IBIQh*@~tle}?!Q;J->;w8095ti+IruowwN&_qoi`vc+A*Ywz{
zW=AtNu)>62*PmhyD}T+H7`@+5hQcV8hYe#*2xX_la6*hm_;OnM>a>4t#=i{s|0W9p
zRgtF1<3G(t|401qzw$Ew&t4_L+lc~I1CHRW1s_19DO3=9dCl~jJp5(}cM_(ISc&b$
zV(~2cVPu+EoGP{t7dwl^;bMny*_u7AFr67Wh{bM9PtU*f^h`EPWIQmA#4{I<F*Th?
z51eMWGINJirgdT(+O=5h!8G;(%v827He=>d!4@;jgZ+GyL!Bo@1^9bKF#9o{bbRRx
z=|<+yLFR-Gf|g-_vLUngm3?BMpH*zAOSo9V*vsJQbZ80qGWVK2o0+>a&2Z|n(1+uR
zqSx$`Bz{(L0sb@M0{vyF(|EIn^K<w4Iv%=MwEQ6+rj+*ap)S5j%!b&_gVC2MrGCMm
z<tD2)U2IyulwK@$WQH!$Oby}#ohK~}wn(FEV#z4cMNVbvPlq^sDD;}W%#x0r&W_#>
z5Zk8?pOY+D#){Sv%PKi+BujF5U6y)4jB_%LBg>Pqr~T8W%+U;Mf>1kNlCUT_ENN^=
peEe8RLPA`sc1l=6Vq{#bwz>5<^KoV(4^4pI{-B)z@TC7s_AlGe7<d2x

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f82d9dacab54b6721a1079f37ecf1cbdf3e4d8dc
GIT binary patch
literal 3317
zcmc&%c~}%z5`XXYF$Xh(LXR^r2vHdjIYsb7q(M#x1q3;w;s7Hc*9Ze+yjG18yzzo~
z22rD;cp%0EjPbq&!AL|A37&YwYhB~6iTk=I7+Lj?{l5J+^-aI4>Q()!s=un9a5mDN
zAsIKOznY;L&<z0jSlj+<?Vq-vI_YFs_#!IMWYepLnESB@e3}`+STJ%YEN6xvTImHG
zgdIG&utOREAnuLuNJB*Q2CAb_$&d<9K$|J#qC`zrazd&mH<dv$T2={V)le0y5`IjT
z!U<yo3<7vC!xI2JR+qO}NeK`^?=G0Zk*RUwq&oqX3p+S+q2v!483zCUDQ5lw9`WE0
zl5fE~NRzXZ@-nqq`h;YSUL#Gzi?rl~r0k5m%q-rWt5DVqRpo@L6)FD<lOJ@l)RZYV
zRWtVpJ|`Cif3G7p9+*hJW@@Pe`@$&A-VC#{9TgTM(qrM){AScc)l;w`%>llyRS>Ml
zbh12q95TENf|vf)(zVP!m?)^9AleNdNmp8(5Y)c@)@;q2R@l{HHFURFMvG$}2s2xR
zqUnt_!t&9dz=P-9(BS<`%yL}%&{d`t=vs>%`by?Y=}h*Z;_>amm?A$aLtX~#oTdnl
z1t!yb-xQ-yryUU#m{PQrXCZ71h^A@_`%BI4Is(I>NP2QxAdKCx3MR~53zjt5fUlQ0
zk=bj56|T#cicOByph@vhVQc6BVejjAVXD|ke7@34Fs5j>nW$zos=D15{(2~rRMa(t
z@S*cSRC@_=?q)Tt*SrKNsQ?~!l)@tov8Z+7gf9NwmLRku6Nz730NPiz@Yl9<5^b5L
zXz{r~7KQhsOiGkQ;_GAt3nGOn_Z-24yGO*%%?SJuzD`i6N$jz0h&}MQ^@uX}SOx?2
z!4fO0rNj`AAW`tTS-^4L4qB}rE7U!oDGI+o$mCXanV9-q0=f_+ab`!O;6)6eCio2o
z538~T^+lIK&44rL`(^dS9v@p%yTv7Bzlf{E&+sB0Z#M!x&!B+G?iW<yrBtF&9WOXL
zdmE4pJ4q%F?~Yo69AMR&KvU<4*`)A!BOz?E6bah)5kFPbBH7+-q4l;cK>17x;>*qx
zXPaE<T;e?C8)`@HtRo4x=PNCnHVzfd*A>7MM76+i)Gu(xgR5Zu%0R-}=6676Jr>OF
zc@k8l=b$}HhrsG$GpTp|eyVG<3pmkWK-m!qFf1fbth#m`-0G2UuNz&3^j}m6CKlUJ
z0TY9eCSODDn#)M)dJPo}*M^XnMOy^AS52_-yIDwCa@fM_$HicB!yemEI}a#deUdm7
zwS|aVa1TaVlR}^0;;5p7_u=a=b!4V_50L7=6{e5cOQakJM9EDTaAE3t6zhIP@=f;~
z;jnm17?&Xe4W`@S4D&^V?=FbM4^@=a=}I(1(cZ1l`HrZ*Qj3x*)v%_|Z*bw!`N9QJ
z+fe?obovHmi4w)Vhzs-dpmBXUtWQe>o^@*S>K#XP!|OP_q?m)w&QX{wu=J+HrG*09
z<>R^>y)_Y<SX9F~(ll^sV;fOS+eqhGSi)8M6~y$-*WuaR61wmHNZ>o+CcRNoWwQD9
zYT&eefXQKRCGZfBf$dM9q0E1p0E3TIHhm=nn_fOZZ^jph!dfK6(^EQV-MSXqjP^rK
z<5c8Si!geVMF32RoJrv-b%kOUP)<xK^=CyTxzNi=?#0CP5ZQ7?iunB^gx1uP8F*bv
zk-}(zYb#070-;E3B9V46HS5~V+=7vHmn$r-l-50LY(MGQtGAuX-l30UUngf5S2y>5
z9{oMN1`HfDc!>8<pJBeNpa1ZHz@Xre&=Kmek)y&RB1cC>kBNzmi~n@&xbX=SG>J*c
z+KDNtX_L}3GPAOQ4oxO0Yn?Kun|UrH)8|Qbw)xnG!ETEEX9}HUDpoGiDW`For~40A
z=*Su2BHh55yuQDUPne}ioSm$d>Yn7x$@+Zmyn-(Z=Py{eXz`MzMa!12Sh=dW<jd7-
z)~;K>;j7Y(Wt%o{`FiWN?d3amR_xlnXK&@cs{P*_`1ZT%nuFgT`r+`AqsNY)IC<*y
znX~84U#P9SSl@8z$IFdBU1_>{t@(P(jn<#rZr-|m=kC4s`wt#I`sMMHr_Y|NPFpD%
zWNbmfovXMgGcAj2N7`I6BP~}i&}L}43XjWqDJ5%TWE&XOKNwXx-|ODZ%m-_>8{_u2
zF$t3(prCgG<gy6BW8?IJ&V=Q7k_$UzxX$$cDHiI~nV$6cSZ6YQ+ov+LSt)vBtHr&k
z1*%%CRm-h$Z+c(-M~r^RPsJinAg6PaQni^HhQuA&4!zUZ?Ud@B=Dzql0(c^M=U_9m
zQMhyK<mCQL1A`b^?akdL<$OxG=8v$2j?~y$V*R$9e%|nZItyg9La9*ykH0!pf6aHm
z|K-J1{Z((0?6jqfRtaUxEiC|0DCH71GICVRVB?D=FC|?$@?`y3mW{v<M<ZC)!1|@I
zfh?QC`ls+`epn!kTluqW2sh(>$BcLUGMCf9DPl)W?d$3~5HA8o26KCV12+b616Iwl
zYHsk0<F-7Xb>sHwiTy^pC&rA)OAZ{A9v2&#%JDP9F}}|x;eprZ&nv+oFd6mnOODs+
zer|k>XI64h3ajDn^X9QTnBqG2k<roIK9n1buJh21c+&BipI#H=nH?89GCMw&cOBb{
zF!~o7)fo>C7N&RbaG~_mB?pbsa||{_&B^mZh7S7?A$+`XurYOno@Iw~%b*M{2D<pb
zL6Z{uO~Eng&k5lm8@Tvmh{mMgGe7<&MvlG1{5aOnV9XN_mb-;zSU!@*OT3fDbKdp1
z%)mTshxe1akM-lLTxyJxKyItk<V?=f>YbBxI%iEzPPV}|Uz?MgmYrqm?&ad{;-*k5
Q0eJfZ2T1_J@gJUl1J8q)CjbBd

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..842ce2caacab4c58384e71071de8b40ef03e2e4b
GIT binary patch
literal 3138
zcmc&%c~nzZ8o&3xmlXntyqI7VsuB@dL<EbX@`9`eSwyyuLLd<kLLvmxx)rTjL`A6z
zb*)&bxZr|VEm(_dl?n<}DlX_)r&evN*74YO>~inR#F46h%sF$W-%0NF?Z12TeJ8PO
zye~sDuFMcMo`53&4nZ3{?e-qKe>b7w(9xn{j~bI1z&J8ePt12<_zYhW0NjOEo=j+!
z1^|d$5gus>DWu#76$~x+2Yd&)Oq8L?*QRA_3bPp`rKMY-v=%BOl)_h8k`rHmK>!a9
zcmm+hA==?HLiU!42=)q5CbV*7LaVG4hyCIyHt_GT5ea)U{VZ&EiPn%=q}Sz}(zF_r
zMv{XUIoh;LLtc?SpG)KnlwN?!a!T2T6#s?Cm!?-@$CTNrnP!T+gc89&-=?Y$&7l7d
zFYpf?h~}lbO+s6D)j3VdO@`ZxucCp<L4wUW?(k`woMQdv(mM^)koieC?EF_xpRGe9
zs3q;`qP_5Y$)h|TXnT6tVg2tnpv~ULJYS~@zBuWgu(UIqDpjWnc20Q>9{FMf8gsDR
zp}>0>zrnr^UFmT{A4r3FEemQv<@7tkq*W?jo@^_yop72y5vJv9C-y^c%sD3L33rG6
z{Yzn2=u}=?=_rZAZ4Y1$jOWj~83vO#uZ3xgHh}VO7w}QJC*80<%-*MBrP$^~6PlU&
z5Z;IyEgW*~Hp~`#iqF*t2ohH<a1b?3L0)kS=%0@2Y5Do9Aa>khkZ`Ap5<cpp+BHu=
zRyKgm9uj!0BN^RTmTni~SdP$YJrX}|1-hTx;7>PmX;d>u-ZQ?H7Ir&}Y^oGg##1eV
zOX7uD-+F+1w~vXvt|IVR>_&m$<^ig9>`>r$>w8{bzX}*;im>h9e<k%*T^ownSOPp2
z@8V1M8-&J3^F^_D18oM>)&Sl+w!n^}sk6Jf1do#dFI_bTeAi$Qw6D4dnns^RpH{R}
ziXSDwZABHGy7e5@((S`9q|Whzqul7-=V@xhqczUm)#F4<jZ5Gus!`w(^8=iB?=sl5
zCX5>3@*Cji-xn+xbQ;v<7NC7A$HK-+2T4%-L0+G!-r!V+85!czV02`WSb60dxHYhN
zsBuaIGQC?Tm{IA%3!M>;G{qWv&mzY5{NQndiMmMoqNqk-{J9%;eNuuHRn1NTU#<YN
zI`+9nx%olay3^Frgc>TP>|2;HfEJGbHHEk8$X)pKeIu>!I}l`tY=^lq`>CvxfU5f1
z8??^egpz$P*?!!wKsX`Q8K&fkK!^PfIIr(=DtHe>;_sBa{%7jZJoz2RQm=1B?e#j8
zS+9mo!+wR!jxQCKCDfwg6S@5BJZF?49!#|^Hi529J7IfH2Jk<xrZ0cvfvyLfgq`w*
zsAZwtrp!5rC*D#jaD8u@-SJy9pp8=_Tqwx_oz*v~O1_I^v6C}gYg$ds-F6MO6jt$v
zAB+dV>0k4!Z5wR1{bL>Q+%ejwIY<Hg#EJ0E!yl3U7aL$6FOkk&!@%w*_t5Xtmx!W!
zY^jHz8{vQ(8=%V+73!X*q-Q%t^FMS7g<0|QdAO+jKrs&}Px7mLR{CfHy^JOg3DXnk
zD#jD_+XV<kya%`Ox|yPdfdC?k=J5qWk=VvoVrTEr$FZ*yBkd=XJNH)%80g~q+MvNh
z+>}Gzhj|S5^z!x@;XBfAlz+hJ*8|6l4H`FoLNKcenHU-t9uXNeNgX{oCN?g9O2X8{
zq~w&;H~uzldRn?BBU7uJk(He@GdEA4Z!pd(C^Quno8NqE_MEx%=D%ICVBtH97B6|X
zbZObL<txfpuBv!%^_sPnRqwA`zhUF1%^z&3-umITnvb^E?%26&cio=7`}Wr#XgK)s
zp~IgvHXZr&=x5Exj-NPr>h$Ml&bEAUuC?v_h4zllFE4g|b*cOEm8;i!t}7qN6b!P|
z_lVwwQl!txC$(NzNay7gngqH$9Vw!?Vm4FKOODc8j>^MM%H>=ic^=G{mc?F;DGd_H
z0l06OUQ+d~5;CDxhE)9*PqBgLtA3f{RaM{qSr5q5<!6~J4a`#ae{oheI^(+kqV-q!
zd|86><^Dj%FU-u==`{?E`<xqku3fq*)Vuoz<E9M!k*qh$4BsN$yDbs9|KQIchOZ7H
z+O&+j3(`^%_F%meH)m|$mDuMT|2bJ8rR55_`hVQZ|DqoIe|fRWzv?APOZ!P_rBJ%N
zpA!Ibh0K<XkB><jV|iGzEwm3Ie^$k^Y#ja)n#8hZR+Ytuv1}F_lEs~=us}3fg|KWS
znepB_;~lpwBo>GvHfHv4AD`FpB6M;D*@u|P7)l0gnq}2wP^FM9$7e^7eQw6c$-Ws$
ziACD5z}%GN_-w-0$Kv?jnu!O_UI=Fd2Z70Ak6RL6Fa5&QB>#MEcowT6`kZ^54yF|A
zK7Q&{vX3HzC3P-zOFX&wOl8s}`5RJ_CmT|exzw@0I7@y}3BBXNU}1VL9s;Gxs0~jv
z5e6HnCi0w++3G(olEWK=jmZi<%T6TA@H~QnF*Pi3X2!@jF(#vk5C+*y@W&xqE(M>d
zI87Wm&JOcaSe4mwPh7A>3(K$^k`^T{Ny|BxdWt@*2>apkB>LDsRwY+skpwbVqo!b1
tk<R3mX*7Ci3JMHn*J52kVU8i+)i=P~*L#FqtpMQJ2OOaRh{gXN{s7EyRmlJV

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..fcb03c17aa1da69311db0ff5f9542aafcde37756
GIT binary patch
literal 5020
zcmc&&cT`l@*5BurX|w_6jx&OSB27_>fDMrWsiGnvO^pskK&o^_N7105SRhIj1S}K<
z1;Ij5KoPqE6~P*f2Fr^*v4HyS9byu{B!7Hsz4hK++*9}7zrFW4Gu+>}N!(2-f?7lc
zx>AZ1r~#NX9ZhSGjO^{q;M=S}{<%h2IQ;TO8_#zy4!sFJ=h;?d`Yh{Pt<-rbHK#tB
zKDawUao`{YlqRKZL<uN9#iCf6mSDj7qATKj@dg0O<`~@y5KgA;&Qt=>`+zhlLTmvn
zzF5TdmqrFI43Wl$P?%64e1e3pkX{<n<NT99u@N0OP#Dk+pY8zYc`>3wT<J)dOxy?r
zTm(3mO#q?90g`@L^rJvOiu9wT#IfSjW{ju~c8o7=YSbUOh&W$%L>%;+j*!mpudDPX
z-%6ezMv#0n&xBlhMejKIdxi!|fl&eR5kZl03j?KbQl(IO5E{5JASygQB2u1n4<_7=
z=@k=t`MBo)1Nd(Xaa4<H({iPrvE`+>rSj$`+hP4uSYojjs(K{Q)4UjrO2)vC*A(H+
zW?gvPZU_OR1!!+s6m0xP0J=Ht5Oq%%z*rq#Dr1oGb13*7`UqD)RRAk^GL-p<q4WhG
zlJg~KRg5CGL_8T<51PQ49gZ+EVl2Fz{Rv&J9tT;PB4pS!3W%r1z=@gxHmeEfsvZSg
zeQi(|*g)qJK5X%`gs=^EFtcX~%umt+?sj#!KC23LDJQ~k<63mFN)zccSAt-J6X-Nw
zgy?#6h<ql5Cgu6CZLub795@X2xk0dK|46vBfCW2yYS4Wig1QR|a6?rc+FC~dQ8*n;
z>Rv;tvkuIvE&*|+DHLM{P^eJ{D?eAmLzoKPjudwP+bOVlwgAKj93l0|k7!A}DjXz6
zK}0GSvW!S*KXeF;Ryu&&2Nu+47=m5~3-;$q;p|-%=(s|{l_C@5Qc(h<V$VZ=vIFQm
zGKQ=ES#bALBqYUoLvHFW_?&PWavaNHr;9b*&&N>zsy-N#HVj5qnZb*fYhdqneHay~
zhTataiiQS0L~#ZPq9b*{VaqItsrJO`LTZSNicj#i+8iApo2$SXIRo~N*h?%Lx)Vk%
z&?Ve<y+IAm=b>j}CQ-StOJTI}C47^!A((bQgb3n1*1Y{Fdar*0GmCn{PU^e_?J2j=
zxss>IY)CX#6cq^X8+PMd18cG}uM?_UKA@4{icLOcf-0Pq(44j=Xx58CCs%i2DGfMm
zPMU{xdDz3xmaj3<en)&~t0Gt)@<w&L+u?57cKDnTgO4y?g;`CUM22Q*u%4W*MW&%A
zh@w;$%R@Gemm96Y3m<-m+|l+N1Y;4o^MebfXpZ6WR3kX*yo0l54|tCrPap-L1QS=B
zfOnP7pyX>0;lsyb&%bGa^m8ZhYaZ5+*18yb-AJM<GdsY_N|&hDdJT1g1gt7`It-sa
z1DY-m!v&lBk?z_<pkm(-_ZC(IZe@Z;Q8vJdPa}>k+Q#~j84EwPuYvs2HDtlAd00`M
zDQeVjz{i(;#OGh=!>vz@VC$qNU^T|q*|G(eWDENl8E)c>pMhAIoEHGA9y^h0tCxY%
z(I%Gab%b93iUUaZfXTF}_|RpF;4zSn`evL&acW!?c*2UjVQhj(g9VtrbuAgt;YYeO
zi6E=i3mIM=N>-fxjP?yViyQ6RiYNCt!m*gSSmO*wlpULf4_VNQC#l>;5uvxSrO(!3
zPZzHu>SwwkPM;MPt#S+viHO2Vjw~hys7vILLo-QfO*p@FUzK1+m=7vTxP=YZ8_6yT
zH9!NG#JFVq3ZgP<0hajI5nbuOqxyVJ2X|fPBJKh7v(h$=cD9R5MKS;L4@lJ@519%w
zSv>EjoQOLd?uzb4PNl~P)O6kio3}%O8#B(3Ri{#g-MpiVt(-1UjaE2_Wq<RS<Gt0E
z3>TGxu2B*@flPp!&)L|#<W}}N6_Qo=%X=cRW*NGaX0F8V8x4t*+*wQSJ0PE;?P%e*
z1+ckS3-)d{BDCB_aEvpz@DxrqVoUs9q90r)a7NwkLm@mP-i2CAw%67aKDW^Wt9v*G
zy*nO3h%es)x2dV%e)uYR{>KK?DSZn*$G71vqm@u=mk-vZ5}>ZaEyYS6nqj<OE<(`z
zOX%JGFaq0?EWTrXk=W{{Ln`EG;{F4H7+UAf34Syhp7yo!PO5dl3AY{WG^zja@+tZ-
z{=v_r`uI##!ZyP+G`8UEb<e?R$7&dzx{u`=H=h&zTE=yIGD+do$W_YZT18OC3H<E7
zZuT$UKrXVK4F9T&Vs~!628|QWV$GSI_(5xJ72QoagxQ=Mc&WY<erez+e{%h1{!XC{
zdD0<}rD2hVy6xwZQ3Xz7&&N4<a0s9uHZD@OQ`?MT-$h_!Z(-mN=cG7gyd(EPLn+T!
zJ&(6g{0;By6eo(DK40r$E1%3fb&4MwE96Be9K|1J*ixg9xAFPor*qEr8na^Y3#5&U
zF0ubIf%i`$ReMXOa?_*NA*n5!J9_prwDRc<DBR|N57YVttjOo=l#!>QDl7&&xMd1z
z$mT0qb=H%rbH_n@S0c9P)D6x|=Xjpp&D-!`c!GYkM;#WozKXp#TZ^<`?1)Jdq{O(!
zT)~~s9W3t0>)dPHJ?!ZBy{P-xYD_byMPPY#6D;jIs2%=5h>7w}<9Fk-NZ$>QkmoRh
zWBt*W+}iX64dg}>5$eOCc*=S*Y<3MEd<yXXl_%jupCc+N$W=K!B!;uEegW<q&V??O
z3bax^1GlM081G+tq{f+AY<A~90eGL}S-cUj0%~1Rqv1!iu`Qjm!LuAoI2p#eOAf{S
zc{=#T)HvuatVErm{;=nxo8pb1Mq_s^PoXQ~HQ2c|VucMutw>&R8e2PSfpXh}#YjQW
zfYvC5!j+Qycs5H*DOE5OZIAm7U$Of(Iv1P68dL8M<rJS=qFAS}`(Yk%_J3BbcQ5CR
z=Xs&WFJED<W7;6mT1mKK8-;q`KE<ANJb{^a6!G(IHH31}OVIML#d;U$5z7TLS-S-G
zDA-*_((f%d1k`3_BdSI9kJm7QDZPkb{u$Cz#nU#UJN(o42puNjjXAU0qDpXA0mc#8
z6t7OfIcf$zEyXEo+5C}Cxi%$hQWZpIAw4Z6YuBr2+op*mrQfEj8@QMIw3eoB9Ae^s
zCReg+-6oBRF^_s$cdg%|WwVAOw%whUspFJo;M=x4{X2cPQl~uIJsY-<@u>~zZQHXk
z$9Pd&n%Hh{M&9_4u5#a#dp8wKh`xU&&#r89;gqCTk9tp*ZP__(HO3t}y?kqlU7DJq
z->LG<U5;BvI_FQX$l5a_*DUnLsfzE)W*6J054Eq{R^d_ZUg6hXxqYAaLI1P)_WQD{
z<{gcBe4~9|&Vhw3Yq%N?`*Ujo&SV+RJH0>eQ1JJq&IJxt`SoEvwV^jpR}~zIif%dk
zJGyU+{th|1+Jx%Apr`Lhr_n2VDA}mFmu?;Nc*NjNmVwEhn{qHWIT$Gmsi2{H8)&GS
zg6`@sP!|gssC*e{R%8wOQIZkO8l()$yJP?_#4^S90=u#i%R?SKro*_^#1uiz@qj#x
zYx4$+V7Q!;iS(11)aRlB&Z@UBW_<)=`vJ?30FUY6ixeiK{&&B;eyg&`CI#mT&6D_2
z+f)UyW!yWy|MYZqL$A6d)c}W08j}*<yi*Ny%G7brHeaR|<g$H?@8P)jYKvX-#)o#-
zEmIGkRXAnYuW#O~hq#s4r3)>V3qsxZ%*eBh{~%c6S>aLPRll4H^QxM6Ea8_ARJcz~
zz}akzWMPC~JzWveJwF<<HeOXG3Qwldj%Xm6=F12g?A~b$RT;6Z-%<ufeZRhO9uo3>
zL&9^R#V{EW<%wj97nA2kkuuKherlWyh!yXZ24E#Qpm@*k#47OsfgMHzibpi?M0)ui
zHNfE|yjPX+CyD4-#WF~DkpVTJBzv)un`4oD_@K0Yy{f)fQ|{7$jx8e`v#s)Cg3jld
z26s2*#RgyADOoR^njaT>rEE@_S95-R*!2U_j_j!g2@yAsEbZ%VE=Y{Jdm@EB#Ck{4
z(jPlA^}UbpSQguNxx_KYx^Q{?)9W?C*N+z_CqBR1x_-#Cq7}<t_jQ(epD0?H{Py|1
zj+|*bWh?uK$?vs&lIXWvElK+H3NL+y>-}V*NnwMm$LVWMz>NrvWEfCTED;_W8^;a`
z4`TQNO%7UAg#+fohH-jZ_4W3%8A|l$&-ULkfN09gl~R_!{!ipIDgud!AdIBBP>&La
zkRmQ00%!u{{~N1$#C${0LVuBbMdn0|sa!;yFFFhX`s*sK^c%s$7=l-$Ow*sHN&iHD
zRDLHz;X#qXaf76<6cc7(dZiQf4o;v6&|e|`0r>CI7iU<2h!q<U5)>h&2$~w|Vt=BT
zx|**0)oo~w2o{*=>ocHOgUVm?Ge+-^)1fem<!Z%P6C(LZF)S3L5x$(5?oislHsfCo
z{C|@Lp+KxDcKuKD-~SQY{I6W-|JhX%{xC%-(BlYinF|2Knj%GsyZdbK$%8jbR7@B%
zVj;1WNF;OUhmjc)NutCySmGd&1WW9K<!iRI!c1mlCy_WaJw5-@(=+)nmhr$ilFVK{
z#@KiwJ+PnU#LVpynbw|ZXx9>nE7REeGE@1!#DtlL`J2r$_4oFQ4|JFm=Ii4g!t6)5
z(eW)?LN{`Mc5)|l5VQ>XlMk7_uk2&}ye%RF9fKuO#$FCjr$dYHm%01gxy;;!X$Di5
zhdvlj7`<j2C-t_7^7WY&<>w<$oyMCpn4gR1*YVKBqUBHVFr~DO4s`U2V>Tqtu8h81
zDe(*b9A|mG>0;CJrSuZXbY|!n&eR~<&tcLMf3qaICgzM1UF1Zj{&a|gheEH}$}Q>0
z>FnrzUx{tv;5o^IWvpl&iM*17M)D*F*X61EMmWUNIPyFhd)hy3${fw0CWy47r7=t6
vgW|>pL`RR6#>7M=Y9|E6#D+#iYMWY)GaY9lcGU#<{STT6`nN0kce#H9l9NZ9

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..84c399e67f3a1f307e88c5325c849ac627b14610
GIT binary patch
literal 3138
zcmc&%d010d7C-mC_d)_h5qKeqQBWjqAfSRv5qTgW7-Vt5&>{wifNYTjQ0s!USVXK#
zaU1Kxh*n$}rPxtK!37uUhFihbIy1Fe>rAy{zp-^%oqJy<j#T|)zHh$i`I5Vx?RU<7
z@0=t~Nbu)b-h=O_BNI3SI0Y?>U*Ws5-wgSf(Jt~`^NJFA;GKDeH{m<+5?-PSgaJ|;
zPbsxY1Axo}laWP;!P<RD!?UUYkT|K7vQ$G}dP=6joXKMatJsPZJCSz0R{A<iD(VY(
z3}kR36Tq(_vgC|Xck4t2yFgS*Z5*Z4CaWN@Up^%U{tX*b*qhnsum{M~^V14*jd_-o
zbc4kp&mxPg^pv#xoWk5ZF_1GzaS3VPXS5fv=D+ax%J53M@fF>4d<!F9LW$iEi<pQN
z6JgTNOHo7SezvmqDLk$lj*j}QcaZj-06Y5ZU}vbeK)+-!Hg4Mw$ZK6Ibc|fZ-kop)
z4f4IfERPxp{@1^TT=r9ZWX(Zjah9R<<{)lP`vquS`~_;OzK@2wnegWPboi;UhLH^j
z=5}pphq{(0s4wX7=#ze^HcF1-k2k>)Z2>w_av3jfWZ;IcA6k(V2!9KFgq3?DnJvd0
zLAJ&P8fseMR{0uuR$0LG@m++2MvmgLR`-;Ad#WD!XEn2%%OnzRKr-$hT+gMQPU1d1
z{s5wzl(6MVG<F=0nL_?BM0)-RXNTUl|K{EZ&K2Y^a(*-XxGM_e`f$kU?SmgoZ-k|7
z&CIn)!BBo|CVq67Lsw!>!|M8}=z#kpXmBmUd&|Z_?{O2L>0&QtwKoS|?|cm1!~Y3?
zPN@SX$dAeALqJ+s&K{akC3&*S40l>fVc=CYw{hE>cyoh4I^4gJsXhA)y>;#`6WrX#
z&a<ot?{+)OEKCmMymzl;a{Tnn88E}>4Qa6G-gwTrWG;9gY?Ac&453H=VBqbJNbnmw
zhEdOTgtks!iJ3Xag+zO?doHq!-=ivZXZ0A_QqvOD$~4*yiv12PxPJpSRYfqp+@FFZ
z?}6RozNcVsb^+eE;tkZe)=3`Jet_#SWeBuhF2?!sDJUkY&|drbO}O2=sK04a1Gc=k
z*KX!ocP@NpBsLTo*t&Ol$BUk0?8X_R*lV)wcBY>?(e*<mShKFhHSp?kn0<MlN3_>4
zq}*_dIXZbeqp$b|P42}?gMZO;E1SMWKd(2jxh}mSGwdUj9lM{&I0?+UuZF<+d7JPw
z{|?7b+zO;)lhjC`BZJG`YtaIiN+zTZVf(*oxt?e0@dDL7=W?Grvi5o-POI0U!vlXo
z%Z@LVR!rWBi%w)qZgFayYVXOMFSEe)O}kKgRw@Ks)Uh}248pepPogWTQruRma;Q)T
zarRrw?L1Z|cRPN2CUS6XM5Xd9xKjNWX060sUgoMsYb+ly^J{LRHuE~k-~$N|GUF>r
zwPS-r&DR^iyLN;_OOOVJ*-u6HzW+PUec}Mc!E(j?Djsz{zK@?xUm}b7(vkW8v<db4
z(?;YzDHL}mYuS0OF_LYr;V2_vAxDbJcMSiK@uqL(^U}xC)GJx~G7x%<JnYHDJiUaF
zjC**OtXmjXItmbGSWaRmmDxKu%DZ)U>f!9-$}8NIDs@jyuiow+efoO#^V0SoFmTXd
zZ=WH)e*Qy;4G#z$F>=)CH-g3lj|~w*!^VY2M2?S&o}i1F7#kO#Flq9Xsfp9{NpHTD
zJUwNGAvG=CI5Q(NYgTqnZeG4=c7fSaSX4Y`?!33>FIc#!WO3;`@0KljuY766vdZP}
zuUNTi^#@gJ)~;K>;ddK9+_ZVi*6QDHtJ%Keqn)+8cJJ9+_lJG^>pyNd@W~$!9%?+?
zbmVAr%dz7pPPU#pedcW2r{~UJxOl1k@|COCu7B3idE@ghZhm=7`%tOjv9-RB?^-Bj
zxmkI%)*H=iPL|nXXUs9uB1$S|3!~_8R@`>h9(2`Kihbn8WWKU2c4<r<v=tC~V4Yr3
z^=%SLsZEAf{g+RPffuWOh30itzx(qZkYmituvi<IweCMrYa7+1?!RpPH9lVzpsF$e
zloE4VrZLyRv!u^?;TPJamqxe8C4@9(2*Ap&S>`2H;jV3o%Kaz*Jmw|3AgawO#k-&_
z6%!A(JMmHz`ySN3==j&cf`V0PRJ#9hFaM2t?EmG7)&5pblA_#A!D^+7J#MZ5Dvi=n
zNJxlH9BqBDlv`O}N&!NsAPDi~OKE~26bqplLWCe>2w@rGStt>Rp{p=Kh@vyu+h(#8
zmu6~#8WLjX4fgdNNfzM~$J2dSF&)F{KuimQjt-%Ex)u3?AKhoC4xQ+qnmDyEJz`Y0
zep*5%<>$r`d~;`!L9`bp8X+JES?!5S%Il(UPD%{OOOMPD3{+oqPr@Ojx9t<AOriT|
zI#@#&p<D4}le18ZAu%9dKW$=u(ljx2;xFDBU-aazcu2Acy?}?NGSrkFIn_cLLX?im
zi$cXV|M5{`zDcqP*-|eE<LELnhvvYP6ftU6>d-kPC&Q@_NpdmGKY?hy6mk|SY7*om
zI>Ofrp~cpF62Vd}A|r^Iv?_@~TF=GM^|=v+#E%#!)hG6eD!m%3Bq%*hhJx9JMvG6H
p$>d`wD9A7NC^8n9v-0yi`~!#h5Ajp!Gyu;(&_sS$$B};z{{p%vL_q)m

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..79a6f24b35cf797574a2db8d32609d38de32a0f9
GIT binary patch
literal 5244
zcmc&&c~n!^*5Bt2$xR@TKyFMR%!8mPgMgwq1epazL=hAv3<Am|1kgHLv1sF1RA?Ol
z(K=y8gNisXiU?xC34GQ8wTjlZR;Z;;-@T!>^|k%uTkEa&_Trv#@BQ0*pPNH|2~mVP
z(*&JPtAc1f8VmqN+?wBe_9GwOz$1;jeli?I8EGnaxGl>J?%laV^I5}L&hX1)>xTCv
zXuRrGH#R6z$X6SzH>Vdh%XJ>pK+9=!2U<)EX%5YiyMrC|eosoh*9HJRSBz=J2q*K7
zmg@o7uEpduA#($cP$uQYC{yERCn_@&X-p!P{E8$iG1~%UOZ}5SnFEsm&=@d{km&%J
zc@d%$gP2Izj6w(mJOnt8O8_aLfWtgG%%jUZ0_M>ZP@Y1@j1^IT$NDm-t^deHO1<Ba
zQs_4w36tMnSD8(rr#3%~AcZbr*#*psZ7cbEhIWFuwAiea_*B*GIHgJ{NMZ&_akFF7
zlCx4$wK;cSl2Xj}2w{5wm;XP2|F#ep%;|RXAbK;UEk&uY$BOd(WIoJw`xweP6wq;I
z4mcDKhWpoap=Z4fbT`;T>>x4P`(+w@I!g>T+ZrJ4o(+JbAw1W^K>aKUB9Fa+t8e!M
zCt)OfIb8-NKMG;d=L)neLl>JX8woW>oZ!NCKd?+00?#MCMVAi`h0Ss)vOjGN#8XG0
z(k8&T<pi`Jwg%onb1)Q-gSNRs*bwOs^VW`s;EuU4bAcJ~wiv>ViDj@`F9%E=kD^Ou
za%6Sp0EpN6gVmW!kY3>msozWBwBAg}pCgA)Umpjbf_R8NWC^WN9N5)Sj=J~=G%Y%C
zQ)CFWH5NeZ@CGN%3&;(yf{BNVL3Y3yiZDCaVXT2A?+(L57!B9`Xsm11C|Ey946?8M
zAot0AG&f5GM+j?3$>qUj2ND{N9Rr6Yz7X;&2P)L|V5{c9p#mi|-O-1}D<oXm>4XCJ
z7lU<XGrU^l3s#RE;cCogxbrp@7O29ZAa^&s%RUd={Pw}Fz%kJEIfh;>djx~?Ou(|t
z1%7<K687F02-c|vsHf;>WEA%hsq7G>r&@vUhKZ1II1JMymJ{mzZ{g)(S9EqrfevLk
z0rpz#C8CXXfpwG(5wg1noeF4%juD%P1GC$8206Ck>jLb-`T9dhA)2u>TTY_Bfi0Lz
z+7s@AwpM6ZaSL56eu`WMq+>hN;-K$TDbBMSLmnt>gTvLoB1;IuM%Fu_{Q&|rx%M=i
zvCTkr%iFOPr*K%mU<TGc#Rs0bzrdu2{P0~hy5N2+9BE1$;7;Bac&E<5EgY9&o+CWS
zq|L^hC+CkM=cIGQ&Rh;>%F?m?f^=hkvgvJdd+iS(9)ieSzXoEut{9$0SAn0^FVN)j
zfd9CAI4K4Jj97dQemM{Tf(Rc-HXVZfFzXboxp)r0Hf0Rt)y%<ORFUXPa3gqn+7K0H
zFF+&C#>#TN!PI*KoW5*=i`TzGHXk1YeV<owZ}wrpJ)Q70dK^$$c|>J&KIhj>nec7H
zO8ESIIr+uz8Q4yZGpZVR3Lp068+>NVBYey`3$9geHdgL<gS#}!ovh}5PbND>;1?hh
zMi$1xvTlFUVEICDIC+{Qx`EJ(pK$=?9&j2v8aG;~3sYXNL60WXA(a6S#hvpcD>J5I
zq+Jv?aLiFMwlR|2d~_PJziLG8Z+eFg4rsz14sOJ&CitO69e%JiGY=mS)rl|Azk^be
zzQN{yzZ!cwXBklu9D=Avo>;nmB^r>Dh83TfL%gP2$rHzdNo9Gmu;gHwc*4Bt=*#R|
zn5nHLcW06vdfh6+6~h)22hyUjoR@y+%B$O=A67Q<R<}j-zCzCg`Nj?0z@_@SpTE0{
zM0SP9S-gqE4}VIf+@^SouUAnArdXiU%}&^i?K-@Sq4pe&ei`<!+cww|Z?Py{=LoiS
zRyP&C(Thx$mVk}J0&X^$4dw5)Vlx)ia98V-98GT@kyE}9wdT1BgpUS6jz=hGewQzr
zzH<wjz3K~C-)RPW*E<kqAr_S5rVV_Zx+-jL<a2a4a5!cC%_Ee^ci^`ib>~joxI)OQ
znu2K_4o1J6O(A5LZ$ZfDTnIgWm2AF$3biR;!t}HGc=aFws%f8&wd=<k>hnsl;)gC6
zzxNWv_qC#5y5<qso<*|TV=fUJL##-hZE`&3bsUCPhf)cT2f@=vHT*h*MmQI;otvkO
zG2J(6APjr(j5Hj!2^Di)Fk|BlICu3A;J<x249Y#o2~y3Z(qAm)g*@@lskdCFM}Dje
zdN_eM9lXx%4F@vXYb1Q9N#nL{yarXno3JyR+VCS|%=K;7Z6jPJ-^5D>`s1yyPYOp?
ztQYQ*j3euO<2c4{dFZ;&R5I-ge_2@fHasB_(A`g?^~M{l$FR>*upzfF@KyQijvD62
zdvL0RA7NO?pDmlkZyM!KlioAU9@YrSP4)G{%uESCMdu{mt@fe^ovjrLhj~-?(AS(u
zn+e#9WD<0~>Lv58B;t8Nk=%<b_JQu$^F*BY0PMD(9nyU6t3PD&3W9oZ9j7{tc-*dI
z_>HnwOj4dkne5vG@}C3{S<-@E>~!R0;w|L3KpW!FWdeWYL5ezyNAuRCuSQBQE^pAJ
z@6nQ{H(^J<FK%M?7C5Ota93EKhq8GY*pUsR(5bCLfoEF<DVjPI8rpNP==z&fa6lH{
z_Fvz?1Jmq*=~FbA>XS0=oULZ0&m2EYnXM#-RuzbEziZ_1-rV3_<L%+5_jRJ{mCG^t
zwra8a)panx{fK$;0|_QAJdfW|Z6+huK1N|C1U2SO1iA6_6ZE<uok%e>g`!cPkn<*$
z;|cYEzgkiU=N|c?onI8_A0LoG9ju7LBa(U0uD>5GF;wH@4kL{Joh@m6;V3q#?VuRK
z>-cUxVovPQAXH`l27Ow)hFTl856iBb$GJlqVKICw{8Fw8uJ1U2+LB^m&zlh4o8Jz?
z?zq>ZE3%c?#g#IhwML#KzbKDuzBx*-_Q4#aBR+*z3X<STaTmUoV<yNI8=)<#&+x^i
z-=K?`+c<+OLSY{*<Q3~`bV?r<0(Iz}s3LS9HH<$EbwB?J3mRMtIb#Hp#rZVqeEAf6
z()a{|Z|maCwdI7~&gWn@#S80<vL!we2Xl6deNaN^Qj+=p^OJ-w6*|z>qCdXQ5p3zD
z1p7~wkxCtN7pBACevQy^5_<gE)oKyJUj~GEH4x(HYD8HVRWld)-D+IuZC*o&!=h_Q
zX<ShaN3ObC!!=)JUQ1c$N7wS~N{edw4w}2QI!^WGb-J!sqw54C9u(E-dG_3`(;r7z
z)C;}E%6gH%b#c8xkn6pA!w_$a260%JvVoo+SKJ_(sk+xNAbOR>d1+j}^1LjuwD`PH
zvgY1-<MeuqM!D*$ve9J0gW^Wh#Xa{L&6X3E7tB|QV=h?aS(jX}RJ(Rvu-fQt*<_s`
z7Sm)?5LeP<yIs}QG_Yuu<wd*F{FsaO`$|hL4mzaix;Xeqy=Ak5=4wpykdqHenunh5
z>1uYYCahYV>cz1w&KInAx45*p-ftQ9wYSwJ*Q;T%m)ttyc3*P8rMiD<c=~dyKgEJA
zUUgp^OWuh4{}zitEv{wbs1>Q5yjtxZ7G$>ehv20K*OjgPs3BTZo;v8WRt9o+D+76~
z)%80T@>v{G+nv-RzG!8#w~$&_?`~uO4KJH5M{4Vz`te*YTe?V*Oil4d7g<P2HPzd(
zdzu>H<rp;(A75r#=pfZUZe(NcdwfiP?3-U+=#N7@SD}KGsf1oUdyR0sz@ta#EedcI
z`Opz6k*_RCBl0s@*dy|{SmSCCU{k0v2(;g?F_<u<vd183nRAn?VX#}L%5b7bx5jYN
z=)NAq5CzXoJlRK<B@PX+uMkfO_UILdO%8CQr%sK?qNhb9RnXy43w!D5$~A71i1@-R
zNu<|0kmpMn{8Gty(K@0n60IX<$jEhe6}=2@Wv_(66|9rb)M7d}(Q1c=3d2bmeLYuZ
zs;;@YsDbM26(hCowWz80cxvg?Sz7-xHStlmvkVs5%XIT4vzLsPQ4a?A@YKYc3TGKn
z>E<RQ2&uh{R0~(I37DvX7pevisYvb0b7vwG_(~)D#K$)%hwecZQah8qlJR*SJs~k`
z1DvJssfARmuDB-kUq4iei1yggQbLi9)C4Vamm(@ZF2JT#+}V3IZgK!~Sd+8qX<L#i
zw!}wyNPpZCl`WZ5m~>p0uw&uRvc%#w?nX&_3bT#o?%#jhXkJ<6&qm4RP431i6`k3}
zsg>QwjnmHb{cN0G!y7K2-yqA8XEfQLkY`@<cqv!41q?UIx)zaRlHHkf!X)RLg)dDO
z+*>o;bm7Co9Mg}w_n$Cb^sMrw>Eh>2!_AiTcIKEZ?dv{aw(L#cOXjuVk1$_B8Z9ti
zX`=m1B?Zdgf1s3sZ)1q|7{YeDhn&XxiJbq3G>4axl**F2_)H=>DO1IbPmX6P5W{z>
zMUv7HlDp2fN9=4*b6J-6=WFI~NzHOZ&kAC$C6?WdqpEa)L_|DBGW@PXKoLsB6GALQ
z>|z*VH~TlX_L#k~7-2594@jz%HP?!idauJ0yT7h7O1}|Tj3KVB*Yz1%_uBKHXr2D=
zBriEWH9^%+W;K{(D`wj@%J$ol46*ww#6JN4T@7SvH;{5NV-w?3lr+ImUK{LB)Y(QJ
zbjWZVQyp-_q#qsx&FNSEkax0re;g5w(VQSp)|!xNkBep37=!Ts*ldg#|K_ZJE%3i5
z3lgzRE(`ik^ZNf0i~g^?)&H|sNpfJUL~Kh*wvP}4$mCL8MQG@x@R9v*mb8d)WW`P4
zrBEm)GY=~h6p9>$SAxP<p-51SPtdM;F$%%#XuLuZ!1m1ieb3Ca!%Wr#>qs%_qrr}j
zBbb5DM1OWZK8J06*oJYfPz14!R|GrN?kk+w`MemHiOw<M)3V}xJ?2GB4^3qEQ$m>d
z7S3fFt-tYFCrl8G^!w8e*}V_!Gb6*@Qsev*6iU`!3(us(NW}Yj=+vp~JdkbrQ`d&x
zAJ05y%}b>WcT0<yJ~1tFx;AwNZ*qTrfnguU!xW2=KgGkA(kngAZ<>nTPy__A`dX!&
z_xO_owDo3+&B*)GD-_=B&@Y*-L3*UG$J`i~1x!s`StX{(Ic)uz5c>~>S@Y6bGLbXc
zG5Zk;ublpK(gw>~F**uuCHsxEN%pU6Q;$gT&0=u0d9wD5f5wzOntn}?nx`u>=4ZvL
thQy|)4^d`hq~(}r$7f_FrKOrXyAO39>Ld%21N{C6X9(suIp%lA{{vSbUhDt>

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..e67164596565a48f5cd69702114b2db7228ee7eb
GIT binary patch
literal 4782
zcmc&&2UJwoy8ibmGiP9E!r%-JMVg2pMX(nJgrTX3QHr2cDbl2xD4-Y+u_Fjb2T_V3
zAZQd&0Rch93o4>fG(=;Iv7oW*J7<VVd`Z@NZ>_uT{nyN{fBXJ@&YJy=lgQbGNASFO
zevUjY55@o(H#@%FRrYIjmd%K#?Xi>L&tew-bhaqo+Du9<WiVAU|N6AXfCF!gE;u5y
zqbrhn;-*FMfTzq;G2qGa<asO}OE?wusL!^1>a#WgNb6yAD~oX4)=XLV1rqWIfjO|`
z1$>T=cbMPGK<}tP9;Pg-d<QB2jC50xF7;2&1O{~0z{7xU<mnE8o=5PU9O*0=6w?U=
z90UZ1O#mrFfi(T)%22aqX-!7tM?1#+i`@Ct=N&$Ue$!E=@%(j_-uw*@hU2N7D!Liu
z?|AfN{K9=>LjA*{SNeHJd&>mTgCM_^zTqJ;p<xn)d6@DBOm_pJn}G}eA2I(HcDWj_
zUd@r$!<Hb=kbAL*ZL@g|1e<Sy>Om0<wyXk!;z{u16)wEkp#u*(CW5cN5;|}&9Ja1d
z0-dZ52>(F`z(^6E$zrhWaS*s4`2d$cRsbttCLDAXK*>{iShr7v5+k`-uwW*%)*8dv
zJTYj78p5+hAJN5{$&fDOqlwMhKs+)6D%=4qHW1KXqYa!18lb3T0e!*pu+x1itWL9n
z1%tuhu~r>8nTl|AVKtP>#(|nqJ-SdWL|QGCpp<3{S}hkKqR|Y(eo=;ISr5oqC4{Z-
zkAii!KX@I|gkDb;6b{y*VJQR+=cVDgf+DoFjt3%tJ{UKQK(d_{EUYO8L8S>4VS121
zt^wjd)xcet0|R0nHvH`@*s(|n1c$|t{P0H<9HRiWgf@gGb0FP-gpMOez#!fRoZhma
zaoa@D-Nu4L+1}87OAfj&k#MQN7}-}8gLc$8c(=|5wC)?hWuJ7o^)U?AM!P_Eaw&X@
z?Sw3GITYH@h2eb|dY5<)CZ(!^X0<6ieU=0Vu1)~$urcUG(XU9w?=FhgLl6<B1vWbu
z!jkt}P(xrHv901Gys9xnCk?ZuDNP4BF#Z7HrBVpmo;rk6=?gSCeK&E>p&jWP_2S#@
zCW6VpT?i%4VJ(^8q3AIj<acT|xwfwtIyT-wJ;jfZ>DUOYAlwh$oY;?Z^yZS4Iek!b
z`YqA~M{MR9V^m=$gBG_nLyK-CYTwY0Z9IX)bt7Z+c<KnoKP1KrTe;A%zXNWiX2Pd!
zk@$F{L~Qo-8DvoUIM&0?dSnuGiYQ2Cu}D2nY{J}n(znZ<%xilB_RV}Kd~1(!%`iNM
z*92m%*U)WxN9z8AX`~X!!1VQ};B}=P$hcWUh?*hxWW@>C(sK&GvSco#wywfPnn-kM
zK^LUgFGG#$Bha7}i&ZDj2etVQ(0ox9*V%Lg<gDMp4=ZZ`pKXkX^DKahNhOYZWw74v
zj)I>$l3?Ox6*9MUIabhMf|@3rz$>~xA&>L-@VTePv$c|Av8u7%c!K9t(x7S=9%Af<
zpM@xxnd1wI4{XUX8@>jE@0wW(R}mWd6^C_$VlbXJ2Uq!;3p>^>$BsuX#gq&Y+2kNb
z#m80=?|HrCu_Fse@468AlB#MYht;m=VC)S{O;?j$5Tu9R_X=>)l=VbqxF;6(N{lYO
z8&Y_Z)Wu2Z^Wq#vk7Y8(b+GLd<hc7j-9`#}Imkq5H%rRp5fwT_an=tsQI$)^qvmtQ
z*z!DSPUPf?tOmJi?E4`dEPlR{LWFcJws*w?%4L@&8Nx3C9fP&(STYvsKJCSpuWMze
z$dRmumv4x;y01}hs+o-ZJ$;Cq;mleyY=c}2GSSL!b799dbvUrYfKYcDPZ{mrDJ9+B
zgax}lL$~dxQQ9}}p+G4Esq^(y*~@lql;<=p!5Z#PLa$GT5`v32z-dl0I3K-Cp8N3x
z>hpdDt|v3_)A}-~wci!%m-AJW<CI{<cTF*=mlweQO)q*qyqdtu)(M8@ULbZkX_3-d
zLfq%QABIw#set?X@aSHvRQs4NIOUYbPWARtE1xw1rrdc<Do)vrirJ>vxN$picFGg5
z&D#L_$yF@JXb&o4B!T1faEA05%|uyp6BlH00`IOGV83($(#vut{M-=E?%Q<*nx=JQ
zExY^h+PNBXI@_}d)5X{Ek_onW@B8oMXEyGTFI2W5+im<<<IGdhfb~)`Jl9t6)q^ZN
zAP~^)tzNQLV|HNJo>0v21_n0Kw%l1$#GE@PN~C^3|6;l8IA9|oBuF2ANv2*3#8VyJ
z**zP}fjh60@S8st8xrfGhJ7}2hKn~6)W`r%T~pz(hmYV_t9vo!x^POhybOfTWx&1U
zJl=E7h!usOCoSxCh(i|%{M`&v;ac$=&X$N2<Za33=r8&O#Xq_Z`5892s`^J@g*{<!
z)a-=n)sa~3&ROWhUU`|>eT}5T(#g=#ABTCJxlS#xi;>d({wCZ}i=7a$qydZGT+Lp!
zSDmz8CC0pCy@|<9*-As7x>%eKS2<TWW$cJI*U-T64VW<Nw9?ee+hI+AtwzWlWsING
ziQkG&C*9KSqpws6YVHR&a#!<1^gcU+2vt;rqFI~C)r;!zfHQ!<i*JWh_r$0mH(T!L
z*hs3X(GzzI;XuD!1&UYPhFjDiOzLM{a@^T^Y*Am861cQWnZHnC`PMt4rimZW*0wEF
z+E?XRZ2M}~Em8&Zk<!91BuB$QekJM)@`17sPTcE1>0`I1o<WxcNmx&kKsrrjHYrt<
z%GOBtlx@4S3P~%SKuIz|aH)70-^)^$Nmf!pnbCXj_4{w4o~SI=q()~b=gD)5xee0$
z@8$q?=#xUDb2&9dY8iU)>^bH*sSV=h$|$eT;Gt`;9$^o=9>RhlE`F}9j*u;Q2I@;J
zv1^{X#5YO{SfxtVD8M;^q^}yCvb?v_2D~EfA2%g}DLtQHe#B@gWou~B9scoO2z^7s
z3o~Z5h)Zx10i*a9MH@Ea)EGV2(?uIMvE?;wvn`5~lBM~kfrF=uH*J>FuuSEPO1|Bq
zsOMbn+FFvlb*!<^*=$j1%Jy;7BkvEkmTumuZjnR@Ecd7G*0N34b8FkbWzPhs65AZh
zvb4-euJwV}+RC<O8F{s(3ak!n%b60`U+&g^V0-Sgh~cw2RtI<F&szKZzFY0)Me6es
zfgEF&>NMVJv%HRTL+WBJ@lFlvbh9n#TkQ8vt~i}xcS(0){_J>+XMb7Gxy&)&GSmKw
z{<3P1mL#6TfPs5s$j!D)hpUrUIDPZxEoqQl>wP9oFe}9%r{4ekz8ULShWi?VE?2rW
zr5NTO4gJ4I&mD~}lVA{oOCk?3k=qi0x>f1MNq8E6yKT!jaZTRwsQdjD%e!jwo8q1h
zcjt-^7qqN<{rtiAU55)#B?2xbw5=^_-9#zsdz`H;Zr?1gWuIqTSJJVSKPC9a*}BrM
zof=kYLc99?-Fx(wRC;vRmz~Qp_U*~DJ96Mc?)0dKH@c4;yjW<lNlMkep}eouc8|Vi
zPea9}gEJF=uX>pdX`Qc-z~9qDN2>soAly|+2di@TR7fD2?NI`V1&P{qX~LYw{dU#M
zkDDffc9q2Mi=L4xGLa%38m1-!m8R$*_<$0%J8kmCM+;i&SB=OdY6sBCr|u2DzA0EW
zk??RpiDZIscQu`G-BH!a3BYwYX6%bTP*oR>(Hn|27Yo;HlFAe$Jlz;wv*bj~<DrgR
zmnSPu#((%o3SFOiwI$1_+vK@E^Xc5KIxj`&_S~;0({N>Hp4-T(-uyX<ELHaxf&FC*
z3T^V;Uj`3Ud!9;BUH)sx^~TU!o%zdOg$=dDe`2Y5yp9;|*sd-v@OTsX(}nzbo7Ftu
zMnAk#y|Syo^IhyOH(L_f>MP!_eXg5rD&%3KjKKEG8#E_0D2!nT{!v6oP*gPAKg6H8
zUDH?TBChhFq4M5Ix&u18No<BL{P~Ug+xxN?Z{hD`hic*iiHJdrWK-acI28EOEMetu
zSO5FWJ4wr5Spsb)5kB=<hhYhSU8R-&U<nNCxo!SWEJ5yf&Jf}s77#tkgtlSIACc~P
zGu`1SG*9@e|9^=3cen-H%z@8}@(uJ4_2v;YU(&(;#JhBaj)xR2Xyydw82`)S^H`(G
zU(y&x?~h~QVLX=OY{r`4OOA-4W*8me^HJ%j(Ec?T|B}G}8!RZx3WNg3e>W-oFOjnU
z$dmkEdz6&#Oi`BArIa_AC;<qBe6GmZd6CP^(I<;vL>Mt*F0vGfM2qP!BMu@_oX9dj
zWFry<h^zu6YnHUa0%l|-64^05J^$R(Gs!TD@xV9|E&67Xk<oN|V7<_mnOns%tu@oo
zu0<k8rm=KmrjmV;F*9H7W4h49$7NZJpUsTbZm!OO%zmg7o!{5NbR+R+C2>M0LCdH=
z$&lIm!amB~#XQVU93b*$>?Pr8IJCHZo;xpH%FOMVW)!+4_0fD*(`%N|-Y(|hZmtW%
z-CZTn>3EAr@!5a%Wj=JVX!%n<OerlR{KU(mnGKPhBcm@-iu)XYv7Myebg^mqTzZjc
zJ~I@DFg1v9x0w;_W4e~Ei5a6r7deipKb_*}q0noV5=%OB8XdjwCbEnhJts-Bj1{dT
zl2mfkNCI+nT>{-L)Fy_GBf-ho)Bb5w=4eJWfv*wa9l0jPKibeYBErx+GBP|)Bi27M
dDkwZm!({4YlgY*cM<KxPA7~)}oale5{s+nP+{*v}

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..cd2e75eaadaa034208a570e691dc2fd0db882799
GIT binary patch
literal 3988
zcmc&%cUV+c7C-mC_h#PAKtSP*GZewXfQZr*OVBVwM^pqO#ey^esWKE>P%$Vdir5fE
zrHC{^nu%hgs$d0IBUaX+f^MSG>{_C6-y4F7o8*uEzWp}$o4NO%dr$kFdp_oehuqVW
zAsIWy!JVNQ7z|*za(T^**yrnePfWR&bgVbyQOb~p1?LXQf}d`xo03}~*;L8^qs>Un
z7$Kv{@ED$+4VcgaIud%o3IK#7F*hSj^nk1r=`o~qJn%H75<y@<bg+L|fHI81VxhPR
ziQADZOh*5iE2$ZqBQO})Nt0awSlNTIb7ynlayB6e5Fp?Q_#~*S)3zF{1~&%vFPxc}
z(fKuF2|aKlp^0B~#4L*6pR$)C6BP<f5NS>z>n1<IFi{VV2}+0xiH`RV4u}s>4`(;w
z!Tv!pkqJ@JDqwGs_!*L2AY{+5-hUDE58*b5nRc-|bA?hN&J+E(pK{8a0dbYHaa~w7
znNj*PysR9DPL0_mqK8j|gToGz3w8Fx$oWR3N8yjC-t|1(n3zSD`FDy;%`Xv~Tt|WB
z^#>3|p2sJ597V5<FJP;f$NZ%om!NgcZPcFs1X&G<#rtA{;dT81LSQnLD$D7B%7!;+
zIJo1<Ef%QMRUOT0IsqqTaj1E9Ctg!ez~-fMai^Cfd}s3lOAa{``y183rq&zP9cYDn
zX*=M3MjSEBd=<8xIEf11s>^$PrUqGtpC<RE@_1g!p9y!x>Ix%=-lcMzo`UumMD2g$
zhSkPlB7r#$&Ldtyn^ljnw{HTa4eBs)<!N|T<_hY*ju1I?41T(x9@5)S6W6?^LR#Y@
z{Ngx;u1-G-wzfv(5rY>{r=5r^QXOEZ!!$T?S)b6}{1Y--TMHVFKf&MqD}k`JAYvFh
zpcB%_V+*(Q-ef7^VQUKPK2uHYDV&S<)mftB#`T1C@jGJPg-68H)5G{9QWJ5t`8WLJ
z`8HGo|9dLZ!k0J)N|>Az1grX7sKKk3f!Wa$yg}a}^x|J^vjr4^#b;9pon>m^^)?;t
zRHhL_=64ZB*{$$sS~FUzaSz207NDr`J9x?W>u{!@4G4B<!IMR+NXleBHlA8T1)cSy
zj!gGJ^dnomwPp^A)i{Q>no%$$Dh8Lfy+`Fk+6c4qErhRDj_@sWiTb*BIu%eIsaa57
zp*=0a2NfsY#zSSp`To)c!nc>Cgxq>1Sr#)NC%tq=SAV)YsKD5TX#3QKp&aHJY}ak&
zyCrL=?S9{l2ASj_OYJP4(EAA;b(a>byndW6^BRU8G-^_Fb47x<v7>l(8Ws4~T_e0-
zY)8e4s&H0Ii>CJ$1r;eN03)-d{6s1ds^9O#b5}I-*J)5Z-SKHCsd^c@lr~OX^N}ee
zP4eU|x$lI0_U%CaU+#g;T?U|3iiHLq!)WuYOrfaxI9}}c1L}60K##oh2!#pFgcoXT
z_;a?b(ToxuCF&lGMz2mqk<!bz!DC7)%<ju3&VO5vIs#roXc(XdQ+3qX>4UpBF8suP
z@MfI<zzPfhc@aWhUqY|$N07K^h4k*!i{uuM5tJxfj|hAljL|wzI<(gmo;+$4j=7ED
zv_~#KEg*1c$*0C(-SZu#Wu1lc`Bqq0H<RG6dkQYOtHCt2oaY`tkB)tjEbw?dNz_wc
zAf(o+!6yVsw3T1y|I-_&g^J10m!V)fwp@eb6WZ{}tPaBTRFkHrwF7N6>n73OWzJI)
z7brV7Bl6H?lCXHOU8gI5iXc699cm@&`KB|!M_)X-346CY5&8x{126h1f6eeSP!SP_
zt1>@D^*c4yZ99%ogFYV%t({3YW|lvi?wTN!-MRxkLlcc-z3OoM`U?J{od%TSB4-?s
z7(iBj$*5f(F@^6C;!0i<6!BwUccJUYR%5;F25p<aZGt78RfcXx)<}|bhB!65i16Le
zi)QPS^wf90)Rq&E(c4|IWR%uW2(vFn5i_fa&=w%Fza0w~m+!?smN(SC8WKl;=BI;v
zBL&c@QHs9M${_425f=VKM(Lib!Cy%G2B(dADCnpO!9g|d==i92Xk$}4y<v6<PHc|g
z-J^7Hpl}3nF*P2p?=3?e;ejyzvO9V6p((y+(}J!_Q*e8VRJ1|ImJ;Tr@eQ}m|D>sB
z5fW+FqZIXUxSD^T*vT_cPu12zJL0zyD-Ya3?aFN4=p&wBxA0qDzFM8=z=IqxE1e)}
zu+;-=;T+WW!*d+<Mg&Py)x|5fGpOt36a0R`I>Ge2YQzs`W08LM9%SI9z+LlY<Z|ul
zyh3e96zZAG{-tK>h0I&N8PiSw`ddkIrI(OgBe7#IWhiAA;^!*}c~bCfBP(>%q+k_b
zT4I;CdJRDjHt}i5TeFt0IoxHJU4BZcNMaRsqalCodJRKGnnYgkWxAG$XNgZ^LF&dK
z7J=t>$qUzQ(w!LBd!w;%eWrn33N2L}NXr`Gvem@5=|K86V~+xt97WNF9ix3}!n&G@
zHfEbIY)X^bAI!+H4(lxOZ9ceZ&jjdhk=o^<WJ!<7uIAM<_M&9EN?A-LligkV$r6Ws
zY{Xt)lx#7D3)onkxnFk`^eK|rUt+)EC2WkfYz*~e3(cENMbq+G{}LT#$=GCg_AYJk
zD8H7HtU~84!(Deflx{7Ww#zE~W=rX|;+c7hbREaC?WJBNo~3@RWjo5fs{-40JC^UP
zn0qv??`CUx_F?~q6oIbOp<UHM=eCZTd*)D1ZRnK(*F8=ZyN^WNs0qJyree?6G2Kn+
zy3Uok$CSODrE||#?meFL?0(xG=fnF>u6Xsl@7CGF`%kX|La66bmDjkI)-s)Ut}4HI
zz2*qFT$k#C){PSD#kbE@7oN>DwBMlTT63Unn~7K1ytbO6^Vt?b?YXYC2QTiKsC<08
zt+x2`e!I0oeYd)jjzX7hrt{nDO0O1s6uRwoJ5qM7+^2T&o%SQ;-yB}pyg`53SBJW4
z0Y86*)HK+qQLI2<a_}{h{cNPi;C@!*@-dwdM1@Cl3@JoOMuscn`5}=Z9MxcXMK>+(
z878hWkk#wUE~)6(Z(HgQ$%7p;@dLl3LoiSXZm>Kl5OFOC@U(y7c+yAAPD0;MH7S6E
z9?;=<((g}M2fwgAtYUkWqyCI0X?!3}ks;Bc@%;?&F%mySvRo6{Ib)V6{oehb1pOnd
z(v0yS;VFZ{LZSi~k|l~p_&0phNYDL{mK{qOVLX<6TyTchulzBO<Me)g3I;PgcU#Vy
zl&GGC<D{64Fz`G^I;?*~&c7<~e}V<EP^u?&{~wd#zlfCnUz%6;S9Ov&!&oeo(c&v3
zwE?7h5;eJ}=S=U({f#BbBh9%nUapYK<+Io?H>SzuNpeM~+(|AEmD`7^&J?V|bZ%!a
zm%DN+D-W!!q}nPu51b?U%;lra%_p)O#~Ci1Y@fu<j@-n$mdo9_N#V<hs(ZNwCr1QY
z&9Dsgo|6#lG%3Q@$1{w(kMdyiTeg^;RQ~K$PS_;a(eF>S<?cSRSNeI6j}CSYl?QP4
zs_-lvcK8m+o}Yit$!^@#4_%dde?AfHnIb;Gdwh(q&x{y99~E>q-mHFnZnHnmhb<O6
zev=PZN=0n2^PG6@LhkC$>8q5I2I9|hRn?m<HaiANFPA%TTjxlw2C;rllNJYBEoEym
zj#FZboW#|iO|kz`*fWL7lFgh&$KLzO6-oW?q)L{vVs+%IO7<J6K=z-jp!-HSC9rW+
zI5~UPKWoZ8O}{2c3}XZ0mL!D4j|qy69TN~27n5X|7!s!pkBK(4v>9tT)<Wv82k_wo
MCrAJf_W$vJ0~5rxVgLXD

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..5212dff6da433a3a19ebc8bdcb3f24dd3d1289ae
GIT binary patch
literal 4323
zcmc&&d011&+J9%xIax>|AS5P$f(rqWT|uQ_Nl?~+2r5f!l}$j_1_Wz0EEbB2xD*7d
zsGuk=c$MO<1*{70A_^*kMas2Ww_00q7xm5wm$rUw|M;Hgd%io#nVEOyeShyVlPA9<
zP!Z%Hrp1BcnJTf47={8^ZgM&lWX{bGo_gdJ#2!z}nfWUJ;ZCpEnCX+;UitY5M9Oh*
z&TsJPSsf|{u|#ZSE!HD1j+kTO43@mXJt=Qc8vq2BnDlyxqM%Y=eiNC9X_*sn^kh<g
zWJGfG+_;FeI5C#!NiHKvH<B-s^ZpJ?W=(<xF$U7<kpY0n*+?9!A|e~TJ52!}0nO*q
zz=(JtAdiqdLqt4RU82r5{egkScqe`-Z*WJ-qyD2KAvyhdm2CbeCrnWVCtu17ncs4<
z6h)^*Eli3@PM;eckscw6CzJT-xlt*J3zL$yS?<G<6<Ge9lIKz;{|ldgnfpbl_?lED
zF6L;nuNFSG;CQYM0QJ6^xGwGhUAXHFJli(`eQQ%D;EnQw9V2(p^NhB@m^tQj;I_x8
z-sc>2PFhRvp4%?4vTLPQ`HTgJj$4pKpTmbs4xtwoE!Z*T9(Pe&E1bybMrSMTBgbK>
zcymfLyr|z!@hzt?yEnAKzN0VEC{SUS<MwEmj|k0ZYJ|gbHEPao$2s*B6fODyw+Byy
z2hM$1y4Q=^a!d%$UxlE$?I+;-f)aRJsHR5RWntGzlbQH+#+-X6Ymr0z5qfhThZC%M
zU#Be9SSN8r53{N1AsE^qX3I-oES!L;h2jSA8vPv3INsE`b9W+R2qKuY<On?9?E|8)
zX^=R=20#3$9+sayLR|@-0tLtB<Guz4UGhH#uCC_vLDN2{Gt9s>c~fD;R6l6EFq|?h
zdWFoFeFeJHUctZS?gPrzo=OqB18-pg{mr~$&darF@Z*VG`10fdX5+RG@a8%P)L>Ch
zjo<m2`mp6THRZ@i?&!P>e8BE1S2M?%Im-Q+Nwg26PD2{FY>0xayWY&u>`%b@P$NhG
zDnflv$z_YE0{i#fD5Fn=5Zu2UnU|h`+kVX`eJCGAA8}=_+1aDj;m*J>F#yJL4z`$5
z%S4?DXM7u_u&#DCT4&9`*vm%Dt}}1Zo?&Mw>pg3zPdmNf8}%&Q;OB)ZzMfC_i(8qm
zzw&1y4kYST?x`{KO9(|fGrI8z`6%w@cuUmZDx(zRm(aUY=HSd{Ug*-R9{q>8r}+78
z^Z5Tj4@AYrC%C>EUE!B+dy&572IOG4mZKALpO@6b<1guG;O!0`i5kz@;}13o_-fm+
zoI2ea{9TVZUOLrKKUJ_Fum9*SFJz69Nt9NCx%DD$29p5?-mb?Ve0q$Vugh@ip1hzl
z4}5}J3nqy4Zd*a-<RH$+H#|}3<`OjbvyD*HWePirtZCE0kvzM#t91m;4R}HLuc+5|
zB5%y~+bB-QTBoJfnLB$;jvl`u7}wnzhn|0%M9VI8L!et81pV_ebMA+F)E4m!LccAh
zj#`P(vG!2ht{Y{b%df;0w;Zv~lk*Vsq7^;AkwD|BPh~w*&eLlGM>B#_6DqPl8l(Im
zUhEw!xPSYYPV>-Ha3pXOw;&>NMD=?XF#hHP#$fzfRKaz`#>T5DZvI2?-joeid3!jj
z^bdKdeHwn?y~%>(qq2rD%Y-n5qNy``I=D|lfSIRsfuHJ9xNU2$K*Pi{`0&~`YX1~7
zUGr6?wBw9xRHcPC)!KhZ&*fl|-WG{F)9e||F?K3I9n)qpDI2|IGw+sCv2lQUSI!&a
zF|-KdFOslLHwMpiZ{d66z4$llD|P;bE^@-n{cv9*1A>1%VG1tAQ3a}S?%AAb5Wats
zj-EOU_jp;Nx-UI-ZD!=qyuJ>K*JZ@#-uQ~TTGNUp2U2*$tE<4|kqE*oTd1>LcAPY-
zg>m;ar}ti<Df_-+qpk`!{_@m(bb_kqT222MExmsY%8Naz;ihkZll+jIGwLMNB&hNJ
z)$gJD^?D-Lwu6lREL%9yo{3Xt%tih_3w7k*U5A?^GAvSq>u~z#HQf2@O_^!)y>LWE
z1ikMwvGBrZH}2FJANmTvikteP3w3;xjZI3A8aiKI1s}KXH}f?gkE9z;Qs2(3qQX|(
zK{JQbyeY54m^F>}P=8q}on$Zq;yiYugy{#U*yBKz{$LB|7nkEuhfd)?hpBn*hZ~`=
zL_W0Z?m|lq3Muz}2<!YLXN*tR;-#{?Lknzv<hRwv;HX*^Y8d+(t!!G(TQRd5XEY~p
zzGsYZq|RvSd|o<ql<!7u@sZ$kK}BEt(F%X>d>mbp<>IrsGQkQXS4L-R0oQEZoFPp&
z=Ock(J<1iu!=;KF)OwDoD9_Lcm85?`E!ln@olPs{j5`<v?(=@&R0!(?+iz_E>s=EC
zM_o-|yv}TN_t!@_>7@WNr-&p=ip8kw*?s)>qkO)9kC6KHR4N+YyAhcND{<ExIlb7>
zpR>(y8j20lFy!m6MNfQM$6A~{<o9oInk~JQW-kwEJ9uW6WT4($M5u^?Use(!dkD?X
z0?d=TZ_Uo3ctb5ik8aIb#?>3;UFKeqn<tPu#&sU8SoXQDnX*8tsQhfXfn`v2=&{PY
zmBZ{KPnRjS<*zcHq`uR6Y}@CnP2F>OGUfJywWGb)S%x)jU;c$fV5Rp4Wz~w3aiO(w
zT}@RhOYP=0705hx6mA$F*IpggykphIiO|b>=WB!o&F)2f4ehbhQv=eXa^k5wOsp}<
zu~@y?_*3;Ar3T;^qPCP^iZu4m3EsIa1P_c1s`WK`o@)qHCDHlJQ3JH_OGXpW5BaQ&
z`jO_BrFp{P%3|-fQJ$+z+;R<l+Q;~oSkGxH_PHbvD0f|o#lDv<gQ`^J$`aozR<mn9
zJe(``>#z<#n0URZ#P6!@M~9cZ#u8PhedLM4;U4Q%U5@k5Z*p5E@xL}<!Iiypn%Db(
z=bU`KaVa${pnH<~R!g}@X~1=tj0fF^mkpcV<GR>|q!i>vTy{0<YoztVh_@wGNMmC4
zJ#qH<!7_afeWMg<AWD{H|8ls;kJGzqvw)hiOd`vod!131&{$g3%VhD!l?6Cw0dIs$
znd~*n8l!iuxj68WJWt>z%CZ#tmn*N>Z}FI|`-E8KmZ2<0V`6=Sg#6NHs;vfub<3~p
zMXz0ZomY@Zjivi;Oj1kEmp8~XzvSrt&{$|RQT@}MUq_!_Gs<h7b^N0-XN#>uY7_51
zGH)q!jBVc=-zRU~<dXU5r@MU?7q==uA8NDUv1R*q-%`h<dymInt`6E6RI}iTRma}&
zuhkFkJsH=zKl0Q%n}ny<*XkBrtxdZB)TaB8`gVIw!ZX{R#>LMbJ-GkOuJ>pz*T6RM
zx&4jfZ_=ozX_P#FoQW6@5K8ZN$rnE<KA9z#F==#Sd|Em;CNYL3W`u=h50T_sNeaiv
zpPI^FYN_iVFaBRrt$AXVN-Wg!EPGxI1_nW=h(|{tMwf;t!Y}6{_9FxEaA5cge%Y!0
zPytAJgE}m~{PQXS_?=&RPmsu4Z2yj5>i)(y6JwHN(+7BTA(r$b`5_zmMQg$^|BU}P
zUjHg>S)mh1IcZUGF-Z|(n$T@?{0G)<Zlc<2;7)iqIAQ6#wHI>+l;0)5tlsaZBF18l
z%9XXIrP>o>nLQ>R22aY|h~S&C_*%#RGg*-6$xLLb|Kpz&|0Op6|8kH2Yd1+!Y%kH1
z^CUlyH3X2ENQH`^py?ql0~brWm9}G>lR~LbC}xm{ZGH+xra~F3@Kh*b6&|tLH6>B-
zXJ;M?g%3Ltd2l3>cACZ_ut19Gi^tj7O(K(N0p6_ak;(Sc*p5Ie6e_k;hOwe{Ut!P6
z36YKg4v`_V7e;$dP6!JPievYa0!e(IEFhg0&qE7Bf*@@GPdjDz-my;$4{=J4_KH<R
zu=ZN_BpuSi2IZhxvsl@e?FLfUhCUEa0$Ee0M}#<~goOsAgokQVC;ny(<mWr{-FQf`
zNc%%PY$=tg(O$FD*$st{iq+RDWe)nE;iIiLDK=??rB^7XvQw`_wg#!;o|6|uIxZqL
znZPQMB4@JoCm{|T3RzQXElK1gJF*|9P-YIClQvk^is&e`l^ihACONRKO+75hb0P7g
z&6Bk!_{5YwngLCanx#glKVBG<ZWEQ7Y7?PWr(~LC#HiEaQ<BXboNXO!?PV$xfZyKG
LNCO0tzc>C3fD0FP

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d0f1bd9b4393fd8255b1fde58a8b75d36fb67909
GIT binary patch
literal 4341
zcmc&&X;f3!7T)I$$xQ%>l3)TT$Pkek1XMsuf-<X!Rt7~4f`E)NtDrbA;{X)}6%`Z}
z5UL2aGN>RpYYQp@b*jap*jBCfX$9ro8;Y&3?T@$CTd#X1XWGNr-##a~U*ak8G8Yg6
z2f+*v0VRMj09whpi^mVWOwzBOKJL|&T>m{MUw@Ije0LtNw8Ca?-;#A#&j>)k7YL1L
zd!B$LU};!`J~g5fQX|#?z}3a*mO>cb3*ru>As|Fnz~YI7oB&y5;F1toY={6;rBqvx
z>SZJj5mSE`N@PUm2m%b~hDUb*@|>rB$eBRELBQg$9YBfJGQu0~7?+Q-6;dM_A%%Xk
zQKb?5d6icFh5(Zz=p;TM&{qnKS{5G>6dAW9P!=as3Z)03flHP}g~vxk$}#mL)$ftG
z6&F9pH2#Y%{}Sl~bwR(nhu{rMen8KaUOi(uWvql1RvV$JO9EX@%fYC`5Wc&m2(Py2
z!IQR0u*`su4jqnyoW*?5+t~(D-|7LFs=$j;807vK3VtWv!Oi#OzzVj3!#*M?ea?e5
zyCi5`j3Tx|WCP8|%;4f4XV8u?h8J_*qpzw>AYVg>CN=5;@zfNkD0gs3C7`2P7dR8O
zK!xuB?JIb&)z2EjvK?Vo*9usaqzRk?6}UaS3J#1)gz=^|=vtKq(rKyyezptfG+l$}
zI!lQBNfjDLErRXKH6Um3B+M)ff~7~b;krKy_H`Xc54i|xuPQ*#XccH_o&ZGg3^1#G
z32Cl6FuS@0L>1<+AJd0o^;$?7s)k1}9Xg!_*uzb>uw@P(M4vlDTK{)wMf_+uM(9FB
z8VB-?NN7880*q3e!1FB&>T)N6IF|)S3uSQWzA{|6LBfr_X2`9)1axDsz`z<O(0ObM
zHv{tF{`*Ksit~oTv;#1da2|F#AAx;t_V92Qh6dL4fnnx2(5|w8=Px$Eq1zKdH*ySm
zwf|p8E$|VF(?<{;sRK@1XT!X~EVLsw6VEMw53j2&(P`sC1xni;4ox^jELGbFWBt2v
z&jYVe*VH_s&;1-qQofHiyG{c0&PNbIT)~<OzC>|jI4C0YE|%1O9op94L6=LOB8#!n
z*xsl>Sou>r#?iMYD~j5o`s`bz4IY?Hs~IYHRYG%H8lg!XgU+RPVC(B~*pjpm`_cL(
zCOqnl?`u|s+JkLyKeGUaa%1obrt7e$%hwU1`Rc6x^EJpk^bD~#jm0AM{jrJmHRQ4j
ze&n8(XHYlG6H$G3n4%?y#|s+3S?3L0vgqYLelmsR)1S+!$!FkAg)1oe&V=ys#@MsP
z^^kS>41Q~#J!Cd7$6hv&=*FxIkYBR^)oH$jXysF=Ds2XgpWzOTUyZ}{Hl6_GnFH|c
zl4`*1%<w3I15ojq#Hpp*S#R@V;rq4?Fg79z+kIdmwzt+CHB79>%P$R~MOXW9`!f^R
zI%x@5<=9L3T7PTOsB#+~Zsv<$gjldCS_bQ$xR7H~SAo%&jjYkP5qkMA96(kEW}i&Q
z)mABjuSyYjP;j06;>0Xcc08O{T3N++5A#8X6YgN+#oFv8qQ%_7>mpn-Ihm-4^2ZWi
zJEI!|_ePgabips#x(LV_i!`^Zx3S&UDl6_9dVogj7a?<g9*gV!l!~}VagsY5sET<L
z(4%G^xp0pHC&pwFt5&%R`{te=wolc8j8-^?<+Zl*ythfoaA7Iv86~k3$OJe(v;$kX
zrkS0tOtRFiGEw63Rp@%Ar4p~t01~Hpu~t5GLOy#7(2`BNVN16r#Kx*}H9aR#rg>Yr
z3g;TI6@D+!1Gg!Z?%h5V!ZqSvt+8e=*tVV*q3|VM`^XTzIUPZWzPbaR)6-!7lb!gL
z@9I&z>@@_306J@+gqk~ium?FyM>&q!f|We7z_`C$gP>opqc;!32&`<4=$`#GVw<N9
zsjyQ64;T!@P`Vct{MZ1V_BC^j?_hAoa}PUH7BK#Z?L?T|`y;6`IS-YvEiiTUtvEaV
z8My381%tFomPgzoD*EMGj%WWgh2Hv7F1b+=M&Sg0sj`#(i#L!<r8e*+S1M@Vb_*J&
zT*8|2+Hr%^Ej-@j8I;A`9{h5*DJvGgN;<gd5l6ow@PTRM=<br~oUG_{B$Ki^26KKw
zDNlQ#c)JrmPV+smBA>C>YoCXzuo&#vR$ElRgQsNIUPq4p)CAf(60xPNJ=84Mc&_-H
zyU;s6VPf>WS}ZQ3ioJY?COLDtGbT%r5he|V{Ch(eSe$paIkz}v?C4*+QRk^tOk?L+
zzV*$`u(IQrR(P)}CM-IS-;c{DeX}2<`Qr%6{+%zmt+5{s7Df{hD&t|lZ3Y=O=QtkR
z3iv?EIXKhjjP~v>R6aR2hN`Ud$9=;&(4kz8QdDwrhiZgze-M-E7i+LN?Uj7+KF76s
z#b+(6@jwle-l3e9EGm2c5iH?c80$W%h6QkS@M~#t&{<r8+Cu}N?474#&-Vt{ed|_q
zL$m?Ayg{Upt!77Z_h+)T^8H7(^e#sV{Cc!ODHLv$Jj8dfG?mi$YN#OYGd%g=U358i
zC(E$T3yuhQoD#)ag@ccZfI2!fy3XqeHJQ5rJ$dmn=3&?ZiS|mW$=d~}`}I?-|3W{^
zx~GU=X*o`e+WP`D=Si_{e=)I|KZ|vMKNAIettIKFcnu{uNEr!I*}u~cf+@X_U>*>(
z6cJi{y2IapjgUPFuZEb_R5rm`2^b}G*q^!{r^e{}oZY{EBb%q~Qs_{!Ax%MO5z=+G
zWMhW1mNZi+DczK%qVILYr@1sOXRKMk#X`w}^v&v1V;*-kAIR9M>9BzkNe^b`>A2+U
z`?eg+`fQ?SsY{WxEW5zarzWJkr7UNs>C%==k>jD<qRAm0M|{s6+Pr&8^uvoqj)%7t
z+a~?|*!Nh*9L-PG0XfDp({qAj22ancHgm3y^H!~y`IcGvS#CQ_%Fk|hy&;}mY?p!w
z+-~Z7m3b6P3*2rQET~%4v_atBY2;TIez&E-{kF+s&(*)aC5;M?$y&2Tw&_MiH9=Q*
zO<Th<-c=iVv%<F_-FWxOh&%PG23RJ0zFgURxyUxdq_{r*hn`bw*rt0MSH0@*Zpbj*
zcRFS8?XLrDv;Ak+6AEf}nPw#|X^QGL$&`6%YZl+ouQAj7!1>J@wyOszi-Q;QblrE^
zWm%M6E-+eHpUjzj=xUK=SWjct<ippC?bf^<<X9g0x@1O<+VpJ8@{Tf(-8Ly)tBRZD
z^DF(DvaKpRs}?n^e#f;w+I2ke%C6}-)>S=q;osD!C`_rob1Lp>PgBm6&+j!Q54?S+
zF!k7j=1mIf_PJA!KWxiYw_U3^t)}<VcEg3IbElp7{@NbfHSZN|Y9F^Bbl+{i*`}`l
z*3pFxYn5zI{?J_;_RZ<dwqN{oSKMf*A;5-7r_0BbjS~?X$q>+>SRy<$HjW(>9>mZ&
zn!Ba4RR?ud-{^|J))u$Q+3g>%*<TVg2Z7xODoQmd0*Qz&jHJnEz;6umF>^m@`7;@%
zt>q%5Mr;@|`sY>J=p!=v+~n`bsPYGD8XgoG95+m{X*$}E#P^KF>Bcl2{d4%g+4Qf_
zigK+$$ckMS5)>g55Hu6l!~Q_%^)x(=syNWUBftt1eq3?^YuNZ>w#?Z5ei#Bw!1AzT
zya}QFU>MSm(L0PBke(VHUyF$^-}pblf+{7_5PAHMe{G2W8S4LkxxV6m>L#h?8>muZ
zitgxdZIOmhQR3w_$J=K3VhQ&Xri@rgq!Ni_F8yZ2T_Q=8NP{I#5=pScF<8DPr442=
zBS(qEmFemENKen?!&oK)6G$>=wV|o$RC+LTwhJ?NOk~=bOhbp3NIaNE>dQ>!dWjh`
z4-2rEZ64sgAU@D(T9~hoR|unz@TBuwwSsQs@f_tr=p<+vjwc^7x{v&0{k*Lr1D%5<
zGR9xNJq?Ez-;uf3r=K!&H>MeeE>C?ppD=n&8YlC%it_cD9p&dEhfeP|cNm}B{EzdY
zi$%*H@?lCTjSh5P5XUGat{#lN+$eEm|GBR6deg<GWu){H$qZ)b9M04r+Rtg)iU5lw
zx+a#45nbd&rv7w_!-qnzN#&k&<}^B5?<<ie4xf`eS;mXDk;p4K>?8*{ye^0C8{rgB
z?<2>__|x%eSLSGjEkURiEsI$h9~5W2EIQg)784VdsFe^D6B`;8sby|$Vs2t4^3VYI
N@PbAHz?1&N<KJIsF+2bO

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1cc7b2c26517de57e1a2b4fe464578f4f87a314a
GIT binary patch
literal 4035
zcmc&%d011&7N41WZ*oJ3AQDIb1<`~>kVV0zh$QTbia~{75!r;u7Fo3_1r&`tAd0wy
z8><RpwLp~}p^Ac*f?8|cL9Gi~m-_0vHz~Hh+CSd+z3=sWFf(V)e$IT!FTNs)4HxG+
zab0}4Bo}A_V6;M7hXVC14zGM%ocJy+UuIr#dx5dQFGvb^FZL~O|4POMTx~ATg3{OI
zGPn!_dtgcq+3?6AD*%AnSVXNPMA#i5?1lzhoNoshntUEBG$bxOI4UGDii>neokpnB
z0fkXQ^6y;nEvOs;7XfO~q&5JQIZytOGmZlm1Z)=52@Ip8G-;?d+T^2bdE}6UN8-Qh
z=uj2>^_CL;UI9{8(1U-dz?2gnAC?>&5tkGk9+DKokx>U(cyL&JOmb|TvZev3a}Nre
zG2v5W@Lz=dWTn}<+?~2U+&hGFK<C&m+6cETGlAc`b+9_B65m+z8+dheB0OWcgH0NH
zgTtc_;|ulnfw4gX-1oo>D9CRCy_0kB``%5^+^Q4X<Y@+MdhUW){9E+Rw$m_4iv`2a
zIuJ{{Izdaub=Xn(5Zda>(cbuQ@TTS<#xivzN(;Kc(fYU082F&6&DOBQlLKcr)`4$?
z39u=>8)ejB;Hs51+!AOH9@`HhUb#EAuR$HKwv7PQ2V208if!Qi#sqA%)tAU&@)Sar
ztIrr{t%5ePv-sXD27@pSLMDz?L|9uOvAgjJ@T%j1eQ&*x`b2~!b8CV7nAf1)_Kw<v
z{z-&3;DE_1&Vtvao`55D1u>&6(US!=U|q*q>^nb4plDcx25Sj;c}5$^t(pf<8V-VL
z?G#jz<pM^zc!Ro&Be7fy0ti-}02;2pf}etq0?ff0i|0B4GFgFty)cjQHYX9>Ysmy=
zSM-Qo2j-)_)i$u!qy{T#e-A%De;;!^JDNErD+QJ5wqt2Q_Jl=QHWp(o#m<35Fts2I
zeA(|oXr(U$7N_eN!@q}c@D~g$?{x>()2Cs2%hW;PsYUoZZYOc-#0(;&GDfqgtU}v6
zIshI?xsFB&jhTC8rtn=S9}`VjftSVyq10FI@ba%WhCj({W3B00$T|iebMo|Cm|kfb
z>O0@xgu_h>ppAA8L(Tso8GD0dt>~#GOZ`T}x^Jz~{M~F;!gw=AwMGTHc0+(xx@ZrV
zvyY?g3;Id_Y%vkTD*^(GrOXr}1ysJ@j^;0KV6M?17}YP|;Hj0%V5edtNAtcpNSz{K
zEV<<d1NLr%!K-(H&3%U8@Ma6#(04Rxm9s^S-BgPf2R?&0y(W=kf4C2$)GXA_SJ^Y?
zWoKxzYW-04-Er{snOK~E@jCFGmIWkVT_L``T?4y9UV(r!d04$U2R3vEpl*$@5gM!_
zRCw1GslB`aBHnbu*SDf^bZ9yMhT{c1+jk7X&Nsk9--RQ%MnXnDFb5CsH>fpfwSlv~
zyP1lR&{4&onScp*9up%b<iJ9vEz;NDf-%=T0Uo>4fq7OL!zbx;Qa+f*@*SAMZZ`gM
z7_mwn48w4&y{w1%(jO2D#Z$qf>Ud^X_IIFmQak!4rwcpoXrv+7l#koazKZ<>FEIiI
z-l)jL0~?GXK<B&Dn(SjQ3B~0oOyLvAoO*Jz<~|)Kq9dais84Ujo7}<~;V!!9hPx@$
zw^P6#*SSRXPB#t9*%|np{(Oua?7<?V0KB<j;V@^d%?NFcMP#2I%knw`%v&cgw<Hhs
z?<+cyPGvkfviJ}%c+LTVMdz`OJ}X8dcAjwZ65!<*am;!!Pp_|V8f%?=4Q#<`nC7#7
zhASUl1$*+`u#twp0Y=;tW`=PqsEAHL$G3b2Yqo209J)>t!{?3%E#0Xoes(aN;hC%^
zy!HdQGb+VI?pKYH)>beVZ8s!b7rCR5lo0&rYOealG1HhX5uW&WtV2xsn?Bg{bviQ0
zuh+J}vI#8dK5pbCm;iYNt=O44hcM~-2XM|voOJw`l*q0dfbVw5@z@cgK$P<l7(J^J
zi);p1{_XML!WVl`fK9LZSGo!0^gul*jbVXqjS{$W#74~NC`4+HgoOUNDzuW{ucffO
z$LgwzKw(uru-5Efa6{uda{ZiQl+qN<_>s^<p=x8W3t35^XHO~Yl7#}hi$3_(d*<jz
z`(}8VpNTp$`Rw(24usl%1=A=uXjtQ&MUbss12Z`?aJldnww+<f$<o$?+mg0oD-Qku
zI}-C5<4#I|)56<~LiK9)!Mg>(qGS@g-oXG&P@4z)pFKyhZ`mN#k)yLBj|=-=Jw)#p
ztYOW#p^iOklf#iWcR@oxG3pBv;$LXbU>wkPg^`jp>Jyz?MRK!A3vN2|kFQglZaojD
zhfh%fZlq0Z*l(90bR@uwVUXg|nK)|+KqSv;e|iQ+YMBPq@6TAp)HL?k;Z&HJ#pc;Y
z_0|`zTB~6sR`5hctJjS%l@tdw6lHDDwGKVELv&!xCjH3?4|*F8tleVhlu7c%2NgME
zJaSE?jR)6lHSsO-C=ef7zinJVRa9T&p$+*~3mX-D=ffKdCPZ}?OPdaF+BHdj>s*2J
zk<EKPTl)M#U(=B-`=+NOmYz#-cA>LE%Pg?DIOl+Swz20<my+B=-aBk%SDQ<=9+|aY
zyiU)xG_S<3SW*($Qo60o|9EKoPS>*S74uIg^j~c$%Rd%epUKj9E8kHWb}rX!ertKb
ziO5Sup1a&CcAkvxt&&}9t=M%cUS8kMa#uws($Y^)PjAi9pK~;ObmDz2GjSUgJ86K5
zUA;M|*8Arh4cCUJsjqi0&Eup!J>|8Rig~YV*W6><EtkJ~s*1Tn8PjK9<w`2%?Y8oL
zCnA3<^8D5PLrl*8Q|XAQWfAq%sByKLu5HYNr=yz{S|*b6sAtBl8}%k7JbLhK%(-l1
z_go9v^RXRy=KfVN{m%vGci2XDm&*o)ox7)|K7Z6dXmV-4c&(P@;uogf2fgxbV+UTC
zT`86vkyI>xY2H&Fcq-xXz{_#H$3xq4Eu(+2xLUpV`>NQ7zgS*BE$kH<aFGf^d3+2&
zSh2D=8cszd;xV$sBxXcR1Px9oB$Ce5DL2x|GZLnc5;iNb+MgfSC*g(@*Wm-OB6|@a
zARYn<3R(rz509cJs_hfdiqcjRj~vpWq19h+DW#8~)kgEbgH{?JfL2UITx608FjC-Z
z01Drl3X4oBaP?RBe-rdm)$%vm0UjeUEGi;4go{(SQh@#hPXz`(<s+OZR1EA8@8gzp
z87k$Eu^+AX$1u2v%kXiat#O`mFf@ckRD_`c3G^ucMznur;QuTZ=#YE^zR&*{I}86A
zAp3uLzQTX%CFwK@bVwnob5uZmR`U&b>LQ6`mj6`M!{Y77t!T0niA5sOZ0esTZ;>cf
zB#sohiA0ei=Sbz9m{OQQpPWS^PkN{Lp`GHC$3)r#?MO81i*Z&~lc|I2Ob?oOPNi2@
zdZAp4L_YK)meQ<JF0!Wi=uq34HlhCWlEdAmL`wrCQM5ePm&$M1Vro(Pb5=T`lAuWC
zPkE$eAK51c`rE~YyGM#bXnSRNsvL@>L%d|}T$=Zy7gg!X)K&RJQ+MK|5P!RPX~4|*
zzyM|GRJ_@$`n=|RoDbD3ivE-j-Ab`M+<jgWEf9J7(E3WH)S>vZJ(c~YnoZGA>qR0L
z`sf}*_dp)#Hf3?B?NX{I6KN%?$*FYzsT5U1q3*;=ODc1!I!Z1TiBnZ`QYK4VQ92@J
zCsjtuB31Xw(xtI($y6L=owPmWpE9LKqtXOCBY8-|lH`aa%P_gzG9)1(KGi5CA|X*0
cA7^A^Ki+1%HQ&bofDeD5j{5cLOZ|NRH{sIBfdBvi

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f892d384dcee4c386d866900d99cf672068e5bc4
GIT binary patch
literal 4860
zcmc&&c|4VA`@imIIU9!~=ja@SlC6}bsE{1HNTRZ}SVPwAGZk8tN{VSAZA95aOl7Gk
zN{faz+7~9YP1{Tp>3yDK>YaWw^T+S=c|Y%aInTXb_w~KM*ZrK2`+Ho(t_A`^;3Eid
z5=aYR6o7t1MnIyG^~8rC6&vGskz)kU);Sbfn--)Ve;z&LH#i0zHP_R@#J^rV`m*)S
z^ZY9d*9btMAW+m5@TfnAfT3&*+N>`zgsd-K03fA}Q6&%Ic-mBB&38yyK!}Wh!4nDD
ze!dX_OM`u5f(4iYPoW1XJV%;oNR#zX!9=>$u0enSRq&_^fa<3pfrk^-rD<3Y2-pa4
zHj@C7%K`@VNK+4&%QE9p?zG8`RCJ7Q2nbnUW`r#ChmQh9>aU~J<R7Fkid5@I&Hdz8
zQrg^rNdLI-z=+tT0lu-m+z_e}60p=iGAu4ULPE0wQ`n4YZYDH$;L86G%)jOSP(?7H
z;w0!|O4wJ*zB|ga-M9jl8?A%#E-`d9E(6`XiSXoxG`!2ufc_R8@Sng(hmS<UmL+`9
z*xdq=k2C=E<l(gp2I((C!0Xs2xISDAj35&@;vs_kUwE)`j~FFKOJmDLCeT!=4;S{@
z!MN~A@Othrx>_+AvXq5Lr(rx0&-8#5=?E4n1awx62ey_P$n!0rV>u7Dc^O0KW-FM}
zwHy{DssekbJlt|FheI+6prTieu9Pby^~Mt5Z?*^Z#w!q2I~5{cDnNtGBG|D^8MX}l
z2sYV);8QjZ+7~n6Kvxxd%t26dSqg5;$wPC~SRnRUgMQ6hNOe#L=ZZWKl^DQ5OdIwo
z)xhe{70?Sa;ijDcd%WHhGUoC@bkq(~2cDqiadJ>ejEC@4He~6N&~oe;=&rT}myZmn
zP1ga<bOw}V`$F3TSvY@<glqft(d^<p7$0*923OjGdY>L#_sfC@!x4}e>kiqehv0L3
zE9|x_f&;V7;qe{}4JJQ@iD_eCT)83q@_H>CzNH1@BSxWj2Y*G10lg?z8$ncrI@oS=
zhUkj<SWR#hkzPCuzg0{{rzT}fvBo*V;jxDaAH@SOez6AOa_AkZbGQUu(=&;ZrJYg}
z^xE;Q4mx0PvlqgNOIYL16X<uX%a~#005h?p9a`4hMHlm)A;Zy8*#5`>_`R+GXKS02
zB{>~Xapof$2TqvDIek>@z(w<#8=z4$8l6q)#Mac|kdf$(b-LNW3*)z#u*?oW&?F7U
z$J|j(K?^)c+X<i3qw%qN$(Y&n8DvP75@VpX8X1I~CibT?7;Z_kIN4E3oG_L9<lg32
zz@LQ310QE&(o-=!PEZea>K~xZu!qyvKaJ!A7pAW|4IfGzfa_@kVJefbS4-+()5X*H
z4L5U0Yg&fAttZj7Ip@L5OoOOZeG4`Gc&t3t8dR(uq2cNnoS!j>G}avhS(`z4w6p?n
zGkrW#U;(VSG~%Ss4#vmK82GtmE$nHnB6AOUWBY3iP`y?iKIO<Ke9`5nxcTX^O!d@w
ztV-_|Gik9gd4~Ct4Ab|-FF*{K<oH8!zdboB<vY+l(ZG<qh0xnyaRA>!(4RFESNu*I
z+=e!xr;cY)>?k%0IBiCrj9!3|+KVwQ^J>!nycd~Oy%6bKS0szuKBLmnZMbgfcD&xv
z4z296gIzIc_~^ymc%tkB6drO9Tk&!O_H0=)Q9H*4v7VY?QL-n|=<rA^@AxueNYG9m
zKQ@Q-tqSAimzMJ#Lp{)u_`8^j<~Zj55N$NnF2coARuLtUi?M{??9jEr`*QhO_V|Vl
zANEo7g1bYhh1ouIg16`MLnNo2gADkY437IVR``7ud)3W)R*BnK)Nn~3GpWtsMNigY
z)X0`&ckXLo2NW#GD5*+p*OGpg`*uq*OqdTEx{1uQwgHT)&$}@1l}*eIvLr*vC=K1T
zSwOa@P37{QPJo0Nu8b9rZIQ?RooMO$T*&BFg~J)TL}aeLNH236N9t@nw%qGAdN_L;
zYy7>ZD43(mxm<0`T)2G=k6rJE)$~q8A5Mi6qN{hoWo9bO@86AIdQyive1C(WU_fUk
za8XmI2ljA_kBrr*3=I1r9OJyd0)fA`qYsZm39N9X=)U<CV!MkvDYaV}_Ztep&<0ml
zP~QZ2_OyvJ=`IGRUG_55eEn34OtoN2&kIt1N+z<6wU<^>+J>7>v18lsO@Rrir3|Oo
zMXac|No<#a8B#rU`5bbcG|1ot-d1{(`Q9B!A4?N>UK7de*nR`*r?p{?nH_kgxtgrT
z*4>2RyxX{^d=6)+XbGpy)LuYZFH)VIBrDz9#3Q};O0hG~o#Vxm@sLpU9g2xj;HbKc
zWra(f!1H<yu{@?B#@z4<`qM216HYbrcvGxd7rXTsG5BTDVzvfRc9p;fXOME;c{ACY
zqBbC3OD225+?Qzev)iz5hb=xvbr={Cub6AbwL*DlG*-FI6xHqGam_kvNx21+p`|kc
z^Er2$HOC>2qj~2Z^r*yZMY+{ru^Y>o%XX=fHp}cV-*{hQa(y=c{^#=y_NQCy8|*@6
z)bHKs=E)RHdG{H<@%62+qO(#htVaP8=CtAuVzWrk&3$P87=mT~$&=jPFo1@#qlj>M
z6*y?Rkqn(%g$JDjd~o$yIQ`TP?a$4Y{c&_OtF(48?it30PT69#T0R}Os6ZI!xhAP}
zp&FapQOXDRvmB#$e1?Cu6ROwwgtjzqVr`yZgvFl?Wjr7iF+YwvekC;)Ztg2V9U*>D
z_{l~3_RkZr2gc{nHPKq^;#!f^W<@iSb1;pmmbF->xn~)Y;@6?I+z_~y_ZZ*BQ01ob
z717SvAMjNL_t3?d-HeH~u23Z4vGb&BqzZa-fK~QcuGY1PHHEVf^}l|DIZbSa1aq#!
zsvQE<{o6Ba;QRp0xi5`hYOW$=_P+*IH%qL0u_p07e-7gi-v$M_CXv)N<T6iiU0PR=
zBJ=O76hWt6NYMY_C_Btj)21qX_$P!iNO(7u9!-%U*vWvggcb)=*5IsB+8$>Pu35+A
zjkC|T$XlB#B{U4~I+M3<qpX@`noyj-ev`bmYmrA&e(ILd`hFL(#fLU*Rhk~%*VS}r
z<2F@`wJeckL0YD|eU`RobHSz`v|RG-b1Vxt@0{pS9o*eqxMjDVPji~c>Tr6_l;F-H
z&$EZO=1z-xd?Cl`NX9<X#5aB2XOC<<Fe?RPD_R$A&$CJ!rQ>z3DD#lr_Hhn-tc$Y>
z9kUHXZl5du;mF*BmYWo9N_G^x6}c9BwUq2Eb+7bm+hbF@tK9oUbpP#^(%nawo>|LQ
zvMtN5^1qO!<K0@8b1dkme1}}y@;$YoUDY9XTFY~fM@F4#`-<*)Mr*GGeblsyzd_$b
zp%-^?lXNqc7PQ|zIf87G1p0Y*BycVgI5H`*kOr%xz{+!5D=4rRG}x7WG+1M<)e$rl
z{IH+GTGT)@9i??tf_24vgeiN-Yy^m+sr5Zga{)>s-amIv0wV6HaT35I3KP0e(k~lm
z0>8g|@#NOk2eVQTDKp;Vr-$P8Y{BG!uAd%RrO9bH7kG3$wn-NmMf7xaJh9t8)+S@T
zXJ?PY4xI%R0o|Rwj@kYOf$asJ*M6R}cShXHp6+W;=N=TN%4m39@0(ZPuyb<Y?d$z+
zMe_=s3%za(EGYA=jp)67<C%M<Z%c-T_s!>?H6ia`JRy@|IXju(R2wGvF_4=qUANl1
z110kxQJ@AdQ8IB+gPY81FT8`2$t1XCND+A<W19rRMsL9@DyD#9@|Fs>@YRx<Rf8Wt
zNlmYO*tA|s$vk~})#H|QCDSD78Pz>)J0^OcN}qA;=PP?nR}M>?)bw=}IOdvfHK`rA
zQRZEr#5Mi#d3R0dol{#)kH5UvuySyiYj)z7N9VRE&B`!4`KI@Bu4%H&%({2|*Gs)m
zXUwes^~K%#m7itI8$P`5xs*F=n|b5M_ff;|u{ZBA&6+970&GNI*nfK!XNQMG&{`uf
zh6oFZiDd?c1=81a>Q0^_qp(X~p-5LVTT?TgNsEs^fBpXU;_V}F{wkeV_0m8hA`l}f
z)zQUe5&nqHGxPgHL-f&KV9Nd~H>hP26SBVe&~oFiqm<L1<OZdohK&D7ZpeNWA7OzJ
zL9rvcDjic8Mw)#_n!{5m!SUDl|A6{;yhZ6oAY{b&2M31x3J6MlX<&a+aT>}_W%3r3
z3<D!f_-*n9j1lK=^$_j%?_&{Q0)~?ry_yh8j))d|7=`fVs5BI*_-b@~3Gn|V3kp1u
zvdHN_EiwN`>h8btCjZY~C50Me1)e5LVXq+{K%^{`7Q4F6bvGG#vxEl;J=%=KmSVAZ
z9`(@XC>AG(ErZ0iVsVhzDo8SBNjc1+TUKJR16@=7FE!PZG-K!pbRhBE?<eZ%O{W?*
z&h~WQDuFI-=z<C@7CX^}r6=8$%!~EueyE?Jvw@%c!ngq28KIsYuEF$txC^zv@0L@A
zB%YNd2(<~yM&e1D^xQY=W4zprA_DA!#J==;2|PuIGS4r4*98mc{%pD!p)T3_$bLep
zG0RwAccVy859dfP4+(V&Z{7&M+4H~M50xy+{<I%Dm6lNfb_-+a39*9{?Jsdk_<}#r
zL6UDO*_3@ry;y8bH|@gc97K89&RFhem`LShD(yriIf2eUwZ)M`p~fsFE2*7R?5KH9
zv1P)@IY~B4ucCa!l1z>)l8_u3mr(Z%w~eE4Bs}T$RD5bFeKaGUAXJO;jb0HK7(2;7
pDr%B%baZ5bT6|!1Oh{yent}0TgUR|LCuM-IA7~^1T&Ul@{sjY`8d(4U

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1a786300b7ea789c918d25ecc282aa7737b43bf2
GIT binary patch
literal 3554
zcmc&%d010d7C-mC_mY<o3Melmf(j^sB1%9(q)2%|StBAYENUez0?Jlatr%3q;)p0J
z;=bUn)lsBV%TigY)}?fyO3|WqI$F0nI-j<pQtf?7v31lx=KJQG-Y>cLo_o*sJNJA!
zC*gdA8%J|~+!!s#a?l6BwjgJ6)wJs~Pv<w>+Az>f*J1r#!^yP;%e0z;n$_KNtRC5M
zz*%w%7f!~Rb0SV;;|We|uZ)82l>z|iU`%Ei!qgf{)sAd9TIm5IbEQHOr%O+mk*v#3
z=CE8QZ$|PnNR_N&-=s?ELVN@c1DVXp0zl|kE=Ws!5PVkC6i5)z5-|;oDGMg1tdERv
z3B$s;2%>VX-|t>4*xnljOZ~<pCrDmBC6^{n1|*oGN={<cP5K2CC)0$C_}sL_^qd(9
zx*VNp3fZJ2%!toO%}q-;fGfrFZlpR-sqSN&|H9^LAs;s9YR$D=DG3Hay^eioMF*@1
zgRDc7a82?Nx^&+&cywq4`fTVr6V^TycH8ZyWBYA|!P6b+@Es3PZSYy>^eU(K&uBMs
zc5S7W2fqz&9oHd^K8qVFj-khn=dgRmZSkD8RyeV+3!T}12f15i;w>2o@VIs-C2{g)
z_OEY)Lyb?6J!rA_=W4Vs*c44@u7?Iy7HV10ju+NaP&Q{OZl5?7?s`7JijM-RtxYoU
zJQ|5=cAkLmN-E%aX%=PYT8MqTMlmU?tVOp^RwK8R<Mftdk!Yg+UFo_^Yia6$E6k?m
zdtf;fF<YO6VA%*v<#KfpIH(&=yI+&uyfu=s1XJ)TI1b(WgTXX<ETj$?itoKw3ro)&
zr!GwNg_5S3_(2_mz8-f9e0&_}j|V=08p}L<uy_m%7!wNhU-hRf%bp^KqN89o_9^^h
z#v!15)KmuN2W)N${b}qP(UbCQ_~FDN*l_X)vvJ2%yrsqs)j8Hu!}t6`O*?mk@;z=R
z9#ou%kGOs#)=&3j8pZ!&Qq|Ga7my9!>*JyDRuI!?!CY`TRxh&n2B8N(lVsEF0`<G2
zseW^1FtKMTa;Q83H$q!bP9F(MIPSw-a#f=hF`gi)=nIU~bnNI`&BULIVM6K^u&R0r
zTIIsP+voc+`%XVc2dqw0E(cap^Ew0J)2zw3E;JBr|70fJ!?iM>9398#j-;Ab9XMzi
zIy(yO$?L)cRQBR6DNd-TRY~!~3+VkB({cWzK=k#~D;D<_osukRi<SHp-8EffeL@_f
zH<N96eid0btw(N_<sxb19X9O>D=FxxWA{(AL-l9Xc<Lq-N!GBpMKxvz@wZnT@cc2B
z7MUi8@!I!pv5_k^Osb*^99-sz^O!t1@_a3xI<HB*#EcQu{PdX4KQb4!mW(hpzu^q|
zqasA_e;<INwp5@Qi#I~q#euN9%!M8pZpXTouaKIw)Ztk%|3+6sMzVu1-$2Py7wNfb
zPw|wM3(X~U6LHP;A*lPaG+Oyp7le;4hKRqNXU_h!7PaXfLDXk!s77Z~)YKk@+s)$p
znn|kg_UrCg`qP(?__!5ye?ObXyXGmc_<l*R3?IapRN7E+JqZ{siC~j%I>VhCP12S=
zr{H+_CUJ=_ZouAm9AWsiyG-BV<!HOu9a~$kpu|h=LC~fJ;9Pt_q|KSeW<JnM!f%f<
z`P{znEv85YZ&5UL`ap;Hr$}I8HQw-JO@_E_<pro4c^WsAw^4_EZOt5(SJLhiE>TsE
zK~!tcF>~*a%gndR{g{@31d+8z3F;U-naS7~q?~lCl1fSjbah$mTmF5@FkYR8hjw8I
z$O)3YGdxgot+q-^e(oA&+%~Zb*C_q{cbXR5SI`188QVm*1IR)>`#VB28MwcU5RFn=
zQV5t;_-$LTkYf8dMKx|)SR^*L4_fE9eNnNA!acdOaeL8+X11CV1z)vzX<w&^y-`h7
z#mlVJabK+CcPv?M?Ui-2vuVeND+c;4VwIYmCFO&HRyjpC?_9dtF}x~hy=K>6DuzT=
zCtqydwXD)Lwz)*<zq@q(@Z|Qr(Ji}|ZyX6%1?v*Mtfa-SOrod#_XOxcQMQ>_>Wb#{
zHVYkBY_Xn~byK4UxKG&T6P>1dbvMy>b}P{b{rc{;(3=OWB&w<i&ti8y(6SATKCB0p
z2sG+J`up>XW&Jg4g4*l@mfMV8WEtE(IHba5dh?p#uT^22edc2><h)bFF70MbMaTu`
zDF>%DEaE~tTw*>>z1&<8`pvNS8VY{Fa&4zN?nG&S|FznS?lZsKG`dJW?$U@^7e1Qa
zvUc3Jp6QqC=TlZ;U0zw&&u#Xv47==|ceksd$ZGr*pAWnVq=?bP<i!>Xy`t8NSS#3t
z^fu1l5n~VU7Fp=&?<<iWhA0cwPx|}+F#ck7AyB?Wa%CZX)e{xUtQBQfnL>8Ry0Bq|
z@GKwyoR3u-2HJ4gSYv};E=-cNlysrqCT7#ADcL#V#MDHgXpsulD3xF8FaKPjs<u*f
z7>fBHljOB!$dB`S(Jiov7#IYdq~hw3kW>KizpW$%U+NPSY^NdZ02FL5uTXJcJtazh
z>m9J6cQ|AJrh;qsqWq>NrYGeX`-$5~{sU4Svr}EOB_;P&^=~lxI;_f44^W7*<C7E9
zbR13E4hQ^)?#01I`%zy%Qi{O?D_#ybCo(d>+z|+Tzdr?sIg!>!kfs%eXAwFaOf2+1
zk3&DA-&W9XF#MljK`v9;D7F9N*6L4cZ~iaItNODtNxo{3T&7~>KlZl-P}(SDd_=_f
zNN;0eDYntB!tmfVJkL)cUtxsud_J#9;sba-iT6)3oM{NdIAQ0{^TEPO=-!o3hHbW>
zK~Thx|6qu#s~6df4GR)z|9oK{D@;T+&ufKA6D^R2dtNQjv*X;u+~Ojq<R%1+njIY#
zku2P&g%khg&LWdRpT9u~ae@q^KEqbHdr3Y!CekB4Aux&83GxQ>1P&R|y>!In$pRfB
zOh)Ji*Ny(nCTE%)U8F}wbW~VIOq2mSu{XhpFJ#io{*Yji@dtl|P--$00;l8%7ksc*
z;5RVk_u8KjY=}1rHW|I4=lL<hHZWC)L1s+As9ACDb4W}^2uvi%`9l1Oi^fMGXBvYf
z@tmL|_tCs2-}p`jX9X$3!y6)L6f%G`o*SS?rv>B^I|iJBJkd`?g{Lucg2Faam-T*b
sV$RU`%*>&>tgMWD+q}fA?39djTQ|>PZo||{tqs7757g5D;pE@luYn)u8UO$Q

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..bc20a7699b64f53ab5cc24074d8f61c1e997b00d
GIT binary patch
literal 4872
zcmc&&cT`l@*5Bur=?Dsg3?QJ$h|-%zMZ*Bnn`ori06{>ibh}`HL9r20P(g|)7Kl=!
zQlyCm5Cd4iC>SiL#2Cdq6;R*3Gr<?XB!7Hsed~LB&78XT{_VZby=%^I+(hn1e1h-8
z_jls+_%H#$u%>2Ozgg&OlOp5Jrz)9m1N)uYmgYUFOYN`5U7^|RV|ul$X>sWNr$?SF
zyRp0UY-uka_=<dGUCLgL&)_puXM#3s%tnbdW(5FTZH#Jp2*<^jG2t7e$|nS7z>pIt
zvHg6*{Z|J0MhEdRMV{g#r1%RGijk1@PtF9oRMf!7fNJEZ4nUHH-RCR_1Z)HhHj@Au
z85WF_VVTQPTC~VXJKFS%xRqFAJ4!6{hmInZ%U@Ti%|CL%q`9P%pL5Zc@sC&)8x{~A
zv(n!;##bho8U*{VTon--8x}6fE*(>RhlKBN;XYjTe}Vb8Dc7s>ThyKSzcD3e_dxbd
zAJZmvEvzxy1|{7h=&oH2y7@Zr@D>l=?9zf~&C_Ak3<Y%Tcm(WNp#WM3njzw6EdT>~
zcr^}#^cTV4UH$=Xemnt;z&UW-Qve0O$-#z$BD5unhpiFJfrc}NaP_b)ObOG2SFRt?
zjZ%HcP*p<HYo-D*XaKATN3hsRKxgSxU{BKkc?ApTSR)6!y=Owmb}Ml1UIWV#)PbET
z4|iNjpm1C~s2P-@>m{m4v-Tt?Y_|i=+UpQmX$s-LDniY;Ww39xD(o2j0TyHhfY0eE
z(7v1jN4n3Vehz|)HZI&9FAt3klYz*w2E&RWNVeAmm(qL?oHT+wOdE1kDqz#6Qg{sW
z;I=Iv>;HBx>~d8A!S}Y1{QMzW6FVNx5K|#6nGG4bBs7<ogYG69a2sYoW%_gwrZeDl
zmM>iElZ7i?By{B(B8L<CFg3arMmE@h=2HW>>6Zb0AHyLb#sjjF3*l4TWjJ743`ZR1
zL;pbxjcgeJ9q}ZXQeq6hy-I>(cc#JA@CoQm-oKEt|6>%RjUX~y6Kr<7z@pJKv_D#m
zr=R!;Z%a+lMZGL8Yl<Trn|zG$Q9c3_m-paqg>O*zS9^&8$0n2@+lO}APY0vhk0Fd`
z#cDIpqnHV76c&6ROXz5a=EQrbC4UeZPmIKJBm80QuPGQ?dp>zGy8}w=htU*p!sc8u
zL?`TJ(89(Vs1-(`rmdY=VigX%5|&~wW)5LWr)}{g4Lqng+6;Z-O!$-@g-<rvf(@?T
zLIh{1FrHs7Lq@@ML~b&JL256@rp+%SS6%TY4>!JqN@q938ZgIrrWhW}uLfJqcW}-4
z5$EZ%S)>BUz*igV;N3}kknvgop=x^A%N13S)>4PxS~MTT4Xd%CY7%uhUxAFWC8$z;
z2qI-Kpps;3P_uS~nj4dFt!?EXyI=%<UResbxgj3Gw*XeGn7H7xk1@PA8lE&K!Njl#
z>`>uSEVsf4RZpwJPh9(imbDGw^Xn!vHIw78QxmV@o0rcdbx-ZVLk+#~s}K!yvRA>D
zXLjU-t?NMdd<|p#9fXGdg#-8&f#KKlaOHJ8@RHBwjPl#bbLGyY@7Ykff>R|5jv=1t
zc-%cqO*n<QQm}$E+AhFFCL4*95zDdox3;Kjq<4J5G&}s-TsuB#y-a<dN;A`8vn=o6
zrw3@fb~Z9n*vsH}46?#{S?rCst63)(O-7F!<jAFmx$G$Y>5K~566~j5E$oP*1sTab
zgYCU^NzP-BB^jzz09v{U%s4U*&VJgDE#1(-OpzrSDrRC7e|8;e7n{n+4a|V}+3t+B
z{Wi!mHxsS=_7LpqQHSVgMUJ}LWR}6+-5hRHHMYk46?)(>i#7HB01D#ha@xvfGMDU0
zlndjY$15J|pm!I;2*Hhe;5IKA7C$?Hw?3>w9lmcNFbGin3>nnW>4`nq;WN%^!Y(ZT
zu`$N^{W=7^Z%6O?LkO&BgP?c*bz+a3CdoaZiu;ZFV<^R)75H=p3=TAK^zLC$=XRJW
z_VrUMo;wXp9=#ytP4=RErZJ|XvKwcnyac<$TVY1>DTY(bGFIf!X13e&+1y7}1srl4
z560mHe(lt4=I<Ur`dH3^XX%#wjy<=ade$|pc5eqh<6@(noQXBdc;Q_<|J-WgXY?b(
zTgws44c0~}9X{;u(F>WOP!ifl&&zSY|D6<f1>s^RZ)Q!aAvULSmt0F?G3;OQjK%x<
zGSOt?&+xaNi1pfPquUF<A&>3SB`OZu$g-;!VR{P_3D(eU9Q!T|WA!Mrnf>LsO^hAy
zPDwkacs7DHskjJ!sfb{zzLo*4od*c-f;POR$AA%yw~-bOTEyua1a3IAPq`<59y={E
z1vTSU%o(n~qD_N$A!nZrK1ux}Fv4Fl6Q^8;l8`9u%<j3UYQLO}c}FEV{u_O0?u^GG
z7Oq6j_OTq{Pxs-GTHLhAMHN^~Y6)}oesyxeYFo@V&X*|tme0GPIge=_U{Bm)7cnE>
z_n_MswqmLW>J?_*+zD$t&uBPknINU?%lO5`MYz}Yr)cpcf;Inx7rCeAIU3E1B*Ns?
zAjs-C3UNJ)2VMgFz(akwzCH)@H0tL4FfoetwYM_z3S~p5><P3<J{`9xMHuH7A*ph;
z4BI4lHbJcSgxygVfUPQXLe<khpdF2AtnG`7vACuXMjxq+`EfMy>&Y>2JLe?o2=)WB
z8&1UCCo{0VnU_$PAPH+p5^%RGo0FV8F;gRB`MAbMs}WbB3MI(|Lsx!3zMr8kldPbO
zGGo5OHy*u@TA~jybSmA!!sj6)pI5;>`ZybOPt4-ho2!BeX9;@t>NOTN%!T;*GKw4b
z@lns)LG06t6t;6O4}Wzf5>0w=2&pf!#CnzsiS-K3j6#J4DA0W~IWn6Z?`z3q`&_!q
z(yz`^=>2qs!TxZE9r{QS>x;=C^X8_KBb%C_Zon4h9?FtERk<8@?Kq_X9!(sxHwuiR
z4{p3-z8sr%`qTJI_hOclP9r?O@*G+l&l2NuU(wf=dLBQQr^~PB{O86+&_}C8&_9Tj
z<S{hVs1E=5BSNc5cr!}hkm@-EdkbJJC5ybRi8yP5wr71_;x?w-6uT^o{G?>Al5tRX
zeg3voSq)3ElBnR@G<j|JV$X(x<Q)?Y{jO$-3R8Bfd=>SyyP+_3x4K0VOJI3ayjRmM
zL))wIXxewv+zRZnEsM5i>Ufp~^)wdkIAGw@C>B^9OV2h5>MZtZI=1uBtjPYW*;dDQ
z<;+cZ{nYDBs;m0fTY#KkDt4P}l`5y@ULjtnX}eovL568sMw-KZ{S)>3?7M_6Ip&)%
zzQavz_adhp%S?w`GnSMrt4-oN-q!W5487l&>3Bzfh1>e~!=!H38Q)9W1#?q$v&#b7
z4$j`d&^uTWeDkDNb&B4hAHwccts7zJA3neKQA_sRRQ;T)*kA5m*uXT%ty%Zxc~5n!
z!I6ubMu*>zFb(tS6A7-ex!5qjF`1__XCup~;8L1`j(3gNsPOVm)w%0OS;j}N?w#s*
z&^*n!s3lW(Y1Kxy$+5O<(~!G0X(q?pbIdmkkFrgRf6TYup*(N9>50xFr$g(bIW)R!
zs|0;c@xmgPO}1vaH7A!0E^ppWqkB~jkDSX4xs$kpyLsLp=t(l%F?8*?(=>X<2Pt~1
z1l^&0485E}x87k=(w?(s!|*b$1ignsuPFKHT;#)+9LIB|_p1|zp5Jdh_kCa8R-CKm
zbpFiGO=5YS6>aCwKD@F=(=pfSLfPZiER&GlwhQG0H}b4@s5w_vJnPaaUUX{3^{UFj
zJ7-q4<~mpZ@ZzWQ(SyC$tIxgctKY^|cd0r5>fzPzbXK<4To@YovC#2|OKsKfgWcsJ
zeeJc?Z(q{z+>536fBOPn_u?3Qg8BQ`f55+Q9gxBYzR9yX?ooHqbfim)UOV#kPv}j3
ze??cTyAb_;sL=7aD}_GX{i08bE^ISV<zrH=VfW?zoE;V%PV<g{Xd*N?I))h#8bIIO
zsf&6&NAbC?VyCX~iBLE!VKsk#$Nu&<?!$NaOh8!GJRlL#jgd?i_#rk6R#F_~CH;0%
z@>JUW4F{pjB%;I`)1f)YUsoxmFFDA*S^vaAWIr>K(17s37%9i1Sja~t+&WXZ-;81*
zfA#+lsDGEYAl(d<7}2YO0>XUx1jWR(us<<6Emfz}@)i`c1T#$O%i{AHQspm660P^2
zW8q_bhLbsMO(;o@h$ej)g)nwhTFR7v4cfm1_<tq~iadd;!0G>*ZVLZ9l<)uLNf!RM
z9wo&m#)>>4OR>UO0YIRt#1px@yL!x#K3Ph6gaIvPB1@4-w2=DJ;wTcui!1|0HX>1=
z$SP2>W=SbH(<3X9$e!-0`B+cQB*SRh1MNuUx?ab?;45mdz{QT9TgB7u0=l7Gi$qRz
zW9datCHo>ndLH6u>|*5Su_V^tW_E~|r+W~+ALd5Iw{8v9Nc>qzoKQhfBK0R3(tBUn
zM|*pih5OqEihOB%2|Se!C0=85_iw(T=MHouO<fYYG@cM@%`(Q<!z{wf(<Q>&Q<6G`
zw@{j&!{RUFp^8PxpW>lQX&LEnyCjC*5ZOD?`Vyu1G5m%0l6q6crev)2B9S#cv<;<e
z5b13*dySuQ0#y@JT8S!hJY9b(MCqYWYnBpADsn12YTrv_881C2NwBmPr6ZD5Qfeeg
zQo1fl-7Cx{mco(bN!wHYDO37rq?(|l5$PMXHZ~weZ&hTZo^MoCM7%~^KvZ;aM7W00
XOnoDLLxGbjz~>Lt5&&-0e_;OxMuIRh

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..151008dc482639c3df72873480ca796f9ef31d42
GIT binary patch
literal 3553
zcmc&%c~leE8o&3>OlA^-1kJ<*QK*)P>`GBVOPQd68WnJX@Ser62*@e~sE9eBfN>W$
zK-7l_O5LgyZ3Ul2#HB7M+M-xXU1(LbuUhM~ty+6$Qmj1nk9W>Huiqhe`+nbk&wQ6)
zE_5hMvVQCsEz7W=Hvsmzt*PGE9rj<!8fLNB*M7>KoHH$vr_uF{cbxzIf$k#s3By=9
zt8~G587pE%w!?uV(~VU!-6#Nn<ZXmU1|p&ps4qZUmQ;BGkxZo&N9j{yBIETL@hnm>
ziYBNy1=aCt=1r<pF4#xFB7g@Oo&XSh9{Iv&QVPTn^b*s4zzTDDl9?x7q=&akrkkT=
zq<>%(ID%JCG4qcIkPtx|`62>G%b3*Y%;eY<Lu8EJptnrGi-efS=+val<P;&ME~xkp
zs+$S*ePsJzn0#%d*JNy$Ov{$y9N+@&Wqxuew|p{-e9E>^pL{%#((g->Rqos^pO6?1
zw`W~N1JncQ%?Xb1X^V>BJafq%sWVXa<3M=vNr!di;2@%?b*A_u_`T)Ye#hvRr=Q9;
zJii959n0WQd9g?mc9%)+V8pp?4a|-yZ^1*SJ&@1-6|!`<A)-2~-RP?h2lNkxAC+dY
z7p;H)4imOYLnSFIfrHBj^wEGA(Z1*F(ck7Bp^L02Q9myeY#Td`YB9ZIDZAwavb{q^
zb8ZB{@XhOBWI;Jt(QXH}t#Bbz%Y#(z#VaKiM;p-WhzIc6_z}$8U*Cf95*Nvt+Trx{
zRrxY;!xU8atuy@ja5AYncNGM`TL`A!DIreZtcR`o#~>~qz(yxac;rGjy0&y?FaO>v
z5L%OrB#+L5*k4-U&o>fDR6S4C;d_=`6+DQtD6u7?p2i?p6w1Wib^>>A9g(<RMc}jG
zjkHN0HK1nHVBp#LJ=Mpv7zP-Er2YD>Bu04#ii0-h0jI*9BCX-?Oxi=EIJnE(qO-nI
zLM@bnUIa;ezO#*f6b7i78Xs`4E|qRw^(AN+@j3cuaVxRc*WSuuc?mfz<O*>eo)<+p
zOh6BlC}8o?Bg%9!o-k=6=u`PMK>FTsGG<I))Dh?i>ox>fxrXGE%)>T<X}1y6H$EmV
z?QTJeeW^_UnrdMCqa}!_JVTsncNb+4XDGk%4&<(LB;oOJtxfxucg2g-ir_J#o_3o2
zFF6116|iY-0MXy>Hy}#+fzBUz9PCa^M|)R}g7xcVmZMwuQ`Xbmz_AP2C^aM!P8^ph
zQGfL{=p2wWIBiNDGA!Cn&suLsjhz*U^jUgxPXQ}EH|Sk@Ozb%FOK~-w_Def#JD7)T
zOB&_FzgQ0DT-a+r-oX<p%Z?L=r&bfXCAZ<!{v_l3Oh>Ic)CHfGrjf~g27q|~?J#lj
zJ|eCe5G6O=z}cKlD17K;>4CoK%zF_wP?sbI7py*n^ZP6#{PsX3xu>T3ovcOkRd;%u
zT)z{y*2bdfS}klC@(eC*TFfk&T7$BVCW^kHY*3VB5OKE90NOU~fUOBp!0Viryz-qB
z`et}Dyr^1$PAyPbEU_6)Nh(aV{puOLnmT7e3wb?UV3`0eZn;6M7ui`B%5C5}!x|#D
z>T7r^qeSGqKNR@QyeZlut+S~5whXv@IKrZFv@P(IOow+K{D_i&wE)?^mWtf9ENp*#
z7d?N!NIbDaN<27`2K!$thjvpmsC|Z-%#lwNZIzFOaiK=)i8p1f*U;jb%~u$=hEjRQ
z^AjTRvrFQnP6hhLu$rnEwTOB$|0wwERv_G5UTU>-{|sVs>rOccJ1X(|NiM2>7A&n{
zWnyO6bL;(~HH_A|33Q$40;ikyk-auQ;J<>k44X&0;NPF0ZIUmolH@NwrZUoAjwj-`
z^AP${;Qms)$)ic}9Do?5UqxO%!Sr?vZ>-2)NXrHWmHKU2P-vkX7Js>M%fdxg_8OCt
zt1MXD$1$`f{77Zt(!L&1Cri1lMa%n*Ouv2k$ks(G?fe!nDovHCcu>%4M_p6Z;x&VV
zD}%~3)l1enhu6lpH&rh!ai7^_Qu%LNR^}Ohp+<Lf+w$@e;2LA{uNY<oE}cQ89@`gC
zKAG4kW%S6xCg*G?H}{!~O<Eh7^Zr|{jmqeiyrKKUjR5#s8vWS!1l8ZJEOF0kGKt4*
zvo_w(4_H;s6-ZP*1&+m)W0%=@=a55ISB+ciq^mC&vS!=FjXraKC7ssRgj9_$^C@&%
zw`1C_h=Vzl^ZH%7y4mgZh0Y~=W*o}-<yXq3bg#a7iE31lOIdB~nGN3aM6Mg^6291>
zYbbIpKahO2e(n>I+s04lbe<|3wa9H#edfI@2j|i5n-9%}M)?@`;>w(5Zbm@HxLX@#
zT>}p1R*8+$!_%HbP(~LQW8LgW4<EbtZJcq4GD<5N^P5#h(mFT52*Rg(Y#h0eK3(Q-
z!a57?$=z}Vo$69MTNasHY*5bzB~DIA;oEI&2APzQVW4A^V)^EUJ5?U7c&bu-p;Vvm
zt8Ny0`OBB$we84{{j>2h(GXA&L|QVVf8g&BrrYM}HH{Zz1yVBI7{2kodWu0k8t=)0
zZ)&_&FWPWYY)YKL{G2ei-UDx`Z`kA3`>Ogk7=2x=s%2h4DaweBk4@IIBz|=`pqCym
z4z}7|ef)4s23|<nGvTbr4DR_d;BkMJf<>%IJDQg!l|otg2M)p(y36BWgZ10<`US)P
z87xpRDqEHIfBdpi|25B?|I6i7|5Z0hvD!|-s2N39fBc12*(#-6Xz1iHAM?dhR*>#I
zd2t$!<3jL{rwJUF&1vGe0FH~}{Nsc(4F*i)SN<F~j-T<qd&WCqnZaw|6>*buoZa0=
z;>FlWLHyo7n;*yW16Iv(T7J;z_^rU_JotTL)UZiIqr#?V#sqjL>cT_gd46&*_HXWN
zJP7*y1tr)COlEz;lIQi1&xi=~N{I=K<Mh0|U>=8qN!Ps(oi>f%kLL$-=z{BJe-iPT
z#-I=LO4Wr=N{t8?LdW((%<+w%+S4DLEKD!?!>3Y{785w#z%#gUS{^Tevb*hvj1%&W
zlZ~l6^&B^bUj`=eIY^5L@SYtt>;s&WcX$v^ayFlT?4r3S_)H^6V$X4On6KkB+2%S4
z&hk<i!wH!*3kgA*&xO!+$pM+zju0m=kM(0wzBFbeP}-;I)8}Ny8eF5((p>fF>8aWF
hS+VIE38^XeLx;N!b@NbZZ2@@k2M&<{1ml11eg_+(_4xn*

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..b485d3882928aa5ca25ac11f280ec415a3432930
GIT binary patch
literal 4311
zcmc&&d0fp|8-LDkxl8M9`Ca#_AuYEeS}5z7?k(*|p$+ZYyJ?Scg^40jBgztGX`zU+
zm8A%cZDflv%otu`3`UmJ@SfjIhS$s=@8|P=-Z}L<=RD^;=lMS0=ltq^%TwxQ%}`7b
z6XwCFF`x}#)p&aDzJ-&YT<8v{NM4#^?y$LRI6rGhcKO+pf#1nW0_PWQbvXU@*XEzE
zTr@P?r}08}IRlK4>2JXZ@E3>S=-YuAuP=s(*XIQQs%8Z41&AcGC3ccdq|Z=dTi^)9
zB7R74d{|&)a8e{g2nE8sNcby~WFrah?}CXfaMr*OfI9&m0I;2j7+();vNY@|1bhS}
zpGyIy&I1}h9Q>%N^Tw%TceZKu4s4812#9!nJ0c!>?<2&dK3~O~?@1Au)X6uJ68Zxv
zGxaceXi8jod~#q|aB{GE6rMzd1%}FFQ{v*4G;0XqGD5P9lB_26{|n5&<o>!Jvq<E@
zyrq@wdsI8SX{V)gAlh~@RJTi^{qStCC>;X7Tv3D0JQL_>F^AB>x@dd39G1`01(UTc
zAph9}z)}aEX%LY6I12pt{sEWYRsttt43ztdq3o#u=B<;W1&L}zw0I0O?Y4r`8=S!~
zZYVsP{1#oP83qOVB4plZ4Adh_;K|*=K9hpB8e`y_8i0<jJzR_yz)F8Rh*{<U6WgO<
zM*2YD7wW*ZN!74LBMk;v?m_3O^^wuxD$rf#0!D|=K|=iqi2qFpjT$px^=y4u-m?!T
z6oo_3PD409lLMRDYtbDQ1a)Us;f9tDG&c<bYNI1q)jfwSS0k8IQwriLYuHSf!A8A0
znE$Q@?!lLE)tMpgd@~mEChLNDmosEN`~^j)Xu)pE7~-<{P+&nr%ig_UG2aP1-*BKl
z*Bm6d9N1YD46U~{;nXD>E|pjzx5`p5PC5hK^PIrwfhAlHDS+E=;~_oS2a2+`z`N9w
zu-3T(Ho1+5JL?G4z2H6!$^HTit8L)vvqiA|nkg8^YopH1FH!%ndnnlqK|;I{IIWxn
zi8WJ+y2x58xAHB#su_Wf3@uXS8M?#vLEEXI{+qyfrU~V_r4t=;Jp=8d@~Nu8Hr2tF
z=gAeW=3ssG9>h^+h{J^kP?zah!bbj(n||>;v@E=dzAJr%Z2Bb-CGs%nI<%GKn~kTd
ziZ4RVu{X#NJcuzTtWc$^I-1hl2!|z!=y+xuvG5QHdFcT}oA(5GZ1<cH?Q|wLHK~E!
zULRDqwFPcx7s9*TL~@Yj0%F{#(R5US9_Qi7J;*xhC{>ch;dm>)QYlK%Q;8jLi{8-u
z1ayZYdea*>LTv;=rZ5fQZ1fshZSJZ(=om@sf;x=)`Y61vas_q22@pGADDh<0A;|ge
zD0#(uJY+Y`CZ0FY=+eYfFm9X)RX^}K)aj-Y)me@(z|kEVFML7j=5-^J#d|?>LO1*z
zSOerZD^kwb11}|;IvBK?^CmwD{?)Pw)}5@S*KY|RO6sgpgXtl1c=;dXjI;O2@ka-7
zjj~dSTFYx(#Y{W;822|i*2<4O4M{MjI20CixX{{}8DMdsk)w4Dq3172fZ%Ol_0^YT
z{|q(o?#V&--H)SWZ9WP+I*z_!X@zLBnS|;1J#^?Pf7-241O<DhBlFAs>B`o3Xh*+R
z(qhLda$dVL988=>G`KsXHA&fIznRy`bj{l+F6u{O&TmVIN3$1D^%Ff2@BTO<LGvK$
z7bho5_s^z!nDg}hy%Xu++E_u^j%r=^7++MLdXpF+G31s+nW3KZVp2N%YpO~<lSq5z
zj4pNG(t5J!6o1LZApS1&Sbeo#3)fAdskZLjPe{wG7+LG)b5wjD@#1dr_+MXb;8l4K
zLXBsvh=2{M{KR4AoI1^F;)h!%#C%6xtpwHG#F|+hJfBrEI#yH$CKl=3R5}%E->o46
z<~4DbXwsaz7hP0ZZ3a4@Jwjb@e=wwt_TtRB<Ai)m3Q^!U>ml#@K-ivVK@IdA#Iwv_
zsiJzkfr$2hhJJDz$us`(K8jSaP&vEDjyrwTLIJ<Qo2a`t1id~IM~N@o1kW$Cz-!-S
z`phqf(8b_a;Co~>d2FycYHIT(+B8FTH2Gyj={*}l<;6J&?>diO--)4!ZS%yp#-F2B
zc^c8GYxT*Ho-hI}@!~~17z~f@H>n)gJ_ScTH*m9qLk3igHHG1KAJaO+^HC|+hS1Yn
zNphDw0hbM#FgR-m$0K<LFX6d@@A+`F>IuUI8uVf{&>$(Yb;nii3m>3^WMkmhIyv{^
zsw>bivXwZTf05ih-ayl2#ahZ{$_>&_r&uLWJWHj_)P-yv>%!2EGX~yk640B3_H_P<
z69PTkY?PEFREbkPKu+ygOGZQj`e}KPhJ$t<G4v(@cMp}RbmYpI!AF_}g5i!l{E>E6
z!1SnCFqs;Eek@aLSP{kkvczpW&%g{1<lFBN%A=uk1iPB4LGc#=!4uhU&Rj_Hw9R~v
zZC<#TD=>5^vM*hfr7E(CY(G}Ic&Vm=EL$Wk`zA-n%&Wq;sVr-GKdX?_Mba%xR_KjN
ze9+#sW$DU+_KSF8+1BiQBbNd*zvitu-<o=sxfIK`Eh`-2yC?E`^S0${ErXh~#SYtZ
zi-$+HRrnp>zGD5zggd8;9m?}Ij!pml!S&<iD>r?WN$~qSR;(&@$ksOZKT(mt#d(#X
z>pI8Ef^F_aHc>ZDRDN4Nd9y61|Aea5mEILzmHsVNg*$w9hqSJnuwzYiz=6b$8!bE5
z?g~7%h_C0gv#2)obb)!m$(_Y}Bfc+lUGG%Au0E!HPt*@5tJm+BCmd`2fbMt9bb}II
zdt}XL=s6p5FnVP>qcEO!{^mjN2Q2Ow1z2tSK?&xm1fvy^l^E*!Aq-W!{%TDh)VYl;
z)H(&|R~GcfQCCoh^=JiDv?&0diWO=t>mAAm&5gX<fz!Cz$QD7{d6zPcD*?SlFrUjP
zM5gmt>eEmUFZ9*ZNq+!UxXbPjfDSx;x`bua{rbi8SDHcg3rW6kM7khYmZd7TOMWc~
zam>*%pI(=)73#D?Z**$sYppPsd?VL2BQmtZ-3o{J?Mv>`p6yXQJnCv)hEBw!jbk%j
zc6RARdX_rm2yN%;MtN;>FSbj0qZ>W7(z|kc{ahwydUe3T)E94<Sl`;v)-|^CgmM1$
zF<lrPDIk`w;05fLixtMr^%(Nx0lj;v>I%5eD0potU0RA!e*IXC$1giG`bBy+X4oiz
zH&s5B9e(U)kiwt~2i&(Sus-4K3*z?|grsw@K11~CN0(2f7C$a3p4F1!NmOi9QZhF@
zHk_4VSaW4+2y4uR4~!(c2TF3ethoI6N8&Gql_%rq!Kf%zl^L%=4QND#6Es$p?N~_$
zzn6z0@5j%TBtD^RB3_>_t0X^P#ZDh7$*tCsJ;VM^Nos!3m9gRR5y`!hxd#b5k)&j(
z<cI~<lAp)_8`Qs|EY7tB5hp1$GCVGrp|G$vAwCl5Ci)&bb?mXi23tb(Y26u4uk)u;
zm-YMeMi_$Oc#LCDQzGT<uo|Ag2z|F?(jUh+VB;%+|C1~T1!8@%$N&1Thkr=8|6g9>
zf9zEfHd+b=5}vTlTo*vBFH)0wc}?~i)B9$LHdB^tvz5xEQt1@@u+3d6O_RzZq)t+4
zgw!EIxhBI76WN)ARO-r(*zOy#rJN?Q5!gV|$#aKTT8_e#36oq{+aZnZC$Jq3EtPt(
zoy?ClmHSdF){Y6WnPeT}Gd(5DX>^RAuU90yALohl%ZSFEGM<An2u=dG-gwF>yZ6cY
zB!3^<_%P=PX)t?U36JUE=GSL?O`FErZfw^}U731sJ~4PrmK^M3EBEuAB=`4KQpb2x
zdil9c{WKq3EZjcIhb^TnA<TJtGP@yl^<e##PHBDkQ(Tqx#>K|1uk=!>BRh4DWowY&
z?=(6(#3mirWCZJki=4*RAE(%RDR@n$Jc%>M?C`#yRF>9zPs(K3Q`kqUtYq&YCCT1(
zC3U|zrxc8%<jJ1L@$pgiYI;3EWRMV?I431Md1z=t!qDKvM0uJ)YItH&lsw+R+HRQj
UFe|Z#KEQ`>I7|U};(uxU8z)yG4gdfE

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..0da33db3c308003647b082db9d88177f6ae33a22
GIT binary patch
literal 3765
zcmc&%c~nzZ8vpKn?<FrGk)XVo2r4dMM38`j0#<p*E?U4U1nUwO0VM=kM8pZAvbYO%
zV;L7*YC&zaXf03$*SeI}rB*xCTI*WJ(HXQV4R!7d#n!R?W6qf~{Z4Y%Z~J}sp4?v|
z_$YUd<|4Sa!Z|4i-2pUJlXgog51W>bU}~p)$}v-dD@r~;I>zdxmpQ$?rsM0{*lD+e
z(|h;ap34EJ<a)VqGETvXIFY><XxOej729P609G1IS{b6KlHu6tCbH*fwI_%aYLz5T
zpO!E^S)ZBAVWmuY2`O8V(>y2kk9?_JNV33TAdP}_0AxOc3kfGl2+3<{8YBor5-|-{
zau#HAc9fiG3DT{z6HH%8Tg7(ms95?J9VLPC`YPG{1qvomPGYBa`Xv;NJi!>BZAeVZ
znx3G~(#uoGASGdXyfHP~kY)k50xO>(r`wd%BFg^1i21vauiJ1x*o1S3NHGZNN9<!G
z9khA^Of)Y=b;<ka74Q8FPxlT-hg~;XvHi!v&VD=T8ND{cz^RUO#J0!Cal;9?GP0Du
zGwv92c59(lh7Jbz%eTNlpT(c7JA|^jOHjg*QOvyaEpVcs6`k37A9?gl$D51^@VtII
zCDDv#YRb>U-llfcAHuQEaV>f;RE{E#Ho_-P8R*!;3%H=3g0J1QXmzX?Jn(A6s@=iV
z=4Kg4*7bn8?I+;dl6CN6MF!Q+Egz2>>CL2+*@=EQxgWWw9HBQAibRZNDjqa?KNEi{
zmZ>`W5W*T&u(>@9%Z6hrn`?mJ0ngyH$4%+odm|WIki*CYN8njaD9CkzklM!;Kb%$%
zOV1pkzMME3N}6Zlwgv`W9DfSR_D@C!`nExx?OeR4FaY`ljDyC{dsAgD3^*?S5Uc}#
zg72p91!|O*GID;vW|z<(&sZyJFU^G8Cl<lrOTC!NZBy{3I(O7CsGfT7^b7R%xjWS8
zBmKk!3g_b5o~J3(R4>M*b`6!P)lr{9Cis-cL;k%Gru)KNa5>Z{viS<3wjU|TyAllA
zF>g@4a%HeSvxMq7^(r;b`deg3`3BGVM-g5%ZwkIYGoL;%J_50KM&Wepk5Ny95rcjQ
zZ2f2^ZRT2-k3JmF==Y^6s%!Vyj+-5W-ksZu`#AL%Z%WZ1bBmhdhb^FMj8k#W(_nP*
zr)xG37oCz6ou46jA3cz-wL2jWGg-?jUR*~unsVfBTPl)9-)9ZiSjmFR4Q$QCeyH)R
z7Eh_Nl4J}WEUL5Kga3BT5&tE?)+XKR1H69PJvMrcKa;Ae21l29;<?OR*!N;Ro|4xr
zF0y7sbx)qtIs0-^OUZD#;*K-qct?rm+z3K3o7SP}ODbX2)xNNEl?&ZBq95y4x>{;=
ztO3u8{U^E}Hi8}a%^j31b&;Oi?<JnRra&QSn276c4MEQi8))_Ctq}1>Aw(U##GL)U
z9-Y@eg_y%@sU~MRYQ7MIFIdO-u$ENgt+za|^vP$C_`C%@yD^)_JMz@mMt?@Hi5S3G
zZLp`}%n2A3MX^bDo#FnSX6do+r{G9LmAFJ7*JqdSAQ*P@0n=kxDcUObz;<@4DRI$5
z2&q~K&V{w2@T|Al^fr?u;s<Z5<NfoyF^gr;jiRa3wU@<DqJf#=?*o6YGm6iz`4SpN
zoW`G&o~J$-?O^S=as%xV`88END1>S;A5!=nSf$vk^ka?%C5Y@iOVH)ONsO^FME%yi
z4OCJxpzF(LbQ{}!6~^xvuxl%ZpsWy?@33IW&H8Gou1C3ax_X-Qv~LK<1ian%R<nXB
zJ$_t~nW>Z-tPWB4R`_$yhmR^0!vfgv&=;au$8oqVl>w{wpD-mClc|#MSn-*HT_78C
zl1>QdiLV7~P+dikwQFPn&9+^p*sHxH;u{}QU+rnZ%6&$*_pTja|1UYjR-dED&uEj3
z`;m3wnxwxz0JM;XiWU+jvPI<3hji4>Um!G-fk(^9q)AFk@&U6dzby+3D7L#MrfEyT
zVzHus$VR`diwdn&9?4glwk|HVcJMDz@zqO~_RvJ_ifOJcT;5X~_vuD{ThU6pkr{Wd
zG;b?j-PdmstM=bsQaT`{Ortxxed#-cBC12m{dX)|Hza0%^3|g|mTz#IakNA|cIS%n
zVaXSE>5lDOSvdl5liF_!GQkbT1XYZQ*1k)cFRu1B!79$gTKQ^iOl;1hyJ04v%ii@e
z0o<Knf=2~2S67Z$(EjW>rCGCuUm{ieEYXx!2d?PlJ)a(2wmo#Mv#x&0;CFV8uky)l
zr=8dCirgMn?o;Tzu4dBi*pKHkL)P!s)y=wEUpQpL-l>iApSLqE8$Zw=U#9jgaw*@R
zc&@@bPvlxrmvX5_*HGkIdC<^WpZk+&Xw{)PH_w#&77zVveYTGYR5lGH<bgaJlPaT_
zM9Zl{CVS~7670rlUmFu$AtnjC+<o3Y$!()H@9878Z*LR5xel3R7mE#>858@xc<4|9
zsl&z$KQm^9O+!A|ab}unL}pn|6+K^Wh8vujl`mkBY2Xkc;Xo}(Se@_H0g`;KL!Pgd
zR`?dTTyM@^plA<ez4K7VK-+_Pb=tClMe-r?4ox@T-1f(cKF1?%zE^k9Wh}}^Cbn>P
zH8RnWW{(b_mz{IW%i27AIYD}{_(AnSgQ_9-Ec+7agl+XJo%Z*)=Wr)Y2zm8wl^9ae
z1j>+@NvEb{W{DG16NPt|JhDwv<+8rYGj>joRZe#;gygsP@poU+2rl3y^I@-GU=XcG
z451J4ggV3_;T8i-|D}JOIuHwssMszYfjYdtN|b)34t|6MJ#zdb>R|nnNTepFC1rIo
zBQsJyMoz5*oy-FWd3fFbA4L5<$m$iIpb}-qCnp;898Cz6BmNB!bF>fN-NTQN3h>0L
zSL@D+I+b7LHiF)-C&FP)6h2C@rd5{H5x5#A5xP#vu@~|0Ao#Zg{!g%=l&S62;s0Zj
z_$Lvy|H~cz({7UT_CTe~iB*>MvjtGwt7Lps)SJ;hofk{Bg?1CfllSL&K9YO|8OQTE
zynhlO#Pdn~*d)uEKT#MjjK=bOsL+#nS5Ib^VW!|gaKyhke~6pgNHPeV5F*US<_K+|
z&=A)=A1*ZhI$>(r=e5FocAUoq_qgcE*$F}3vvo01$-=%Ng5;Mwi!>I0V=Ycd5=1)v
zS%$*iEB2YO(Vl4u!AZPcu(yOKaER!-=24R-3G*<a>4a`cy)&QLWX(TIAMI(>#Y`~9
z##o?}c#)m>!rpo{A5ttLzsW}^rGI)t@Z>CEgAWZC^esv`UGXDBE%heFCel@Uo(~X)
z!Kp$G(qn_XXT^EUBQ+T=D3K!P2=yl^b{+~@^S4-%%n3TOujBo5I?u_HtYAfScuOTa
zjVvHL*DcU>hM;T`$AVL^C;o}4a5SBopmIppXUxe?%yNxSPj}U4WEgWC<|bxjrWn&4
Z+`Wdn57nx}?EzkXpppDNiy(hk{|!(~IQIYm

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..1e1b4765c9bc16d822b1bac7fb9e6d103c2763c9
GIT binary patch
literal 4158
zcmc&&cT|+u5}*5(WfvA{`{6F4AZ4jaQ4kT8rB^{vDMsnNtyB{g6oZ1INE47CARwIx
zLR8d<2-q7o&?F}Il88o6j2h*A-|{f}V*YsNyz??=_uf1A&dhIS?mg`9;3@Jlp$N*F
zvU8)@6et3yuQq6?w1{^Z=^I`!;X>d@*WeNT5o*FZrT)|lKlw%4$~03z@hH9_#iisa
z7R6FE2VM5KEuTGZ1pv}o7(H?kh8IZ*2azg82+V*bFW_^6gW^L2B7>45DNLEG+>4Y?
zAz`GD{db`RhIEcVVL%V^^aMcbfs~UQodc7zF@b=C0Ov>%K+3W~PL^#YPg^mfFg7u+
z%9P5VS@YTBCww;gw~aDg$zONrLm6F3B^Zup4`9Lv#QRcoWkVA}QsTnmlLJD7l7nQU
z=tWd$KuAJtN?g38xLi#6cO<-w3!h@D{{`kJOWr?`>YeCDJs~Ake<$<e2I;VI1w@;z
zht?qx3>^*!!)gP#eT54zwraz}emw}$=b_#839xxN5421AA>k)&03#)MHUWd&U!uT&
z?_0S1t`S%fmQe32fL%}JVbu;1%1Gp5(E>|2y2lt!eB}rlans<L$2)YsZ8{XF@{!&l
zEg&8n0XxAJtk)7S*ro*>9d%IRS;NI>dC2oOhnP*a;64-$pRH5_PO%bPThIzM6VhOc
zQ3pEPs){rZHvw;x6KEbj3*xSs5dTCO4o&zBio#W4^B>=UeQ6j3wrIe)Wh|&1YDc%E
z5p<rBf$IuN(0g<$5EXV{-1!``oHb!VTQvxpOrQ$Wg^Ed?ux7Lk?!sKS>PTU?zMKPF
zJ$NAa+7YrI-A2(V3b2RJg19UW6d01wzjrSfu5kd**DUDD)dOKJ3tCEp;N(p?IDUzQ
zOFNB`OJg-?C7p&hs~kY{fe~B|E`Xcw;$da750qxrz-a0KlsGm(rOQ0HwF5(MGVX&x
z_GHj#HHD|oGGX^M9ngwbL@%m-Ln@(nQL-)qal9rt<Sl^2wnbQHWIK`D_zqsS%|zc$
zE0tkuxWewKyNN)RO3+%SO?cM4K;6!#VaOt%XbKpV(KkAWZ*kTGldE?jjyR1SF5Zt`
z>72n#6CO#eym$`!*L{ahRX;|i6UErhgiv_ZU5j&c=aEfi7on}^HPQe#%(BlIH9E_p
zg}sO1urLuFTRVuY>&9X0%B9$#w>|t~{v6}CIO3H@xnRE62X)r=!_Dks7|l(@ry6Bo
zRu;3#sDeqXM*|(mB<cvUGmFLYPM<GbDxM@AJLL!RtKJddO+#emYZr_=6T?%egW#z7
zJDfDVBmLmvERqMZV6plL{NCgYvVQguJ7pR+vb-B|P94Foc+Z3Eqv6=|gCx4-ejKc<
zw23aY=g`SZ#agrMV2Yh996CQ4=WTt1wAb$iIr}&8Q$QQwR>pV&Wex0<Y~ny*5$kn+
z65Q+0gdGFzWO>a}Y-gtlI;hi)&!~Tke|F|RKJUm>Db1`@tlj9ERQfV=vPbF(8Efo^
zpMWH=EDM2*hfbv8+Rwpo{~?ybHH4o3h64oEf${vgxXR~T@cttQ-FH2Pk`*~9^oSLC
zAaOB9>Mp}{=5>%E$NkBIjwML%vI^OFauhXBJc%1NZ^I9|I-*rWj&N$&h?Rt&A+24s
ziI(#OZv4DRWw?4SCr7*i_2b=A`W{cvn#b3nqR0WCto9CA@gq{}GzOqGCK20{HwSfZ
zmzTA=*hMNVo(}zkX;{L-0OanRA}ze}J=~d+sw4L9#F96*N`-G%BkjW-v7po-qU}qH
zdtP&{lwFuJafMSSC4M!Gt{zy6sh0Hc%r9?&6@z=!U9@K)e%S#2?V>u|Z_@*`XfnZ`
z_tuZxcIXlMqf|`9DNTV$+j<n^(T+#-0bX)@I-LEY0`oN);(jwRkv-pE1^LBtU{J0R
ztx?Lwt=kYL{j-pqbfN=WBY3EoJ?$RnVn-Mj(&2^<>b*sqdvn;E7Byh0$6{DFNfj(u
zS`$B;l?+!an$X3lU@$xHMqIzAkKHuyLze`Z*r`l`%qA5pQo1TzO1)s&gx)*hNQT#q
zGG(LSQuQr-J4;PAi>HE$lMC_Hwcn#tNhK_UE-$bSyv?fScFNS=Ed#^GSu#CVsxU)(
z33~YKS1j(e45ZDIRbE|0q2ZU0vC-ulIPO1i@n^@yX!4KcNX^>@8(t<PzTmmDYIyc2
z!YiG8Gn-ThvXSBh_FZRBKUm89Ve~kQ^Y$7i_Kq@kGr54QnzNC7v*s8ax$lT}mY2#k
zcP+y`E;jSP=a{tF3m&WPttYpElIO5nMisie8rW_Iz3}MxBRJjLPE6SOjQPpx;ZnUE
zL#jvm&)-^txmrHK{4u1Zilsh<p73`U5DF*Z#Y|?mN1EVd0LJE9SFK%#vlVrHd#cv0
zmy*|TDz&c8%#z`oMh^8<uiq%AZj;Ry?fNoDN!P2v_vo&y%@d7-Pn3#kHf)(>k@#Tf
zXwAkvHS0{az@|1kU(=~T*RQuWr%=apms6Qd-KJs#-;T)P-nz{tMuENA0^8lWWiuiN
z8~l##-cmkGeCtG+ZT;4YIV*pCFnp{&uX6rcjH6=Lu&vrQTT#!yuOYw2ahrzo4!g#J
zI@eOusOx==h4mg)HaRNxO+}5~4PK4@{Y}NqK6`>s?yzs(-nw*u;=}9x%_Uz4^ki}-
zIkc3vhny(TTRPBEwm0I!F6VNG)*W3jLmg2!23pJaC5U@YzDM`#(fLY(t~jgh6X-cK
zdgG96x?%pL#pk{|FotZI4#w3tBygS*I5PcdZ5@NwO{2Az546!}zc6TLEA}Of{McVO
zD{uMSRWG03y|gbcV1C9Zuy_{R0!8bk6oiZRZws>BC^yxi$Ra=3e)B|=@XP!2Lmcze
zL>qXs3qqX>br$Ykbf6&2wZtfBplEhsxO@4m756S5D2(u^oWG7mSr$butZ~d2F77Uh
z@~-!&b||(ij$YjC(;j}MyEw+@YX<#g@$$KQB<Kq11b@3iI@j}O-wxSyw%_J(PP*~&
zy=tOe?#<ip_!k=JkQu6ROZQ2zI2pvZR~$0HTDm0a(*DG1hl)9L=x(3K*XidYe)kIf
z`U(mAbX6&AjJP>{%yu|&QSl6k3riwmqmq)P!eYZ1CPdSo9%<!UTFOtR3cIHWuSyu=
zhp+aN+{Tlkpwbdzqsu<X1riZq7)cYlA)3sMFi)M92V>JG$Q<n<5k7m|h9Pr*-KC8_
zBy%9lH2ga<C-<J(#fHU4B#&{`KalbZBz&ST+^SF0xxdE$8`MvgEXXwjJ}W6CGAu5L
zB4|phjeWq3wN>3(l&ooT3uYMq<Gxd@G2@TfCS&*KjZhfHa<gK*3BKfZ81{_O2;;Y;
ztwP6FXW~nM|Fc+7<_c5=ZvSK2{tr>-|H~Wx$6k_hi@q{f$X2e^;Q<I#`CO5gmxqt#
z*u&yi5k`!diEKn7(L(ymh^t7HCbEeTIfz6NBHIYbo(*l_&a7-jB4=i%_3@e3B+Dcw
z0uxB&@r8krkp;c5U*N>(wrR|0&kS^Ek;si1Z2TBiaxOAv^q64N1t!5hOHx7|X2<yX
zdPOqlah`O3pGVV!B%ZA#2%Q8iWAP+Q=IkT?B!3^X_)y0PQ4r%Vfv3x%#cy2qTD+Li
zU6^64bV=%C`NYtBHpxLgW(j`23ljW&C8g7N3&-knS@dx}bhBvrARnfcHsVmnCCSWz
z$k~mtml&mu<1ciU^qX!rE#s{hiR_rAV=U7HvA@IY=wQ>8bWdh7Ms$<YnEul#j$I19
zXCv{XGpDPg&;3L;X=C>!NtW@VZA6kzjyXw+9NU+a?ic5fLgPs4Wc=y)v@3HpW0t^I
z7Y8M-NC``x79ti;3rb8(NK;P@OH7JNh*vi;pKdbUSm34#@csjb2mnv|KL`H;bMo-^

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7e9af93b02ee386e9ac98f1c9aea4130450e7a99
GIT binary patch
literal 3781
zcmc&%d03Ry8h_9EzHjE6VMd1K8)t+RR0c$lO%|1uZ`c(LL1l4)VG#%rVnAGK02Km3
z7R}W3qTrgAhT#&JY@(S)38myU^L8^tE4`0(UBc{qLu6j-kNZ6LxgD5u&U?=L{@&%B
z;rs~aBiuNebLA$6aUu@P0aOdToonMS#q#;GaF27~Dhl0~HoxCs)t=7*XUVCYI0g9;
zaUyFkaA1e_RP2y807%DT(kl?AV4S8MS#z}76GRHNN|K;kk`$M!)2DJ+sZbt9$|j_V
z*06sDrFJ5*0f&Kf3Nio?xe#&DVMG*y^`j|}AfP2;8W=eXBJwdIADNu>l@oQLnGFsC
zhLRicBzzS+w4-9FUv-ourDs>krip_wB}`GJrypu=(!b^8AWzCl%*#w(k{g$#%hk!#
z$RsT(E-@=3FLQ}8%Y#_C7Hck2nrqnlzwr5ox%ZlJUzvq*M@dyk_TREUq|-rFvmoc-
z9Nd`NKv%r^Gu%5k5uJ42Y{HJ70efus(D5U-!`Ov(bokC6P*dnR==G_g_s4acIJ$IE
z>q5tY+r{gUNuR^-Z#<43*q_JlSvSSYE_6X>(G~Pz-7VxkJR8?$CBcKHU6jOODzksf
z1vuFH5RHZ~Jo%I>dNWjxX1AY!_cb}Fqp%wnHBnHxY(DOe4218z?qbzj!PNFP8F;-L
zg&KEt!k48R;c-O{W$RLaeSIb|Y3r>-H%}i%ZfPxaZLvrcX_zM6oNXn|7<rZ1)_xl-
zoe{JB;dCsUh^aiT8G^^$hqLZqOK<dhGnOC+pO;(U{{B#q#{@#gNN0R|Q4^GX*g}04
zITcFV7UR3k4C<MA27G<(=p&=<LZjtU{C05wj0~6oCq90TvaI|G*{yyTOap&{f5aUG
z%GZ_3;{1TkE2ZCye@*nTMi1X~u7WpCH!xdw&d0TlZm8M5iSpR{h+1&IkDA(ID;`t4
z6gRkB5*rqJF|Fe7nGDw$>I2Zj<SmI%&>zB>7cK{<<0nLBmk_%9Z*tjmyTNtZ^VEpt
zGKd@~Lw1`wp>IY9$~Bjuq!wT1vWqLKiuD4?MhjpZ7GnFUhnd7PvCQ-nDp-Gb9$N3j
zz_?FGFmIlHj1CMxOF12=rdIX_!+SY%ar2B|^t!&38ouyzYMJSmC^PK~{L=R&_}1bA
z`pC?1#P<2(Y}5D9@XRb+cWf~|z;!Xl-kr(l8Zs312i~@vksgiqF1><BYDSA|(;U!1
zmzv@|UZ(eFEyVfvf>F;;SIus(IwL8$5HC4|zLURZ)hV8CFqOUW_!=^E*n-?FYedqh
zTWsc4R`T-2W_Ew1Ejn?|70=&hBFULBPSj}nHvaonJN!z3rCGMgJNWfQ{cKdVmdQ}n
zgPqed@ls|fG(3JC&tKUlE-__9jXyr1^Bb0<uF{EeMV}+&Pl^z|^i>dwuHA^@)@+5!
z&qu+YN+)_$xGn2aQzbR&XvQzb{tI23?#+(<q7S7?ouuavdx_^&7bzsok+|{tcy#|{
zCawPX3WPsj3=u~^WzPMx30=_LgXoj5QLT=0)Ycu1yG;`<OeOWW?z%gc{`e6jKj=dD
zze=a^?v?7RQ$M1s!^bcto2;pXfh3GdBG{B0j&Q55P1<3823o?miA!|}Blk_Q2am76
zV=O#sP@UKvTUk|6;*#4CvaJvtiw}sxau=}KcMX#8o0CjVjV>6*td_wril)vUxG4TH
z3Yd89Wcao*OMIdFGidfci{G!gK)o~7#?)@zCfa@WWvbpjgz6eNu9$qJQn6j>$8-cG
ziL5+J(Z#^IOxD&A^_>1qR7xtKYvu97{LL#d-jIo%uV4tu4UtXp2$p=^R4<LO*dmQn
zFOr^}62dV73r1aUQ!q8BPAT+yr8LvzIMrXF<s478D-<39Ecp|9UcnudIB|u;o;(7y
zV5Xu4VAA%mHV$N<e*OfZN(Szf6QOVzEhzxZs{D2o7E!FZLv-toqSa!>=#b5Rb*qX^
zRPL$0t#zwkHMP-}s`&afWfl$*`=Z<Gi_3?*CVa4&-&wNG$|vVWZ`;mSt48^)V%6GR
zr8Q$h);q+s?<(71A6_4_MZ0_L#_`dIQ$KItUB1aBzP(iKzo%l0M{4)Jn2tT`wtB-g
z!MnsDEA8;Blo)9Ly+H<0RsM~5x)Q?~tc&ccYOPl0+|U{T?hv(C2&UZNdY|B(+fMLc
zgvCBHgCeM!K-CkSHSPwWWp6NsSp!fa$Y_9|T8<dELM0nVe00^>t)N%$V+LenFSpiB
zTEQ6<CV?aw$pBlf>YqLUvhICl8zu|@?Q@w(wkz%wRoMitpI8=QH)E4)a(81{r2Pxq
z{ff;!*UlLixi@@w#F4dg$44EC?_BR$KF@jnvFz*JN6MpI;#vwB3$Kc3x5UIV)pn6b
zD`GrSKHd?y!E0TtXZodgl6#J>o9~_VWoxm8cjbaf`o2$gM;@zOI3@q~wayLRRg0cq
zdGF5ko?}&U(+Y_D2?e0)nMT|qOS1x%^f>WYbO;q#r^geAtnyQ{0@}j|6~GzJ;4~~~
zLrHb)ku}oci+i3kfQ~UZq#y6n8If1|2-Q*8${*2YZs?JsXIhyB&po1N&q0r5MTZ<u
z%OA1qH{Egr8)NB3s=Bm>M6EW^4awwCSQ4OQKyFaEe$#WzWoOXOdg`H`(sYlu=I|ho
z2zmO=m1L$Z5jaJ%p3X?q=Zcdvl7;t}Jidj)l$B$Y53Mv;RGMBRVfp=C{lgbFoC^r!
zq{ipfflbE1An28|v<@+*evpWS8xD~0>-a3C=rw)^2#aEMzv>8-;@MRK@`O@^)SGFl
zHGf7aOn)O78OcjhatEo?08;*dGy`Kb+r|=7@htwIc>SZ4)fJwg66q6DlQVT3O&FFP
z{vA=XvkrU9!jF&*@WiU8>&}S=m7nH3g5Hx8;V>r(^A)UVmGN{0GKYzWp;NLOLGWz^
ze52$4Ocs<1wY56zf6OBPA_Dh+xx;_iO;X-;P%1R6^02KXfZAFm<0B$oh?+cju~a)~
z7omCbTAt@;laJ75@O(b6P2q!hK85#BF|KKe!c1Z2&-0<eNaUfBNXDsNKoEfV7gmgS
zaq%IOz*!-J?4K|6fkH>1c|J_&v@wEc+~-{dIX%IBmRmy9yu75KN$D}s5vjs{W;ls&
z`HQ48;`tjvND!nA;u)vH-c$Da*eK5>Nx><+POvw+C+UzDGbBgMoh!)Gg>Eo)W9WnN
zq?0vmt}e<mD<*nYR&2B}b>eUKV1Co*JRJ`y7HPkWM<}H>J1KZxuCT#}h6(ycrTiiP
zvqO#bCdDRgsPsG^AWVZZgc@YW22FY~!F?I2$wWbk6ggk0KM8U0P{^9rXh|X`*^&Ji
zUYkF7PR3vbE26_2D>-OnOmcADn0icRP#*DP%u}!@_{3B=nn6ua*<|Z-Udl_(bxzFA
ocGl(OWaZl|P0rD$Wi7FB^P1o`!Brh*4e;A9oS*^1$v?2a0DmGVAOHXW

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..d112e179eb9bbb8372b524bcc40eea2e5cae3f76
GIT binary patch
literal 3997
zcmc&%cUV*B8h^iYPLh+bA|z=7Q79t}QBcIK%?UC^K^zRV$dXMEP`tIa2%@NniUSk}
zLnQ2NTSv<f6crV%2x`@$t%@C@_SV+bbCP1^YX7*;bD!Jyfp5Lz_rCA<J;^WLl+PF;
zA+#5c^Axg$pbsEkv>^_2fT!~I8vS#hZwu`H>4z^DdPw!R9u|U7AQamQNg<!MWlP6{
z6**`lCI_tmfM<@;BZ)BlAjn#fR7gl1fz6kQxxqnkp@9)Wi4j6fM2hN==m?Ve$;iJ8
zC9$P*10e?Vz^5kwdM_7F^rUlOay20ka1rQoIdagWwT$pJ6XP-@TQNDP5tHao8xdW>
z+o!bh&k8UMzXP&%;&la9I-$yt<*{LLNr9n3NkKZ1^dd4eFhm)%JT^{UQ$G@Yk7NzF
z>=q{dFCgDpX_H8JTI4Cbz)}~Rul3{r%VqsCh;mGW${q@OjxPn<f{}3hvNk-)vVi+-
z)(~PNK!-|{u<1hqSmd;U@>>f4I|CTd#US&?NC>EZ0bjo?1$OvkDDjg(;X^*G+C!mF
z6ST1?$z*7*v4>N8UBNVV6byL1MCYqUL$*|mtdE%k@xTs9<xG&L5OA@|9JrPyU?7k~
zR}>$%1dNC1jZQGDCkhs=GzRWY1GqA~5{h(_VT4^B>a3I^v*TqT*ysjk$2%duaU8_`
zB!XkQi(to6DQtRv7@Tv%U`d53oLkI>13k5<kB6Y)tQK4yW&kbC?*p-K2G}<|hIDr`
zm|ax>lCm+7k6FP!qXt<0stUe?Y0&K|#QN4wfh;cpNUB{Sz5h0fT0RVF2y=){=R&qE
z3)<@I!FII^c>m6Z#!PFFWwN0nHwZdz>cPoNEV#7a9(j}&fO+B>c(%#~%<kF2*TLCv
z^JN^YO!9@?^dfk*q8)NvRdB##I`r+q(6dkP!pMx_U|KmA9uB0!p(~bP9;c6<<o_EP
zhJJ^VtPsS<nL)v4ONnpM1@>q4vseKZGqJ+*O2N!%KUA{fIyORP%Gn=jg`S_2;FQD1
zM456imi((Ly7cVEutG~Wd|lTPZZ-N*=Wz@R&OQH}zvtC0q*eV3Ym8tUo9Fw0%(xVR
zXLts1%I3b0j-9c`CO2mB6GmIJ8}ur%Yd0*g10p%=^cofH`tUyKyH&x85f_5?2kk_w
zODH>Z#(UVpRn43mu2#s%F#~lw&u5*>7^lO(YXixXeAvtST#(=XohWc^9%S_z!=Wr&
zqG6AVp53-BJgwHFSX96Oy5%u}H2>x<is0Gu&eo0REZDk+&pkR98#QMQ@$}1BLUR5(
zcuz}*dG~X0@^Lrb74$2FM*wQF(Lv1@{jgh`mgqX^XJOd(Sd8~eCxrcW4n6IQCa~gF
zk{i=I39{FatCb_ggP(_DXq^uke$NIT+->HKx{kqD-g`NmzIt3W#S$Fu{KztJ*oIt^
z+_a61w%}JP&tYzRQ@|#@oGq$VlJSo};d=K^(i*NR23DFj=;8$4QQpn@#TQsh6qDhH
z1|_F!>t#4Pp#wX<tqYev(gBOjIYdC=S^RXb9Xk;}%aVIo5EbVM-2U+n!`^~v+zs*T
zP#b=PW8?J`TK(WE?Azgj4>x`Z?6{vfYfRgrGCBdP*)jzk*~QnH+SSM!Hh(m<T};N5
za{|#U_vJj<wQu0gh!vLca~rUv^_85ZyNp@ROI@*`6+uMRTA}uNvuT_eVeZ6bZZRkR
zw_eo!MG7X(X%dY8dNVA$SYzU0;ef=u+wm{w732OJ@1c3a33B=if7aGx{pfjaJP~U!
z0wSDBP_$Pq9)1GwoZF+J^P_#3-<Tfl!|x@K9|Ra8{}?V@)GI}+4Ki_g6~cHw$XG_F
z>af+4`}!H9zUOw;g<&Cep6ICc3$&?a1G#aY3R}?{&A!Pp#DaNdcxQSNbnh!eU6H}y
zc;1t^`n?TybNmT(Ns@}4PL*hFG@Q!f<!5kAvKQ;N+*yjW1V>P+P9$6^=)-rhjdjum
zhG=KfcKqXm-=NcpIqZ>*J|JImn_ZyYpmp%O-C$cfL91!16dZU9(EWi&SnTgwkUU*S
z^zjZM>izWr_UglR+*voY@qv@^X!xx>WIR`a^)8kX9|>l$iv-Rn+~*V4vq`LBK?)9c
z$%(7v=%cxU8?R2Xxi7A8WA2Esn@QQM{3+{M&sMj>S9e{}{=8hh^2WuuS68_Jd|P>r
zPXz4Z7v9<`A)iZDyc*W%qar;=wm|>MemK)oOX%(&V7_(gNZ~=!R+z&1>uZu=u2xJi
zA854fCQJnMgulFi&~z3&Q8A|}9D=(HFj6efPg#SL`c{5T`D@ZRd{ehvc|mHrmUwJL
zPg6nKdOZ_GhL|c`yTQQ9N9EUCn7-*f``}Z#RMEQ4MiUe6^)wf)-(oCJB_)c38QaX<
zvaS4E4sO_P>0RiyTT#4m=SaW0h~AdsO*wW;S~4U~hcb6NL|jz)w;tM@HzB_7)NZGe
ztbJ2fKDyW2TC(N92PqiWaE5AYfm4RQb-)SLwj$T9rtW)Ylx7#t%pDtf^+f6R60dy4
z21Dnv9i?+sKBWO|Wjo7#Yl1uWIG69LT=;py{i|)|In{wpsazwMirm_eQ`y!F+bee0
zhhHdk&vU8V(-_@T7kRC{GVe3xt(FZ&u2p-#NW6Ekbm7UWeMgfY_I2dBR_{N)>gl8V
z*G^U+`07)@c~ZBU{N^;$z-G~@nu6B#d^3-|ZncGNo5T)L*H6_Jo!nyLv{C9_cd%o-
z)!ed09d*TLa_mD+?{%*~)R{LivHyBUeaZO)@-*IXj|Np&k=u5g#itueFO@LpoW{Lh
zsD_|hHV&dIZ7!G5=;)J&K9|#cHRw(ie6yvOk2IQI`4heTO7$D)sf*u6mmDtfstrlw
zS<Ix->)bl`YtUtxGN};LkR-RE6oVTZ8OIQ&utXvzGBJr078AzM51MPFa75RQMLou{
z{Nb_&H5+^LnZ1)x$c0m1Qz!DMHn0#8h*&gv`k8r`wA|B7?;uaKwOYjFpbbNw-ae&`
zhLEQw>%SvUdatQeOjulal7<q}^r;`os;p$)mNb2OJN(~(zN=bErXz^ii6IeTu|YzD
zW{wuv8wAus>RDkRr#U1zV&b7K7qT_RL)jf;_t#|zF(KP?D&tLv)fdB%RE$O#ydVoh
zI=%@LUk&`9#e#^GNF|>CV|M%xQPuy;^OgNmFG<vAE+S<lJ<*@q5~*05^6~NVove9S
z;(Wr65l2cvQPdpzpOKjql}stZDHn<gr<}soXA0V27PE4qD0gP2_k%ONQ!f*l2uvW#
z>!Xo&b`$A^^K3U}@084p&dflErYKKlQ1~-jwVtwP_R+y(XO9W?U9ddVWm2@ipHBp%
zkM*YW`&Se_sN*@QgV0IPqKT(oGP)uDi2=TjaiOl^R1o8@hNsJ+#eZ<`Gk-p__h1H1
z>FU%q`9#xailiW4N2R~tY-NC-x^x<Ej;213c|-G|n?=hT`7o_i#D}^rNMaO}yC-9>
zHcB4EpX09XH{EPn23t>2Gnl1o4AX=70GCNo!DCm_JsHOs(M?Wf`cJ2*xfJ?Lq4uOR
zr>mp&{*)qFb5H7I886y~Qg>40q%Kl(t}fj_)@3=3qpp+jr{mME%++WtL2MEql(1}h
tSkkDF`1nyl2?@$%lNDhJiIK`UlQH8*j~Q(*@st9*{)1x#fH(ca{0|mk!UF&R

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f3f7d2a7d0cc4ecb1608dee4e76b94cde19d2b83
GIT binary patch
literal 4018
zcmc&%d011&7N41WZ*r4>M9obTKtL%0abXkOP)XPk6~PLq6vHAQTS{28?obpJ72E(p
zR*_9m5LyM5T?My-&jq#0vseLN>-wI1efK8C>a+dhec$_D&zH>1nK|eD&N(yRmtQ=j
zo)#2N*-;Me6iI;*02r)Z78|!}x1Rp2qp7})1!Ygd(-hlXZ{B>jHe|0&-nH;)HWZ))
zlxRG~rT7$!V(Hod6LQE#L=IU20JxtcW`qdq1Cnm2OW|T`z~YNVoB(-LkbkIL8A>4`
zSJ(oDolp`gA^$9tcs!jWpb(%3K0N_I>mHPyJDmfQvk8s?4g{JUHV(8jNn1W`!N_p+
z#JCA5?cXyNkwYgU68p_YNLTUcE`7)|QB{F3EX4^(dhz#Vm}mw?2gXGPN5%RF$z$c3
zVe}#_$UiVTA}%sYRoELSd;}%uG07vO`(I%GVYy90s#EAreNU(=&gVVLBb;{31u<0%
zP;F>6o?7-RcwRLLo|=%!Bgf1J2S*>omulyM&;5*WkD_O=-t`=~J|!J5_wVMJnq9=U
zxsC-ESMPyH{2V&D=NNoxd>&aw53*NuT?FkLZo$ri2hj2}1<H*M0x#<iU>uW~M0r*h
zsA_rz#{hRUt<4;kxoX1sEhoT9NepaV-;FlZV_^G=#i-lM5&UfP6p0QwV|mRQz~-no
ztUb^U?xgGi?^0v1(PrzA?UboR*iIeR;OQD@5!Q(3CbL*xYiDsY6*}Aqz1u`i%R?ZT
z0ExU;Zb)Mi!s4g~;B5E;bXfLr`v)cy0-y<|tZoD^%3Xn`uOo=in}8mESr0aMHey%2
zW`dOFW$0-G0WZ%v18i-L@H+jcpjHr%Dw7?6p2KW#;?gKgu>CbON;(R(9AAUK`&R+X
z)*Oqb>;M^;f*)VHoAoMP34Ur%1p7``6WK+JQEsgTY%s3JtV-TuOU~cNW;Tvy8z#r2
zYO`zXwSG246Z;n;!rT`-3zT45Rv=h6;6jX8zY>f;c7iqh8iY^(ML#xqF)*Js1Jhoq
z0enYhao<oEiSLfiA>`E&{KAS#!R&A!SQ38==}E@0{l#B$-&_=9QmfT?d9)u&c<u}@
zzrH=Z(AWj*nC?Om4omcR>$J1o)@o_&d)Etxn`A)?K{|`;{eX<TO>$OWZ6M3NM#Fo}
zd}479j}tR-EUQ+l65YIQgz|)Tgo1YjrMI>5z0+hwgs2c0jbFizC*nc%yS-@fs%G{^
zErO+EodOf8SHg=alQjAFO+ms`Pu5p=ouE(d9_ar~HrU>y50pwFSKnhaX_mf&%WG{w
z%NIR{y>64q&wscNL%HL*=WA@(3)436BYDTL+I!>Ri&K%f_|h%lF(Vl)7|6iR{a6pX
z<j+A!D1c3-ny|Us2lZ}UI?R5=c2scB5^?`|0R+Fi2w&U{$5HVr@$H!x@H7uYf|sF-
z1-uDDaHA&~(r*eL+;8SixP?HYM-Dqh9-vn`-56N){Y;FsN{0n(OQfT-17mM|2wZa3
z1JmRRmV4|HQt@;x$766Rudlw4OC)K4VHl2eR9t2M(;E;=Wz)bws*LJNy8;>}cc7E$
zU6|>q7Czs~fwY`|1MBQDV=1xogq@oae&`a8nLpjF-BU1wvstkbwqx~d(|NzZH4kop
z{kxs8QTo3ER@6iGhB2o>Wq1ravST`|-^<su?W!Y&e=!lXcPF6e`TlT@YaCZ{^9RtU
z7jLZasztGzD%s2S>JyI3oRK_Uj#qs{X<Ra#!FC9C#jkLR*@~Av@apmPNH?QNVDq<a
z;H&N<25v@HP?U8VJGGz~^WD-97mUKmnQwiGv=f8yO{M~m9H|FF?Mq<zylO0@4PY5R
zP6QXe-j94Nu4{bzSqwRAkv8;=;DBzeGPq`BDrQ#&5%(VwLg#D^S|c79kuu>YPFGDZ
z3aoL54P)QJtu33$EelFfd}}!C4xx<#xQ5t;<XCWZe>v<53jo%a-0>SfnW8&3ZSb-<
z5p^btd0Vt?32uH0+hC{Pu$I1MkSC~ziJD>Ha=~3}FH2uDS)dK~#O}gYANT=wDl=H)
z>O6tn(jQp`8nwIw_p-qFvdO$ATU}tqT?hvrKS7bNcpzb>rf~Ic3idpIfZlz%ku&GE
z2KM-j0*>m<hWcJI)Z-_?zZT446$u<+i04}3^;BZGT*l@sZM#8EY{(Sces_k&d3%i$
z(I-TAVs{ex(>D>X*R+Dh`_3>oJ5#Ho&JUZ{RUrW0tz7G80#@-`4~<fa&mj-J8(!yG
zO1h700fT1-!MT=dd|2*d=4Yp#OVyI&sb2E;-%6ahS`p4XR9f~E24Z@`e*GRoPXatE
zg|ya7;+%B=ks`bN^&2p9go#g6{)Qwre~e3}T|r_pPh=T-y{RB+la_%jMI<f!X7flB
z&r+Y}!sM-=nFpNBlooB=rZXj`|9W%LrXBiriKJL|ASK<<Wv7X6%Yn_ij6DinvSh_u
z_KfqX3GHbq-kM>yv?WDse=s%6Dzv-QxAow*?8%C|XS3`}w(p<5;z@r`YsrqhS?du;
z+o3eAz&>Tf*hOun=|#?IV_f$+l<h2@ooN|%qpfUL$-I2oW^Kpv-DO^-o@I;L%lA}x
z9|`E#=UB0~a`Ca4fg9}=8HfFw5;;0fhcc@J&+Z(%`1GNyqaoiHx@J37?yC#GUK4in
zbY=E;(Y-C3b)2hmjw}1S%NC!h+TW1y=x#^0^Wof+t6n@AxOwJqUgJ7|adlme<Todg
zBTbi_JyOuRiErqZ<5FGNzExzk{MOm(qBA=T?6>H;)*R^AW#U!7q@$+zT!wjIXO8R9
zgBP-=C<kwK94)z&XP3ks<yKqTRpheE)UUI)>~aZ%&aTTjUiuNbaorGlwCR;{8Xc~E
z<as5@Ta9jih;O*;N`2j!YrmmaTs!;$dZOQ_=;Cin=2ZtKagAov=rt}Ea@FXsUO=Gr
z>LrwXE77HpnlZV2Of@)>VNncw3RdC~Vaix`a6~XeLuk&?OA7l&3;XpY^`j&gRn+Xm
z=k|xh!j7_i&;7_?1PF)+K!R-t9y0HfmVWibJm-&ipNO$miHIDsVOZ3syR^}7To9?a
z;NaLlBT-uKDOE&pR7k9v9}Ys{Ae3A;k*qhNY1F6T{{-}psuibN0})FZ7#bWYr*N7^
z8lex^sFAMwp^<hpp#;`Q^l{56mfHAZmdDurei;;^SnjrrH!f0L48vFvjWBdUM%r|I
z117!-_&<vULata>?EXKd%6}1M{l7e4$zS!7gsH|tu7nh(8VLXp>xwj_o}Tl(r>P&7
zC?7Xt#9AtoN~QDZFC(+1(gdk2MCv4!hDhy0RC_YoU=Fjgmr7lknbwD9T2n2ROavy7
zbl%tF%*>|H3&*)GjBcO6jE>Aehn7m+nL*~usH$_RIirUMSkAQw@Lm`f<TN$h*T*xI
zIgj+9^IN%`9#rw{RYB+^Xi>*gEt#{A{FRHmt)qgRL!@%XUj<K>LyPZ_?)k+RjPAw^
z>e5xItMdt`_hhkhZ|i7ZpSjVCd{m{=c=Ofuxh?oOAG%qze2@>*N|_?ad0{MbAa!+T
z>{UhyL-_MuRsE)$P0LX0rBVlG=^VlIK(WYa>hb`~6?9J~F-CNg6PW(fDXK4p-jk_3
z>CEZs=yPAGEJ1xws$>~2+D5ACq}oYUq<UXfx^JXY9F3!@lkunH)2__bs4aoWKp~I$
vDlRy7LZCu1K^_wmonR0j9HR`2jxw;YnP@T5T<oq3!21t4fdk+{|6Bhz)#cIN

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..06444accf507f2191f70afd33d8ea36e7ca5aa07
GIT binary patch
literal 4326
zcmc&&2UJt{8vp)xBsUN?HztCL0x>FEz=f88vc-tXauN0lK?s72p#_5j#fpN80*V`F
zuvS0?>#B+y(dVdEtJYezxZ&x$H;C3}d(M03yz}~<<iEc8`~ElNM`Y3<N1C7`=;#2N
zPlGXlLw!!o{IpoPUq#yb4tmL=dkb6k9z1&6y8Lm-O?m3>+2Q3U1ysi7;p<8Z=bKfX
zyK(Fd4YYvnZbR$ShBSxfn7e|6>e30Rt|$PYE5VpiAK|!<vq1bDnbU;G1vrKxAum#%
z5<N9uo*qwQ0)4?<B={AH)ksYJRWOkaQ#H^SFoPj80WkYiS{A@m!J-&UAmAY&cw7QV
zJqkF?TUU>A<1?15=yy)6`)9QasV<F>LLY1d44yAf8RZ8&7>=utbr&xuKjM+-MJuB+
z6fr5PsnK$kTrYuHBt%b*QYL38QnU!yV1issoJ)w8<L3WG&Oe2nE2I|)1L$0i7I`lJ
z`Fql9{xnE*nFlqkQfO_61)ItN@Y7{JJTJ0_he!KCRDV;nag!1jPB8`Rl}AB&+Zw>$
z1fJ<*u;_6Dgzx<)Tza<!IB`Q^Q>X~4o*KfeRZ^6b#>WyxL!oh(1Dsgn4Sf{0@NCRG
zbasawEHf9Pe)U#BJhBH$=@0I?1YFo*1-!l%U}EYH=MxQKNw_N{E%1cVt%)!xs~7M}
zOyEjj4XoGCgr4?$(3u)@WZAG4Oc(foWy2Xrt#yW!-vm&vKM9t{n#015eK4vt1|qii
zf#%5^Sl7B6-O)jyIjsxVj7*@Zu{RLqBf&w_2Km005V)ffL|Yx90+T?wnFi*(-vRev
zIJ9`v*qyJ3LD3jf5bgAa{9k@Ti5W()i?9MkJ`a}JkZ^SGUa*<t1+upss9n?##EUqv
zy;Ke-ZyCVxizHlJ>wx^WRDxCdDQKVN1(px&;Zo!>xb-dtvQ!~Zn!g_2Pd^4Ly*I-;
zzY%a}6^7b#?!y3e59m|l1W%vMg^gGGf>nw!dS3Aob&tM>R1yTKDVE^1BoM}R6e3Oh
zZer1vckpV5GdgHns!R3phmE~A5)s|kfz@PdLbm=nY8_lm-1k3%`r9|-i+%fnW6M2I
z5T~$)lKn_!%tO(K+{mo+&2TjDI{KmV5pwF5img>f!<z#eaGqoYxwY&(>^S@u^??9v
z==Tn2i?1FU+f)w?;xu$5_X3u80EcV#4(PFK8z$WDjjwCugJ#1~xTP+E_lwf--u5||
z+u$K&!ZI_?FUR&E$Am+~+I$X&luX9@j@Uy+9S<khG(7>odLgWP>xc23F+78=18>XM
zaMJ0n&Vz@8NK?>*!Ltv+>#e?^7d8r#d)i`8rW}C69}eM{$Bh7WV=UHIN1}_PkHfM(
z!Kk)Z8)!_YV>S6Bq31|{s6X2Sx1P5b3`VuX?WsEecXPm%v^!84YT~<y<(#+0>G1Q>
zxzO)YcXIXmiP&0=BdY6r0N--*J(_g-K0e}5Z?0wjbZlF<lX&)ISJGzNQasrq3_k(s
zFtjWRavu7S#<?@VW`8}$=n6t@FL9XF>J1KG4ad9B;6qW?MC`k?37DxZBJ2FU(Ta36
z-fi+#Jj>t~QY75Kru|lcJ&MgCYDdcub>9t3HTVv7Qz$WzS3~8uu|x;mOn$p}G%4Sm
zY*@9e#?(J46m6P*9qTFX!(E#oK^@H^Tsm+zu~j)4%Y5aHF1Ft^dNTJoui$(HZzp=J
zx7_R~*Du?Ezv}&u$VgI#98HTkIw6lJ#Z8JgyQPlWI<7aWKjnZ;T%*fNv+Kvv7}Q|@
zxM_{e8EI;is=EtYG36l@vebi27FL0^O%``LIURPtUx7`W)yOR{AUT>BZ-~s@Gf=bI
zS<mo(f5;pX#F=)-3x%#NK~uk84MkUb!NwvRqL-{UWna8RNB2k_mKgpF{pdG{vbu2}
z#p~GUoZjQg4PKgO$g3NNY3>a`uMa8+(b?-D8=enA`!12Eema28%U?n0!R7ej{(7kK
zLMV2@Aj-smSA|vHbHa39oPn4(&FJ-=Bm%3RCAvA{46#&ZN$RdN$0IwUF;oyl#XaZ`
zkM1|>95Fr)hh%HGYI$VO&BOY_z`Ku0lYzyklIw(-nJvM&1y8_dO)m7$-^K}0O`=lU
zvU##!hUk9ZCr6*0#|M3!z)x;#;l2m~GQwjh{HjrM&o8|Ub%Rb~4aMj2T_Y?EtQW5&
zoW@?mtNQxj%^mv<ht?Jut`oSEN4%mrW-e;fGHL>;T<s$o|8ON97Z2#ig%SFm#zh$R
zjRLd1j)9lThd*qfH}CF&DxKTt94Fk`A8Sh{L3ig1QhhNVR|kZ1f5_Vm{I8A?(IdNI
zH@ziDv&zfBc5EI&wYA{X)$Tm*oxS*#nr2L}TS@iUTn*;G>p^(cX`K05JEl*s(zT&0
z_@BO&3AP<Vg8hTZ$a=1YB{Sjg&LK3Dgy+ueX$7C)<p4$r-79kQaLQN`dblEQ9@nss
zPpNz5+<aZ3Q+(^;%6an*EIiagY1P+-CX%4dp^a7f3%fZ)o+y>BFIa3gIPF1e<NEna
zdb!V~L>?Q|#g;zHBw<Y(3cu+qtMV!Hs9sPqAaqas)u!r&EA1ni)FRJ~i^>MZU)UUW
zWaHx1gHrFDDD&J@R6Z>0_XlCS=8x(1RSuBG&T3h2&-sSdK^pZ~OYbEXqn0@rE-Um~
zVYlV*a^H*Mz;d@a80~jS5>y>f?or}*xqooYq=vb)e~V3cZSsw#68|f9Q)DyWyd`Z)
zcgeqBAR1O+Q?@7O^r|7VIJT=a3758p)fL#T-lzD_v#;*Vu9t0-fon70WHUEq19fX_
zm{B%AceUpx)FEqn$beWfxD!N?7DVy59v!fn?w~fNl^b;sKQ{+v13y3Tik)2tNUA?o
zChB-vFuHu7;!$gD`YZh@ZZp~*T;7{$Fm=S7_SY}kUKvEV=ixkobC#jpBVSkKs(Ni0
zIkM2CU$7?2D9UTG*^ueaUmHdH6nk6xu5g}V9OG9qAZ(xNjd5&1*}#Mr%?y*c!17@;
zUOs<g5-+RtEEKrRG))Mq_AhhIcx#$Ce#^Km!L>8#q~IDheUBXBtzahYv=>G>!e3`H
z8F%WU9P_71Ov1*3TB97ov>oM$vL(m&seA>or}xC3ZL!M+P>|V@(X+-Yv~ef54`sTu
zFR5MhPn2zSJ?+ISL4^DMX*|7bt2Czk!LJ5PF3jkW4H{9l`R^%AxySEhFhPjsdR~W0
zA!}f296d{=462@8=K8q(V~v-UEr-NQ2bj}XC!6&7oPF^W2`MbIj7cYw6Vg@OnB*9i
znK3l1f-l(BS5Ran-qA}um&;PkKYl{~lxxXouK-#{OSB|Z9Un+Uv|=Qe0y*MQZc!g7
z<J1rHm+X>p(Tb4jvSHcfm#2);CwA#mWh6dl_gC!F;3L;ej!B7Abu#D<BzTF$^|s<`
zHVnV~GX7s={WHv>MJ^!Zq({ZaDC9K3@N8@B4}9I)JYc(tJHxoa1rvVWc$(8`{5kz)
z?LJ)zjnSL{H`bdFYOjZ-^ca(&>zb^)Gx06h_}aw(4i*IZB6Cr||N5VazlqHMU!LN>
z?NJhJw-xA%DZ$fzrT`*yAzvC4G$v$d=aVI@AnaLjk$Om_(y`2&6@RHTQ|b{X^^!{C
zq@HowGY`gKG`sSYN`2Xx*>}y%PP<HJBd~#_V`dJpw;#+bMg{t?d(TXE9K{YyXsI-S
z9X!I=tyV8}VE0LpPJxb*A;B5ZUPF??LWAO2y+X#+HzSc5wDCN(L6{;K>5QjcvbxXw
z)5AktQlh=%q;l3@o1THgNLbfCXu<?`@5c_E(6yy^)|14Xd8p(eF3PabKxKHS7CMt}
zY$ray@t@bjG>efx)Wf#YBQ@GPSj8%&z5%Sg)+n<p|5#sbznNw;($#vYbR@g<PG)<M
z8tyeDG14iE>4`II#56gR?LSju=cO=b9$HVPat0lv50iRicHWb=Sk{ZNk!m~H>7)hO
zd9H;Xrtr#O@@R3g{!DzvmA#ryOAuP5%G0K0#HeheQd4c^X=%z#i|H|G=?Tgd3rANw
VM>_{mfH}a&7c>w6GUgYKe*tj26RQ9K

literal 0
HcmV?d00001

diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json
new file mode 100644
index 000000000..f7f0fe9df
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json
@@ -0,0 +1,48 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "fdlist",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-10-14 10:43:37",
+        "end_time": "2024-10-14 10:43:38",
+        "status": "success"
+    },
+    "code": null,
+    "job_input_params": {
+        "docs_to_remove": "docs_to_remove",
+        "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"],
+        "num_processors": 0
+    },
+    "execution_stats": {
+        "cpus": 31.7,
+        "gpus": 0,
+        "memory": 15.83,
+        "object_store": 0,
+        "execution time, min": 0.003
+    },
+    "job_output_stats": {
+        "result_files": 1,
+        "result_size": 663,
+        "processing_time": 0.2,
+        "input_files": 28,
+        "input_bytes": 38040,
+        "input_rows": 44,
+        "consolidated_files": 1,
+        "consolidated_bytes": 64,
+        "consolidated_rows": 8
+    },
+    "source": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1",
+        "type": "path"
+    }
+}
diff --git a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py
new file mode 100644
index 000000000..294c86f25
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py
@@ -0,0 +1,46 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from cluster_analysis_transform import sort_output_cli_param
+from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+
+
+class TestSparkClusterAnalysisTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "cluster_num_bands": 14,
+            "cluster_num_segments": 2,
+            "cluster_jaccard_similarity_threshold": 0.7,
+            sort_output_cli_param: True,
+        }
+        launcher = SparkTransformLauncher(ClusterAnalysisSparkTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "signature_calc", "bands"),
+                os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py
new file mode 100644
index 000000000..919857e23
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py
@@ -0,0 +1,58 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+    operation_mode_cli_param,
+)
+from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+
+
+class TestSparkDataCleaningTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        duplicate_location = os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "test-data",
+                "expected/get_list_transform/docs_to_remove_consolidated",
+                "docs_to_remove_consolidated.parquet",
+            )
+        )
+        config = {
+            document_id_column_cli_param: "int_id_column",
+            duplicate_list_location_cli_param: duplicate_location,
+            operation_mode_cli_param: "annotate",
+        }
+        launcher = SparkTransformLauncher(DataCleaningSparkTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "input"),
+                os.path.join(basedir, "expected", "data_cleaning", "annotated"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py
new file mode 100644
index 000000000..4b59e3a7a
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py
@@ -0,0 +1,45 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.runtime.pure_python import PythonTransformLauncher
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from get_duplicate_list_transform import sort_output_cli_param
+from get_duplicate_list_transform_python import (
+    GetDuplicateListPythonTransformConfiguration,
+)
+
+
+class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            sort_output_cli_param: True,
+        }
+        launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration())
+        fixtures = [
+            (
+                launcher,
+                config,
+                os.path.join(basedir, "expected", "cluster_analysis"),
+                os.path.join(basedir, "expected", "get_list_transform"),
+            )
+        ]
+        return fixtures
diff --git a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py
new file mode 100644
index 000000000..6d93dc7a9
--- /dev/null
+++ b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py
@@ -0,0 +1,42 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+
+from data_processing.test_support.launch.transform_test import (
+    AbstractTransformLauncherTest,
+)
+from data_processing.utils import ParamsUtils
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from signature_calc_transform_spark import (
+    SignatureCalculationSparkTransformConfiguration,
+)
+
+
+class TestSparkSignatureCalcTransform(AbstractTransformLauncherTest):
+    """
+    Extends the super-class to define the test data for the tests defined there.
+    The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+    """
+
+    def get_test_transform_fixtures(self) -> list[tuple]:
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        config = {
+            "minhash_num_permutations": 112,
+            "minhash_num_bands": 14,
+            "minhash_num_segments": 2,
+        }
+        launcher = SparkTransformLauncher(SignatureCalculationSparkTransformConfiguration())
+        fixtures = [
+            (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc"))
+        ]
+        return fixtures

From 77d85fde33e2905a19a5195adf64fecc5d88be9b Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 18 Oct 2024 16:13:42 -0400
Subject: [PATCH 044/105] Adjust to file naming changes

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/ray/Dockerfile | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile
index 27d101bb8..1265e8ee3 100644
--- a/transforms/universal/fdedup/ray/Dockerfile
+++ b/transforms/universal/fdedup/ray/Dockerfile
@@ -22,10 +22,7 @@ COPY --chown=ray:users images/ images/
 RUN pip install --no-cache-dir -e .
 
 # copy the main() entry point to the image 
-COPY ./src/fdedup_transform_ray.py .
-
-# copy some of the samples in
-COPY src/fdedup_local_ray.py local/
+COPY ./src/fuzzy_dedup_ray.py .
 
 # copy test
 COPY test/ test/

From 310d8139ca2bd52afd2e987fc52c6b530d4c2888 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 18 Oct 2024 18:03:34 -0400
Subject: [PATCH 045/105] Create python Dockerfile

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/python/Dockerfile | 43 +++++++++++++++++++
 .../universal/fdedup/python/requirements.txt  | 10 +++++
 2 files changed, 53 insertions(+)
 create mode 100644 transforms/universal/fdedup/python/Dockerfile
 create mode 100644 transforms/universal/fdedup/python/requirements.txt

diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile
new file mode 100644
index 000000000..a0a557060
--- /dev/null
+++ b/transforms/universal/fdedup/python/Dockerfile
@@ -0,0 +1,43 @@
+FROM docker.io/python:3.10.14-slim-bullseye
+
+RUN pip install --upgrade --no-cache-dir pip 
+
+# install pytest
+RUN pip install --no-cache-dir pytest
+
+# Create a user and use it to run the transform
+RUN useradd -ms /bin/bash dpk
+USER dpk
+WORKDIR /home/dpk
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
+RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
+
+COPY --chown=dpk:root src/ src/
+COPY --chown=dpk:root pyproject.toml pyproject.toml
+COPY --chown=dpk:root README.md README.md
+COPY --chown=dpk:root requirements.txt requirements.txt
+
+RUN pip install --no-cache-dir -e .
+
+# copy source data
+COPY src/ src/
+
+# copy source data
+COPY ./src/signature_calc_transform_python.py fdedup_transform_python.py
+COPY ./src/signature_calc_local_python.py local/
+
+# copy test
+COPY test/ test/
+COPY test-data/ test-data/
+
+# Set environment
+ENV PYTHONPATH /home/dpk
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt
new file mode 100644
index 000000000..85806f809
--- /dev/null
+++ b/transforms/universal/fdedup/python/requirements.txt
@@ -0,0 +1,10 @@
+pyarrow==16.1.0
+pyyaml>=6.0.2
+boto3>=1.34.69
+kubernetes>=30.1.0
+polars>=1.6.0
+disjoint-set>=0.8.0
+numpy<1.29.0
+sentencepiece>=0.2.0
+mmh3>=4.1.0
+scipy>=1.12.0, <2.0.0

From 7d97cef7c741a703b27b2d5f17467b998bc2794b Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Sat, 19 Oct 2024 14:51:24 -0400
Subject: [PATCH 046/105] Ray bug fixes

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/ray/Dockerfile             | 10 +++++++---
 .../fdedup/ray/src/cluster_analysis_local_ray.py       |  2 +-
 .../fdedup/ray/src/signature_calc_transform_ray.py     |  1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile
index 1265e8ee3..ec2c56f28 100644
--- a/transforms/universal/fdedup/ray/Dockerfile
+++ b/transforms/universal/fdedup/ray/Dockerfile
@@ -2,6 +2,8 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
 
 FROM ${BASE_IMAGE}
 
+USER ray
+
 RUN pip install --upgrade --no-cache-dir pip 
 
 # install pytest
@@ -13,16 +15,18 @@ COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/
 RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
 COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ 
 RUN cd data-processing-lib-ray    && pip install --no-cache-dir -e .
+COPY --chown=ray:users python-transform/  python-transform/
+RUN cd python-transform && pip install --no-cache-dir -e .
 
 # Install ray project source
 COPY --chown=ray:users src/ src/
 COPY --chown=ray:users pyproject.toml pyproject.toml
 COPY --chown=ray:users README.md README.md
-COPY --chown=ray:users images/ images/
 RUN pip install --no-cache-dir -e .
 
-# copy the main() entry point to the image 
-COPY ./src/fuzzy_dedup_ray.py .
+# copy source files needed by test-image
+COPY ./src/signature_calc_transform_ray.py fdedup_transform_ray.py
+COPY ./src/signature_calc_local_ray.py local/fdedup_local_ray.py
 
 # copy test
 COPY test/ test/
diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
index 25b96788d..c078746ce 100644
--- a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
+++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
@@ -19,7 +19,7 @@
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands"))
 output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
 local_conf = {
     "input_folder": input_folder,
diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py
index bc3c0d991..678d953f2 100644
--- a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py
+++ b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py
@@ -14,6 +14,7 @@
 from data_processing_ray.runtime.ray.runtime_configuration import (
     RayTransformRuntimeConfiguration,
 )
+from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher
 from signature_calc_transform import SignatureCalculationTransformConfiguration
 
 
From 87902ac1f8aa4eff51e127df76ea44fd86a632e0 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Sat, 19 Oct 2024 17:03:09 -0400
Subject: [PATCH 047/105] Fix spark image to support testing

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/spark/Dockerfile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile
index 523b94c06..a36a7cef7 100644
--- a/transforms/universal/fdedup/spark/Dockerfile
+++ b/transforms/universal/fdedup/spark/Dockerfile
@@ -36,7 +36,8 @@ RUN pip install --no-cache-dir -e .
 COPY ./src/signature_calc_spark.py .
 
 # copy some of the samples in
-# COPY src/filter_local_spark.py local/
+COPY src/signature_calc_transform_spark.py fdedup_transform_spark.py
+COPY src/signature_calc_spark.py local/fdedup_local_spark.py
 
 # copy test
 COPY test/ test/
@@ -46,6 +47,7 @@ USER spark
 
 # Set environment
 ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH}
+ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH}
 
 # Put these at the end since they seem to upset the docker cache.
 ARG BUILD_DATE

From c84792452619fc57ce2ebeee8f872ef2b67deb82 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 25 Oct 2024 07:57:22 -0400
Subject: [PATCH 048/105] Removed file copy utils

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/python/src/file_copy_util.py       | 158 -----------
 .../fdedup/spark/src/file_copy_util_spark.py  | 261 ------------------
 2 files changed, 419 deletions(-)
 delete mode 100644 transforms/universal/fdedup/python/src/file_copy_util.py
 delete mode 100644 transforms/universal/fdedup/spark/src/file_copy_util_spark.py

diff --git a/transforms/universal/fdedup/python/src/file_copy_util.py b/transforms/universal/fdedup/python/src/file_copy_util.py
deleted file mode 100644
index 87867e532..000000000
--- a/transforms/universal/fdedup/python/src/file_copy_util.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import argparse
-import io
-import os
-import re
-
-import polars as pl
-from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
-from data_processing.utils import ParamsUtils, get_logger
-
-
-"""
-This class reads all the parquet files inside an `input_folder` of the type
-`.../bands/band=b/segment=s`, concatenates those files, and writes them into a
-file called `.../consolidated_bands/band_b_segment_s.parquet`
-"""
-
-
-class FileCopyUtil:
-    def __init__(
-        self,
-        data_access_factory: DataAccessFactoryBase,
-        config: dict,
-        stats: dict,
-    ):
-        self.data_access_factory = data_access_factory
-        self.root_folder = config.get("root_folder")
-        self.logger = get_logger(__name__, level="INFO")
-
-    def copy_data(self, subfolder_name: str, data_type: str):
-        self.logger.info(f"copy_data(): subfolder_name = {subfolder_name}, data_type = {data_type}")
-        if self.data_access_factory.s3_config is not None:
-            _, root_folder = self.root_folder.split("://")
-        else:
-            root_folder = self.root_folder
-        self.logger.debug(f"copy_data(): root_folder = {root_folder}")
-        if data_type == "bands":
-            match = re.match(r"^band=(\d+)/segment=(\d+)$", subfolder_name)
-            if match:
-                band = int(match.group(1))
-                segment = int(match.group(2))
-            else:
-                raise ValueError(f"Wrong subfolder_name {subfolder_name}, should be band=b/segment=s")
-            input_folder = os.path.join(
-                root_folder,
-                "bands",
-                f"band={band}",
-                f"segment={segment}/",
-            )
-            output_path = os.path.join(
-                root_folder,
-                "bands_consolidated",
-                f"band_{band}_segment_{segment}.parquet",
-            )
-        elif data_type == "docs_to_remove":
-            input_folder = os.path.join(
-                root_folder,
-                f"{subfolder_name}/",
-            )
-            output_path = os.path.join(
-                root_folder,
-                "docs_to_remove_consolidated",
-                f"docs_to_remove_consolidated.parquet",
-            )
-            self.logger.debug(f"copy_data(): input_folder = {input_folder}, output_path = {output_path}")
-
-        data_access = self.data_access_factory.create_data_access()
-        self.logger.debug(f"copy_data(): getting the data from the input_folder {input_folder}")
-        file_dict, status = data_access.get_folder_files(
-            input_folder,
-            extensions=[".parquet"],
-            return_data=True,
-        )
-        self.logger.info(f"Found {len(file_dict)} files in input folder {input_folder}")
-        consolidated_df = pl.DataFrame()
-        for fname, contents in file_dict.items():
-            df = pl.read_parquet(io.BytesIO(contents))
-            # self.logger.info(f"{fname} has {len(df)} rows")
-            consolidated_df = consolidated_df.vstack(df)
-        if "docs_to_remove" in consolidated_df.columns:
-            consolidated_df = consolidated_df.select("docs_to_remove").unique()
-        output_table = consolidated_df.to_arrow()
-        self.logger.info(
-            f"Writing to {output_path} table with {output_table.num_rows} rows and {output_table.nbytes:,d} bytes"
-        )
-        stats = {
-            "input_files": len(file_dict),
-            "input_bytes": sum(len(v) for v in file_dict.values()),
-            "input_rows": output_table.num_rows,
-            "output_files": 1,
-            "output_bytes": output_table.nbytes,
-            "output_rows": output_table.num_rows,
-        }
-        data_access.save_table(output_path, output_table)
-        return stats
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--root_folder",
-        type=str,
-        default=os.getenv("HOME", os.path.join(os.sep)),
-        help="root folder",
-    )
-    parser.add_argument(
-        "--subfolder_name",
-        type=str,
-        default=os.path.join("band=0", "segment=0"),
-        help="subfolder name",
-    )
-    parser.add_argument(
-        "--data_type",
-        type=str,
-        default="docs_to_remove",
-        help="Processing either bands or docs_to_remove",
-    )
-    parser.add_argument(
-        "--use_s3",
-        type=bool,
-        default=False,
-        help="use s3",
-    )
-    args = parser.parse_args()
-    root_folder = args.root_folder
-    config = {"root_folder": args.root_folder}
-    input_folder = args.root_folder
-    output_folder = args.root_folder
-    data_type = args.data_type
-    data_access_factory: DataAccessFactoryBase = DataAccessFactory()
-    daf_args = []
-    if args.use_s3:
-        s3_creds = {
-            "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
-            "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
-            "url": os.getenv("AWS_ENDPOINT_URL"),
-        }
-        s3_config = {
-            "input_folder": root_folder,
-            "output_folder": root_folder,
-        }
-        daf_args.append("--data_s3_cred")
-        daf_args.append(ParamsUtils.convert_to_ast(s3_creds))
-        daf_args.append("--data_s3_config")
-        daf_args.append(ParamsUtils.convert_to_ast(s3_config)),
-    else:
-        local_config = {
-            "input_folder": root_folder,
-            "output_folder": root_folder,
-        }
-        daf_args.append("--data_local_config")
-        daf_args.append(ParamsUtils.convert_to_ast(local_config))
-    daf_parser = argparse.ArgumentParser()
-    data_access_factory.add_input_params(parser=daf_parser)
-    data_access_factory_args = daf_parser.parse_args(args=daf_args)
-    data_access_factory.apply_input_params(args=data_access_factory_args)
-    stats = {}
-    fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats)
-    fcu.copy_data(args.subfolder_name, args.data_type)
diff --git a/transforms/universal/fdedup/spark/src/file_copy_util_spark.py b/transforms/universal/fdedup/spark/src/file_copy_util_spark.py
deleted file mode 100644
index 58a43a736..000000000
--- a/transforms/universal/fdedup/spark/src/file_copy_util_spark.py
+++ /dev/null
@@ -1,261 +0,0 @@
-import argparse
-import os
-import socket
-import time
-import traceback
-from datetime import datetime
-
-import polars as pl
-import yaml
-from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
-from data_processing.utils import ParamsUtils, get_logger
-from file_copy_util import FileCopyUtil
-from pyspark.sql import SparkSession
-
-
-logger = get_logger(__name__)
-
-
-class FileCopySpark:
-    def __init__(self, root_folder: str, num_bands: int, num_segments: int, use_s3: bool):
-        self.root_folder = root_folder
-        self.num_bands = num_bands
-        self.num_segments = num_segments
-        self.use_s3 = use_s3
-        self.subdirs = [f"band={b}/segment={s}" for b in range(num_bands) for s in range(num_segments)]
-
-    def _init_spark(self, app_name: str = "copy-app") -> SparkSession:
-        server_port_https = int(os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "-1"))
-        if server_port_https == -1:
-            # we are running locally
-            spark_config = {"spark.driver.host": "127.0.0.1"}
-            return SparkSession.builder.appName(app_name).config(map=spark_config).getOrCreate()
-        else:
-            # we are running in Kubernetes, use spark_profile.yml and
-            # environment variables for configuration
-
-            server_port = os.environ["KUBERNETES_SERVICE_PORT"]
-            master_url = f"k8s://https://kubernetes.default:{server_port}"
-
-            # Read Spark configuration profile
-            config_filepath = os.path.abspath(
-                os.path.join(os.getenv("SPARK_HOME"), "work-dir", "config", "spark_profile.yml")
-            )
-            with open(config_filepath, "r") as config_fp:
-                spark_config = yaml.safe_load(os.path.expandvars(config_fp.read()))
-            spark_config["spark.submit.deployMode"] = "client"
-
-            # configure the executor pods from template
-            executor_pod_template_file = os.path.join(
-                os.getenv("SPARK_HOME"),
-                "work-dir",
-                "src",
-                "templates",
-                "spark-executor-pod-template.yml",
-            )
-            spark_config["spark.kubernetes.executor.podTemplateFile"] = executor_pod_template_file
-            spark_config["spark.kubernetes.container.image.pullPolicy"] = "Always"
-
-            # Pass the driver IP address to the workers for callback
-            myservice_url = socket.gethostbyname(socket.gethostname())
-            spark_config["spark.driver.host"] = myservice_url
-            spark_config["spark.driver.bindAddress"] = "0.0.0.0"
-
-            spark_config["spark.decommission.enabled"] = True
-            logger.info(f"Launching Spark Session with configuration\n" f"{yaml.dump(spark_config, indent=2)}")
-            app_name = spark_config.get("spark.app.name", "my-spark-app")
-            return SparkSession.builder.master(master_url).appName(app_name).config(map=spark_config).getOrCreate()
-
-    def create_data_access_factory(self, root_folder: str, use_s3: bool) -> DataAccessFactoryBase:
-        input_folder = root_folder
-        output_folder = root_folder
-        data_access_factory: DataAccessFactoryBase = DataAccessFactory()
-        daf_args = []
-        if use_s3:
-            s3_creds = {
-                "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
-                "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
-                "url": os.getenv("AWS_ENDPOINT_URL"),
-            }
-            s3_config = {
-                "input_folder": root_folder,
-                "output_folder": root_folder,
-            }
-            daf_args.append("--data_s3_cred")
-            daf_args.append(ParamsUtils.convert_to_ast(s3_creds))
-            daf_args.append("--data_s3_config")
-            daf_args.append(ParamsUtils.convert_to_ast(s3_config)),
-        else:
-            local_config = {
-                "input_folder": root_folder,
-                "output_folder": os.path.join(root_folder, "bands_consolidated"),
-            }
-            daf_args.append("--data_local_config")
-            daf_args.append(ParamsUtils.convert_to_ast(local_config))
-        daf_parser = argparse.ArgumentParser()
-        data_access_factory.add_input_params(parser=daf_parser)
-        data_access_factory_args = daf_parser.parse_args(args=daf_args)
-        data_access_factory.apply_input_params(args=data_access_factory_args)
-
-        return data_access_factory
-
-    def orchestrate(
-        self, runtime_config: dict, execution_config: dict, data_access_factory: DataAccessFactoryBase, data_type: str
-    ) -> int:
-        """
-        orchestrator for transformer execution
-        :param execution_config: orchestrator configuration
-        :param data_access_factory: data access factory
-        :param runtime_config: transformer runtime configuration
-        :return: 0 - success or 1 - failure
-        """
-        start_time = time.time()
-        start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        logger.info(f"orchestrator started at {start_ts}")
-        data_access = data_access_factory.create_data_access()
-        # initialize Spark
-        spark_session = self._init_spark()
-        sc = spark_session.sparkContext
-        transform_config = sc.broadcast(runtime_config)
-        daf = sc.broadcast(data_access_factory)
-        data_type = data_type
-        print("data_type")
-        print(data_type)
-
-        def process_partition(iterator):
-            """
-            process partitions
-            :param iterator: iterator of records
-            :return:
-            """
-            # local statistics dictionary
-            stats = {}
-            # create file processor
-            file_processor = FileCopyUtil(
-                data_access_factory=daf.value,
-                config=transform_config.value,
-                stats=stats,
-            )
-            for f in iterator:
-                stats = file_processor.copy_data(subfolder_name=f[0], data_type=data_type)
-            # return partition's statistics
-            return list(stats.items())
-
-        num_partitions = 0
-        try:
-            if data_type == "bands":
-                # Get files to process
-                files = [
-                    f"band={band}/segment={segment}"
-                    for band in range(self.num_bands)
-                    for segment in range(self.num_segments)
-                ]
-            elif data_type == "docs_to_remove":
-                files = ["docs_to_remove"]
-            print(data_type)
-
-            if len(files) == 0:
-                logger.error("No input files to process - exiting")
-                return 0
-            logger.info(f"Number of files is {len(files)}")
-            # process data
-            logger.debug("Begin processing files")
-            source_rdd = sc.parallelize(files, execution_config.get("parallelization"))
-            num_partitions = source_rdd.getNumPartitions()
-            logger.info(f"Parallelizing execution. Using {num_partitions} partitions")
-            stats_rdd = source_rdd.zipWithIndex().mapPartitions(process_partition)
-            # build overall statistics
-            stats = dict(stats_rdd.reduceByKey(lambda a, b: a + b).collect())
-            return_code = 0
-            status = "success"
-        except Exception as e:
-            # process execution exception
-            logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
-            return_code = 1
-            status = "failure"
-            stats = {}
-        try:
-            # build and save metadata
-            logger.debug("Building job metadata")
-            input_params = runtime_config
-            # input_params = runtime_config.get_transform_metadata() | execution_config.get_input_params()
-            metadata = {
-                "job details": {
-                    "start_time": start_ts,
-                    "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                    "status": status,
-                },
-                "job_input_params": input_params | data_access_factory.get_input_params(),
-                "execution_stats": {
-                    "num partitions": num_partitions,
-                    "execution time, min": (time.time() - start_time) / 60,
-                },
-                "job_output_stats": stats,
-            }
-            logger.debug(f"Saving job metadata: {metadata}.")
-
-            if data_access_factory.s3_config is not None:
-                _, root_folder = self.root_folder.split("://")
-                in_path = os.path.join(root_folder, "bands")
-                out_path = os.path.join(root_folder, "bands_consolidated")
-                data_access.input_folder = f"{in_path}{os.sep}"
-                data_access.output_folder = f"{out_path}{os.sep}"
-            else:
-                data_access.input_folder = os.path.join(self.root_folder, "bands")
-                data_access.output_folder = os.path.join(self.root_folder, "bands_consolidated")
-            data_access.save_job_metadata(metadata)
-            logger.debug("Saved job metadata.")
-            return return_code
-        except Exception as e:
-            logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
-            return 1
-        finally:
-            # stop spark context at the end. Required for running multiple tests
-            spark_session.stop()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--root_folder",
-        type=str,
-        default="/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_second/",
-        help="root folder",
-    )
-    parser.add_argument(
-        "--num_bands",
-        type=int,
-        default=14,
-        help="number of bands",
-    )
-    parser.add_argument(
-        "--num_segments",
-        type=int,
-        default=2,
-        help="number of segments",
-    )
-    parser.add_argument(
-        "--data_type",
-        type=str,
-        default="docs_to_remove",
-        help="bands or doc2remove",
-    )
-    parser.add_argument(
-        "--parallelization",
-        type=int,
-        default=-1,
-        help="spark parallelization",
-    )
-    parser.add_argument(
-        "--use_s3",
-        type=bool,
-        default=False,
-        help="use s3",
-    )
-    args = parser.parse_args()
-    fcs = FileCopySpark(args.root_folder, args.num_bands, args.num_segments, args.use_s3)
-    data_access_factory = fcs.create_data_access_factory(args.root_folder, args.use_s3)
-    app_config = {"root_folder": args.root_folder}
-    execution_config = {"parallelization": args.parallelization} if args.parallelization > 0 else {}
-    status = fcs.orchestrate(app_config, execution_config, data_access_factory, args.data_type)
-    print(f"Orchestrate concluded with status {status}")

From ba9b07ca0a9a4821df0f38cf488db4fc8db7408e Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 25 Oct 2024 10:00:13 -0400
Subject: [PATCH 049/105] Add fdedup to kfp black list until we get kfp
 integration

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 scripts/check-workflows.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh
index d1f934368..d431f6fbd 100755
--- a/scripts/check-workflows.sh
+++ b/scripts/check-workflows.sh
@@ -17,7 +17,7 @@ if [ ! -d transforms ]; then
     echo Please run this script from the top of the repository
     exit 1
 fi
-KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering"
+KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering fdedup"
 while [ $# -ne 0 ]; do
    case $1 in
         -show-kfp-black-list)    echo $KFP_BLACK_LIST; exit 0;

From f1879487bc4106f1b776ed6529e8e706096c4bc9 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 25 Oct 2024 10:05:15 -0400
Subject: [PATCH 050/105] Freeze polars version to 1.9.0 for now

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/python/pyproject.toml    |  4 ++--
 .../universal/fdedup/python/requirements.txt  |  2 +-
 .../universal/fdedup/spark/requirements.txt   | 20 +++++++++----------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml
index fa815441c..f46c8e8c4 100644
--- a/transforms/universal/fdedup/python/pyproject.toml
+++ b/transforms/universal/fdedup/python/pyproject.toml
@@ -15,9 +15,9 @@ dependencies = [
     "pyyaml>=6.0.2",
     "boto3>=1.34.69",
     "kubernetes>=30.1.0",
-    "polars>=1.6.0",
+    "polars==1.9.0",
     "disjoint-set>=0.8.0",
-    "scipy>=1.14.1",
+    "scipy>=1.14.1, <2.0.0",
     "numpy<1.29.0",
     "sentencepiece>=0.2.0",
     "mmh3>=4.1.0",
diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt
index 85806f809..576c028a8 100644
--- a/transforms/universal/fdedup/python/requirements.txt
+++ b/transforms/universal/fdedup/python/requirements.txt
@@ -2,7 +2,7 @@ pyarrow==16.1.0
 pyyaml>=6.0.2
 boto3>=1.34.69
 kubernetes>=30.1.0
-polars>=1.6.0
+polars==1.9.0
 disjoint-set>=0.8.0
 numpy<1.29.0
 sentencepiece>=0.2.0
diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt
index 10f3e129b..576c028a8 100644
--- a/transforms/universal/fdedup/spark/requirements.txt
+++ b/transforms/universal/fdedup/spark/requirements.txt
@@ -1,10 +1,10 @@
-pyarrow
-pyyaml
-boto3
-kubernetes
-polars
-disjoint-set
-scipy
-numpy
-sentencepiece
-mmh3
+pyarrow==16.1.0
+pyyaml>=6.0.2
+boto3>=1.34.69
+kubernetes>=30.1.0
+polars==1.9.0
+disjoint-set>=0.8.0
+numpy<1.29.0
+sentencepiece>=0.2.0
+mmh3>=4.1.0
+scipy>=1.12.0, <2.0.0

From 84b9104a7791661d368345d3c5b8e8cd02a67a19 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 25 Oct 2024 10:08:47 -0400
Subject: [PATCH 051/105] Fixed duplicate_list_location bug

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../python/src/data_cleaning_transform_python.py  | 15 +++++++++++----
 .../fdedup/ray/src/data_cleaning_transform_ray.py |  8 +++++---
 .../spark/src/data_cleaning_transform_spark.py    | 15 +++++++++++----
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
index e5c1e5025..9c60ecbba 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
@@ -13,7 +13,11 @@
 import os
 from typing import Any
 
-from data_cleaning_transform import DataCleaningTransformConfiguration
+from data_cleaning_transform import (
+    DataCleaningTransformConfiguration,
+    duplicate_list_location_default,
+    duplicate_list_location_key,
+)
 from data_processing.data_access import DataAccessFactoryBase
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.runtime.pure_python.runtime_configuration import (
@@ -53,9 +57,12 @@ def get_transform_config(
         :return: dictionary of transform init params
         """
         data_access = data_access_factory.create_data_access()
-        duplicate_list_location = os.path.abspath(
-            os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"])
-        )
+        duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
+        if not duplicate_list_location.startswith("/"):
+            out_paths = data_access.output_folder.rstrip("/").split("/")
+            dupl_list_paths = duplicate_list_location.split("/")
+            paths = out_paths[:-1] + dupl_list_paths
+            duplicate_list_location = "/".join([p.strip("/") for p in paths])
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
         self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
index e83960c24..5ed2cecbe 100644
--- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
+++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
@@ -91,9 +91,11 @@ def get_transform_config(
         """
         data_access = data_access_factory.create_data_access()
         duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
-        duplicate_list_location = os.path.abspath(
-            os.path.join(data_access.output_folder, "..", duplicate_list_location)
-        )
+        if not duplicate_list_location.startswith("/"):
+            out_paths = data_access.output_folder.rstrip("/").split("/")
+            dupl_list_paths = duplicate_list_location.split("/")
+            paths = out_paths[:-1] + dupl_list_paths
+            duplicate_list_location = "/".join([p.strip("/") for p in paths])
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
         duplicate_list, retries = data_access.get_file(duplicate_list_location)
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
index 29890d05f..56c10d801 100644
--- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
+++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
@@ -13,7 +13,11 @@
 import os
 from typing import Any
 
-from data_cleaning_transform import DataCleaningTransformConfiguration
+from data_cleaning_transform import (
+    DataCleaningTransformConfiguration,
+    duplicate_list_location_default,
+    duplicate_list_location_key,
+)
 from data_processing.data_access import DataAccessFactoryBase
 from data_processing.transform import TransformStatistics
 from data_processing.utils import get_logger
@@ -53,9 +57,12 @@ def get_transform_config(
         :return: dictionary of transform init params
         """
         data_access = data_access_factory.create_data_access()
-        duplicate_list_location = os.path.abspath(
-            os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"])
-        )
+        duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
+        if not duplicate_list_location.startswith("/"):
+            out_paths = data_access.output_folder.rstrip("/").split("/")
+            dupl_list_paths = duplicate_list_location.split("/")
+            paths = out_paths[:-1] + dupl_list_paths
+            duplicate_list_location = "/".join([p.strip("/") for p in paths])
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
         self.duplicate_list, retries = data_access.get_file(duplicate_list_location)

From 08ff0069f00d0a84c8ef6cd3e2f55eefc098b2fb Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 25 Oct 2024 10:10:01 -0400
Subject: [PATCH 052/105] Allow input of s3 credentials on command line

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/python/src/fuzzy_dedup_python.py   | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
index acb1be3bb..054447e70 100644
--- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
+++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
@@ -1,4 +1,5 @@
 import argparse
+import ast
 import os
 import sys
 
@@ -119,8 +120,17 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list:
             "output_folder": output_folder,
         }
         if in_args.use_s3:
-            sys_argv.append("--data_s3_cred")
-            sys_argv.append(ParamsUtils.convert_to_ast(s3_creds))
+            if in_args.s3_cred is not None:
+                s3_cred_ast = ParamsUtils.convert_to_ast(in_args.s3_cred)
+                sys_argv.append("--data_s3_cred")
+                sys_argv.append(s3_cred_ast)
+            elif (
+                s3_creds.get("access_key") is not None
+                and s3_creds.get("secret_key") is not None
+                and s3_creds.get("url") is not None
+            ):
+                sys_argv.append("--data_s3_cred")
+                sys_argv.append(ParamsUtils.convert_to_ast(s3_creds))
             sys_argv.append("--data_s3_config")
         else:
             sys_argv.append("--data_local_config")
@@ -207,6 +217,13 @@ def parse_args() -> argparse.Namespace:
         help="use s3",
     )
 
+    parser.add_argument(
+        "--s3_cred",
+        type=ast.literal_eval,
+        default=None,
+        help="ast string of options for s3 credentials",
+    )
+
     return parser.parse_args()
 
 
From d0c6f8a72efe75ccbfce0d89fd56b9b06dac4cb1 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 25 Oct 2024 10:53:10 -0400
Subject: [PATCH 053/105] Added license

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/python/src/Murmur_MH.py | 13 +++++++++++++
 .../fdedup/python/src/fuzzy_dedup_python.py         | 12 ++++++++++++
 .../universal/fdedup/ray/src/fuzzy_dedup_ray.py     | 12 ++++++++++++
 .../universal/fdedup/spark/src/fuzzy_dedup_spark.py | 12 ++++++++++++
 4 files changed, 49 insertions(+)

diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py
index e3442ba02..03d5047ea 100644
--- a/transforms/universal/fdedup/python/src/Murmur_MH.py
+++ b/transforms/universal/fdedup/python/src/Murmur_MH.py
@@ -1,3 +1,16 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+
 import logging
 import os
 from typing import List, Set
diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
index 054447e70..bdd78c7da 100644
--- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
+++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
@@ -1,3 +1,15 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
 import argparse
 import ast
 import os
diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
index 0b9be33ca..0d4c2954f 100644
--- a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
+++ b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
@@ -1,3 +1,15 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
 import argparse
 import os
 import sys
diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
index 5217f2f7b..58688de42 100644
--- a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
+++ b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
@@ -1,3 +1,15 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
 import argparse
 import os
 import sys

From 63e11eb729a85f3a1cf349b21e19a680f300ec10 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 25 Oct 2024 11:49:22 -0400
Subject: [PATCH 054/105] Use str2bool for use_s3 argument

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/python/src/fuzzy_dedup_python.py    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
index bdd78c7da..7135054d2 100644
--- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
+++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
@@ -24,7 +24,7 @@
 )
 from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
 from data_processing.runtime.pure_python import PythonTransformLauncher
-from data_processing.utils import ParamsUtils, get_logger
+from data_processing.utils import ParamsUtils, get_logger, str2bool
 from get_duplicate_list_transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
@@ -159,6 +159,10 @@ def execute_service(self, service_short_name: str, params: list) -> int:
             launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
         elif service_short_name == "fdclean":
             launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration())
+        else:
+            err_msg = f"Unknown service {service_short_name} specified. Must be one of {SERVICE_DICT.values()}"
+            self.logger.error(err_msg)
+            raise ValueError(err_msg)
         status = launcher.launch()
         return status
 
@@ -225,7 +229,8 @@ def parse_args() -> argparse.Namespace:
 
     parser.add_argument(
         "--use_s3",
-        action="store_true",
+        type=lambda x: bool(str2bool(x)),
+        default=False,
         help="use s3",
     )
 

From bf550fde9ad3d1d9e8f7bd0f7f75b25df12d24a2 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Tue, 29 Oct 2024 19:22:55 -0400
Subject: [PATCH 055/105] Add overwrite output path argument

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../python/src/signature_calc_transform.py    | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py
index 7c4dd391c..03f9bc9b4 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py
@@ -48,6 +48,8 @@
 """ This key holds the size of the word shingles calculated for each document"""
 num_segments_key = "num_segments"
 """ This key holds the number of segments across which we divide the hashing space for each band"""
+overwrite_output_path_key = "overwrite_output_path"
+""" This key holds the overwrite output path"""
 
 # command line arguments
 document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
@@ -68,6 +70,8 @@
 """ The size of the word shingles calculated for each document"""
 num_segments_cli_param = f"{cli_prefix}{num_segments_key}"
 """ The number of segments across which we divide the hashing space for each band"""
+overwrite_output_path_cli_param = f"{cli_prefix}{overwrite_output_path_key}"
+""" The overwrite output path"""
 
 captured_arg_keys = [
     document_id_column_key,
@@ -79,6 +83,7 @@
     jaccard_similarity_threshold_key,
     word_shingle_size_key,
     num_segments_key,
+    overwrite_output_path_key,
 ]
 
 # defaults
@@ -100,6 +105,8 @@
 """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)"""
 num_segments_default = 1
 """ Default number of segments across which we divide the hashing space for each band"""
+overwrite_output_path_default = None
+""" Default overwrite output path (no overwrite)"""
 
 
 NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?")
@@ -136,7 +143,8 @@ class SignatureCalculationTransform(AbstractTableTransform):
         num_minhashes_per_band: number of minhashes to use in each band
         jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates
         word_shingle_size: the size of the word shingles calculated for each document
-        num_segments the number of segments across which we divide the hashing space for each band
+        num_segments: the number of segments across which we divide the hashing space for each band
+        overwrite_output_path: specify an output path other than the one used by the data_access
     """
 
     def __init__(self, config: dict[str, Any]):
@@ -158,6 +166,7 @@ def __init__(self, config: dict[str, Any]):
         self.num_segments = config.get(num_segments_key, num_segments_default)
         self.num_bands = config.get(num_bands_key, num_bands_default)
         self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default)
+        self.overwrite_output_path = config.get(overwrite_output_path_key, overwrite_output_path_default)
         # use this dataframe to store the minhashes and size for each document
         self.all_minhashes: pl.DataFrame = None
         # use this dataframe to store the band hashes for each document
@@ -311,7 +320,7 @@ def write_band_signatures(self):
                 last_file_name_path = Path(self.last_file_name)
                 suffix_path = last_file_name_path.relative_to(self.data_access.input_folder)
                 save_path = os.path.join(
-                    self.data_access.output_folder,
+                    self.overwrite_output_path if self.overwrite_output_path else self.data_access.output_folder,
                     "bands",
                     f"band={band_ix}",
                     f"segment={segment_index}",
@@ -470,6 +479,12 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=num_segments_default,
             help="the number of segments across which we divide the hashing space for each band",
         )
+        parser.add_argument(
+            f"--{overwrite_output_path_cli_param}",
+            type=str,
+            default=overwrite_output_path_default,
+            help="overwrite of the output path",
+        )
 
     def apply_input_params(self, args: Namespace) -> bool:
         """

From 272be3697239019ad604badcaf4ae2d8fd3c654b Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Wed, 30 Oct 2024 16:40:40 -0400
Subject: [PATCH 056/105] Add separate data access objects for reading and
 writing files

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../python/src/signature_calc_local_python.py | 21 +++++++---
 .../python/src/signature_calc_transform.py    | 41 ++++++++++---------
 2 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
index 062580f22..2800c70cd 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
@@ -12,6 +12,7 @@
 
 import os
 import sys
+from ast import Param
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
@@ -22,12 +23,23 @@
 
 # create parameters
 input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "test_scdata"))
 local_conf = {"input_folder": input_folder, "output_folder": output_folder}
 code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+s3_creds = {
+    "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
+    "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+    "url": os.getenv("AWS_ENDPOINT_URL"),
+}
+s3_config = {
+    "input_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_data/",
+    "output_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_output_data/s3_test_3/",
+}
+
 params = {
     # Data access. Only required parameters are specified
     "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    "scdata_local_config": ParamsUtils.convert_to_ast(local_conf),
     # execution info
     "runtime_pipeline_id": "pipeline_id",
     "runtime_job_id": "job_id",
@@ -35,6 +47,8 @@
     "minhash_num_permutations": 112,
     "minhash_num_bands": 14,
     "minhash_num_segments": 2,
+    # "scdata_s3_cred": ParamsUtils.convert_to_ast(s3_creds),
+    # "scdata_s3_config": ParamsUtils.convert_to_ast(s3_config),
 }
 
 
@@ -44,11 +58,6 @@
     print(sys.argv)
 
     sys.argv.append("--data_s3_cred")
-    s3_creds = {
-        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
-        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
-        "url": os.getenv("AWS_ENDPOINT_URL"),
-    }
     sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
 
     # create launcher
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py
index 03f9bc9b4..159697d19 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py
@@ -20,10 +20,10 @@
 import numpy as np
 import polars as pl
 import pyarrow as pa
+from data_processing.data_access import DataAccessFactory
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
 from data_processing.utils import CLIArgumentProvider
 from Murmur_MH import Murmur_MH
-from scipy.integrate import quad as integrate
 
 
 short_name = "minhash"
@@ -48,8 +48,6 @@
 """ This key holds the size of the word shingles calculated for each document"""
 num_segments_key = "num_segments"
 """ This key holds the number of segments across which we divide the hashing space for each band"""
-overwrite_output_path_key = "overwrite_output_path"
-""" This key holds the overwrite output path"""
 
 # command line arguments
 document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
@@ -70,8 +68,6 @@
 """ The size of the word shingles calculated for each document"""
 num_segments_cli_param = f"{cli_prefix}{num_segments_key}"
 """ The number of segments across which we divide the hashing space for each band"""
-overwrite_output_path_cli_param = f"{cli_prefix}{overwrite_output_path_key}"
-""" The overwrite output path"""
 
 captured_arg_keys = [
     document_id_column_key,
@@ -83,7 +79,6 @@
     jaccard_similarity_threshold_key,
     word_shingle_size_key,
     num_segments_key,
-    overwrite_output_path_key,
 ]
 
 # defaults
@@ -105,8 +100,10 @@
 """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)"""
 num_segments_default = 1
 """ Default number of segments across which we divide the hashing space for each band"""
-overwrite_output_path_default = None
-""" Default overwrite output path (no overwrite)"""
+
+
+sigcalc_data_factory_key = "sc_data_factory"
+sigcalc_data_access_key = "sc_data_access"
 
 
 NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?")
@@ -144,7 +141,6 @@ class SignatureCalculationTransform(AbstractTableTransform):
         jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates
         word_shingle_size: the size of the word shingles calculated for each document
         num_segments: the number of segments across which we divide the hashing space for each band
-        overwrite_output_path: specify an output path other than the one used by the data_access
     """
 
     def __init__(self, config: dict[str, Any]):
@@ -166,7 +162,6 @@ def __init__(self, config: dict[str, Any]):
         self.num_segments = config.get(num_segments_key, num_segments_default)
         self.num_bands = config.get(num_bands_key, num_bands_default)
         self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default)
-        self.overwrite_output_path = config.get(overwrite_output_path_key, overwrite_output_path_default)
         # use this dataframe to store the minhashes and size for each document
         self.all_minhashes: pl.DataFrame = None
         # use this dataframe to store the band hashes for each document
@@ -177,6 +172,12 @@ def __init__(self, config: dict[str, Any]):
         self.bytes_processed = 0
         self.data_access = config.get("data_access")
         self.last_file_name = None
+        self.sc_data_access = config.get(sigcalc_data_access_key, None)
+        if self.sc_data_access is None:
+            self.sc_daf = config.get(sigcalc_data_factory_key, None)
+            if self.sc_daf is None:
+                raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}")
+            self.sc_data_access = self.sc_daf.create_data_access()
 
     def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
         """
@@ -319,15 +320,17 @@ def write_band_signatures(self):
                 common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name])
                 last_file_name_path = Path(self.last_file_name)
                 suffix_path = last_file_name_path.relative_to(self.data_access.input_folder)
+                if self.sc_data_access.output_folder is None:
+                    self.sc_data_access.output_folder = self.data_access.output_folder
                 save_path = os.path.join(
-                    self.overwrite_output_path if self.overwrite_output_path else self.data_access.output_folder,
+                    self.sc_data_access.output_folder,
                     "bands",
                     f"band={band_ix}",
                     f"segment={segment_index}",
                     suffix_path,
                 )
                 segment_band_minhash_table = segment_band_minhash_df.to_arrow()
-                bytes_written, _, _ = self.data_access.save_table(save_path, segment_band_minhash_table)
+                bytes_written, _, _ = self.sc_data_access.save_table(save_path, segment_band_minhash_table)
                 if bytes_written > 0:
                     num_tables_written += 1
                     num_docs_written += segment_band_minhash_table.num_rows
@@ -412,8 +415,10 @@ def __init__(self):
         super().__init__(
             name=short_name,
             transform_class=SignatureCalculationTransform,
-            remove_from_metadata=[],
+            remove_from_metadata=[sigcalc_data_factory_key],
         )
+        self.daf = DataAccessFactory(cli_arg_prefix="scdata_")
+
         from data_processing.utils import get_logger
 
         self.logger = get_logger(__name__, level="INFO")
@@ -479,12 +484,7 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=num_segments_default,
             help="the number of segments across which we divide the hashing space for each band",
         )
-        parser.add_argument(
-            f"--{overwrite_output_path_cli_param}",
-            type=str,
-            default=overwrite_output_path_default,
-            help="overwrite of the output path",
-        )
+        self.daf.add_input_params(parser=parser)
 
     def apply_input_params(self, args: Namespace) -> bool:
         """
@@ -495,4 +495,5 @@ def apply_input_params(self, args: Namespace) -> bool:
         captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
         self.params = self.params | captured
         self.logger.info(f"{short_name} parameters are : {self.params}")
-        return True
+        self.params[sigcalc_data_factory_key] = self.daf
+        return self.daf.apply_input_params(args=args)

From ee411e1bd7957a802857b2b1ac6703f0d50c2968 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Thu, 31 Oct 2024 16:46:09 -0400
Subject: [PATCH 057/105] Define 2 data access objects for data and duplicate
 list

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/python/src/data_cleaning_transform.py   | 10 +++++++++-
 .../python/src/data_cleaning_transform_python.py   | 14 ++++++++++++--
 .../fdedup/ray/src/data_cleaning_transform_ray.py  | 14 ++++++++++++--
 .../spark/src/data_cleaning_transform_spark.py     | 14 ++++++++++++--
 4 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
index 8e17b757f..1a349ae85 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
@@ -17,6 +17,7 @@
 import numpy as np
 import polars as pl
 import pyarrow as pa
+from data_processing.data_access import DataAccessFactory
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
 from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger
 
@@ -53,6 +54,9 @@
 operation_mode_default = "filter_duplicates"
 """ Default value for operation mode, will filter out all the duplicate documents"""
 
+dataclean_data_factory_key = "dc_data_factory"
+dataclean_data_access_key = "dc_data_access"
+
 
 class DataCleaningTransform(AbstractTableTransform):
     """
@@ -129,7 +133,9 @@ def __init__(self, transform_class: type[AbstractTableTransform] = DataCleaningT
         super().__init__(
             name=short_name,
             transform_class=transform_class,
+            remove_from_metadata=[dataclean_data_factory_key],
         )
+        self.daf = DataAccessFactory(cli_arg_prefix="dcdata_")
         self.logger = get_logger(__name__, level="INFO")
 
     def add_input_params(self, parser: ArgumentParser) -> None:
@@ -157,6 +163,7 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=operation_mode_default,
             help="operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents",
         )
+        self.daf.add_input_params(parser=parser)
 
     def apply_input_params(self, args: Namespace) -> bool:
         """
@@ -167,4 +174,5 @@ def apply_input_params(self, args: Namespace) -> bool:
         captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
         self.params = self.params | captured
         self.logger.info(f"{short_name} parameters are : {self.params}")
-        return True
+        self.params[dataclean_data_factory_key] = self.daf
+        return self.daf.apply_input_params(args=args)
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
index 9c60ecbba..edef8b9c5 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
@@ -15,6 +15,8 @@
 
 from data_cleaning_transform import (
     DataCleaningTransformConfiguration,
+    dataclean_data_access_key,
+    dataclean_data_factory_key,
     duplicate_list_location_default,
     duplicate_list_location_key,
 )
@@ -57,15 +59,23 @@ def get_transform_config(
         :return: dictionary of transform init params
         """
         data_access = data_access_factory.create_data_access()
+        dc_data_access = self.params.get(dataclean_data_access_key, None)
+        if dc_data_access is None:
+            dc_daf = self.params.get(dataclean_data_factory_key, None)
+            if dc_daf is None:
+                raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}")
+            dc_data_access = dc_daf.create_data_access()
+        if dc_data_access.output_folder is None:
+            dc_data_access.output_folder = data_access.output_folder
         duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
         if not duplicate_list_location.startswith("/"):
-            out_paths = data_access.output_folder.rstrip("/").split("/")
+            out_paths = dc_data_access.output_folder.rstrip("/").split("/")
             dupl_list_paths = duplicate_list_location.split("/")
             paths = out_paths[:-1] + dupl_list_paths
             duplicate_list_location = "/".join([p.strip("/") for p in paths])
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
-        self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
+        self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location)
         return self.params | {"df": self.duplicate_list}
 
 
diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
index 5ed2cecbe..88171e260 100644
--- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
+++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
@@ -17,6 +17,8 @@
 from data_cleaning_transform import (
     DataCleaningTransform,
     DataCleaningTransformConfiguration,
+    dataclean_data_access_key,
+    dataclean_data_factory_key,
     duplicate_list_location_default,
     duplicate_list_location_key,
 )
@@ -90,15 +92,23 @@ def get_transform_config(
         :return: dictionary of filter init params
         """
         data_access = data_access_factory.create_data_access()
+        dc_data_access = self.params.get(dataclean_data_access_key, None)
+        if dc_data_access is None:
+            dc_daf = self.params.get(dataclean_data_factory_key, None)
+            if dc_daf is None:
+                raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}")
+            dc_data_access = dc_daf.create_data_access()
+        if dc_data_access.output_folder is None:
+            dc_data_access.output_folder = data_access.output_folder
         duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
         if not duplicate_list_location.startswith("/"):
-            out_paths = data_access.output_folder.rstrip("/").split("/")
+            out_paths = dc_data_access.output_folder.rstrip("/").split("/")
             dupl_list_paths = duplicate_list_location.split("/")
             paths = out_paths[:-1] + dupl_list_paths
             duplicate_list_location = "/".join([p.strip("/") for p in paths])
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
-        duplicate_list, retries = data_access.get_file(duplicate_list_location)
+        duplicate_list, retries = dc_data_access.get_file(duplicate_list_location)
         docs_to_remove_list = ray.put(duplicate_list)
         return {"df": docs_to_remove_list} | self.params
 
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
index 56c10d801..2ff0df8bf 100644
--- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
+++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
@@ -15,6 +15,8 @@
 
 from data_cleaning_transform import (
     DataCleaningTransformConfiguration,
+    dataclean_data_access_key,
+    dataclean_data_factory_key,
     duplicate_list_location_default,
     duplicate_list_location_key,
 )
@@ -57,15 +59,23 @@ def get_transform_config(
         :return: dictionary of transform init params
         """
         data_access = data_access_factory.create_data_access()
+        dc_data_access = self.params.get(dataclean_data_access_key, None)
+        if dc_data_access is None:
+            dc_daf = self.params.get(dataclean_data_factory_key, None)
+            if dc_daf is None:
+                raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}")
+            dc_data_access = dc_daf.create_data_access()
+        if dc_data_access.output_folder is None:
+            dc_data_access.output_folder = data_access.output_folder
         duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default)
         if not duplicate_list_location.startswith("/"):
-            out_paths = data_access.output_folder.rstrip("/").split("/")
+            out_paths = dc_data_access.output_folder.rstrip("/").split("/")
             dupl_list_paths = duplicate_list_location.split("/")
             paths = out_paths[:-1] + dupl_list_paths
             duplicate_list_location = "/".join([p.strip("/") for p in paths])
         if duplicate_list_location.startswith("s3://"):
             _, duplicate_list_location = duplicate_list_location.split("://")
-        self.duplicate_list, retries = data_access.get_file(duplicate_list_location)
+        self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location)
         return self.params | {"df": self.duplicate_list}
 
 
From 3a3050125ef8987f85ba85b59ca13f928f812584 Mon Sep 17 00:00:00 2001
From: David Wood <dawood@us.ibm.com>
Date: Fri, 1 Nov 2024 10:18:34 -0400
Subject: [PATCH 058/105] get fdedeup/python test-image to pass, and clean up
 req in ray version

Signed-off-by: David Wood <dawood@us.ibm.com>
---
 transforms/universal/fdedup/python/Dockerfile       |  2 +-
 transforms/universal/fdedup/python/requirements.txt | 10 ----------
 transforms/universal/fdedup/ray/Dockerfile          |  3 +--
 transforms/universal/fdedup/ray/pyproject.toml      |  1 +
 4 files changed, 3 insertions(+), 13 deletions(-)
 delete mode 100644 transforms/universal/fdedup/python/requirements.txt

diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile
index a0a557060..f8c41791e 100644
--- a/transforms/universal/fdedup/python/Dockerfile
+++ b/transforms/universal/fdedup/python/Dockerfile
@@ -18,7 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
 COPY --chown=dpk:root src/ src/
 COPY --chown=dpk:root pyproject.toml pyproject.toml
 COPY --chown=dpk:root README.md README.md
-COPY --chown=dpk:root requirements.txt requirements.txt
+#COPY --chown=dpk:root requirements.txt requirements.txt
 
 RUN pip install --no-cache-dir -e .
 
diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt
deleted file mode 100644
index 576c028a8..000000000
--- a/transforms/universal/fdedup/python/requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-pyarrow==16.1.0
-pyyaml>=6.0.2
-boto3>=1.34.69
-kubernetes>=30.1.0
-polars==1.9.0
-disjoint-set>=0.8.0
-numpy<1.29.0
-sentencepiece>=0.2.0
-mmh3>=4.1.0
-scipy>=1.12.0, <2.0.0
diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile
index ec2c56f28..e921c4749 100644
--- a/transforms/universal/fdedup/ray/Dockerfile
+++ b/transforms/universal/fdedup/ray/Dockerfile
@@ -1,5 +1,4 @@
-ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
-
+ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310
 FROM ${BASE_IMAGE}
 
 USER ray
diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml
index 9c533231a..6a871abea 100644
--- a/transforms/universal/fdedup/ray/pyproject.toml
+++ b/transforms/universal/fdedup/ray/pyproject.toml
@@ -10,6 +10,7 @@ authors = [
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
 dependencies = [
+    "dpk_fdedup_transform_python==0.2.2.dev1",
     "data-prep-toolkit-ray==0.2.2.dev1",
     "mmh3>=4.1.0",
     "xxhash==3.4.1",

From 80ae8df747998feb0f4dba49ec4322ace854d01c Mon Sep 17 00:00:00 2001
From: nelson <kibnelson@gmail.com>
Date: Fri, 8 Nov 2024 16:51:39 -0500
Subject: [PATCH 059/105] Added an option to run either word or char shingle

Signed-off-by: nelson <kibnelson@gmail.com>
---
 .../fdedup/python/src/fuzzy_dedup_python.py   |  8 ++++++
 .../python/src/signature_calc_transform.py    | 26 ++++++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
index 7135054d2..bc5f3fded 100644
--- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
+++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
@@ -57,6 +57,7 @@
         signature_calc_transform.jaccard_similarity_threshold_key,
         signature_calc_transform.word_shingle_size_key,
         signature_calc_transform.num_segments_key,
+        signature_calc_transform.shingle_option_key,
     ],
     "cluster": [
         cluster_analysis_transform.jaccard_similarity_threshold_key,
@@ -240,6 +241,13 @@ def parse_args() -> argparse.Namespace:
         default=None,
         help="ast string of options for s3 credentials",
     )
+    parser.add_argument(
+        "--shingle_option",
+        type=str,
+        required=False,
+        default="word",
+        help="Option used for shingling",
+    )
 
     return parser.parse_args()
 
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py
index 159697d19..2ed3ed258 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py
@@ -48,6 +48,8 @@
 """ This key holds the size of the word shingles calculated for each document"""
 num_segments_key = "num_segments"
 """ This key holds the number of segments across which we divide the hashing space for each band"""
+shingle_option_key = "shingle_option"
+""" This key holds the option that is used to do shingles calculation for each document"""
 
 # command line arguments
 document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}"
@@ -68,6 +70,8 @@
 """ The size of the word shingles calculated for each document"""
 num_segments_cli_param = f"{cli_prefix}{num_segments_key}"
 """ The number of segments across which we divide the hashing space for each band"""
+shingle_option_cli_param = f"{cli_prefix}{shingle_option_key}"
+""" This key holds the option that is used to do shingles calculation for each document"""
 
 captured_arg_keys = [
     document_id_column_key,
@@ -100,6 +104,8 @@
 """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)"""
 num_segments_default = 1
 """ Default number of segments across which we divide the hashing space for each band"""
+shingle_option_default = "word"
+""" Default option of doing shingling"""
 
 
 sigcalc_data_factory_key = "sc_data_factory"
@@ -162,6 +168,7 @@ def __init__(self, config: dict[str, Any]):
         self.num_segments = config.get(num_segments_key, num_segments_default)
         self.num_bands = config.get(num_bands_key, num_bands_default)
         self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default)
+        self.shingle_option = config.get(shingle_option_key, shingle_option_default)
         # use this dataframe to store the minhashes and size for each document
         self.all_minhashes: pl.DataFrame = None
         # use this dataframe to store the band hashes for each document
@@ -202,7 +209,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
         # generate minhash values
         minhashes = df.map_rows(
             lambda row: mm_min_hash.minhash2_nosalt(
-                *self._generate_word_shingles(row, window_size=self.word_shingle_size)
+                *self._generate_word_shingles(row, self.shingle_option, window_size=self.word_shingle_size)
             )
         )
         # rename columns, cast minhashes to list(uint32)
@@ -353,7 +360,9 @@ def write_band_signatures(self):
         return [], metadata
 
     # define shingles generation function
-    def _generate_word_shingles(self, row: tuple, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]:
+    def _generate_word_shingles(
+        self, row: tuple, shingling_option: str, window_size: int = 5, delimiter: str = " "
+    ) -> tuple[list, int, int]:
         text = row[0]
         # lower case
         text = text.lower()
@@ -366,7 +375,12 @@ def _generate_word_shingles(self, row: tuple, window_size: int = 5, delimiter: s
         # diacritics/unicode normalization
         text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
         text = text.strip()
-        words = text.split()
+        print(shingling_option)
+        print("=============")
+        if shingling_option == "char":
+            words = list(text)
+        else:
+            words = text.split()
         document_id = row[1]
         doc_len = len(row[0])
         word_count = len(words)
@@ -484,6 +498,12 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             default=num_segments_default,
             help="the number of segments across which we divide the hashing space for each band",
         )
+        parser.add_argument(
+            f"--{shingle_option_cli_param}",
+            type=str,
+            default=shingle_option_default,
+            help="Shingling option",
+        )
         self.daf.add_input_params(parser=parser)
 
     def apply_input_params(self, args: Namespace) -> bool:

From c531809647c29de300052c1d9a698905bc904733 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Sun, 10 Nov 2024 14:26:29 -0500
Subject: [PATCH 060/105] Use captured_arg_keys to list the arguments of each
 transform

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/python/src/fuzzy_dedup_python.py   | 29 +++----------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
index 7135054d2..f3d0b0fdc 100644
--- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
+++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
@@ -47,31 +47,10 @@
 }
 
 ARGS_MAP = {
-    "minhash": [
-        signature_calc_transform.contents_column_key,
-        signature_calc_transform.document_id_column_key,
-        signature_calc_transform.seed_key,
-        signature_calc_transform.num_permutations_key,
-        signature_calc_transform.num_bands_key,
-        signature_calc_transform.num_minhashes_per_band_key,
-        signature_calc_transform.jaccard_similarity_threshold_key,
-        signature_calc_transform.word_shingle_size_key,
-        signature_calc_transform.num_segments_key,
-    ],
-    "cluster": [
-        cluster_analysis_transform.jaccard_similarity_threshold_key,
-        cluster_analysis_transform.num_bands_key,
-        cluster_analysis_transform.num_segments_key,
-    ],
-    "fdlist": [
-        get_duplicate_list_transform.subfolder_key,
-        get_duplicate_list_transform.consolidated_filename_key,
-    ],
-    "fdclean": [
-        data_cleaning_transform.document_id_column_key,
-        data_cleaning_transform.duplicate_list_location_key,
-        data_cleaning_transform.operation_mode_key,
-    ],
+    "minhash": signature_calc_transform.captured_arg_keys,
+    "cluster": cluster_analysis_transform.captured_arg_keys,
+    "fdlist": get_duplicate_list_transform.captured_arg_keys,
+    "fdclean": data_cleaning_transform.captured_arg_keys,
 }
 
 
From fe431104ca2d171b451be76a0cd7716f268f9d52 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Sun, 10 Nov 2024 14:28:06 -0500
Subject: [PATCH 061/105] Ray implementation for get_duplicate_list_transform

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/ray/src/fuzzy_dedup_ray.py         |  6 +-
 .../src/get_duplicate_list_transform_ray.py   | 69 +++++++++++++++++++
 .../test_get_duplicate_list_transform_ray.py  |  9 ++-
 3 files changed, 78 insertions(+), 6 deletions(-)
 create mode 100644 transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py

diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
index 0d4c2954f..987369714 100644
--- a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
+++ b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
@@ -23,6 +23,10 @@
 from get_duplicate_list_transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
+from get_duplicate_list_transform_ray import (
+    GetDuplicateListRayRuntime,
+    GetDuplicateListRayTransformConfiguration,
+)
 from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
 
 
@@ -56,7 +60,7 @@ def execute_service(self, service_short_name: str, params: list) -> int:
         elif service_short_name == "cluster":
             launcher = RayTransformLauncher(runtime_config=ClusterAnalysisRayTransformConfiguration())
         elif service_short_name == "fdlist":
-            launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration())
+            launcher = RayTransformLauncher(runtime_config=GetDuplicateListRayTransformConfiguration())
         elif service_short_name == "fdclean":
             launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration())
         status = launcher.launch()
diff --git a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py
new file mode 100644
index 000000000..40081e658
--- /dev/null
+++ b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py
@@ -0,0 +1,69 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+from typing import Any
+
+from data_processing.data_access import DataAccess
+from data_processing.utils import CLIArgumentProvider, get_logger
+from data_processing_ray.runtime.ray import (
+    DefaultRayTransformRuntime,
+    RayTransformLauncher,
+    RayTransformRuntimeConfiguration,
+)
+from get_duplicate_list_transform import (
+    GetDuplicateListTransformConfiguration,
+    subfolder_key,
+)
+
+
+logger = get_logger(__name__)
+
+
+class GetDuplicateListRayRuntime(DefaultRayTransformRuntime):
+    """
+    Get duplicate list runtime support for Ray
+    """
+
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params=params)
+        self.logger = get_logger(__name__)
+
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Return the set of folders that will be processed by this transform
+        :param data_access - data access object
+        :return: list of folder paths
+        """
+        return [self.params[subfolder_key]]
+
+
+class GetDuplicateListRayTransformConfiguration(RayTransformRuntimeConfiguration):
+    """
+    Implements the RayTransformConfiguration for Fuzzy Dedup Get Duplicate List
+    as required by the RayTransformLauncher.
+    """
+
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(
+            transform_config=GetDuplicateListTransformConfiguration(),
+            runtime_class=GetDuplicateListRayRuntime,
+        )
+
+
+if __name__ == "__main__":
+    launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration())
+    logger.info("Launching fuzzy dedup get duplicate list ray transform")
+    launcher.launch()
diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
index 4b59e3a7a..55869598c 100644
--- a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
+++ b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
@@ -12,14 +12,12 @@
 
 import os
 
-from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
+from data_processing_ray.runtime.ray import RayTransformLauncher
 from get_duplicate_list_transform import sort_output_cli_param
-from get_duplicate_list_transform_python import (
-    GetDuplicateListPythonTransformConfiguration,
-)
+from get_duplicate_list_transform_ray import GetDuplicateListRayTransformConfiguration
 
 
 class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
@@ -31,9 +29,10 @@ class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
     def get_test_transform_fixtures(self) -> list[tuple]:
         basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
         config = {
+            "run_locally": True,
             sort_output_cli_param: True,
         }
-        launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration())
+        launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration())
         fixtures = [
             (
                 launcher,

From 82a1860524e8ebd4c59ae0598356095d69021e3c Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Sun, 10 Nov 2024 14:30:03 -0500
Subject: [PATCH 062/105] Bug fix: jaccard threshold type must be float

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/python/src/signature_calc_transform.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py
index 159697d19..b492eb3ae 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py
@@ -456,7 +456,7 @@ def add_input_params(self, parser: ArgumentParser) -> None:
         )
         parser.add_argument(
             f"--{jaccard_similarity_threshold_cli_param}",
-            type=int,
+            type=float,
             default=jaccard_similarity_threshold_default,
             help="Jaccard similarity threshold above which two documents are duplicates",
         )

From 61ed40f347612787d32385df779d0d88fc4e3f88 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Sun, 10 Nov 2024 14:31:18 -0500
Subject: [PATCH 063/105] Get fuzzy dedup ray image ready for kfp

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/ray/Dockerfile | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile
index e921c4749..d4b3ae484 100644
--- a/transforms/universal/fdedup/ray/Dockerfile
+++ b/transforms/universal/fdedup/ray/Dockerfile
@@ -1,8 +1,6 @@
 ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310
 FROM ${BASE_IMAGE}
 
-USER ray
-
 RUN pip install --upgrade --no-cache-dir pip 
 
 # install pytest
@@ -24,13 +22,20 @@ COPY --chown=ray:users README.md README.md
 RUN pip install --no-cache-dir -e .
 
 # copy source files needed by test-image
-COPY ./src/signature_calc_transform_ray.py fdedup_transform_ray.py
-COPY ./src/signature_calc_local_ray.py local/fdedup_local_ray.py
+COPY --chown=ray:users ./src/signature_calc_transform_ray.py fdedup_transform_ray.py
+COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py
+COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py
+COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py
+COPY --chown=ray:users ./src/data_cleaning_transform_ray.py data_cleaning_transform_ray.py
+COPY --chown=ray:users ./src/signature_calc_local_ray.py local/fdedup_local_ray.py
 
 # copy test
 COPY test/ test/
 COPY test-data/ test-data/
 
+USER root
+RUN chmod a+rwx /home/ray
+USER ray
 # Set environment
 ENV PYTHONPATH /home/ray
 

From a8ede002fba33a4e01df9421b60f30558b98260e Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Sun, 10 Nov 2024 17:37:56 -0500
Subject: [PATCH 064/105] kfp implementation for fuzzy dedup

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/kfp_ray/fdedup_wf.py     | 321 +++++++++----
 .../src/fdedup_compute_execution_params.py    | 437 ++++++++++--------
 2 files changed, 494 insertions(+), 264 deletions(-)

diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index 3156ab6f1..1c3e8e570 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -14,14 +14,24 @@
 import kfp.compiler as compiler
 import kfp.components as comp
 import kfp.dsl as dsl
-from src.fdedup_compute_execution_params import fdedup_compute_execution_params
+from src.fdedup_compute_execution_params import (
+    cluster_analysis_compute_execution_params,
+    compute_common_params,
+    data_cleaning_compute_execution_params,
+    get_duplicate_list_compute_execution_params,
+    signature_calc_compute_execution_params,
+)
 from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils
 
 
-task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest"
+task_image = os.getenv("FDEDUP_IMAGE_LOCATION", "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest")
+image_pull_secret = os.getenv("FDEDUP_IMAGE_PULL_SECRET", "my_secret")
 
 # the name of the job script
-EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py"
+SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "signature_calc_transform_ray.py"
+CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "cluster_analysis_transform_ray.py"
+GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "get_duplicate_list_transform_ray.py"
+DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py"
 
 # components
 base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
@@ -40,8 +50,18 @@
     # compilation time.
     import uuid
 
-    compute_exec_params_op = dsl.component_decorator.component(
-        func=fdedup_compute_execution_params, base_image=base_kfp_image
+    compute_common_params_op = dsl.component_decorator.component(func=compute_common_params, base_image=base_kfp_image)
+    compute_signature_calc_exec_params_op = dsl.component_decorator.component(
+        func=signature_calc_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_cluster_analysis_exec_params_op = dsl.component_decorator.component(
+        func=cluster_analysis_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_get_duplicate_list_exec_params_op = dsl.component_decorator.component(
+        func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_data_cleaning_exec_params_op = dsl.component_decorator.component(
+        func=data_cleaning_compute_execution_params, base_image=base_kfp_image
     )
     print(
         "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the "
@@ -49,61 +69,94 @@
     )
     run_id = uuid.uuid4().hex
 else:
-    compute_exec_params_op = comp.create_component_from_func(
-        func=fdedup_compute_execution_params, base_image=base_kfp_image
+    compute_common_params_op = comp.create_component_from_func(func=compute_common_params, base_image=base_kfp_image)
+    compute_signature_calc_exec_params_op = comp.create_component_from_func(
+        func=signature_calc_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_cluster_analysis_exec_params_op = comp.create_component_from_func(
+        func=cluster_analysis_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_get_duplicate_list_exec_params_op = comp.create_component_from_func(
+        func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image
+    )
+    compute_data_cleaning_exec_params_op = comp.create_component_from_func(
+        func=data_cleaning_compute_execution_params, base_image=base_kfp_image
     )
     run_id = dsl.RUN_ID_PLACEHOLDER
 
 # create Ray cluster
 create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml")
-# execute job
-execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml")
+# execute signature calculation job
+execute_signature_calc_job_op = comp.load_component_from_file(
+    component_spec_path + "executeRayJobComponent_multi_s3.yaml"
+)
+# execute cluster analysis job
+execute_cluster_analysis_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml")
+# execute get duplicate list job
+execute_get_duplicate_list_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml")
+# execute data cleaning job
+execute_data_cleaning_job_op = comp.load_component_from_file(
+    component_spec_path + "executeRayJobComponent_multi_s3.yaml"
+)
 # clean up Ray
 cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml")
 
 # Task name is part of the pipeline name, the ray cluster name and the job name in DMF.
-TASK_NAME: str = "fdedup"
+TASK_NAME: str = "fuzzydedup"
 
 
 @dsl.pipeline(
     name=TASK_NAME + "-ray-pipeline",
-    description="Pipeline for fdedup",
+    description="Pipeline for fuzzy dedup",
 )
-def fdedup(
+def fuzzydedup(
+    # folders used
     # Ray cluster
-    ray_name: str = "fdedup-kfp-ray",  # name of Ray cluster
+    ray_name: str = "fuzzydedup-kfp-ray",  # name of Ray cluster
     # Add image_pull_secret and image_pull_policy to ray workers if needed
-    ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image},
-    ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image},
+    ray_head_options: dict = {
+        "cpu": 1,
+        "memory": 4,
+        "image": task_image,
+        "image_pull_secret": image_pull_secret,
+        "imagePullPolicy": "Always",
+    },
+    ray_worker_options: dict = {
+        "replicas": 2,
+        "max_replicas": 2,
+        "min_replicas": 2,
+        "cpu": 2,
+        "memory": 4,
+        "image": task_image,
+        "image_pull_secret": image_pull_secret,
+        "imagePullPolicy": "Always",
+    },
     server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
     # data access. checkpointing is not supported by dedup
-    data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}",
-    data_s3_access_secret: str = "s3-secret",
+    data_s3_config: str = "{'input_folder': 's3://cos-llm-pile-south/spark_test/fd_xs_dataset_test/', 'output_folder': 's3://cos-llm-pile-south/spark_test/fuzzy_dedup_test_output_data/kfp_test_1/'}",
+    data_s3_access_secret: str = "s3-south-secret",
+    scdata_s3_access_secret: str = "s3-south-secret",
+    dcdata_s3_access_secret: str = "s3-south-secret",
     data_max_files: int = -1,
     data_num_samples: int = -1,
     # orchestrator
-    runtime_actor_options: dict = {"num_cpus": 0.7},
     runtime_pipeline_id: str = "pipeline_id",
-    runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
+    runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"},
     # columns used
-    fdedup_doc_column: str = "contents",
-    fdedup_id_column: str = "int_id_column",
-    fdedup_cluster_column: str = "cluster",
-    # infrastructure
-    fdedup_bucket_cpu: float = 0.5,
-    fdedup_doc_cpu: float = 0.5,
-    fdedup_mhash_cpu: float = 0.5,
+    fdedup_contents_column: str = "contents",
+    fdedup_document_id_column: str = "int_id_column",
     # fuzzy parameters
-    fdedup_num_permutations: int = 64,
-    fdedup_threshold: float = 0.8,
-    fdedup_shingles_size: int = 5,
-    fdedup_delimiters: str = " ",
-    # Random delay between reads
-    fdedup_random_delay_limit: int = 5,
-    # snapshotting
-    fdedup_snapshot_delay: int = 1,
-    fdedup_use_doc_snapshot: bool = False,
-    fdedup_use_bucket_snapshot: bool = False,
+    fdedup_num_permutations: int = 112,
+    fdedup_num_bands: int = 14,
+    fdedup_num_minhashes_per_band: int = 8,
+    fdedup_word_shingle_size: int = 5,
+    fdedup_jaccard_similarity_threshold: float = 0.75,
+    fdedup_seed: int = 42,
+    fdedup_docs_to_remove_folder: str = "docs_to_remove",
+    fdedup_duplicate_list_location: str = os.path.join(
+        "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet"
+    ),
+    fdedup_operation_mode: str = "annotate",
     # data sampling
     fdedup_n_samples: int = 10,
     # additional parameters
@@ -136,89 +189,189 @@ def fdedup(
         wait_print_tmout - time between prints, sec
         http_retries - http retries for API server calls
     :param data_s3_access_secret - s3 access secret
+    :param scdata_s3_access_secret - signature calculation s3 access secret
+    :param dcdata_s3_access_secret - data cleaning s3 access secret
     :param data_s3_config - s3 configuration
     :param data_max_files - max files to process
     :param data_num_samples - num samples to process
-    :param runtime_actor_options - actor options
     :param runtime_pipeline_id - pipeline id
     :param runtime_code_location - code location
-    :param fdedup_doc_column - document column name
-    :param fdedup_id_column - integer document id column name
-    :param fdedup_cluster_column - cluster column name
-    :param fdedup_bucket_cpu - number of CPUs per bucket hash
-    :param fdedup_doc_cpu - number of CPUs per doc hash
-    :param fdedup_mhash_cpu - number of CPUs per minhash hash
+    :param fdedup_contents_column - document column name
+    :param fdedup_document_id_column - integer document id column name
     :param fdedup_num_permutations - number of permutations
-    :param fdedup_threshold - threshold
-    :param fdedup_shingles_size - number of words in shingle
-    :param fdedup_delimiters - delimiter for splitting document
-    :param fdedup_random_delay_limit - delay between reads to reduce S3 load.
-                                A random number between 0 and random_delay_limit is used
-    :param fdedup_snapshot_delay - delay between restoring individual actors
-    :param fdedup_use_bucket_snapshot - flag to skip buckets building and start from existing snapshots
-    :param fdedup_use_doc_snapshot - flag to skip documents building and start from existing snapshots
+    :param fdedup_num_bands - number of bands
+    :param fdedup_num_minhashes_per_band - length of a band
+    :param fdedup_word_shingle_size - length of word shingles
+    :param fdedup_jaccard_similarity_threshold - similarity threshold
+    :param fdedup_seed - seed for the random number generator
+    :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids
+    :param fdedup_duplicate_list_location - name of the file holding the consolidated list of duplicates
+    :param fdedup_operation_mode - data cleaning mode, one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate'
     :param fdedup_n_samples - number of samples for parameters computation
     :return: None
     """
     # create clean_up task
-    clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params)
+    clean_up_task = cleanup_ray_op(
+        ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params
+    )
     ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2)
     # pipeline definition
     with dsl.ExitHandler(clean_up_task):
         # compute execution params
-        compute_exec_params = compute_exec_params_op(
-            worker_options=ray_worker_options,
-            actor_options=runtime_actor_options,
+        compute_common_exec_params = compute_common_params_op(
+            ray_worker_options,
+            data_s3_config,
+            fdedup_num_permutations,
+            fdedup_n_samples,
+        )
+        ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2)
+        ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret)
+        fdedup_num_segments = compute_common_exec_params.outputs["num_segments"]
+        runtime_actor_cpus = compute_common_exec_params.outputs["cpus_per_actor"]
+        runtime_num_actors = compute_common_exec_params.outputs["num_actors"]
+
+        # start Ray cluster
+        ray_cluster = create_ray_op(
+            ray_name=ray_name,
+            run_id=run_id,
+            ray_head_options=ray_head_options,
+            ray_worker_options=ray_worker_options,
+            server_url=server_url,
+            additional_params=additional_params,
+        )
+        ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2)
+        ray_cluster.after(compute_common_exec_params)
+
+        # Get the parameters for the signature calculation job
+        compute_signature_calc_exec_params = compute_signature_calc_exec_params_op(
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_num_actors=runtime_num_actors,
             data_s3_config=data_s3_config,
             data_max_files=data_max_files,
             data_num_samples=data_num_samples,
             runtime_pipeline_id=runtime_pipeline_id,
             runtime_job_id=run_id,
             runtime_code_location=runtime_code_location,
-            doc_column=fdedup_doc_column,
-            id_column=fdedup_id_column,
-            cluster_column=fdedup_cluster_column,
-            bucket_cpu=fdedup_bucket_cpu,
-            doc_cpu=fdedup_doc_cpu,
-            mhash_cpu=fdedup_mhash_cpu,
+            doc_column=fdedup_contents_column,
+            id_column=fdedup_document_id_column,
             num_permutations=fdedup_num_permutations,
-            threshold=fdedup_threshold,
-            shingles_size=fdedup_shingles_size,
-            delimiters=fdedup_delimiters,
-            random_delay_limit=fdedup_random_delay_limit,
-            snapshot_delay=fdedup_snapshot_delay,
-            use_doc_snapshot=fdedup_use_doc_snapshot,
-            use_bucket_snapshot=fdedup_use_bucket_snapshot,
-            n_samples=fdedup_n_samples,
+            num_bands=fdedup_num_bands,
+            num_minhashes_per_band=fdedup_num_minhashes_per_band,
+            word_shingle_size=fdedup_word_shingle_size,
+            threshold=fdedup_jaccard_similarity_threshold,
+            num_segments=fdedup_num_segments,
+            seed=fdedup_seed,
         )
-        ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2)
-        ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret)
+        ComponentUtils.add_settings_to_component(compute_signature_calc_exec_params, ONE_HOUR_SEC * 2)
+        compute_signature_calc_exec_params.after(ray_cluster)
 
-        # start Ray cluster
-        ray_cluster = create_ray_op(
+        # Execute signature calculation job
+        execute_signature_calc_job = execute_signature_calc_job_op(
             ray_name=ray_name,
             run_id=run_id,
-            ray_head_options=ray_head_options,
-            ray_worker_options=ray_worker_options,
+            additional_params=additional_params,
+            exec_params=compute_signature_calc_exec_params.output,
+            exec_script_name=SIGNATURE_CALC_EXEC_SCRIPT_NAME,
             server_url=server_url,
+            prefix="scdata",
+        )
+        ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC)
+        ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret)
+        ComponentUtils.set_s3_env_vars_to_component(
+            execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata"
+        )
+        execute_signature_calc_job.after(compute_signature_calc_exec_params)
+
+        # Get the parameters for the cluster analysis job
+        compute_cluster_analysis_exec_params = compute_cluster_analysis_exec_params_op(
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_num_actors=runtime_num_actors,
+            data_s3_config=data_s3_config,
+            data_max_files=data_max_files,
+            data_num_samples=data_num_samples,
+            runtime_pipeline_id=runtime_pipeline_id,
+            runtime_job_id=run_id,
+            runtime_code_location=runtime_code_location,
+            num_bands=fdedup_num_bands,
+            threshold=fdedup_jaccard_similarity_threshold,
+            num_segments=fdedup_num_segments,
+        )
+        ComponentUtils.add_settings_to_component(compute_cluster_analysis_exec_params, ONE_HOUR_SEC * 2)
+        compute_cluster_analysis_exec_params.after(execute_signature_calc_job)
+        # Execute job
+        execute_cluster_analysis_job = execute_cluster_analysis_job_op(
+            ray_name=ray_name,
+            run_id=run_id,
             additional_params=additional_params,
+            exec_params=compute_cluster_analysis_exec_params.output,
+            exec_script_name=CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME,
+            server_url=server_url,
         )
-        ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2)
-        ray_cluster.after(compute_exec_params)
+        ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC)
+        ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret)
+        execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params)
+
+        compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op(
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_num_actors=runtime_num_actors,
+            data_s3_config=data_s3_config,
+            data_max_files=data_max_files,
+            data_num_samples=data_num_samples,
+            runtime_pipeline_id=runtime_pipeline_id,
+            runtime_job_id=run_id,
+            runtime_code_location=runtime_code_location,
+            duplicate_docids_folder=fdedup_docs_to_remove_folder,
+            duplicate_list_location=fdedup_duplicate_list_location,
+        )
+        ComponentUtils.add_settings_to_component(compute_get_duplicate_list_exec_params, ONE_HOUR_SEC * 2)
+        compute_get_duplicate_list_exec_params.after(execute_cluster_analysis_job)
         # Execute job
-        execute_job = execute_ray_jobs_op(
+        execute_get_duplicate_list_job = execute_get_duplicate_list_job_op(
             ray_name=ray_name,
             run_id=run_id,
             additional_params=additional_params,
-            exec_params=compute_exec_params.output,
-            exec_script_name=EXEC_SCRIPT_NAME,
+            exec_params=compute_get_duplicate_list_exec_params.output,
+            exec_script_name=GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME,
             server_url=server_url,
         )
-        ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC)
-        ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
-        execute_job.after(ray_cluster)
+        ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC)
+        ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret)
+        execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params)
+
+        compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op(
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_num_actors=runtime_num_actors,
+            data_s3_config=data_s3_config,
+            data_max_files=data_max_files,
+            data_num_samples=data_num_samples,
+            runtime_pipeline_id=runtime_pipeline_id,
+            runtime_job_id=run_id,
+            runtime_code_location=runtime_code_location,
+            id_column=fdedup_document_id_column,
+            duplicate_list_location=fdedup_duplicate_list_location,
+            operation_mode=fdedup_operation_mode,
+        )
+        ComponentUtils.add_settings_to_component(compute_data_cleaning_exec_params, ONE_HOUR_SEC * 2)
+        compute_data_cleaning_exec_params.after(execute_get_duplicate_list_job)
+
+        # Execute job
+        execute_data_cleaning_job = execute_data_cleaning_job_op(
+            ray_name=ray_name,
+            run_id=run_id,
+            additional_params=additional_params,
+            exec_params=compute_data_cleaning_exec_params.output,
+            exec_script_name=DATA_CLEANING_EXEC_SCRIPT_NAME,
+            server_url=server_url,
+            prefix="dcdata",
+        )
+        ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC)
+        ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret)
+        ComponentUtils.set_s3_env_vars_to_component(
+            execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata"
+        )
+        execute_data_cleaning_job.after(compute_data_cleaning_exec_params)
 
 
 if __name__ == "__main__":
     # Compiling the pipeline
-    compiler.Compiler().compile(fdedup, __file__.replace(".py", ".yaml"))
+    compiler.Compiler().compile(fuzzydedup, __file__.replace(".py", ".yaml"))
diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
index 726200339..c5ff4d52b 100644
--- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
+++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
@@ -10,10 +10,77 @@
 # limitations under the License.
 ################################################################################
 
+from typing import Any, Dict, NamedTuple
 
-def fdedup_compute_execution_params(
+
+def compute_common_params(
     worker_options: dict,  # ray worker configuration
-    actor_options: dict,  # actor's resource requirements
+    data_s3_config: str,  # S3 configuration
+    num_permutations: int,  # number of permutations (minhashes) per document
+    n_samples: int,  # files to sample for number of documents estimation
+) -> NamedTuple("fdedup_params", [("num_segments", int), ("num_actors", int), ("cpus_per_actor", float)]):
+
+    import sys
+
+    from data_processing.data_access import DataAccessS3
+    from data_processing.utils import GB
+    from runtime_utils import KFPUtils
+
+    # get credentials
+    s3_key, s3_secret, s3_endpoint = KFPUtils.credentials()
+    s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint}
+    s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"'))
+    # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly
+    data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1)
+    # sample input data
+    sampling: dict[str, Any]
+    sampling, _ = data_access.sample_input_data(n_samples=n_samples)
+    number_of_docs = int(sampling.get("estimated number of docs"))
+    if number_of_docs == 0:
+        print(f"Estimated number of documents and documents size is zero. Please verify the input path.")
+        sys.exit(1)
+    print(f"Estimated number of docs: {number_of_docs}")
+    # Assume each document takes doc_bytes = (8 + num_permutations * 4 + 20) bytes, where:
+    #   8 bytes are taken by the band hash
+    #   (num_permutations * 4) bytes are taken by the min hashes
+    #   20 bytes to provide some extra space for storage in a table
+    # The total amount of space needed by a band is number_of_docs * doc_bytes.
+    # To scale the handling of this data, divide each band into segments, where each segment size is below 3GB
+    doc_bytes = 8 + num_permutations * 4 + 20
+    band_bytes = number_of_docs * doc_bytes
+    num_segments = 1 + (band_bytes // (3 * GB))
+    print(f"Number of segments: {num_segments}")
+
+    # To process data efficiently, each actor needs 16GB of memory.
+    # The actor config controls CPU allocation, not memory;
+    # use CPU allocation s.t. the number of actors on a worker  provides access to 16GB of memory for each actor.
+    # Also, to keep S3 utilization in check, limit the number of actors to 2000
+    num_nodes = worker_options["replicas"]
+    cpu_per_node = worker_options["cpu"] - 1
+    memory_per_node = 0.85 * worker_options["memory"]
+
+    memory_per_actor = 16  # GB
+    max_num_actors = 2000
+    num_actors_per_node: int = int(memory_per_node / memory_per_actor)
+    if num_actors_per_node == 0:
+        num_actors_per_node = 1
+    num_actors = num_nodes * num_actors_per_node
+    while num_actors > max_num_actors:
+        num_actors -= num_nodes
+        num_actors_per_node -= 1
+    print(f"Number of actors per node = {num_actors_per_node}")
+    cpus_per_actor = cpu_per_node / num_actors_per_node
+    print(f"CPUs per actor = {cpus_per_actor}")
+
+    from collections import namedtuple
+
+    fdedup_params = namedtuple("fdedup_params", ["num_segments", "num_actors", "cpus_per_actor"])
+    return fdedup_params(num_segments, num_actors, cpus_per_actor)
+
+
+def signature_calc_compute_execution_params(
+    runtime_actor_cpus: float,  # actor's CPU requirements
+    runtime_num_actors: int,  # number of actors needed to run this step
     data_s3_config: str,  # s3 configuration
     data_max_files: int,  # max files to process
     data_num_samples: int,  # num samples to process
@@ -22,27 +89,19 @@ def fdedup_compute_execution_params(
     runtime_code_location: dict,  # code location
     doc_column: str,  # document column name
     id_column: str,  # integer document id column name
-    cluster_column: str,  # cluster column name
-    bucket_cpu: float,  # number of CPUs per bucket hash
-    doc_cpu: float,  # number of CPUs per doc hash
-    mhash_cpu: float,  # number of CPUs per minhash hash
     num_permutations: int,  # number of permutations
+    num_bands: int,  # number of bands
+    num_minhashes_per_band: int,  # band length
+    word_shingle_size: int,  # number of words in shingle
     threshold: float,  # threshold,
-    shingles_size: int,  # number of words in shingle
-    delimiters: str,  # delimiter for splitting document
-    random_delay_limit: int,  # delay between reads to reduce S3 load.
-    # A random number between 0 and random_delay_limit is used
-    snapshot_delay: int,  # delay between restoring individual actors
-    use_doc_snapshot: bool,  # flag to skip documents building and start from existing snapshots
-    use_bucket_snapshot: bool,  # flag to skip buckets building and start from existing snapshots
-    n_samples: int,  # number of samples to use
-) -> dict:  # NamedTuple(
-    # "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)]
+    num_segments: int,  # number of segments
+    seed: int,  # seed for the random number generator
+) -> dict:
 
     """
-    Compute fuzzy dedup execution parameters
-    :param worker_options: cluster parameters
-    :param actor_options: actor request requirements
+    Compute fuzzy dedup execution parameters for signature calculation
+    :param runtime_actor_cpus: actor's CPU requirements
+    :param runtime_num_actors: number of actors to run this step
     :param data_s3_config: s3 configuration
     :param data_max_files: max files to process
     :param data_num_samples: num samples to process
@@ -51,182 +110,200 @@ def fdedup_compute_execution_params(
     :param runtime_code_location: code location
     :param doc_column: document column name
     :param id_column: integer document id column name
-    :param cluster_column: cluster column name
-    :param bucket_cpu: number of CPUs per bucket hash
-    :param doc_cpu: number of CPUs per doc hash
-    :param mhash_cpu: number of CPUs per minhash hash
     :param num_permutations: number of permutations
+    :param num_bands: number of bands
+    :param num_minhashes_per_band: band length
+    :param word_shingle_size: number of words in shingle
     :param threshold: threshold,
-    :param shingles_size: number of words in shingle
-    :param delimiters: delimiter for splitting document
-    :param random_delay_limit: # delay between reads to reduce S3 load. A random number between 0 and random_delay_limit is used
-    :param snapshot_delay: delay between restoring individual actors
-    :param use_doc_snapshot: flag to skip documents building and start from existing snapshots
-    :param use_bucket_snapshot: flag to skip buckets building and start from existing snapshots
-    :param n_samples: number of samples to use
+    :param num_segments: number of segments
+    :param seed: seed for the random number generator
     :return: a dictionary with a Ray Job execution parameters
     """
-    import math
-    import sys
 
-    from data_processing.data_access import DataAccessS3
-    from data_processing.utils import GB, KB
-    from runtime_utils import KFPUtils
-    from scipy.integrate import quad as integrate
-
-    EXECUTION_OF_KB_DOC = 0.003
-
-    def fuzzy_optimal_param(
-        threshold: float,
-        num_perm: int,
-        false_positive_weight: float,
-        false_negative_weight: float,
-    ) -> tuple[int, int]:
-        """
-        Computes parameters for fuzzy dedup
-        :param threshold: filtering threshold
-        :param num_perm: number of permutations
-        :param false_positive_weight: false positive weight
-        :param false_negative_weight: false negative weight
-        :return: number of buckets and bucket length
-        """
-
-        def _false_positive_probability(ths: float, b: int, r: int) -> float:
-            """
-            Compute false positive probability
-            :param ths: filtering threshold
-            :param b: permutation
-            :param r: rel permutation
-            :return: probability
-            """
-            _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b)
-            a, err = integrate(_probability, 0.0, ths)
-            return a
-
-        def _false_negative_probability(ths: float, b: int, r: int) -> float:
-            """
-            Compute false negative probability
-            :param ths: filtering threshold
-            :param b: permutation
-            :param r: rel permutation
-            :return: probability
-            """
-            _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b))
-            a, err = integrate(_probability, ths, 1.0)
-            return a
-
-        min_error = float("inf")
-        opt = (0, 0)
-        for perm in range(1, num_perm + 1):
-            max_r = int(num_perm / perm)
-            for rel in range(1, max_r + 1):
-                fp = _false_positive_probability(threshold, perm, rel)
-                fn = _false_negative_probability(threshold, perm, rel)
-                error = fp * false_positive_weight + fn * false_negative_weight
-                if error < min_error:
-                    min_error = error
-                    opt = (perm, rel)
-        return opt
+    # fuzzy parameters for signature calculation
+    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
+    print(f"runtime_actor_options = {runtime_actor_options}")
+    return {
+        "data_s3_config": data_s3_config,
+        "data_max_files": data_max_files,
+        "data_num_samples": data_num_samples,
+        "runtime_num_workers": runtime_num_actors,
+        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_pipeline_id": runtime_pipeline_id,
+        "runtime_job_id": runtime_job_id,
+        "runtime_code_location": str(runtime_code_location),
+        "minhash_contents_column": doc_column,
+        "minhash_document_id_column": id_column,
+        "minhash_num_permutations": num_permutations,
+        "minhash_num_bands": num_bands,
+        "minhash_num_minhashes_per_band": num_minhashes_per_band,
+        "minhash_word_shingle_size": word_shingle_size,
+        "minhash_jaccard_similarity_threshold": threshold,
+        "minhash_num_segments": num_segments,
+        "minhash_seed": seed,
+        "scdata_s3_config": data_s3_config,
+    }
+
+
+def cluster_analysis_compute_execution_params(
+    runtime_actor_cpus: float,  # actor's CPU requirements
+    runtime_num_actors: int,  # number of actors needed to run this step
+    data_s3_config: str,  # s3 configuration
+    data_max_files: int,  # max files to process
+    data_num_samples: int,  # num samples to process
+    runtime_pipeline_id: str,  # pipeline id
+    runtime_job_id: str,  # job id
+    runtime_code_location: dict,  # code location
+    num_bands: int,  # number of bands
+    threshold: float,  # threshold,
+    num_segments: int,  # number of segments
+) -> dict:
+
+    """
+    Compute fuzzy dedup execution parameters for cluster analysis
+    :param runtime_actor_cpus: actor's CPU requirements
+    :param runtime_num_actors: number of actors to run this step
+    :param data_s3_config: s3 configuration
+    :param data_max_files: max files to process
+    :param data_num_samples: num samples to process
+    :param runtime_pipeline_id: pipeline id
+    :param runtime_job_id: job id
+    :param runtime_code_location: code location
+    :param num_bands: number of bands
+    :param threshold: threshold,
+    :param num_segments: number of segments
+    :return: a dictionary with a Ray Job execution parameters
+    """
+    import json
+    import os
 
     # fuzzy parameters
-    num_buckets, length_bucket = fuzzy_optimal_param(
-        threshold=threshold,
-        num_perm=num_permutations,
-        false_positive_weight=0.5,
-        false_negative_weight=0.5,
-    )
-    print(f"Fuzzy parameters: num buckets {num_buckets}, bucket length {length_bucket}")
     # Get cluster parameters
-    cluster_cpu = worker_options["replicas"] * worker_options["cpu"]
-    cluster_memory = worker_options["replicas"] * worker_options["memory"]
-    print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}")
-    cluster_cpu -= 1
-    cluster_memory *= 0.85
-    # get actor requirements
-    actor_cpu = actor_options["num_cpus"]
-    print(f"actor required cpu {actor_cpu}")
-    # get credentials
-    s3_key, s3_secret, s3_endpoint = KFPUtils.credentials()
-    s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint}
-    s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"'))
-    if type(s3_config) is list:
-        # S3 config is list. take the first element
-        s3_config = s3_config[0]
-    # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly
-    data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1)
-    # sample input data
-    sampling, _ = data_access.sample_input_data(n_samples=n_samples)
-    avg_doc_size = sampling.get("average doc size KB")
-    number_of_docs = sampling.get("estimated number of docs")
-    avg_table_size = sampling.get("average table size MB") / KB
-    if number_of_docs == 0:
-        print(f"Estimated number of documents and documents size is zero. Please verify the input path.")
-        sys.exit(1)
-    # we are creating more buckets actors, so that we get better parallelization for bucket processing
-    b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB)
-    d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB)
-    m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB)
-    # compute cpu requirements
-    # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount
-    # of CPUs
-    n_preprocessors = int(
-        (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu
-    )
-    if n_preprocessors <= 0:
-        print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}")
-        print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}")
-        print("Try to increase the size of the cluster")
-        sys.exit(1)
-    # compute the amount of workers
-    n_workers = int((0.85 * cluster_cpu - d_actors * doc_cpu) / actor_cpu)
-    # Ensure that we do not overwhelm S3
-    if n_workers > 2000:
-        n_workers = 2000
-    print(
-        f"Number of preprocessors: {n_preprocessors}, Number of workers: {n_workers}, bucket actors {b_actors}, "
-        f"minhash actors {m_actors}, document actors {d_actors}"
-    )
-
-    # Make sure that we have enough memory
-    r_mem = avg_table_size * 4 * n_preprocessors + 2 * (b_actors + m_actors + d_actors)
-    print(f"Required execution memory {r_mem} GB")
-    if r_mem > cluster_memory:
-        print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}")
-        print(f"Try to increase the size of the cluster or increase size of the cpu per worker (current {actor_cpu})")
-        sys.exit(1)
+    data_s3_config_dict = json.loads(data_s3_config.replace("'", '"'))
+    base_folder = data_s3_config_dict.get("output_folder")
+    data_s3_config_dict["input_folder"] = os.path.join(base_folder, "bands")
+    data_s3_config_dict["output_folder"] = os.path.join(base_folder, "docs_to_remove")
+    data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'")
+    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
+    return {
+        "data_s3_config": data_s3_config,
+        "data_max_files": data_max_files,
+        "data_num_samples": data_num_samples,
+        "runtime_num_workers": runtime_num_actors,
+        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_pipeline_id": runtime_pipeline_id,
+        "runtime_job_id": runtime_job_id,
+        "runtime_code_location": str(runtime_code_location),
+        "cluster_num_bands": num_bands,
+        "cluster_jaccard_similarity_threshold": threshold,
+        "cluster_num_segments": num_segments,
+    }
 
-    print(
-        f"Required cpu : "
-        f"{b_actors * bucket_cpu + m_actors * mhash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}"
-    )
 
-    projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60
-    print(f"Projected execution time {projected_execution} min")
+def get_duplicate_list_compute_execution_params(
+    runtime_actor_cpus: float,  # actor's CPU requirements
+    runtime_num_actors: int,  # number of actors needed to run this step
+    data_s3_config: str,  # s3 configuration
+    data_max_files: int,  # max files to process
+    data_num_samples: int,  # num samples to process
+    runtime_pipeline_id: str,  # pipeline id
+    runtime_job_id: str,  # job id
+    runtime_code_location: dict,  # code location
+    duplicate_docids_folder: str,  # folder with the docs IDs to remove
+    duplicate_list_location: str,  # location of the list of duplicate doc ids
+) -> dict:
+    """
+    Compute fuzzy dedup execution parameters for get duplicate list step
+    :param runtime_actor_cpus: actor's CPU requirements
+    :param runtime_num_actors: number of actors to run this step
+    :param data_s3_config: s3 configuration
+    :param data_max_files: max files to process
+    :param data_num_samples: num samples to process
+    :param runtime_pipeline_id: pipeline id
+    :param runtime_job_id: job id
+    :param runtime_code_location: code location
+    :param duplicate_docids_folder: folder with the docs IDs to remove
+    :param duplicate_list_location: location of the list of duplicate doc ids
+    :return: a dictionary with a Ray Job execution parameters
+    """
+    import json
+
+    # fuzzy parameters
+    # Get cluster parameters
+    data_s3_config_dict = json.loads(data_s3_config.replace("'", '"'))
+    base_folder = data_s3_config_dict.get("output_folder")
+    data_s3_config_dict["input_folder"] = base_folder
+    data_s3_config_dict["output_folder"] = base_folder
+    data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'")
+    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
+    return {
+        "data_s3_config": data_s3_config,
+        "data_max_files": data_max_files,
+        "data_num_samples": data_num_samples,
+        "runtime_num_workers": runtime_num_actors,
+        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_pipeline_id": runtime_pipeline_id,
+        "runtime_job_id": runtime_job_id,
+        "runtime_code_location": str(runtime_code_location),
+        "fdlist_docs_to_remove": duplicate_docids_folder,
+        "fdlist_consolidated_filename": duplicate_list_location,
+    }
+
+
+def data_cleaning_compute_execution_params(
+    runtime_actor_cpus: float,  # actor's CPU requirements
+    runtime_num_actors: int,  # number of actors needed to run this step
+    data_s3_config: str,  # s3 configuration
+    data_max_files: int,  # max files to process
+    data_num_samples: int,  # num samples to process
+    runtime_pipeline_id: str,  # pipeline id
+    runtime_job_id: str,  # job id
+    runtime_code_location: dict,  # code location
+    id_column: str,  # integer document id column name
+    duplicate_list_location: str,  # location of the list of duplicate doc ids
+    operation_mode: str,  # filter (non-)duplicates or annotate
+) -> dict:
+    """
+    Compute fuzzy dedup execution parameters
+    :param runtime_actor_cpus: actor's CPU requirements
+    :param runtime_num_actors: number of actors to run this step
+    :param data_s3_config: s3 configuration
+    :param data_max_files: max files to process
+    :param data_num_samples: num samples to process
+    :param runtime_pipeline_id: pipeline id
+    :param runtime_job_id: job id
+    :param runtime_code_location: code location
+    :param id_column: integer document id column name
+    :param duplicate_list_location: location of the list of duplicate doc ids
+    :param operation_mode: filter (non-)duplicates or annotate
+    :return: a dictionary with a Ray Job execution parameters
+    """
+    import json
+    import os
+
+    # fuzzy parameters
+    # Get cluster parameters
+    data_s3_config_dict = json.loads(data_s3_config.replace("'", '"'))
+    base_folder = data_s3_config_dict.get("output_folder")
+    if operation_mode == "filter_duplicates":
+        output_subfolder = "cleaned"
+    elif operation_mode == "filter_non_duplicates":
+        output_subfolder = "duplicates"
+    else:  # operation_mode == "annotate"
+        output_subfolder = "annotated"
+    data_s3_config_dict["output_folder"] = os.path.join(base_folder, output_subfolder)
+    data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'")
+    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
     return {
         "data_s3_config": data_s3_config,
         "data_max_files": data_max_files,
         "data_num_samples": data_num_samples,
-        "runtime_num_workers": n_workers,
-        "runtime_worker_options": str(actor_options),
+        "runtime_num_workers": runtime_num_actors,
+        "runtime_worker_options": str(runtime_actor_options),
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),
-        "fdedup_doc_column": doc_column,
-        "fdedup_id_column": id_column,
-        "fdedup_cluster_column": cluster_column,
-        "fdedup_bucket_cpu": bucket_cpu,
-        "fdedup_doc_cpu": doc_cpu,
-        "fdedup_mhash_cpu": mhash_cpu,
-        "fdedup_num_doc_actors": d_actors,
-        "fdedup_num_bucket_actors": b_actors,
-        "fdedup_num_minhash_actors": m_actors,
-        "fdedup_num_preprocessors": n_preprocessors,
-        "fdedup_num_permutations": num_permutations,
-        "fdedup_threshold": threshold,
-        "fdedup_shingles_size": shingles_size,
-        "fdedup_delimiters": delimiters,
-        "fdedup_random_delay_limit": random_delay_limit,
-        "fdedup_snapshot_delay": snapshot_delay,
-        "fdedup_use_doc_snapshot": use_doc_snapshot,
-        "fdedup_use_bucket_snapshot": use_bucket_snapshot,
+        "fdclean_document_id_column": id_column,
+        "fdclean_duplicate_list_location": duplicate_list_location,
+        "fdclean_operation_mode": operation_mode,
     }

From 96edea4fe2cb976e0e20a7b0299a022ecd378ef0 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Sun, 10 Nov 2024 22:08:05 -0500
Subject: [PATCH 065/105] Added params to captured_arg_keys

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/python/src/data_cleaning_transform.py  | 1 +
 .../universal/fdedup/python/src/signature_calc_transform.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
index 1a349ae85..74597068c 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
@@ -44,6 +44,7 @@
 captured_arg_keys = [
     document_id_column_key,
     duplicate_list_location_key,
+    operation_mode_key,
 ]
 
 # defaults
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py
index c63fa3576..6b14e1ba0 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py
@@ -71,7 +71,7 @@
 num_segments_cli_param = f"{cli_prefix}{num_segments_key}"
 """ The number of segments across which we divide the hashing space for each band"""
 shingle_option_cli_param = f"{cli_prefix}{shingle_option_key}"
-""" This key holds the option that is used to do shingles calculation for each document"""
+""" The option (word/char) used to do shingles calculation for each document"""
 
 captured_arg_keys = [
     document_id_column_key,
@@ -83,6 +83,7 @@
     jaccard_similarity_threshold_key,
     word_shingle_size_key,
     num_segments_key,
+    shingle_option_key,
 ]
 
 # defaults
@@ -375,8 +376,7 @@ def _generate_word_shingles(
         # diacritics/unicode normalization
         text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
         text = text.strip()
-        print(shingling_option)
-        print("=============")
+        self.logger.debug(shingling_option)
         if shingling_option == "char":
             words = list(text)
         else:

From 1a70530af57f530d5ac98acacafbf94512a977b3 Mon Sep 17 00:00:00 2001
From: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
Date: Mon, 11 Nov 2024 12:13:53 +0900
Subject: [PATCH 066/105] update readme following template
 https://github.com/IBM/data-prep-kit/issues/753#issuecomment-2460867526

Signed-off-by: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
---
 .../language/doc_quality/python/README.md     | 57 +++++++++++++++----
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md
index 38421f34f..f3944cdc0 100644
--- a/transforms/language/doc_quality/python/README.md
+++ b/transforms/language/doc_quality/python/README.md
@@ -1,13 +1,21 @@
 # Document Quality Transform 
+
 Please see the set of
 [transform project conventions](../../../README.md#transform-project-conventions)
 for details on general project conventions, transform configuration,
 testing and IDE set up.
 
-## Summary 
-This transform will calculate and annotate several metrics related to document, which are usuful to see the quality of document. 
+## Description 
+This transform will calculate and annotate several metrics related to document, which are usuful to see the quality of document.
+Text is the type of data this transform operates on.
+
+### Input 
 
-In this transform, following metrics will be included:
+| input column name | data type | descrition |
+|-|-|-|
+| the one specified in _doc_content_column_ configuration | string | text whose quality will be calculated by this transform |
+
+### Output columns annotated by this transform
 
 | output column name | data type | description | supported language |
 |-|-|-|-|
@@ -27,7 +35,7 @@ In this transform, following metrics will be included:
 
 You can see more detailed backgrounds of some columns in [Deepmind's Gopher paper](https://arxiv.org/pdf/2112.11446.pdf)
 
-## Configuration and command line Options
+## Configuration
 
 The set of dictionary keys holding [DocQualityTransform](src/doc_quality_transform.py) 
 configuration for values are as follows:
@@ -36,13 +44,19 @@ configuration for values are as follows:
 * _doc_content_column_ - specifies column name that contains document text. By default, "contents" is used.
 * _bad_word_filepath_ - specifies a path to bad word file: local folder (file or directory) that points to bad word file. You don't have to set this parameter if you don't need to set bad words.
 
-## Running
+Example
+```
+{
+    text_lang_key: "en",
+    doc_content_column_key: "contents",
+    bad_word_filepath_key: os.path.join(basedir, "ldnoobw", "en"),
+}
+```
+
+## Usage
 
 ### Launched Command Line Options 
-When running the transform with the Ray launcher (i.e. TransformLauncher),
-the following command line arguments are available in addition to 
-the options provided by 
-the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).
+The following command line arguments are available
 ```
   --docq_text_lang DOCQ_TEXT_LANG   language used in the text content. By default, "en" is used.
   --docq_doc_content_column DOCQ_DOC_CONTENT_COLUMN   column name that contain document text. By default, "contents" is used.
@@ -70,6 +84,9 @@ ls output
 ```
 To see results of the transform.
 
+### Code example
+
+TBD (link to the notebook will be provided)
 
 ### Transforming data using the transform image
 
@@ -77,7 +94,27 @@ To use the transform image to transform your data, please refer to the
 [running images quickstart](../../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
 
+## Testing
+
+Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md)
+
+Currently we have:
+- [Unit test](test/test_doc_quality_python.py)
+- [Integration test](test/test_doc_quality.py)
+
+
+## Further Resource
+
+- For those who want to learn C4 heuristic rules
+  - https://arxiv.org/pdf/1910.10683.pdf
+- For those who want to learn Gopher statistics
+  - https://arxiv.org/pdf/2112.11446.pdf
+- For those who want to see the source of badwords used by default
+  - https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
+
+
+## Consideration
 
-## Troubleshooting guide
+### Troubleshooting guide
 
 For M1 Mac user, if you see following error during make command, `error: command '/usr/bin/clang' failed with exit code 1`, you may better follow [this step](https://freeman.vc/notes/installing-fasttext-on-an-m1-mac)
\ No newline at end of file

From 24163af9d00f7603b9ec17091c785c0fead8eaae Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 11 Nov 2024 09:36:19 -0500
Subject: [PATCH 067/105] Add shingle type option (word or char) to kfp

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/kfp_ray/fdedup_wf.py               | 3 +++
 .../fdedup/kfp_ray/src/fdedup_compute_execution_params.py      | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index 1c3e8e570..139a0f919 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -150,6 +150,7 @@ def fuzzydedup(
     fdedup_num_bands: int = 14,
     fdedup_num_minhashes_per_band: int = 8,
     fdedup_word_shingle_size: int = 5,
+    fdedup_shingle_option: str = "word",
     fdedup_jaccard_similarity_threshold: float = 0.75,
     fdedup_seed: int = 42,
     fdedup_docs_to_remove_folder: str = "docs_to_remove",
@@ -202,6 +203,7 @@ def fuzzydedup(
     :param fdedup_num_bands - number of bands
     :param fdedup_num_minhashes_per_band - length of a band
     :param fdedup_word_shingle_size - length of word shingles
+    :param fdedup_shingle_option - type of shingle, one of 'word', or 'char'
     :param fdedup_jaccard_similarity_threshold - similarity threshold
     :param fdedup_seed - seed for the random number generator
     :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids
@@ -258,6 +260,7 @@ def fuzzydedup(
             num_bands=fdedup_num_bands,
             num_minhashes_per_band=fdedup_num_minhashes_per_band,
             word_shingle_size=fdedup_word_shingle_size,
+            shingle_option=fdedup_shingle_option,
             threshold=fdedup_jaccard_similarity_threshold,
             num_segments=fdedup_num_segments,
             seed=fdedup_seed,
diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
index c5ff4d52b..65b7ac2f6 100644
--- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
+++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
@@ -93,6 +93,7 @@ def signature_calc_compute_execution_params(
     num_bands: int,  # number of bands
     num_minhashes_per_band: int,  # band length
     word_shingle_size: int,  # number of words in shingle
+    shingle_option: str,  # type of shingle, one of 'word' or 'char'
     threshold: float,  # threshold,
     num_segments: int,  # number of segments
     seed: int,  # seed for the random number generator
@@ -114,6 +115,7 @@ def signature_calc_compute_execution_params(
     :param num_bands: number of bands
     :param num_minhashes_per_band: band length
     :param word_shingle_size: number of words in shingle
+    :param shingle_option: str: type of shingle, one of 'word' or 'char'
     :param threshold: threshold,
     :param num_segments: number of segments
     :param seed: seed for the random number generator
@@ -138,6 +140,7 @@ def signature_calc_compute_execution_params(
         "minhash_num_bands": num_bands,
         "minhash_num_minhashes_per_band": num_minhashes_per_band,
         "minhash_word_shingle_size": word_shingle_size,
+        "minhash_shingle_option": shingle_option,
         "minhash_jaccard_similarity_threshold": threshold,
         "minhash_num_segments": num_segments,
         "minhash_seed": seed,

From ecb87b0afd8042d122edc549639880c8b74d6ad5 Mon Sep 17 00:00:00 2001
From: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
Date: Wed, 13 Nov 2024 10:23:10 +0900
Subject: [PATCH 068/105] fix typo and update description

Signed-off-by: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
---
 transforms/language/doc_quality/python/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md
index f3944cdc0..1e060018d 100644
--- a/transforms/language/doc_quality/python/README.md
+++ b/transforms/language/doc_quality/python/README.md
@@ -6,12 +6,12 @@ for details on general project conventions, transform configuration,
 testing and IDE set up.
 
 ## Description 
-This transform will calculate and annotate several metrics related to document, which are usuful to see the quality of document.
-Text is the type of data this transform operates on.
+This transform will calculate and annotate several metrics which are useful to assess the quality of the document.
+The document quality transform operates on text documents only
 
 ### Input 
 
-| input column name | data type | descrition |
+| input column name | data type | description |
 |-|-|-|
 | the one specified in _doc_content_column_ configuration | string | text whose quality will be calculated by this transform |
 

From e3fae5db338ee16ae4fdcf6eced47da962e20b06 Mon Sep 17 00:00:00 2001
From: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
Date: Wed, 13 Nov 2024 17:52:42 +0900
Subject: [PATCH 069/105] add name/email of contributor

Signed-off-by: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
---
 transforms/language/doc_quality/python/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md
index 1e060018d..6a085ef05 100644
--- a/transforms/language/doc_quality/python/README.md
+++ b/transforms/language/doc_quality/python/README.md
@@ -5,6 +5,10 @@ Please see the set of
 for details on general project conventions, transform configuration,
 testing and IDE set up.
 
+## Contributors
+
+- Daiki Tsuzuku (dtsuzuku@jp.ibm.com)
+
 ## Description 
 This transform will calculate and annotate several metrics which are useful to assess the quality of the document.
 The document quality transform operates on text documents only

From 3a43c3d4370cdb31949a11190804552716a3adce Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Wed, 13 Nov 2024 10:53:09 -0500
Subject: [PATCH 070/105] Utility to calculate number of bands and length of a
 band

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/utils/Makefile    | 16 ++++
 .../universal/fdedup/utils/calc_r_and_b.ipynb | 74 +++++++++++++++++++
 .../universal/fdedup/utils/requirements.txt   |  3 +
 3 files changed, 93 insertions(+)
 create mode 100644 transforms/universal/fdedup/utils/Makefile
 create mode 100644 transforms/universal/fdedup/utils/calc_r_and_b.ipynb
 create mode 100644 transforms/universal/fdedup/utils/requirements.txt

diff --git a/transforms/universal/fdedup/utils/Makefile b/transforms/universal/fdedup/utils/Makefile
new file mode 100644
index 000000000..dae3f30ea
--- /dev/null
+++ b/transforms/universal/fdedup/utils/Makefile
@@ -0,0 +1,16 @@
+PYTHON=python
+PIP=pip
+
+venv:	requirements.txt
+	$(PYTHON) -m venv venv
+	if [ -e venv/Scripts/activate ]; then			\
+		echo "For Windows please try the following AS Administrator - no guarantees";	\
+		echo "  venv\\Scripts\\activate";		\
+		echo "  pip install --upgrade pip";		\
+		echo "  pip install -r requirements.txt";	\
+		echo "  pip install pytest";		\
+	else						\
+		. venv/bin/activate;			\
+		$(PIP) install --upgrade pip;		\
+		$(PIP) install -r requirements.txt;	\
+	fi					
diff --git a/transforms/universal/fdedup/utils/calc_r_and_b.ipynb b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb
new file mode 100644
index 000000000..8398f9efa
--- /dev/null
+++ b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb
@@ -0,0 +1,74 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cf5dba9a-d530-4a0a-ae71-2d741f7e705f",
+   "metadata": {},
+   "source": [
+    "This notebook allows calculating the values for `b` (the number of bands) and `r` (the number of minhashes in a band) used in the fuzzy dedup algorithm. The default values are `b=14` and `r=8`, as defined in the [FineWeb datasets paper](https://arxiv.org/pdf/2406.17557). The x-axis of the graph represents the Jaccard similarity between a pair of documents, while the y-axis represents the probability that they become duplication candidates. Please refer to http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf for more details on this methodology."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "800bc113-8b5e-4cec-8717-98fa05753bd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Define the parameterized function\n",
+    "def f(s, r, b):\n",
+    "    return 1 - (1 - s**r)**b\n",
+    "\n",
+    "# Set the parameters r and b\n",
+    "r = 8\n",
+    "b = 14\n",
+    "\n",
+    "# Generate values for s in a range, e.g., from 0 to 1\n",
+    "s_values = np.linspace(0, 1, 500)  # 500 points between 0 and 1\n",
+    "f_values = f(s_values, r, b)\n",
+    "\n",
+    "# Plot the function\n",
+    "plt.figure(figsize=(8, 6))\n",
+    "plt.plot(s_values, f_values, label=fr\"$f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\", color='blue')\n",
+    "plt.xlabel(\"s\")\n",
+    "plt.ylabel(\"f(s)\")\n",
+    "plt.title(f\"Plot of the function $f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\")\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98016b04-b6a0-465d-b65b-6d402978c9f0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transforms/universal/fdedup/utils/requirements.txt b/transforms/universal/fdedup/utils/requirements.txt
new file mode 100644
index 000000000..ce2acfefb
--- /dev/null
+++ b/transforms/universal/fdedup/utils/requirements.txt
@@ -0,0 +1,3 @@
+jupyter
+numpy
+matplotlib

From 2f61be7938d7540a0a1831e85b8a961bef24d35c Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Wed, 13 Nov 2024 15:37:32 -0500
Subject: [PATCH 071/105] Set correct version for pyproject

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/python/pyproject.toml | 6 +++---
 transforms/universal/fdedup/ray/Makefile          | 2 +-
 transforms/universal/fdedup/ray/pyproject.toml    | 2 +-
 transforms/universal/fdedup/spark/Makefile        | 2 +-
 transforms/universal/fdedup/spark/pyproject.toml  | 8 ++++----
 transforms/universal/fdedup/utils/Makefile        | 2 ++
 6 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml
index f46c8e8c4..dd58d41d4 100644
--- a/transforms/universal/fdedup/python/pyproject.toml
+++ b/transforms/universal/fdedup/python/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "dpk_fdedup_transform_python"
-version = "0.2.2.dev1"
-requires-python = ">=3.10"
+version = "0.2.2.dev2"
+requires-python = ">=3.10,<3.13"
 description = "Fuzzy Dedup Transform for Python"
 license = {text = "Apache-2.0"}
 readme = {file = "README.md", content-type = "text/markdown"}
@@ -10,7 +10,7 @@ authors = [
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit==0.2.2.dev1",
+    "data-prep-toolkit==0.2.2.dev2",
     "pyarrow==16.1.0",
     "pyyaml>=6.0.2",
     "boto3>=1.34.69",
diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile
index f5f06c3c3..ec193b6c3 100644
--- a/transforms/universal/fdedup/ray/Makefile
+++ b/transforms/universal/fdedup/ray/Makefile
@@ -43,7 +43,7 @@ setup:: .transforms.setup
 
 # TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation
 set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions 
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions 
 
 build-dist:: .defaults.build-dist
 
diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml
index b24886ad9..037525126 100644
--- a/transforms/universal/fdedup/ray/pyproject.toml
+++ b/transforms/universal/fdedup/ray/pyproject.toml
@@ -11,7 +11,7 @@ authors = [
 ]
 dependencies = [
     "data-prep-toolkit[ray]==0.2.2.dev2",
-    "dpk_fdedup_transform_python==0.2.2.dev1",
+    "dpk_fdedup_transform_python==0.2.2.dev2",
     "mmh3>=4.1.0",
     "xxhash==3.4.1",
     "tqdm==4.66.3",
diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile
index 7eb132fbd..ac2735e7d 100644
--- a/transforms/universal/fdedup/spark/Makefile
+++ b/transforms/universal/fdedup/spark/Makefile
@@ -36,7 +36,7 @@ publish: publish-image
 publish-image:: .transforms.publish-image-spark
 
 set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions
         
 build-dist:: .defaults.build-dist 
 
diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml
index 548f350c0..cc66fc044 100644
--- a/transforms/universal/fdedup/spark/pyproject.toml
+++ b/transforms/universal/fdedup/spark/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "dpk_fdedup_transform_spark"
-version = "0.2.2.dev1"
-requires-python = ">=3.10"
+version = "0.2.2.dev2"
+requires-python = ">=3.10,<3.13"
 description = "Fuzzy Dedup Spark Transform"
 license = {text = "Apache-2.0"}
 readme = {file = "README.md", content-type = "text/markdown"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
 dependencies = [
-    "dpk_fdedup_transform_python==0.2.2.dev1",
-    "data-prep-toolkit-spark==0.2.2.dev1",
+    "dpk_fdedup_transform_python==0.2.2.dev2",
+    "data-prep-toolkit-spark==0.2.2.dev2",
 ]
 
 [project.optional-dependencies]
diff --git a/transforms/universal/fdedup/utils/Makefile b/transforms/universal/fdedup/utils/Makefile
index dae3f30ea..d9dae01d7 100644
--- a/transforms/universal/fdedup/utils/Makefile
+++ b/transforms/universal/fdedup/utils/Makefile
@@ -14,3 +14,5 @@ venv:	requirements.txt
 		$(PIP) install --upgrade pip;		\
 		$(PIP) install -r requirements.txt;	\
 	fi					
+set-versions:
+	@:
\ No newline at end of file

From cd5eb05f82d1145a620a03d0094aac96846d5d55 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Wed, 13 Nov 2024 15:45:37 -0500
Subject: [PATCH 072/105] Change the name of the utils Makefile

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/utils/{Makefile => Makefile.local} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename transforms/universal/fdedup/utils/{Makefile => Makefile.local} (100%)

diff --git a/transforms/universal/fdedup/utils/Makefile b/transforms/universal/fdedup/utils/Makefile.local
similarity index 100%
rename from transforms/universal/fdedup/utils/Makefile
rename to transforms/universal/fdedup/utils/Makefile.local

From 6cc18cd8eaba2fb12a31f49af52aba188a9f6ac4 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Thu, 14 Nov 2024 08:36:45 -0500
Subject: [PATCH 073/105] Copy whl file to the context folder

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/python/Dockerfile |  5 +++--
 transforms/universal/fdedup/spark/Dockerfile  | 19 +++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile
index f8c41791e..a6724e6e7 100644
--- a/transforms/universal/fdedup/python/Dockerfile
+++ b/transforms/universal/fdedup/python/Dockerfile
@@ -4,6 +4,7 @@ RUN pip install --upgrade --no-cache-dir pip
 
 # install pytest
 RUN pip install --no-cache-dir pytest
+ARG DPK_WHEEL_FILE_NAME
 
 # Create a user and use it to run the transform
 RUN useradd -ms /bin/bash dpk
@@ -12,8 +13,8 @@ WORKDIR /home/dpk
 
 # Copy and install data processing libraries 
 # These are expected to be placed in the docker context before this is run (see the make image).
-COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
-RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
+COPY --chown=dpk:root data-processing-dist data-processing-dist
+RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
 
 COPY --chown=dpk:root src/ src/
 COPY --chown=dpk:root pyproject.toml pyproject.toml
diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile
index a36a7cef7..772dfef79 100644
--- a/transforms/universal/fdedup/spark/Dockerfile
+++ b/transforms/universal/fdedup/spark/Dockerfile
@@ -1,35 +1,34 @@
 ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0
-
 FROM ${BASE_IMAGE}
 
-# USER root
 # install pytest
 RUN pip install --no-cache-dir pytest
+ARG DPK_WHEEL_FILE_NAME
 
 WORKDIR ${SPARK_HOME}/work-dir
 
 # Copy in the data processing framework source/project and install it
 # This is expected to be placed in the docker context before this is run (see the make image).
-COPY --chown=spark:root data-processing-lib-python/ data-processing-lib-python/
-RUN cd data-processing-lib-python && pip install --no-cache-dir -e .
-COPY --chown=spark:root data-processing-lib-spark/ data-processing-lib-spark/
-RUN cd data-processing-lib-spark && pip install --no-cache-dir -e .
+COPY --chown=spark:root data-processing-dist data-processing-dist
+RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]
+
+## Copy the python version of the tansform
 COPY --chown=spark:root python-transform/  python-transform/
 RUN cd python-transform && pip install --no-cache-dir -e .
 
-# Install project source
+# Install spark project source
 COPY --chown=spark:root src/ src/
 COPY --chown=spark:root pyproject.toml pyproject.toml
+COPY --chown=spark:root README.md README.md
 RUN mkdir -p /opt/spark/work-dir/src/templates && \
     mkdir -p /opt/spark/work-dir/config
+COPY --chown=spark:root deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/
+COPY --chown=spark:root deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/
 
 # install requirements from requirements.txt
 COPY requirements.txt .
 RUN pip3 install -r requirements.txt
 
-COPY deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/
-COPY deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/
-
 RUN pip install --no-cache-dir -e .
 
 # copy the main() entry point to the image

From 9f336203571b07e8486292793599406b87abf830 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Thu, 14 Nov 2024 08:38:49 -0500
Subject: [PATCH 074/105] Use keyword args in compute_common_params

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index 139a0f919..0a0a4d9bf 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -221,10 +221,10 @@ def fuzzydedup(
     with dsl.ExitHandler(clean_up_task):
         # compute execution params
         compute_common_exec_params = compute_common_params_op(
-            ray_worker_options,
-            data_s3_config,
-            fdedup_num_permutations,
-            fdedup_n_samples,
+            worker_options=ray_worker_options,
+            data_s3_config=data_s3_config,
+            num_permutations=fdedup_num_permutations,
+            n_samples=fdedup_n_samples,
         )
         ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2)
         ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret)

From 528457c5cc91dad1439c72258be92e8030f45015 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Thu, 14 Nov 2024 10:42:20 -0500
Subject: [PATCH 075/105] Use dynamic dependencies

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 data-processing-lib/spark/pyproject.toml      | 55 -------------------
 transforms/universal/fdedup/python/Dockerfile |  2 +-
 .../universal/fdedup/python/pyproject.toml    | 16 +-----
 .../universal/fdedup/python/requirements.txt  | 10 ++++
 transforms/universal/fdedup/ray/Dockerfile    |  1 +
 .../universal/fdedup/ray/pyproject.toml       | 11 +---
 .../universal/fdedup/ray/requirements.txt     |  6 ++
 .../universal/fdedup/spark/pyproject.toml     | 11 ++--
 .../universal/fdedup/spark/requirements.txt   |  3 +-
 9 files changed, 33 insertions(+), 82 deletions(-)
 delete mode 100644 data-processing-lib/spark/pyproject.toml
 create mode 100644 transforms/universal/fdedup/python/requirements.txt
 create mode 100644 transforms/universal/fdedup/ray/requirements.txt

diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml
deleted file mode 100644
index 89b4d9bf8..000000000
--- a/data-processing-lib/spark/pyproject.toml
+++ /dev/null
@@ -1,55 +0,0 @@
-[project]
-name = "data_prep_toolkit_spark"
-version = "0.2.2.dev2"
-keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
-requires-python = ">=3.10,<3.13"
-description = "Data Preparation Toolkit Library for Spark"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
-    { name = "David Wood", email = "dawood@us.ibm.com" },
-    { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
-]
-dependencies = [
-    "data-prep-toolkit==0.2.2.dev2",
-    "pyspark>=3.5.2",
-    "psutil>=6.0.0",
-    "PyYAML>=6.0.2"
-]
-
-[project_urls]
-Repository = "https://github.com/IBM/data-prep-kit"
-Issues = "https://github.com/IBM/data-prep-kit/issues"
-Documentation = "https://ibm.github.io/data-prep-kit/"
-"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop"
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-
-[project.optional-dependencies]
-dev = [
-    "twine",
-    "pytest>=7.3.2",
-    "pytest-dotenv>=0.5.2",
-    "pytest-env>=1.0.0",
-    "pre-commit>=3.3.2",
-    "pytest-cov>=4.1.0",
-    "pytest-mock>=3.10.0",
-    "moto==5.0.5",
-    "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/data_processing_spark"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile
index a6724e6e7..280063863 100644
--- a/transforms/universal/fdedup/python/Dockerfile
+++ b/transforms/universal/fdedup/python/Dockerfile
@@ -19,7 +19,7 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
 COPY --chown=dpk:root src/ src/
 COPY --chown=dpk:root pyproject.toml pyproject.toml
 COPY --chown=dpk:root README.md README.md
-#COPY --chown=dpk:root requirements.txt requirements.txt
+COPY --chown=dpk:root requirements.txt requirements.txt
 
 RUN pip install --no-cache-dir -e .
 
diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml
index dd58d41d4..97be33d54 100644
--- a/transforms/universal/fdedup/python/pyproject.toml
+++ b/transforms/universal/fdedup/python/pyproject.toml
@@ -9,23 +9,13 @@ authors = [
     { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
-dependencies = [
-    "data-prep-toolkit==0.2.2.dev2",
-    "pyarrow==16.1.0",
-    "pyyaml>=6.0.2",
-    "boto3>=1.34.69",
-    "kubernetes>=30.1.0",
-    "polars==1.9.0",
-    "disjoint-set>=0.8.0",
-    "scipy>=1.14.1, <2.0.0",
-    "numpy<1.29.0",
-    "sentencepiece>=0.2.0",
-    "mmh3>=4.1.0",
-]
+dynamic = ["dependencies"]
 
 [build-system]
 requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
 build-backend = "setuptools.build_meta"
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
 
 [project.optional-dependencies]
 dev = [
diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt
new file mode 100644
index 000000000..4e69a72e4
--- /dev/null
+++ b/transforms/universal/fdedup/python/requirements.txt
@@ -0,0 +1,10 @@
+data-prep-toolkit==0.2.2.dev2
+pyyaml>=6.0.2
+boto3>=1.34.69
+kubernetes>=30.1.0
+polars==1.9.0
+disjoint-set>=0.8.0
+scipy>=1.14.1, <2.0.0
+numpy<1.29.0
+sentencepiece>=0.2.0
+mmh3>=4.1.0
diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile
index af32f0fb3..71287ced7 100644
--- a/transforms/universal/fdedup/ray/Dockerfile
+++ b/transforms/universal/fdedup/ray/Dockerfile
@@ -20,6 +20,7 @@ RUN cd python-transform && pip install --no-cache-dir -e .
 COPY --chown=ray:users src/ src/
 COPY --chown=ray:users pyproject.toml pyproject.toml
 COPY --chown=ray:users README.md README.md
+COPY --chown=ray:users requirements.txt requirements.txt
 RUN pip install --no-cache-dir -e .
 
 # copy source files needed by test-image
diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml
index 037525126..cb8c6306a 100644
--- a/transforms/universal/fdedup/ray/pyproject.toml
+++ b/transforms/universal/fdedup/ray/pyproject.toml
@@ -9,18 +9,13 @@ authors = [
     { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
-dependencies = [
-    "data-prep-toolkit[ray]==0.2.2.dev2",
-    "dpk_fdedup_transform_python==0.2.2.dev2",
-    "mmh3>=4.1.0",
-    "xxhash==3.4.1",
-    "tqdm==4.66.3",
-    "scipy>=1.12.0, <2.0.0"
-]
+dynamic = ["dependencies"]
 
 [build-system]
 requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
 build-backend = "setuptools.build_meta"
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
 
 [project.optional-dependencies]
 dev = [
diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt
new file mode 100644
index 000000000..6ee40ef7f
--- /dev/null
+++ b/transforms/universal/fdedup/ray/requirements.txt
@@ -0,0 +1,6 @@
+data-prep-toolkit[ray]==0.2.2.dev2
+dpk_fdedup_transform_python==0.2.2.dev2
+mmh3>=4.1.0
+xxhash==3.4.1
+tqdm==4.66.3
+scipy>=1.12.0, <2.0.0
diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml
index cc66fc044..f77df2010 100644
--- a/transforms/universal/fdedup/spark/pyproject.toml
+++ b/transforms/universal/fdedup/spark/pyproject.toml
@@ -9,10 +9,13 @@ authors = [
     { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
-dependencies = [
-    "dpk_fdedup_transform_python==0.2.2.dev2",
-    "data-prep-toolkit-spark==0.2.2.dev2",
-]
+dynamic = ["dependencies"]
+
+[build-system]
+requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.dynamic]
+dependencies = {file = ["requirements.txt"]}
 
 [project.optional-dependencies]
 dev = [
diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt
index 576c028a8..c373ffbb7 100644
--- a/transforms/universal/fdedup/spark/requirements.txt
+++ b/transforms/universal/fdedup/spark/requirements.txt
@@ -1,4 +1,5 @@
-pyarrow==16.1.0
+dpk_fdedup_transform_python==0.2.2.dev2
+data-prep-toolkit[spark]==0.2.2.dev2
 pyyaml>=6.0.2
 boto3>=1.34.69
 kubernetes>=30.1.0

From fffb6305e7dbd018c343fde736b396db18a3d3d3 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Thu, 14 Nov 2024 12:38:00 -0500
Subject: [PATCH 076/105] Add FIXME for
 https://github.com/kubeflow/pipelines/issues/10914

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/kfp_ray/fdedup_wf.py     | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index 0a0a4d9bf..fabc4e084 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -279,10 +279,12 @@ def fuzzydedup(
             prefix="scdata",
         )
         ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC)
-        ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret)
-        ComponentUtils.set_s3_env_vars_to_component(
-            execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata"
-        )
+        # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
+        if os.getenv("KFPv2", "0") == "1":
+            ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret)
+            ComponentUtils.set_s3_env_vars_to_component(
+                execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata"
+            )
         execute_signature_calc_job.after(compute_signature_calc_exec_params)
 
         # Get the parameters for the cluster analysis job
@@ -311,7 +313,9 @@ def fuzzydedup(
             server_url=server_url,
         )
         ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC)
-        ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret)
+        # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
+        if os.getenv("KFPv2", "0") == "1":
+            ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret)
         execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params)
 
         compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op(
@@ -338,7 +342,9 @@ def fuzzydedup(
             server_url=server_url,
         )
         ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC)
-        ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret)
+        # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
+        if os.getenv("KFPv2", "0") == "1":
+            ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret)
         execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params)
 
         compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op(
@@ -368,10 +374,12 @@ def fuzzydedup(
             prefix="dcdata",
         )
         ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC)
-        ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret)
-        ComponentUtils.set_s3_env_vars_to_component(
-            execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata"
-        )
+        # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
+        if os.getenv("KFPv2", "0") == "1":
+            ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret)
+            ComponentUtils.set_s3_env_vars_to_component(
+                execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata"
+            )
         execute_data_cleaning_job.after(compute_data_cleaning_exec_params)
 
 
From 5547d7fb574b8ebe2f8a98d6656f16faf9537808 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Thu, 14 Nov 2024 13:03:02 -0500
Subject: [PATCH 077/105] Add FIXME for
 https://github.com/kubeflow/pipelines/issues/10914

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index fabc4e084..683f93210 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -280,7 +280,7 @@ def fuzzydedup(
         )
         ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC)
         # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
-        if os.getenv("KFPv2", "0") == "1":
+        if os.getenv("KFPv2", "0") != "1":
             ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret)
             ComponentUtils.set_s3_env_vars_to_component(
                 execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata"
@@ -314,7 +314,7 @@ def fuzzydedup(
         )
         ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC)
         # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
-        if os.getenv("KFPv2", "0") == "1":
+        if os.getenv("KFPv2", "0") != "1":
             ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret)
         execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params)
 
@@ -343,7 +343,7 @@ def fuzzydedup(
         )
         ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC)
         # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
-        if os.getenv("KFPv2", "0") == "1":
+        if os.getenv("KFPv2", "0") != "1":
             ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret)
         execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params)
 
@@ -375,7 +375,7 @@ def fuzzydedup(
         )
         ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC)
         # FIXME: see https://github.com/kubeflow/pipelines/issues/10914
-        if os.getenv("KFPv2", "0") == "1":
+        if os.getenv("KFPv2", "0") != "1":
             ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret)
             ComponentUtils.set_s3_env_vars_to_component(
                 execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata"

From 09e56e05dea66de01a023c53978a23497723b698 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Thu, 14 Nov 2024 13:06:24 -0500
Subject: [PATCH 078/105] Remove pyproject.toml dependencies

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 data-processing-lib/spark/Makefile | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/data-processing-lib/spark/Makefile b/data-processing-lib/spark/Makefile
index d4769187b..5fde2bb07 100644
--- a/data-processing-lib/spark/Makefile
+++ b/data-processing-lib/spark/Makefile
@@ -11,9 +11,14 @@ setup::
 
 set-versions: .check-env
 	$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
-	sed -e 's/"pyspark...*",/"pyspark>=${SPARK_VERSION}",/'				\
-	    pyproject.toml > tt.toml
-	mv tt.toml pyproject.toml
+	if [ -e pyproject.toml ]; then					\
+		cat pyproject.toml | sed -e 's/"spark[default]==.*",/"spark[default]==$(SPARK_VERSION)",/' > tt.toml; \
+		mv tt.toml pyproject.toml; \
+	fi
+	if [ -e requirements.txt ]; then					\
+		cat requirements.txt | sed -e 's/ray[default]==.*/ray[default]==$(SPARK_VERSION)/' > tt.txt; \
+		mv tt.txt requirements.txt; \
+	fi
 
 build:: build-dist 
 
@@ -26,7 +31,7 @@ publish-dist :: .check-env .defaults.publish-dist
 
 publish-image:: .defaults.publish-image
 
-venv::  pyproject.toml
+venv::
 	$(MAKE) .defaults.spark-lib-src-venv
 	pip install pytest pytest-cov 
 

From d3eac50704aa8bf032f212a0604430a3f0764cc2 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 15 Nov 2024 10:24:30 -0500
Subject: [PATCH 079/105] Fix bug in number of actors calculation

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../fdedup/kfp_ray/src/fdedup_compute_execution_params.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
index 65b7ac2f6..cd3a58b99 100644
--- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
+++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
@@ -57,16 +57,18 @@ def compute_common_params(
     # Also, to keep S3 utilization in check, limit the number of actors to 2000
     num_nodes = worker_options["replicas"]
     cpu_per_node = worker_options["cpu"] - 1
-    memory_per_node = 0.85 * worker_options["memory"]
+    memory_per_node = worker_options["memory"]
 
     memory_per_actor = 16  # GB
     max_num_actors = 2000
     num_actors_per_node: int = int(memory_per_node / memory_per_actor)
     if num_actors_per_node == 0:
         num_actors_per_node = 1
-    num_actors = num_nodes * num_actors_per_node
+    # never run actors on the head node, so (n - 1) nodes to run actors
+    num_actors = (num_nodes - 1) * num_actors_per_node
+
     while num_actors > max_num_actors:
-        num_actors -= num_nodes
+        num_actors -= num_nodes - 1
         num_actors_per_node -= 1
     print(f"Number of actors per node = {num_actors_per_node}")
     cpus_per_actor = cpu_per_node / num_actors_per_node

From fa5959b5f90ce90e97a52288be6aee18c06b9068 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 15 Nov 2024 10:28:39 -0500
Subject: [PATCH 080/105] Cleanup main entry point and local implementation of
 python transforms

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/python/Dockerfile   |  4 ++--
 .../python/src/cluster_analysis_local_python.py |  5 +++--
 .../python/src/cluster_analysis_transform.py    | 10 +++++-----
 .../python/src/data_cleaning_local_python.py    | 12 ++++++++----
 ...dup_python.py => fdedup_transform_python.py} |  0
 ...get_duplicate_list_transform_local_python.py |  6 ++++--
 .../python/src/signature_calc_local_python.py   | 17 +----------------
 7 files changed, 23 insertions(+), 31 deletions(-)
 rename transforms/universal/fdedup/python/src/{fuzzy_dedup_python.py => fdedup_transform_python.py} (100%)

diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile
index 280063863..071478870 100644
--- a/transforms/universal/fdedup/python/Dockerfile
+++ b/transforms/universal/fdedup/python/Dockerfile
@@ -27,8 +27,8 @@ RUN pip install --no-cache-dir -e .
 COPY src/ src/
 
 # copy source data
-COPY ./src/signature_calc_transform_python.py fdedup_transform_python.py
-COPY ./src/signature_calc_local_python.py local/
+COPY ./src/fdedup_transform_python.py fdedup_transform_python.py
+COPY ./src/fdedup_transform_python.py local/
 
 # copy test
 COPY test/ test/
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
index 915cdcd1e..bb785021c 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
@@ -21,7 +21,9 @@
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands"))
+input_folder = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands")
+)
 output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
 local_conf = {
     "input_folder": input_folder,
@@ -42,7 +44,6 @@
 if __name__ == "__main__":
     # Set the simulated command line args
     sys.argv = ParamsUtils.dict_to_req(d=params)
-    print(sys.argv)
     # create launcher
     launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration())
     # Launch python to process the input
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
index 412fc1fa8..a9822babe 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
@@ -140,7 +140,7 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
         # Generate the docs_to_remove dataframe
         docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove")
         output_data = TransformUtils.convert_arrow_to_binary(docs_to_remove_dataframe.to_arrow())
-        self.logger.info(f"{len(docs_to_remove_dataframe)} documents marked to remove")
+        self.logger.debug(f"{len(docs_to_remove_dataframe)} documents marked to remove")
         metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)}
         return [(output_data, output_path)], metadata
 
@@ -187,8 +187,8 @@ def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFra
             max_cdocs = 0
             min_cdocs = 0
             avg_cdocs = 0
-        self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs")
-        self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
+        self.logger.debug(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs")
+        self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
         cluster_stats = {
             "groupby_clusters": num_clusters,
             "cluster_duplicate_docs": sum_cdocs,
@@ -226,8 +226,8 @@ def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, An
             max_cdocs = 0
             min_cdocs = 0
             avg_cdocs = 0
-        self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs")
-        self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
+        self.logger.debug(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs")
+        self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}")
         jaccard_stats = {
             "jaccard_clusters": num_clusters,
             "jaccard_duplicate_docs": sum_cdocs,
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py
index 4295e4e82..aa4aabb90 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py
@@ -23,15 +23,20 @@
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
 }
 duplicate_location = os.path.abspath(
     os.path.join(
-        os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet"
+        os.path.dirname(__file__),
+        "..",
+        "test-data",
+        "expected",
+        "docs_to_remove_consolidated",
+        "docs_to_remove_consolidated.parquet",
     )
 )
 code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
@@ -49,7 +54,6 @@
 if __name__ == "__main__":
     # Set the simulated command line args
     sys.argv = ParamsUtils.dict_to_req(d=params)
-    print(sys.argv)
     # create launcher
     launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration())
     # Launch the ray actor(s) to process the input
diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py
similarity index 100%
rename from transforms/universal/fdedup/python/src/fuzzy_dedup_python.py
rename to transforms/universal/fdedup/python/src/fdedup_transform_python.py
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
index be90b3073..34b18ab04 100644
--- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
+++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
@@ -21,8 +21,10 @@
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected/cluster_analysis"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected"))
+input_folder = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "cluster_analysis")
+)
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
index 2800c70cd..be395ed4d 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py
@@ -23,18 +23,9 @@
 
 # create parameters
 input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "test_scdata"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
 local_conf = {"input_folder": input_folder, "output_folder": output_folder}
 code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
-s3_creds = {
-    "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
-    "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
-    "url": os.getenv("AWS_ENDPOINT_URL"),
-}
-s3_config = {
-    "input_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_data/",
-    "output_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_output_data/s3_test_3/",
-}
 
 params = {
     # Data access. Only required parameters are specified
@@ -47,18 +38,12 @@
     "minhash_num_permutations": 112,
     "minhash_num_bands": 14,
     "minhash_num_segments": 2,
-    # "scdata_s3_cred": ParamsUtils.convert_to_ast(s3_creds),
-    # "scdata_s3_config": ParamsUtils.convert_to_ast(s3_config),
 }
 
 
 if __name__ == "__main__":
     # Set the simulated command line args
     sys.argv = ParamsUtils.dict_to_req(d=params)
-    print(sys.argv)
-
-    sys.argv.append("--data_s3_cred")
-    sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
 
     # create launcher
     launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration())

From c4f889b37e165e9c0f6243e7cf47d19b1185c521 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 15 Nov 2024 10:30:40 -0500
Subject: [PATCH 081/105] Cleanup main entry point and local implementation of
 ray transforms

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/ray/Dockerfile            |  2 +-
 .../fdedup/ray/src/cluster_analysis_local_ray.py      |  4 +++-
 .../fdedup/ray/src/data_cleaning_local_ray.py         | 11 ++++++++---
 .../{fuzzy_dedup_ray.py => fdedup_transform_ray.py}   |  2 +-
 .../fdedup/ray/src/signature_calc_local_ray.py        |  2 +-
 5 files changed, 14 insertions(+), 7 deletions(-)
 rename transforms/universal/fdedup/ray/src/{fuzzy_dedup_ray.py => fdedup_transform_ray.py} (97%)

diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile
index 71287ced7..4bfe32a9e 100644
--- a/transforms/universal/fdedup/ray/Dockerfile
+++ b/transforms/universal/fdedup/ray/Dockerfile
@@ -24,7 +24,7 @@ COPY --chown=ray:users requirements.txt requirements.txt
 RUN pip install --no-cache-dir -e .
 
 # copy source files needed by test-image
-COPY --chown=ray:users ./src/signature_calc_transform_ray.py fdedup_transform_ray.py
+COPY --chown=ray:users ./src/fdedup_transform_ray.py fdedup_transform_ray.py
 COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py
 COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py
 COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py
diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
index c078746ce..c54ba85c2 100644
--- a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
+++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
@@ -19,7 +19,9 @@
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands"))
+input_folder = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands")
+)
 output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
 local_conf = {
     "input_folder": input_folder,
diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
index 54fa2ccac..b951e2fc8 100644
--- a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
+++ b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
@@ -23,15 +23,20 @@
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
 }
 duplicate_location = os.path.abspath(
     os.path.join(
-        os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet"
+        os.path.dirname(__file__),
+        "..",
+        "test-data",
+        "expected",
+        "docs_to_remove_consolidated",
+        "docs_to_remove_consolidated.parquet",
     )
 )
 worker_options = {"num_cpus": 0.8}
diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
similarity index 97%
rename from transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
rename to transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
index 987369714..be1bf5fcb 100644
--- a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py
+++ b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
@@ -19,7 +19,7 @@
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from fuzzy_dedup_python import ServiceOrchestrator, parse_args
+from fdedup_transform_python import ServiceOrchestrator, parse_args
 from get_duplicate_list_transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
index 64f492584..cb87b56af 100644
--- a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
+++ b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
@@ -19,7 +19,7 @@
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
 output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
 local_conf = {
     "input_folder": input_folder,

From f3c5be0c276c228710d753b377d539aba634f95c Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 15 Nov 2024 10:32:18 -0500
Subject: [PATCH 082/105] Cleanup main entry point and local implementation of
 spark transforms

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 ...ark.py => cluster_analysis_local_spark.py} | 30 +++++++++++----
 ..._spark.py => data_cleaning_local_spark.py} | 38 +++++++++++++++----
 ...dup_spark.py => fdedup_transform_spark.py} |  2 +-
 ...spark.py => signature_calc_local_spark.py} | 29 ++++++++++----
 4 files changed, 77 insertions(+), 22 deletions(-)
 rename transforms/universal/fdedup/spark/src/{cluster_analysis_spark.py => cluster_analysis_local_spark.py} (54%)
 rename transforms/universal/fdedup/spark/src/{data_cleaning_spark.py => data_cleaning_local_spark.py} (50%)
 rename transforms/universal/fdedup/spark/src/{fuzzy_dedup_spark.py => fdedup_transform_spark.py} (97%)
 rename transforms/universal/fdedup/spark/src/{signature_calc_spark.py => signature_calc_local_spark.py} (56%)

diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py
similarity index 54%
rename from transforms/universal/fdedup/spark/src/cluster_analysis_spark.py
rename to transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py
index 83498f59e..c9950657c 100644
--- a/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py
+++ b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py
@@ -19,14 +19,30 @@
 from data_processing_spark.runtime.spark import SparkTransformLauncher
 
 
+# create parameters
+input_folder = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands")
+)
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "cluster_num_bands": 14,
+    "cluster_num_segments": 2,
+    "cluster_jaccard_similarity_threshold": 0.7,
+}
 if __name__ == "__main__":
-    sys.argv.append("--data_s3_cred")
-    s3_creds = {
-        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
-        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
-        "url": os.getenv("AWS_ENDPOINT_URL"),
-    }
-    sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
     # create launcher
     launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration())
     # Launch the spark worker(s) to process the input
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
similarity index 50%
rename from transforms/universal/fdedup/spark/src/data_cleaning_spark.py
rename to transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
index 7b6bd626d..9c14c67d8 100644
--- a/transforms/universal/fdedup/spark/src/data_cleaning_spark.py
+++ b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
@@ -19,14 +19,38 @@
 from data_processing_spark.runtime.spark import SparkTransformLauncher
 
 
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {
+    "input_folder": input_folder,
+    "output_folder": output_folder,
+}
+duplicate_location = os.path.abspath(
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+        "test-data",
+        "expected",
+        "docs_to_remove_consolidated",
+        "docs_to_remove_consolidated.parquet",
+    )
+)
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    document_id_column_cli_param: "int_id_column",
+    duplicate_list_location_cli_param: duplicate_location,
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+}
+
 if __name__ == "__main__":
-    sys.argv.append("--data_s3_cred")
-    s3_creds = {
-        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
-        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
-        "url": os.getenv("AWS_ENDPOINT_URL"),
-    }
-    sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
     # create launcher
     launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration())
     # Launch the spark worker(s) to process the input
diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py
similarity index 97%
rename from transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
rename to transforms/universal/fdedup/spark/src/fdedup_transform_spark.py
index 58688de42..82767f849 100644
--- a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py
+++ b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py
@@ -18,7 +18,7 @@
 from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing_spark.runtime.spark import SparkTransformLauncher
-from fuzzy_dedup_python import ServiceOrchestrator, parse_args
+from fdedup_transform_python import ServiceOrchestrator, parse_args
 from get_duplicate_list_transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
diff --git a/transforms/universal/fdedup/spark/src/signature_calc_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py
similarity index 56%
rename from transforms/universal/fdedup/spark/src/signature_calc_spark.py
rename to transforms/universal/fdedup/spark/src/signature_calc_local_spark.py
index 0e7046549..2db884346 100644
--- a/transforms/universal/fdedup/spark/src/signature_calc_spark.py
+++ b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py
@@ -21,14 +21,29 @@
 )
 
 
+# create parameters
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output"))
+local_conf = {"input_folder": input_folder, "output_folder": output_folder}
+code_location = {"github": "github", "commit_hash": "12345", "path": "path"}
+
+params = {
+    # Data access. Only required parameters are specified
+    "data_local_config": ParamsUtils.convert_to_ast(local_conf),
+    "scdata_local_config": ParamsUtils.convert_to_ast(local_conf),
+    # execution info
+    "runtime_pipeline_id": "pipeline_id",
+    "runtime_job_id": "job_id",
+    "runtime_code_location": ParamsUtils.convert_to_ast(code_location),
+    "minhash_num_permutations": 112,
+    "minhash_num_bands": 14,
+    "minhash_num_segments": 2,
+}
+
+
 if __name__ == "__main__":
-    sys.argv.append("--data_s3_cred")
-    s3_creds = {
-        "access_key": os.getenv("AWS_ACCESS_KEY_ID"),
-        "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
-        "url": os.getenv("AWS_ENDPOINT_URL"),
-    }
-    sys.argv.append(ParamsUtils.convert_to_ast(s3_creds))
+    # Set the simulated command line args
+    sys.argv = ParamsUtils.dict_to_req(d=params)
     # create launcher
     launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration())
     # Launch the spark worker(s) to process the input

From 4941d5bab37a0bdc1e5873ce8e7288483703751f Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Fri, 15 Nov 2024 10:46:43 -0500
Subject: [PATCH 083/105] Cleanup main entry point and local implementation of
 spark transforms

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/spark/Dockerfile                | 6 +-----
 .../universal/fdedup/spark/src/data_cleaning_local_spark.py | 4 ++++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile
index 772dfef79..b04994d46 100644
--- a/transforms/universal/fdedup/spark/Dockerfile
+++ b/transforms/universal/fdedup/spark/Dockerfile
@@ -32,11 +32,7 @@ RUN pip3 install -r requirements.txt
 RUN pip install --no-cache-dir -e .
 
 # copy the main() entry point to the image
-COPY ./src/signature_calc_spark.py .
-
-# copy some of the samples in
-COPY src/signature_calc_transform_spark.py fdedup_transform_spark.py
-COPY src/signature_calc_spark.py local/fdedup_local_spark.py
+COPY ./src/fdedup_transform_spark.py .
 
 # copy test
 COPY test/ test/
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
index 9c14c67d8..eb1e61845 100644
--- a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
+++ b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
@@ -14,6 +14,10 @@
 import sys
 
 import polars as pl
+from data_cleaning_transform import (
+    document_id_column_cli_param,
+    duplicate_list_location_cli_param,
+)
 from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
 from data_processing.utils import ParamsUtils
 from data_processing_spark.runtime.spark import SparkTransformLauncher

From 9c82fe0fb9734fb317ad8f18bfd940fe8fe361cb Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 18 Nov 2024 13:58:38 -0500
Subject: [PATCH 084/105] Added documentation for python, ray, spark and kfp

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/README.md         |  19 +-
 transforms/universal/fdedup/kfp_ray/README.md |  14 +-
 transforms/universal/fdedup/python/README.md  | 239 +++++++++++++++++-
 transforms/universal/fdedup/ray/README.md     | 211 ++++------------
 transforms/universal/fdedup/spark/README.md   | 150 ++++-------
 5 files changed, 348 insertions(+), 285 deletions(-)

diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md
index e128566d2..fed3c1370 100644
--- a/transforms/universal/fdedup/README.md
+++ b/transforms/universal/fdedup/README.md
@@ -1,10 +1,11 @@
-# Fuzzy Deduplification Transform 
-The fdedup transforms removes documents that are very similar to each other within a set of parquet files, 
-per the set of 
-[transform project conventions](../../README.md#transform-project-conventions)
-the following runtimes are available:
+# Fuzzy Deduplication Transform 
+The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a
+set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries.
+Per the set of [transform project conventions](../../README.md#transform-project-conventions) the following runtimes are available:
 
-* [ray](ray/README.md) - enables the running of the base python transformation
-in a Ray runtime
-* [kfp](kfp_ray/README.md) - enables running the ray docker image 
-in a kubernetes cluster using a generated `yaml` file.
+* [python](python/README.md) - enables running the base transform in a pure python environment
+* [ray](ray/README.md) - enables running the base python transform in a Ray runtime
+* [spark](spark/README.md) - enables running the base python transform in a spark runtime
+* [kfp](kfp_ray/README.md) - enables running the ray docker image in a kubernetes cluster using a generated `yaml` file.
+
+Please check [here](python/README.md) for a more detailed description of this transform.
diff --git a/transforms/universal/fdedup/kfp_ray/README.md b/transforms/universal/fdedup/kfp_ray/README.md
index 97fd45a69..75eb77a08 100644
--- a/transforms/universal/fdedup/kfp_ray/README.md
+++ b/transforms/universal/fdedup/kfp_ray/README.md
@@ -1,8 +1,8 @@
-# Fuzzy Deduplication Ray-base KubeFlow Pipeline Transformation 
+# Fuzzy Deduplication Ray-based KubeFlow Pipeline Transformation 
 
 
 ## Summary 
-This project allows execution of the [noop Ray transform](../ray) as a 
+This project allows execution of the [fuzzy dedup Ray transform](../ray) as a 
 [KubeFlow Pipeline](https://www.kubeflow.org/docs/components/pipelines/overview/)
 
 The detail pipeline is presented in the [Simplest Transform pipeline tutorial](../../../../kfp/doc/simple_transform_pipeline.md) 
@@ -16,13 +16,9 @@ make workflow-build
 from the directory. It creates a virtual environment (make workflow-venv) and after that compiles the pipeline 
 definitions in the folder. The virtual environment is created once for all transformers. 
 
-Note: the pipelines definitions can be compiled and executed on KFPv1 and KFPv2. Meantime, KFPv1 is our default. If you
-prefer KFPv2, please do the following:
-```shell
-make clean
-export KFPv2=1
-make workflow-build
-```
+## Considerations
+Currently, fuzzy dedup KFP pipeline definitions can be compiled and executed on KFPv1. KFPv2 is not
+supported currently, because of this issue: https://github.com/kubeflow/pipelines/issues/10914
 
 The next steps are described in [Deploying a pipeline](../../../../kfp/doc/simple_transform_pipeline.md#deploying-a-pipeline-)
 and [Executing pipeline and watching execution results](../../../../kfp/doc/simple_transform_pipeline.md#executing-pipeline-and-watching-execution-results-)
\ No newline at end of file
diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md
index 34f18c73b..d2d940344 100644
--- a/transforms/universal/fdedup/python/README.md
+++ b/transforms/universal/fdedup/python/README.md
@@ -5,7 +5,240 @@ Please see the set of
 for details on general project conventions, transform configuration,
 testing and IDE set up.
 
-## Summary
+## Contributors
+- Nelson Bore (kibnelson@gmail.com)
+- Constantin Adam (cmadam@us.ibm.com)
 
-The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see
-[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details.
\ No newline at end of file
+## Description
+The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a
+set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries.
+
+Fuzzy dedup is a complex process made up of a pipeline that performs four main steps:
+
+1. **Signature Calculation**: creates a set of minhashes for each document, and uses them to create band signatures for
+the document.
+2. **Cluster Analysis**: groups documents into clusters based on matching band signatures. Within each cluster, it
+retains only the documents that have a Jaccard similarity above a specified threshold, and it identifies which documents
+to keep as unique and which ones to mark as duplicates.
+3. **Duplicate List Generation**: combines the similarity clusters identified in each band to create a single, unified
+list of duplicate documents.
+4. **Data Cleaning**: processes the documents by either filtering out duplicates or adding annotations to distinguish
+duplicates from non-duplicates.
+
+Each one of these steps is described in more detail below.
+
+### Signature Calculation
+
+This transform computes `num_permutations` minhashes and `num_bands` signatures for each document in the dataset, by
+following these processing steps:
+1. **Shingle Generation**: create a set of character or word shingles, using a specified window length. Character
+shingles are more effective at detecting similar documents, but require more computational resources compared to word
+shingles.
+2. **Minhash Calculation**: using the shingles as input, compute `num_permutations` minhashes for each document.
+3. **Band Signature Calculation**: divide the minhashes into `num_bands`, where each band contains
+`num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band.
+
+The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard
+similarity will be marked as duplicates. A Jupyter notebook in the [utils](utils) folder generates a graph of this
+probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact
+the deduplication process.
+
+To help distribute the workload and speed up processing of the next steps, the hash space of each band is divided into
+`num_segments` segments. The band signatures, the minhashes, the document ids, and lengths are stored in an organized
+output folder structure `bands/band=b/segment=s`, where `b` is the band number and `s` is the segment number.
+
+### Cluster Analysis
+
+This transform leverages segmented processing to analyze the data generated by the **Signature Calculation** step
+efficiently and in parallel. Each worker processes a specific segment `s` of a band `b` by loading and analyzing all
+Parquet files from the folder `bands/band=b/segment=s`. Each row in the Parquet files contains, for a document:
+* `band_hash`, the document's band signature, and 
+* `data`, a structure with three fields: the unique `document_id`, document's `minhashes`, and `document_size`.
+
+The transform runs the following processing steps:
+1. **Data Loading**: combine into a single dataframe all Parquet files in `bands/band=b/segment=s`.
+2. **Clustering**: run a `group_by` operation on the `band_hash` column that will group documents with the same band
+signature into clusters.
+3. **Similarity Analysis**: for each cluster, calculate Jaccard similarity between pairs of documents using their
+minhashes, and move documents below the specified Jaccard similarity threshold into new clusters.
+4. **Duplicate Identification**: in clusters with more than one document remaining, retain the largest document with the
+smallest document id, and mark as duplicates all other documents in the cluster.
+5. **Persist Results**: save the duplicate clusters in a file.
+
+### Duplicate List Generation
+
+The **Cluster Analysis** step identifies duplicates across multiple bands, meaning a document can be marked as a
+duplicate in one or more bands (e.g., if two documents are identical, one will be marked as a duplicate in all bands).
+This transform consolidates all duplicate information from each band segment into a single file, providing a unified
+record of duplicates detected across the dataset.
+
+### Data Cleaning
+
+This transform processes the original dataset using the list of duplicate documents generated by the **Duplicate List
+Generation** step. It imports each file in the original dataset into a table and produces a new dataset. The directory
+structure of the input dataset is preserved, but the contents of the output files depend on the selected operating mode:
+1. **Annotate** - add a new `duplicate` column to the dataset, that contains a `d` for documents marked as duplicates,
+and is empty for non-duplicates
+2. **Filter duplicates** - removes all documents identified as duplicates from the dataset.
+3. **Filter non-duplicates** - removes from the dataset all documents that were not marked as duplicates, leaving only
+the duplicates.
+
+The output dataset reflects the selected mode, providing flexibility for downstream processing.
+
+## Input Columns Used by This Transform
+
+| Input Column Name                                                   | Data Type | Description                      |
+|---------------------------------------------------------------------|-----------|----------------------------------|
+| Column specified by the _contents_column_ configuration argument    | str       | Column that stores document text |
+| Column specified by the _document_id_column_ configuration argument | int64     | Column that stores document ID   |
+
+## Output Columns Annotated by This Transform
+| Output Column Name | Data Type | Description                                                                                                         |
+|------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| duplicate  | str       | Column added if fuzzy dedup runs in 'annotate' mode. Value is 'd' for duplicate documents, empty for non-duplicates |
+
+## Configuration and Usage
+### Fuzzy Deduplication Transform
+The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as
+follows:
+```text
+--input_folder INPUT_FOLDER
+                    Input folder path
+--output_folder OUTPUT_FOLDER
+                    Output folder path
+--operation_mode {filter_duplicates,filter_non_duplicates,annotate}
+                    operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents
+--contents_column CONTENTS_COLUMN
+                    name of the column that stores document text
+--document_id_column DOCUMENT_ID_COLUMN
+                    name of the column that stores document ID
+--seed SEED         seed of the random number generator
+--num_permutations NUM_PERMUTATIONS
+                    number of permutations to use for minhash calculation
+--num_bands NUM_BANDS
+                    number of bands to use for band hash calculation
+--num_minhashes_per_band NUM_MINHASHES_PER_BAND
+                    number of minhashes to use in each band
+--word_shingle_size WORD_SHINGLE_SIZE
+                    number of words included in one shingle
+--jaccard_similarity_threshold JACCARD_SIMILARITY_THRESHOLD
+                    jaccard similarity threshold above which two documents are similar
+--num_segments NUM_SEGMENTS
+                    the number of segments dividing the hashing space for each band (for scalability)
+--duplicate_list_location DUPLICATE_LIST_LOCATION
+                    path to the file with all the duplicate document ids
+--services SERVICES   Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning)
+--use_s3 USE_S3       use s3
+--s3_cred S3_CRED     ast string of options for s3 credentials
+--shingle_option SHINGLE_OPTION
+                    Option used for shingling
+
+```
+
+### Signature Calculation Transform
+The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values
+are as follows:
+```text
+--minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN
+                    name of the column storing the unique ID assigned to each document
+--minhash_contents_column MINHASH_CONTENTS_COLUMN
+                    name of the column storing the contents of each document
+--minhash_seed MINHASH_SEED
+                    the seed used to instantiate the random number generator
+--minhash_num_permutations MINHASH_NUM_PERMUTATIONS
+                    number of permutations (minhashes) calculated for each document
+--minhash_word_shingle_size MINHASH_WORD_SHINGLE_SIZE
+                    the size of the word shingles calculated for each document
+--minhash_num_bands MINHASH_NUM_BANDS
+                    the number of bands to use in the banding technique
+--minhash_num_minhashes_per_band MINHASH_NUM_MINHASHES_PER_BAND
+                    the number of minhashes to use in each band
+--minhash_num_segments MINHASH_NUM_SEGMENTS
+                    the number of segments across which we divide the hashing space for each band
+--minhash_shingle_option MINHASH_SHINGLE_OPTION
+                    Shingling option ('word' or 'char')
+```
+
+### Cluster Analysis Transform
+The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values
+are as follows:
+```text
+--cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD
+                      Jaccard similarity threshold above which two documents are duplicates
+--cluster_num_bands CLUSTER_NUM_BANDS
+                      The number of bands used in the banding technique
+--cluster_num_segments CLUSTER_NUM_SEGMENTS
+                      The number of segments dividing the hashing space for each band
+```
+
+### Get Duplicates List Transform
+This transform currently has no configuration parameters.
+
+### Data Cleaning Transform
+The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values
+are as follows:
+```text
+  --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN
+                        name of the column storing the unique ID assigned to each document
+  --fdclean_operation_mode {filter_duplicates,filter_non_duplicates,annotate}
+                        operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents
+```
+
+### Running the samples
+To run the samples, use the following `make` target to create a virtual environment:
+
+```commandline
+make venv
+```
+Subsequently, the main orchestration program can run with:
+```commandline
+source venv/bin/activate
+cd src
+python fdedup_transform_python.py
+```
+Alternatively the transforms included in fuzzy dedup can be launched independently:
+```commandline
+source venv/bin/activate
+cd src
+python signature_calc_local_python.py
+python cluster_analysis_local_python.py
+python get_duplicate_list_local_python.py
+python data_cleaning_local_python.py
+```
+After running the transforms, execute:
+```shell
+ls output
+```
+To see results of the transform.
+
+### Code example
+
+TBD (link to the notebook will be provided)
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+## Testing
+
+For testing fuzzy deduplication in a pure python runtime, use the following `make` targets. To launch integration tests
+for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data
+cleaning) use: 
+```commandline
+make test-src
+```
+
+To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that
+image, use:
+```commandline
+make test-image
+```
+
+## Further Resources
+The following is a list of references to research articles and github repositories that inspired the module's design:
+
+1. [Jure Leskovec, Anand Rajaraman, Jeff Ullman, Mining of Massive Datasets, Chapter 3: Finding Similar Items](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) 
+2. [G Penedo et al., The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/pdf/2406.17557)
+3. [Datatrove github repo](https://github.com/huggingface/datatrove)
diff --git a/transforms/universal/fdedup/ray/README.md b/transforms/universal/fdedup/ray/README.md
index 41be44301..d93be3a4a 100644
--- a/transforms/universal/fdedup/ray/README.md
+++ b/transforms/universal/fdedup/ray/README.md
@@ -1,185 +1,45 @@
 # Fuzzy Dedup
 
-Please see the set of
-[transform project conventions](../../../README.md)
-for details on general project conventions, transform configuration,
-testing and IDE set up.
+Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform
+configuration,  testing and IDE set up.
 
 ## Summary
 
-The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see
-[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details. The architecture of the implementation is presented here:
+This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime.
 
-![](images/fuzzy.png)
+## Configuration and command line Options
 
-The main components of implementation are driver, processors (implemented as actor pools) - table processor, table 
-filter and bucket hash processor, and hash actors - minhash, buckets and docs. 
-
-The complication of mapping this model to transform model is the fact that in this model assumes a two pass processing, 
-while a transform model is a single pass. The solution to this mismatch is to use transform runtime to implement the 
-first path and use the native transform pipeline to implement filtering.
-
-## Transform runtime
-The [transform runtime](src/fdedup_transform_ray.py) is implementing complete first path of the fuzzy deduping:
-* creates bucket and minhash collectors
-* implements initial file processing to populate bucket and minhash caches
-* creates doc collectors 
-* implement bucket processing
-* Clean up everything except for doc collectors in preparation to filter, that is implemented by the framework proper
-The main components of runtime are described below
-
-### TableProcessor Actor
-
-[Table processing actor](src/fdedup_transform_ray.py) is implemented following framework itself is implemented as a pair -
-`FdedupTransform` implementing the actual transformation and and 
-[transform table processor](../../../../data-processing-lib/src/data_processing/runtime/ray/transform_table_processor.py) 
-(from the framework itself).
-
-### DocsMinHash Actor
-
-This [actor](src/fdedup_support.py) stores MInHashes
-
-### BucketsHash Actor
-
-This actor [actor](src/fdedup_support.py)
-
-### BucketHashProcessor
-
-BucketHash [actor](src/fdedup_support.py) implement the actual buckets processing, removing duplicates. 
-Implementation of this actor allows to better manage this "expensive" process, by using Actor pool load balancing
-thus minimizing overall time for this operation. Instead of pre partitioning buckets, it is using dynamic load
-partitioning. We also are processing "longest" buckets first thus further improving performance. To further improve
-the overall performance we can in future implement bucket splitting - its faster to process more smaller buckets 
-then the long ones
-
-### BucketHashProcessor
-
-This [actor](src/fdedup_support.py) is queueing up requests to the `BucketHashProcessor` actor pool, which load 
-balances their execution
-
-### DocCollector Actor
-
-This [actor](src/fdedup_support.py) is a collector for unique documents
-
-## Transformer
-
-In the fuzzy dedup implementation, the [transformer](src/fdedup_transform_ray.py) only implements filtering. For every
-table, it checks document ids with the `DocumentsCollector` cache and removes all of the rows which do not have ids in 
-the hash 
-
-## Snapshotting
-
-Fuzzy dedup often runs on very large data sets and implements three very distinct phases:
-* Building buckets
-* Processing buckets
-* Filtering data
-To improve recoverability of fuzzy dedup, current implementation includes snapshotting - at the end of the first two 
-phases we snapshot the current state of execution - bucket and minhash actors after the first phase and document actors 
-after the second. This snapshotting provide code with the ability to restart from the existing snapshot. You can use one
-of two configuration flags (assuming snapshots exist):
-* `use_bucket_snapshot` to start from the second phase
-* `use_doc_snapshot` to start from the third phase
-
-## Building
-
-A [docker file](Dockerfile) that can be used for building docker image. You can use 
-
-```shell
-make build to build it
-```
-
-### Configuration and command line Options
-
-The set of dictionary keys holding [BlockListTransform](src/blocklist_transform.py)
-configuration for values are as follows:
-
-* _bucket_cpu_ - specifies number of CPUs for bucket actor
-* _doc_cpu_ - specifies number of CPUs for doc actor
-* _mhash_cpu_ - specifies number of CPUs for minhash actor
-* _num_doc_actors_ - specifies number of doc actors
-* _num_bucket_actors_ - specifies number of bucket actors
-* _num_minhash_actors_ - specifies number of minhash actors
-* _num_preprocessors_ - specifies number of preprocessors
-* _num_permutations_ - specifies number of permutations
-* _threshold_ - specifies threshold
-* _shingles_size_ - specifies shingles size
-* _japanese_data_ - specifies whether to use japanese specific document splitting
-* _delimiters_ - specifies delimiter for non japanese document splitting
-* _snapshot_delay_ - delay between different actors reading/writing snapshot not to overwhelm storage
-* -use_bucket_snapshot_ - run from the existing buckets snapshot (bypass building buckets)
-* -use_doc_snapshot_ - run from the existing docs snapshot (bypass building and processing buckets)
-
-Above you see both parameters and their values for small runs (tens of files). We also provide an 
-[estimate](src/cluster_estimator.py) to roughly determine cluster size for running transformer.
+Fuzzy Dedup configuration and command line options are the same as for the base python transform. 
 
 ## Running
-
-
-### Launched Command Line Options
+### Launched Command Line Options 
 When running the transform with the Ray launcher (i.e. TransformLauncher),
-the following command line arguments are available in addition to
-[the options provided by the launcher](../../../../data-processing-lib/doc/ray-launcher-options.md).
-
-```shell
-  --fdedup_doc_column FDEDUP_DOC_COLUMN
-                        document column name
-  --fdedup_id_column FDEDUP_ID_COLUMN
-                        integer document id column name
-  --fdedup_cluster_column FDEDUP_CLUSTER_COLUMN
-                        cluster column name
-  --fdedup_bucket_cpu FDEDUP_BUCKET_CPU
-                        number of CPUs per bucket hash
-  --fdedup_mhash_cpu FDEDUP_MHASH_CPU
-                        number of CPUs per minhash hash
-  --fdedup_doc_cpu FDEDUP_DOC_CPU
-                        number of CPUs per doc hash
-  --fdedup_num_doc_actors FDEDUP_NUM_DOC_ACTORS
-                        number of doc actors to use
-  --fdedup_num_minhash_actors FDEDUP_NUM_MINHASH_ACTORS
-                        number of minhash actors to use
-  --fdedup_num_bucket_actors FDEDUP_NUM_BUCKET_ACTORS
-                        number of bucket actors to use
-  --fdedup_num_preprocessors FDEDUP_NUM_PREPROCESSORS
-                        number of preprocessors to use
-  --fdedup_num_permutations FDEDUP_NUM_PERMUTATIONS
-                        number of permutations
-  --fdedup_threshold FDEDUP_THRESHOLD
-                        threshold
-  --fdedup_shingles_size FDEDUP_SHINGLES_SIZE
-                        number of words in shingle
-  --fdedup_delimiters FDEDUP_DELIMITERS
-                        delimiter for splitting document
-  --fdedup_snapshot_delay FDEDUP_SNAPSHOT_DELAY
-                        snapshot delay time
-  --fdedup_use_bucket_snapshot FDEDUP_USE_BUCKET_SNAPSHOT
-                        flag to continue with bucket snapshot
-  --fdedup_use_doc_snapshot FDEDUP_USE_DOC_SNAPSHOT
-                        flag to continue with doc snapshot
-  --fdedup_random_delay_limit FDEDUP_RANDOM_DELAY_LIMIT
-                        maximum delay between read
-```
-
-These correspond to the configuration keys described above.
+In addition to those available to the transform as defined in [here](../python/README.md),
+the set of 
+[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available.
 
 ### Running the samples
-To run the samples, use the following `make` targets
-
-* `run-cli-sample` - runs src/fdedup_transform_ray.py using command line args
-* `run-local-sample` - runs src/fdedup_local_ray.py
-* `run-s3-sample` - runs src/fdedup_s3_ray.py
-    * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html)
-     and [here](https://min.io/docs/minio/linux/index.html) 
-     and invocation of `make minio-start` to load data into local minio for S3 access.
-
-These targets will activate the virtual environment and set up any configuration needed.
-Use the `-n` option of `make` to see the detail of what is done to run the sample.
+To run the samples, use the following `make` target to create a virtual environment:
 
-For example, 
-```shell
-make run-cli-sample
-...
+```commandline
+make venv
+```
+Subsequently, the main orchestration program can run with:
+```commandline
+source venv/bin/activate
+cd src
+python fdedup_transform_ray.py
 ```
-Then 
+Alternatively the transforms included in fuzzy dedup can be launched independently:
+```commandline
+source venv/bin/activate
+cd src
+python signature_calc_local_ray.py
+python cluster_analysis_local_ray.py
+python get_duplicate_list_local_ray.py
+python data_cleaning_local_ray.py
+```
+After running the transforms, execute:
 ```shell
 ls output
 ```
@@ -190,3 +50,18 @@ To see results of the transform.
 To use the transform image to transform your data, please refer to the 
 [running images quickstart](../../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
+
+## Testing
+
+For testing fuzzy deduplication in a ray runtime, use the following `make` targets. To launch integration tests
+for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data
+cleaning) use: 
+```commandline
+make test-src
+```
+
+To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that
+image, use:
+```commandline
+make test-image
+```
\ No newline at end of file
diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md
index 3bf9b3245..dd0294aed 100644
--- a/transforms/universal/fdedup/spark/README.md
+++ b/transforms/universal/fdedup/spark/README.md
@@ -1,109 +1,67 @@
-# Spark-GUF
+# Fuzzy Dedup
 
-This is an implementation of Spark data processing modules. At a high level, every Spark application consists of a driver program that runs the user’s main function and executes various parallel operations on a cluster.  
+Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform
+configuration,  testing and IDE set up.
 
-The modules can run locally or remotely in a Kubernetes cluster.
+## Summary
 
-## Running Transforms locally
+This project wraps the [Fuzzy Dedup transform](../python) with a Spark runtime.
 
-Start in the `spark-guf` directory. To run the modules locally, follow these steps:
-1. Create a virtual environment using this command
-   ```
-   make venv
-   ```
-2. Activate the virtual environment:
-   ```
-   source venv/bin/activate
-   ```
+## Configuration and command line Options
 
-3. Set the `PYTHONPATH` environment variable to include the `src` directory:
-   ```
-   export PYTHONPATH=${PYTHONPATH}:${PWD}/src
-   ```
-4. Invoke one of the transforms:
-   ```
-   python src/transforms/spark_pi/spark_transformer_pi.py
-   ```
-5. To find out which arguments a transform takes, run that transform with a `--help` flag:
-   ```
-   python src/transforms/spark_filter/spark_filter_transform.py --help
-   usage: spark_filter_transform.py [-h] --input_folder INPUT_FOLDER --output_folder OUTPUT_FOLDER [--data_type DATA_TYPE]
-                                    --filter_criteria_list FILTER_CRITERIA_LIST [--filter_columns_to_drop FILTER_COLUMNS_TO_DROP]
-                                    [--filter_logical_operator {AND,OR}]
+Fuzzy Dedup configuration and command line options are the same as for the base python transform. 
 
-   optional arguments:
-      -h, --help            show this help message and exit
-      --input_folder INPUT_FOLDER
-                            path to read the input files (local fs or s3)
-      --output_folder OUTPUT_FOLDER
-                            path to write the output files (local fs or s3)
-      --data_type DATA_TYPE
-                            Type of files to filter (parquet, orc, csv, json, txt)
-      --filter_criteria_list FILTER_CRITERIA_LIST
-                            list of filter criteria (in SQL WHERE clause format), for example: [ "docq_total_words > 100 AND docq_total_words < 200", "docq_perplex_score < 230", "date_acquired BETWEEN '2023-07-04'
-                            AND '2023-07-08'", "title LIKE 'https://%'", "document_id IN ('doc-id-1', 'doc-id-2', 'doc-id-3')" ]
-      --filter_columns_to_drop FILTER_COLUMNS_TO_DROP
-                            list of columns to drop after filtering, for example: ["column1", "column2"]
-      --filter_logical_operator {AND,OR}
-                            logical operator (AND or OR) that joins filter criteria
-   ```
+## Running
+### Launched Command Line Options 
+When running the transform with the Spark launcher (i.e. TransformLauncher),
+In addition to those available to the transform as defined in [here](../python/README.md),
+the set of 
+[spark launcher](../../../../data-processing-lib/doc/spark-launcher-options.md) are available.
 
-## Running Transforms in Kubernetes/OpenShift
+### Running the samples
+To run the samples, use the following `make` target to create a virtual environment:
 
-Start in the `spark-guf` directory. To run the transforms in a Kubernetes or OpenShift cluster, follow these steps:
+```commandline
+make venv
+```
+Subsequently, the main orchestration program can run with:
+```commandline
+source venv/bin/activate
+cd src
+python fdedup_transform_spark.py
+```
+Alternatively the transforms included in fuzzy dedup can be launched independently:
+```commandline
+source venv/bin/activate
+cd src
+python signature_calc_local_spark.py
+python cluster_analysis_local_spark.py
+python get_duplicate_list_local_spark.py
+python data_cleaning_local_spark.py
+```
+After running the transforms, execute:
+```shell
+ls output
+```
+To see results of the transform.
 
-1. Build and push a pyspark base docker image (this example assumes that images are pushed to the Docker hub, but same approach can be used to push images to icr.io, or quai.io:
-   ```
-   docker build -t my-docker-username/my-pyspark:3.5.1 .
-   docker push my-docker-username/my-pyspark:3.5.1
-   ```  
-2. Build and push a specific transform image (this will use the pyspark built in the previous point as the base image):
-   ```
-   docker build -t my-docker-username/my-pyspark-filter:3.5.1 -f src/transforms/spark_filter/Dockerfile --build-arg BASE_IMAGE=my-docker-username/my-pyspark:3.5.1 .
-   docker push my-docker-username/my-pyspark-filter:3.5.1 
-   ```
+### Transforming data using the transform image
 
-3. Configure the `spark` service account (note that you can use any other service account name, but you will need then to replace `spark` with `your-service-account-name` in all the yaml files listed below). This is a one-time process to perform for each namespace where you want to run spark apps:
-   ```
-   # create 'spark' service account
-   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-serviceaccount.yaml --namespace=my-namespace
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
 
-   # create 'spark' role
-   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role.yaml --namespace=my-namespace
+## Testing
 
-   # bind the 'spark' service account to the 'spark' role
-   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role-binding.yaml --namespace=my-namespace
+For testing fuzzy deduplication in a spark runtime, use the following `make` targets. To launch integration tests
+for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data
+cleaning) use: 
+```commandline
+make test-src
+```
 
-   # bind the 'spark' service account to the cluster roles
-   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-edit-role-binding.yaml --namespace=my-namespace
-   kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-cluster-role-binding.yaml --namespace=my-namespace
-   ```
-   
- 4. Create any secrets that are needed to access S3 folders used for input or output of the transforms. Follow [this link](https://github.com/aws-samples/machine-learning-using-k8s/blob/master/docs/aws-creds-secret.md) for more information on how to build the S3 secrets.
- 
- 5. Edit a pod yaml file from the `deployment/kubernetes/pods` directory.  The steps below refer to the [yaml file used to build the filter pod] (deployment/kubernetes/pods/spark-driver-pod-filter.yaml):
-    1. Give a name to the pod (`metadata/name`), the container launched inside the pod (`spec/containers/name`), and the Spark application (the `APP_NAME` variable in `spec/containers/env`).
-    2. Specify the namespace where the pod will be created (`metadata/namespace`). Use the same namespace for the `EXECUTOR_NAMESPACE` variable in `spec/containers/env`)
-    3. Specify the command to launch the Spark application (in `spec/containers/args`)
-    4. Specify the image used by the driver (`spec/containers/image` - usually this is the transform image built under point 2).
-    5. Specify the image used by the executors (`EXECUTOR_DOCKER_IMAGE` variable in `spec/containers/env`)
-    6. Specify the service account to use by the driver (`spec/containers/serviceAccount`) and by the executors(the `SERVICE_ACCOUNT` variable in `spec/containers/env`)
-    7. Configure S3: 
-       1. Specify the input (`AWS_ENDPOINT_URL_IN`) and output (`AWS_ENDPOINT_URL_OUT`) endpoint URLs.  
-       2. Specify the input and out access key ids and secret access keys.
-
-6. Launch the Spark application by creating the driver pod:
-   ```
-   kubectl apply -f deployment/kubernetes/pod/spark-driver-pod-filter.yaml
-   ```
-   
-7. Monitor the creation of the executor pods:
-   ```
-   kubectl get pods -w
-   ```
-
-8. Monitor the driver logs:
-   ```
-   kubectl logs spark-driver-pod-filter -f
-   ```
-   ```
+To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that
+image, use:
+```commandline
+make test-image
+```
\ No newline at end of file

From ed4e9c1f8cfb77084d095d99200b68355cc059f4 Mon Sep 17 00:00:00 2001
From: Shahrokh Daijavad <shahrokhDaijavad@users.noreply.github.com>
Date: Mon, 18 Nov 2024 16:40:37 -0800
Subject: [PATCH 085/105] Update README.md

utils folder is one level up from the python folder
---
 transforms/universal/fdedup/python/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md
index d2d940344..295862221 100644
--- a/transforms/universal/fdedup/python/README.md
+++ b/transforms/universal/fdedup/python/README.md
@@ -39,7 +39,7 @@ shingles.
 `num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band.
 
 The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard
-similarity will be marked as duplicates. A Jupyter notebook in the [utils](utils) folder generates a graph of this
+similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this
 probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact
 the deduplication process.
 

From fb5601a7eefa66236b9d2b42edbebc476b509606 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Tue, 19 Nov 2024 14:28:42 -0500
Subject: [PATCH 086/105] Code cleanup and bug fixes

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/python/Dockerfile |  3 --
 .../python/src/cluster_analysis_transform.py  | 43 ++++++++---------
 .../python/src/data_cleaning_transform.py     | 12 ++---
 .../python/src/fdedup_transform_python.py     | 29 +++++++++--
 .../src/get_duplicate_list_transform.py       | 23 +++------
 .../python/src/signature_calc_transform.py    | 48 +++++++++----------
 .../src/signature_calc_transform_python.py    |  2 +-
 7 files changed, 82 insertions(+), 78 deletions(-)

diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile
index 071478870..79c85e4ac 100644
--- a/transforms/universal/fdedup/python/Dockerfile
+++ b/transforms/universal/fdedup/python/Dockerfile
@@ -23,9 +23,6 @@ COPY --chown=dpk:root requirements.txt requirements.txt
 
 RUN pip install --no-cache-dir -e .
 
-# copy source data
-COPY src/ src/
-
 # copy source data
 COPY ./src/fdedup_transform_python.py fdedup_transform_python.py
 COPY ./src/fdedup_transform_python.py local/
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
index a9822babe..16febc0dc 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
@@ -13,13 +13,17 @@
 import os
 import re
 from argparse import ArgumentParser, Namespace
-from typing import Any, List, Tuple
+from typing import Any, List
 
 import numpy as np
 import polars as pl
-import pyarrow as pa
 from data_processing.transform import AbstractFolderTransform, TransformConfiguration
-from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
+from data_processing.utils import (
+    CLIArgumentProvider,
+    TransformUtils,
+    UnrecoverableException,
+    get_logger,
+)
 from Murmur_MH import Murmur_MH
 
 
@@ -86,7 +90,7 @@ class ClusterAnalysisTransform(AbstractFolderTransform):
     to keep (the largest size document), and mark the other documents as
     duplicates. The resulting clusters are saved in a file for further analysis.
 
-    Args:
+    The following internal variables are initialized from the config parameter:
         num_bands: number of bands used in the banding technique
         jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates
         num_segments: the number of segments dividing the hashing space for each band
@@ -106,12 +110,14 @@ def __init__(self, config: dict[str, Any]):
         )
         self.sort_output = config.get(sort_output_key, sort_output_default)
         self.data_access = config.get("data_access")
+        if self.data_access is None:
+            raise UnrecoverableException("Could not get a pointer to the data access object inside the transform.")
         self.logger = get_logger(__name__)
 
     def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
         self.logger.info(f"Cluster analysis for folder {folder_name}")
         metadata = {}
-        input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name))
+        input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name))
         files, retries = self.data_access.get_folder_files(
             path=input_folder,
             extensions=[".parquet"],
@@ -125,17 +131,17 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
             segment = int(match.group(2))
         else:
             raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s")
-        output_folder = self.sanitize_folder_name(self.data_access.output_folder)
+        output_folder = TransformUtils.clean_path(self.data_access.output_folder)
         output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet")
 
         # consolidate into a single data frame band hashes computed by workers
-        band_segment_dataframe, consolidation_stats = self.consolidate_band_segment_files(files)
+        band_segment_dataframe, consolidation_stats = self._consolidate_band_segment_files(files)
         metadata |= consolidation_stats
         # cluster grouping by band hashes
-        cluster_dataframe, cluster_stats = self.get_clusters(band_segment_dataframe)
+        cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)
         metadata |= cluster_stats
         # cluster analysis using jaccard similarity
-        jaccard_cluster_dataframe, jaccard_stats = self.analyze_clusters(cluster_dataframe)
+        jaccard_cluster_dataframe, jaccard_stats = self._analyze_clusters(cluster_dataframe)
         metadata |= jaccard_stats
         # Generate the docs_to_remove dataframe
         docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove")
@@ -144,14 +150,7 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
         metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)}
         return [(output_data, output_path)], metadata
 
-    def sanitize_folder_name(self, folder_name: str) -> str:
-        if "://" in folder_name:
-            _, folder_name = folder_name.split("://")
-        if folder_name[-1] != "/":
-            folder_name = f"{folder_name}/"
-        return folder_name
-
-    def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]:
+    def _consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]:
         band_segment_dataframe = pl.DataFrame()
         total_input_rows = 0
         for fname, contents in files.items():
@@ -170,7 +169,7 @@ def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.Da
         }
         return band_segment_dataframe, consolidation_stats
 
-    def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]:
+    def _get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]:
         groupby_dataframe = band_segment_dataframe.group_by("band_hash").agg("document_data")
         cluster_dataframe = groupby_dataframe.with_columns(cluster_length=pl.col("document_data").list.len()).filter(
             pl.col("cluster_length") > 1
@@ -195,14 +194,14 @@ def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFra
         }
         return cluster_dataframe, cluster_stats
 
-    def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]:
+    def _analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]:
         # Define the schema with specific data types
         schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64}
         doc_ids_lists = []
         docs_to_remove_lists = []
         len_of_docs2remove_lists = []
         for row in df.iter_rows(named=True):
-            doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self.jaccard_distance_calculation(row)
+            doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self._jaccard_distance_calculation(row)
             doc_ids_lists += doc_ids_list
             docs_to_remove_lists += docs_to_remove_list
             len_of_docs2remove_lists += len_of_docs2remove_list
@@ -236,7 +235,7 @@ def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, An
             filtered_jaccard_dataframe = filtered_jaccard_dataframe.sort(by="first_doc")
         return filtered_jaccard_dataframe, jaccard_stats
 
-    def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]:
+    def _jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]:
         # Process row and return a new list of Series or a new row
         threshold = self.jaccard_similarity_threshold
         doc_ids_list = []
@@ -321,7 +320,7 @@ def add_input_params(self, parser: ArgumentParser) -> None:
             f"--{sort_output_cli_param}",
             type=bool,
             default=sort_output_default,
-            help="Sort",
+            help="Sort the similarity clusters by the document ID of the kept doc (used primarily for testing)",
         )
 
     def apply_input_params(self, args: Namespace) -> bool:
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
index 74597068c..3403bfc42 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
@@ -12,14 +12,13 @@
 import io
 import os
 from argparse import ArgumentParser, Namespace
-from typing import Any, List, Tuple
+from typing import Any
 
-import numpy as np
 import polars as pl
 import pyarrow as pa
 from data_processing.data_access import DataAccessFactory
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
-from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger
+from data_processing.utils import CLIArgumentProvider, get_logger
 
 
 short_name = "fdclean"
@@ -69,8 +68,9 @@ class DataCleaningTransform(AbstractTableTransform):
     keeps the directory structure of the input dataset, but has all the fuzzy
     duplicates removed.
 
-    Args:
-        duplicate_location: location (local or s3) of the duplicate document list
+    The following internal variables are initialized from the config dictionary:
+        duplicate_list_location: location (local or s3) of the duplicate document list
+        operation_mode: one of annotate, filter_duplicates, or filter_non_duplicates
     """
 
     def __init__(self, config: dict[str, Any]):
@@ -90,7 +90,7 @@ def __init__(self, config: dict[str, Any]):
         self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column})
 
     def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
-        self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}")
+        self.logger.debug(f"Transforming table with {table.num_rows} rows from file {file_name}")
         input_df = pl.from_arrow(table)
         # handle the case when the doc_id columns in the input dataframe and the
         # docs_to_remove_df  have different types, i.e. one is int32 and the
diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py
index b77f44401..166e48e26 100644
--- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py
+++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py
@@ -115,17 +115,38 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list:
                 s3_cred_ast = ParamsUtils.convert_to_ast(in_args.s3_cred)
                 sys_argv.append("--data_s3_cred")
                 sys_argv.append(s3_cred_ast)
+                if service_name == "minhash":
+                    sys_argv.append("--scdata_s3_cred")
+                    sys_argv.append(s3_cred_ast)
+                if service_name == "fdclean":
+                    sys_argv.append("--dcdata_s3_cred")
+                    sys_argv.append(s3_cred_ast)
             elif (
                 s3_creds.get("access_key") is not None
                 and s3_creds.get("secret_key") is not None
                 and s3_creds.get("url") is not None
             ):
+                ast_s3_cred = ParamsUtils.convert_to_ast(s3_creds)
                 sys_argv.append("--data_s3_cred")
-                sys_argv.append(ParamsUtils.convert_to_ast(s3_creds))
+                sys_argv.append(ast_s3_cred)
+                if service_name == "minhash":
+                    sys_argv.append("--scdata_s3_cred")
+                    sys_argv.append(ast_s3_cred)
+                if service_name == "fdclean":
+                    sys_argv.append("--dcdata_s3_cred")
+                    sys_argv.append(ast_s3_cred)
             sys_argv.append("--data_s3_config")
         else:
             sys_argv.append("--data_local_config")
-        sys_argv.append(ParamsUtils.convert_to_ast(data_io))
+        ast_data_io = ParamsUtils.convert_to_ast(data_io)
+        sys_argv.append(ast_data_io)
+        if in_args.use_s3:
+            if service_name == "minhash":
+                sys_argv.append("--scdata_s3_config")
+                sys_argv.append(ast_data_io)
+            if service_name == "fdclean":
+                sys_argv.append("--dcdata_s3_config")
+                sys_argv.append(ast_data_io)
         return sys_argv
 
     def execute_service(self, service_short_name: str, params: list) -> int:
@@ -163,9 +184,9 @@ def parse_args() -> argparse.Namespace:
         "--contents_column", type=str, required=False, help="name of the column that stores document text"
     )
     parser.add_argument(
-        "--document_id_column", type=str, required=False, help="name of the column that stores document text"
+        "--document_id_column", type=str, required=False, help="name of the column that stores document ID"
     )
-    parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text")
+    parser.add_argument("--seed", type=int, required=False, help="seed of the random number generator")
     parser.add_argument(
         "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation"
     )
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
index c49124cf1..c14c4bdce 100644
--- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
+++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
@@ -11,16 +11,12 @@
 ################################################################################
 import io
 import os
-import re
 from argparse import ArgumentParser, Namespace
-from typing import Any, List, Tuple
+from typing import Any
 
-import numpy as np
 import polars as pl
-import pyarrow as pa
 from data_processing.transform import AbstractFolderTransform, TransformConfiguration
 from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger
-from Murmur_MH import Murmur_MH
 
 
 short_name = "fdlist"
@@ -61,7 +57,7 @@ class GetDuplicateListTransform(AbstractFolderTransform):
     This is an intermediate step of the fuzzy dedup pipeline. It runs in a single
     location and consolidates in a single file all the duplicates found for each
     band segment.
-    Args:
+    These internal variables are initialized from the config dictionary:
         subfolder: name of the subfolder with the duplicate records
         consolidated_filename: name of the file with the consolidated list of duplicates
     """
@@ -82,7 +78,7 @@ def __init__(self, config: dict[str, Any]):
     def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
         self.logger.info(f"Get Duplicate List for folder {folder_name}")
         metadata = {}
-        input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name))
+        input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name))
         files, retries = self.data_access.get_folder_files(
             path=input_folder,
             extensions=[".parquet"],
@@ -90,24 +86,17 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str
         )
         if retries > 0:
             metadata |= {"data_access_retries": retries}
-        output_folder = self.sanitize_folder_name(self.data_access.output_folder)
+        output_folder = TransformUtils.clean_path(self.data_access.output_folder)
         output_path = os.path.join(output_folder, self.consolidated_filename)
 
         # consolidate into a single data frame band hashes computed by workers
-        consolidated_dataframe, consolidation_stats = self.consolidate_docs_to_remove_files(files)
+        consolidated_dataframe, consolidation_stats = self._consolidate_docs_to_remove_files(files)
         self.logger.info(f"{len(consolidated_dataframe)} documents marked as duplicates")
         metadata |= consolidation_stats
         output_data = TransformUtils.convert_arrow_to_binary(consolidated_dataframe.to_arrow())
         return [(output_data, output_path)], metadata
 
-    def sanitize_folder_name(self, folder_name: str) -> str:
-        if "://" in folder_name:
-            _, folder_name = folder_name.split("://")
-        if folder_name[-1] != "/":
-            folder_name = f"{folder_name}/"
-        return folder_name
-
-    def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]:
+    def _consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]:
         consolidated_dataframe = pl.DataFrame()
         total_input_rows = 0
         for fname, contents in files.items():
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py
index 6b14e1ba0..4e64bcb5a 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py
@@ -14,7 +14,7 @@
 import unicodedata
 from argparse import ArgumentParser, Namespace
 from pathlib import Path
-from typing import Any, List
+from typing import Any
 
 import mmh3
 import numpy as np
@@ -22,7 +22,7 @@
 import pyarrow as pa
 from data_processing.data_access import DataAccessFactory
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
-from data_processing.utils import CLIArgumentProvider
+from data_processing.utils import CLIArgumentProvider, UnrecoverableException
 from Murmur_MH import Murmur_MH
 
 
@@ -129,16 +129,13 @@ class SignatureCalculationTransform(AbstractTableTransform):
     """
     This is the first transform of the fuzzy dedup pipeline. First, it calculates,
     for each document in a dataset, `num_permutations` minhashes.  It accepts as
-    input the number of bands and the length of each band.  If those two parameters
-    are not specified, then, based on the values of `jaccard_similarity_threshold`
-    and `num_permutations`, it determines the optimal number of bands, and the
-    length of each band (how many minhashes will be used to get the signature for
-    each band). The band signatures, the minhashes and the document lengths are
+    input the number of bands and the length (number of minhashes used for) each
+    band. The band signatures, the minhashes and the document lengths are
     then saved in the output folder, under a folder structure `bands/band=b/segment=s`.
     To improve scalability of the next step of fuzzy dedup, the hash space of
     each band is divided into `num_segments` segments.
 
-    Args:
+    The following internal variables are retrieved from the config parameter:
         document_id_column: name of the column storing the unique ID assigned to each document
         contents_column_cli_param: name of the column storing the contents of each document
         seed: the seed used to instantiate the random number generator
@@ -171,21 +168,22 @@ def __init__(self, config: dict[str, Any]):
         self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default)
         self.shingle_option = config.get(shingle_option_key, shingle_option_default)
         # use this dataframe to store the minhashes and size for each document
-        self.all_minhashes: pl.DataFrame = None
+        self.all_minhashes = None
         # use this dataframe to store the band hashes for each document
-        self.all_band_hashes: pl.DataFrame = None
+        self.all_band_hashes = None
         # this variable keeps track of how many files were processed since last
         # data write to properly update metadata
         self.files_processed = 0
         self.bytes_processed = 0
         self.data_access = config.get("data_access")
+        if self.data_access is None:
+            raise UnrecoverableException("Could not get a pointer to the data access object inside the transform.")
         self.last_file_name = None
+
         self.sc_data_access = config.get(sigcalc_data_access_key, None)
-        if self.sc_data_access is None:
-            self.sc_daf = config.get(sigcalc_data_factory_key, None)
-            if self.sc_daf is None:
-                raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}")
-            self.sc_data_access = self.sc_daf.create_data_access()
+        self.sc_daf = config.get(sigcalc_data_factory_key, None)
+        if self.sc_daf is None:
+            raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}")
 
     def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
         """
@@ -194,7 +192,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
         This implementation makes no modifications so effectively implements a copy of the
         input parquet to the output folder, without modification.
         """
-        self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}")
+        self.logger.debug(f"Transforming table with {table.num_rows} rows from file {file_name}")
         self.logger.debug("----minhash---")
         self.last_file_name = file_name
         self.files_processed += 1
@@ -226,7 +224,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
             self.all_minhashes = self.all_minhashes.vstack(minhashes)
 
         # Calculate band hashes
-        band_hashes_list = self.process_rows_into_bands(
+        band_hashes_list = self._process_rows_into_bands(
             minhashes,
             self.num_bands,
             self.num_rows,
@@ -247,7 +245,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
             self.all_band_hashes = self.all_band_hashes.vstack(band_hashes)
 
         if len(self.all_minhashes) > 750000:
-            tables, metadata = self.write_band_signatures()
+            tables, metadata = self._write_band_signatures()
         else:
             tables = []
             metadata = {}
@@ -266,14 +264,16 @@ def flush(self) -> tuple[list[pa.Table], dict[str, Any]]:
         """
         self.logger.info(f"Starting flush()")
         if self.all_band_hashes is not None and self.all_minhashes is not None:
-            tables, metadata = self.write_band_signatures()
+            tables, metadata = self._write_band_signatures()
         else:
             tables = []
             metadata = {}
         return tables, metadata
 
-    def write_band_signatures(self):
+    def _write_band_signatures(self):
         # define the upper and lower bounds of each band segment
+        if self.sc_data_access is None:
+            self.sc_data_access = self.sc_daf.create_data_access()
         segment_bounds_list = []
         upper_bound = np.uint64(np.iinfo(np.uint64).max)
         segment_len = np.uint64(upper_bound // self.num_segments)
@@ -325,7 +325,6 @@ def write_band_signatures(self):
                 self.logger.debug(f"band {band_ix} segment {segment_index} encapsulated document info in a structure")
 
                 # append the table to the result list, and the path to metadata
-                common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name])
                 last_file_name_path = Path(self.last_file_name)
                 suffix_path = last_file_name_path.relative_to(self.data_access.input_folder)
                 if self.sc_data_access.output_folder is None:
@@ -389,7 +388,7 @@ def _generate_word_shingles(
             k_shingles.append(delimiter.join(words[i : i + window_size]))
         return k_shingles, doc_len, document_id
 
-    def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b: int, r: int, seed: int = 42):
+    def _emit_bands(self, int_id_column: str, minhashes: np.array, b: int, r: int, seed: int = 42):
         num_minhashes = len(minhashes)
         assert b * r <= num_minhashes, f"b*r must be <= num minhashes, was b={b}, r={r}, num_minhashes={num_minhashes}"
         results = []
@@ -403,13 +402,12 @@ def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b
         return results
 
     # Apply the function
-    def process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band):
+    def _process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band):
         result = []
         for row in df.iter_rows():
-            bands = self.emit_bands(
+            bands = self._emit_bands(
                 row[0],  # document id
                 np.array(row[1], dtype=np.uint32),  # minhashes
-                row[2],  # document length
                 minhashlsh_num_bands,
                 minhashlsh_length_band,
             )
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py
index 5ddc102eb..40e0e97e3 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py
+++ b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py
@@ -40,5 +40,5 @@ def __init__(self):
 
 if __name__ == "__main__":
     launcher = PythonTransformLauncher(SignatureCalculationTransformConfiguration())
-    logger.info("Launching noop transform")
+    logger.info("Launching fuzzy dedup signature calculation transform")
     launcher.launch()

From 0636d5f998c61d9169fbd5afb3d124aa6b1bad4f Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Thu, 21 Nov 2024 00:44:28 -0500
Subject: [PATCH 087/105] Reduce the amount of logging

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/python/src/cluster_analysis_transform.py   | 2 +-
 .../universal/fdedup/python/src/data_cleaning_transform.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
index 16febc0dc..fa3ce6d28 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
+++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
@@ -115,7 +115,7 @@ def __init__(self, config: dict[str, Any]):
         self.logger = get_logger(__name__)
 
     def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
-        self.logger.info(f"Cluster analysis for folder {folder_name}")
+        self.logger.debug(f"Cluster analysis for folder {folder_name}")
         metadata = {}
         input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name))
         files, retries = self.data_access.get_folder_files(
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
index 3403bfc42..cb07923ae 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py
+++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py
@@ -86,7 +86,7 @@ def __init__(self, config: dict[str, Any]):
         self.operation_mode = config.get(operation_mode_key, operation_mode_default)
         contents = config.get("df")
         self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents))
-        self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows")
+        self.logger.debug(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows")
         self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column})
 
     def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:

From d58518bfe9d52eacd0063909267cabafb1f546dc Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Thu, 21 Nov 2024 00:45:39 -0500
Subject: [PATCH 088/105] Cleanup KFP pipeline code

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/kfp_ray/fdedup_wf.py     |  40 +++---
 .../src/fdedup_compute_execution_params.py    | 134 ++++++++++--------
 2 files changed, 92 insertions(+), 82 deletions(-)

diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index 683f93210..ffc6f79bc 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -115,22 +115,23 @@ def fuzzydedup(
     ray_name: str = "fuzzydedup-kfp-ray",  # name of Ray cluster
     # Add image_pull_secret and image_pull_policy to ray workers if needed
     ray_head_options: dict = {
-        "cpu": 1,
-        "memory": 4,
+        "cpu": 8,
+        "memory": 64,
         "image": task_image,
         "image_pull_secret": image_pull_secret,
         "imagePullPolicy": "Always",
     },
     ray_worker_options: dict = {
-        "replicas": 2,
-        "max_replicas": 2,
-        "min_replicas": 2,
-        "cpu": 2,
-        "memory": 4,
+        "replicas": 10,
+        "max_replicas": 10,
+        "min_replicas": 10,
+        "cpu": 16,
+        "memory": 128,
         "image": task_image,
         "image_pull_secret": image_pull_secret,
         "imagePullPolicy": "Always",
     },
+    runtime_actor_options: dict = {"num_cpus": 0.8, "memory": 16},
     server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
     # data access. checkpointing is not supported by dedup
     data_s3_config: str = "{'input_folder': 's3://cos-llm-pile-south/spark_test/fd_xs_dataset_test/', 'output_folder': 's3://cos-llm-pile-south/spark_test/fuzzy_dedup_test_output_data/kfp_test_1/'}",
@@ -153,10 +154,6 @@ def fuzzydedup(
     fdedup_shingle_option: str = "word",
     fdedup_jaccard_similarity_threshold: float = 0.75,
     fdedup_seed: int = 42,
-    fdedup_docs_to_remove_folder: str = "docs_to_remove",
-    fdedup_duplicate_list_location: str = os.path.join(
-        "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet"
-    ),
     fdedup_operation_mode: str = "annotate",
     # data sampling
     fdedup_n_samples: int = 10,
@@ -206,8 +203,6 @@ def fuzzydedup(
     :param fdedup_shingle_option - type of shingle, one of 'word', or 'char'
     :param fdedup_jaccard_similarity_threshold - similarity threshold
     :param fdedup_seed - seed for the random number generator
-    :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids
-    :param fdedup_duplicate_list_location - name of the file holding the consolidated list of duplicates
     :param fdedup_operation_mode - data cleaning mode, one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate'
     :param fdedup_n_samples - number of samples for parameters computation
     :return: None
@@ -222,6 +217,7 @@ def fuzzydedup(
         # compute execution params
         compute_common_exec_params = compute_common_params_op(
             worker_options=ray_worker_options,
+            actor_options=runtime_actor_options,
             data_s3_config=data_s3_config,
             num_permutations=fdedup_num_permutations,
             n_samples=fdedup_n_samples,
@@ -229,8 +225,9 @@ def fuzzydedup(
         ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2)
         ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret)
         fdedup_num_segments = compute_common_exec_params.outputs["num_segments"]
-        runtime_actor_cpus = compute_common_exec_params.outputs["cpus_per_actor"]
         runtime_num_actors = compute_common_exec_params.outputs["num_actors"]
+        runtime_actor_cpus = compute_common_exec_params.outputs["actor_cpu"]
+        runtime_actor_memory = compute_common_exec_params.outputs["actor_memory"]
 
         # start Ray cluster
         ray_cluster = create_ray_op(
@@ -246,8 +243,9 @@ def fuzzydedup(
 
         # Get the parameters for the signature calculation job
         compute_signature_calc_exec_params = compute_signature_calc_exec_params_op(
-            runtime_actor_cpus=runtime_actor_cpus,
             runtime_num_actors=runtime_num_actors,
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_actor_memory=runtime_actor_memory,
             data_s3_config=data_s3_config,
             data_max_files=data_max_files,
             data_num_samples=data_num_samples,
@@ -289,8 +287,9 @@ def fuzzydedup(
 
         # Get the parameters for the cluster analysis job
         compute_cluster_analysis_exec_params = compute_cluster_analysis_exec_params_op(
-            runtime_actor_cpus=runtime_actor_cpus,
             runtime_num_actors=runtime_num_actors,
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_actor_memory=runtime_actor_memory,
             data_s3_config=data_s3_config,
             data_max_files=data_max_files,
             data_num_samples=data_num_samples,
@@ -319,16 +318,15 @@ def fuzzydedup(
         execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params)
 
         compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op(
-            runtime_actor_cpus=runtime_actor_cpus,
             runtime_num_actors=runtime_num_actors,
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_actor_memory=runtime_actor_memory,
             data_s3_config=data_s3_config,
             data_max_files=data_max_files,
             data_num_samples=data_num_samples,
             runtime_pipeline_id=runtime_pipeline_id,
             runtime_job_id=run_id,
             runtime_code_location=runtime_code_location,
-            duplicate_docids_folder=fdedup_docs_to_remove_folder,
-            duplicate_list_location=fdedup_duplicate_list_location,
         )
         ComponentUtils.add_settings_to_component(compute_get_duplicate_list_exec_params, ONE_HOUR_SEC * 2)
         compute_get_duplicate_list_exec_params.after(execute_cluster_analysis_job)
@@ -348,8 +346,9 @@ def fuzzydedup(
         execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params)
 
         compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op(
-            runtime_actor_cpus=runtime_actor_cpus,
             runtime_num_actors=runtime_num_actors,
+            runtime_actor_cpus=runtime_actor_cpus,
+            runtime_actor_memory=runtime_actor_memory,
             data_s3_config=data_s3_config,
             data_max_files=data_max_files,
             data_num_samples=data_num_samples,
@@ -357,7 +356,6 @@ def fuzzydedup(
             runtime_job_id=run_id,
             runtime_code_location=runtime_code_location,
             id_column=fdedup_document_id_column,
-            duplicate_list_location=fdedup_duplicate_list_location,
             operation_mode=fdedup_operation_mode,
         )
         ComponentUtils.add_settings_to_component(compute_data_cleaning_exec_params, ONE_HOUR_SEC * 2)
diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
index cd3a58b99..15722c164 100644
--- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
+++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py
@@ -10,15 +10,27 @@
 # limitations under the License.
 ################################################################################
 
-from typing import Any, Dict, NamedTuple
+from typing import Any, NamedTuple
 
 
 def compute_common_params(
     worker_options: dict,  # ray worker configuration
+    actor_options: dict,  # actor desired configuration
     data_s3_config: str,  # S3 configuration
     num_permutations: int,  # number of permutations (minhashes) per document
     n_samples: int,  # files to sample for number of documents estimation
-) -> NamedTuple("fdedup_params", [("num_segments", int), ("num_actors", int), ("cpus_per_actor", float)]):
+) -> NamedTuple(
+    "fdedup_params", [("num_segments", int), ("num_actors", str), ("actor_cpu", float), ("actor_memory", int)]
+):
+    """
+    Compute fuzzy dedup execution parameters common to all the transforms
+    :param worker_options: worker group configuration
+    :param actor_options: desired actor configuration
+    :param data_s3_config: s3 configuration
+    :param num_permutations: number of permutations
+    :param n_samples: number of samples used to estimate the total number of documents in the dataset
+    :return: fdedup_params NamedTuple: num_segments - int, num_actors - str, cpus (float) and memory (int) per actor
+    """
 
     import sys
 
@@ -40,49 +52,45 @@ def compute_common_params(
         print(f"Estimated number of documents and documents size is zero. Please verify the input path.")
         sys.exit(1)
     print(f"Estimated number of docs: {number_of_docs}")
+    actor_cpu: float = actor_options.get("num_cpus", 1)  # if num_cpus not specified, request 1 CPU per actor
+    actor_memory: int = int(actor_options.get("memory", 16)) * GB  # if memory not specified, request 16 GB per actor
+    # Calculate the number of segments
     # Assume each document takes doc_bytes = (8 + num_permutations * 4 + 20) bytes, where:
     #   8 bytes are taken by the band hash
     #   (num_permutations * 4) bytes are taken by the min hashes
     #   20 bytes to provide some extra space for storage in a table
     # The total amount of space needed by a band is number_of_docs * doc_bytes.
-    # To scale the handling of this data, divide each band into segments, where each segment size is below 3GB
+    # To scale band handling, divide each band into segments, each smaller than 1/6 of an actor's allocated memory
     doc_bytes = 8 + num_permutations * 4 + 20
     band_bytes = number_of_docs * doc_bytes
-    num_segments = 1 + (band_bytes // (3 * GB))
+    num_segments = 1 + (band_bytes // (actor_memory // 6))
     print(f"Number of segments: {num_segments}")
 
-    # To process data efficiently, each actor needs 16GB of memory.
-    # The actor config controls CPU allocation, not memory;
-    # use CPU allocation s.t. the number of actors on a worker  provides access to 16GB of memory for each actor.
-    # Also, to keep S3 utilization in check, limit the number of actors to 2000
-    num_nodes = worker_options["replicas"]
-    cpu_per_node = worker_options["cpu"] - 1
-    memory_per_node = worker_options["memory"]
-
-    memory_per_actor = 16  # GB
-    max_num_actors = 2000
-    num_actors_per_node: int = int(memory_per_node / memory_per_actor)
-    if num_actors_per_node == 0:
-        num_actors_per_node = 1
-    # never run actors on the head node, so (n - 1) nodes to run actors
-    num_actors = (num_nodes - 1) * num_actors_per_node
-
-    while num_actors > max_num_actors:
-        num_actors -= num_nodes - 1
-        num_actors_per_node -= 1
-    print(f"Number of actors per node = {num_actors_per_node}")
-    cpus_per_actor = cpu_per_node / num_actors_per_node
-    print(f"CPUs per actor = {cpus_per_actor}")
+    # Calculate number of actors, using KFPUtils.default_compute_execution_params()
+    # Create new dict with memory expressed in bytes, as expected by KFPUtils.default_compute_execution_params()
+    actor_config = {
+        "num_cpus": actor_cpu,
+        "memory": actor_memory,
+    }
+    num_actors = KFPUtils.default_compute_execution_params(str(worker_options), str(actor_config))
 
+    print(f"num_actors = {num_actors}")
     from collections import namedtuple
 
-    fdedup_params = namedtuple("fdedup_params", ["num_segments", "num_actors", "cpus_per_actor"])
-    return fdedup_params(num_segments, num_actors, cpus_per_actor)
+    fdedup_params = namedtuple(
+        typename="fdedup_params",
+        field_names=["num_segments", "num_actors", "actor_cpu", "actor_memory"],
+    )
+    print(
+        f"num_segments = {num_segments}, num_actors = {num_actors}, actor_cpu = {actor_cpu}, actor_memory = {actor_memory}"
+    )
+    return fdedup_params(num_segments, num_actors, actor_cpu, actor_memory)
 
 
 def signature_calc_compute_execution_params(
-    runtime_actor_cpus: float,  # actor's CPU requirements
-    runtime_num_actors: int,  # number of actors needed to run this step
+    runtime_num_actors: str,  # number of actors computed by KFPUtils.default_compute_execution_params()
+    runtime_actor_cpus: float,  # number of CPUS needed for each actor
+    runtime_actor_memory: int,  # memory (in bytes) needed by each actor
     data_s3_config: str,  # s3 configuration
     data_max_files: int,  # max files to process
     data_num_samples: int,  # num samples to process
@@ -103,8 +111,9 @@ def signature_calc_compute_execution_params(
 
     """
     Compute fuzzy dedup execution parameters for signature calculation
-    :param runtime_actor_cpus: actor's CPU requirements
-    :param runtime_num_actors: number of actors to run this step
+    :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params()
+    :param runtime_actor_cpus: number of CPUS needed for each actor
+    :param runtime_actor_memory: memory (in bytes) needed by each actor
     :param data_s3_config: s3 configuration
     :param data_max_files: max files to process
     :param data_num_samples: num samples to process
@@ -116,23 +125,22 @@ def signature_calc_compute_execution_params(
     :param num_permutations: number of permutations
     :param num_bands: number of bands
     :param num_minhashes_per_band: band length
-    :param word_shingle_size: number of words in shingle
+    :param word_shingle_size: number of words/chars in shingle
     :param shingle_option: str: type of shingle, one of 'word' or 'char'
     :param threshold: threshold,
     :param num_segments: number of segments
     :param seed: seed for the random number generator
-    :return: a dictionary with a Ray Job execution parameters
+    :return: dictionary with Ray Job execution parameters
     """
 
     # fuzzy parameters for signature calculation
-    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
-    print(f"runtime_actor_options = {runtime_actor_options}")
+    actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory}
     return {
         "data_s3_config": data_s3_config,
         "data_max_files": data_max_files,
         "data_num_samples": data_num_samples,
         "runtime_num_workers": runtime_num_actors,
-        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_worker_options": str(actor_options),
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),
@@ -151,8 +159,9 @@ def signature_calc_compute_execution_params(
 
 
 def cluster_analysis_compute_execution_params(
-    runtime_actor_cpus: float,  # actor's CPU requirements
-    runtime_num_actors: int,  # number of actors needed to run this step
+    runtime_num_actors: str,  # number of actors computed by KFPUtils.default_compute_execution_params()
+    runtime_actor_cpus: float,  # number of CPUS needed for each actor
+    runtime_actor_memory: int,  # memory (in bytes) needed by each actor
     data_s3_config: str,  # s3 configuration
     data_max_files: int,  # max files to process
     data_num_samples: int,  # num samples to process
@@ -166,8 +175,9 @@ def cluster_analysis_compute_execution_params(
 
     """
     Compute fuzzy dedup execution parameters for cluster analysis
-    :param runtime_actor_cpus: actor's CPU requirements
-    :param runtime_num_actors: number of actors to run this step
+    :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params()
+    :param runtime_actor_cpus: number of CPUS needed for each actor
+    :param runtime_actor_memory: memory (in bytes) needed by each actor
     :param data_s3_config: s3 configuration
     :param data_max_files: max files to process
     :param data_num_samples: num samples to process
@@ -189,13 +199,13 @@ def cluster_analysis_compute_execution_params(
     data_s3_config_dict["input_folder"] = os.path.join(base_folder, "bands")
     data_s3_config_dict["output_folder"] = os.path.join(base_folder, "docs_to_remove")
     data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'")
-    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
+    actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory}
     return {
         "data_s3_config": data_s3_config,
         "data_max_files": data_max_files,
         "data_num_samples": data_num_samples,
         "runtime_num_workers": runtime_num_actors,
-        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_worker_options": str(actor_options),
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),
@@ -206,47 +216,48 @@ def cluster_analysis_compute_execution_params(
 
 
 def get_duplicate_list_compute_execution_params(
-    runtime_actor_cpus: float,  # actor's CPU requirements
-    runtime_num_actors: int,  # number of actors needed to run this step
+    runtime_num_actors: str,  # number of actors computed by KFPUtils.default_compute_execution_params()
+    runtime_actor_cpus: float,  # number of CPUS needed for each actor
+    runtime_actor_memory: int,  # memory (in bytes) needed by each actor
     data_s3_config: str,  # s3 configuration
     data_max_files: int,  # max files to process
     data_num_samples: int,  # num samples to process
     runtime_pipeline_id: str,  # pipeline id
     runtime_job_id: str,  # job id
     runtime_code_location: dict,  # code location
-    duplicate_docids_folder: str,  # folder with the docs IDs to remove
-    duplicate_list_location: str,  # location of the list of duplicate doc ids
 ) -> dict:
     """
     Compute fuzzy dedup execution parameters for get duplicate list step
-    :param runtime_actor_cpus: actor's CPU requirements
-    :param runtime_num_actors: number of actors to run this step
+    :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params()
+    :param runtime_actor_cpus: number of CPUS needed for each actor
+    :param runtime_actor_memory: memory (in bytes) needed by each actor
     :param data_s3_config: s3 configuration
     :param data_max_files: max files to process
     :param data_num_samples: num samples to process
     :param runtime_pipeline_id: pipeline id
     :param runtime_job_id: job id
     :param runtime_code_location: code location
-    :param duplicate_docids_folder: folder with the docs IDs to remove
-    :param duplicate_list_location: location of the list of duplicate doc ids
     :return: a dictionary with a Ray Job execution parameters
     """
     import json
+    import os
 
     # fuzzy parameters
+    duplicate_docids_folder: str = "docs_to_remove"
+    duplicate_list_location: str = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet")
     # Get cluster parameters
     data_s3_config_dict = json.loads(data_s3_config.replace("'", '"'))
     base_folder = data_s3_config_dict.get("output_folder")
     data_s3_config_dict["input_folder"] = base_folder
     data_s3_config_dict["output_folder"] = base_folder
     data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'")
-    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
+    actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory}
     return {
         "data_s3_config": data_s3_config,
         "data_max_files": data_max_files,
         "data_num_samples": data_num_samples,
         "runtime_num_workers": runtime_num_actors,
-        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_worker_options": str(actor_options),
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),
@@ -256,8 +267,9 @@ def get_duplicate_list_compute_execution_params(
 
 
 def data_cleaning_compute_execution_params(
-    runtime_actor_cpus: float,  # actor's CPU requirements
-    runtime_num_actors: int,  # number of actors needed to run this step
+    runtime_num_actors: str,  # number of actors computed by KFPUtils.default_compute_execution_params()
+    runtime_actor_cpus: float,  # number of CPUS needed for each actor
+    runtime_actor_memory: int,  # memory (in bytes) needed by each actor
     data_s3_config: str,  # s3 configuration
     data_max_files: int,  # max files to process
     data_num_samples: int,  # num samples to process
@@ -265,13 +277,13 @@ def data_cleaning_compute_execution_params(
     runtime_job_id: str,  # job id
     runtime_code_location: dict,  # code location
     id_column: str,  # integer document id column name
-    duplicate_list_location: str,  # location of the list of duplicate doc ids
     operation_mode: str,  # filter (non-)duplicates or annotate
 ) -> dict:
     """
     Compute fuzzy dedup execution parameters
-    :param runtime_actor_cpus: actor's CPU requirements
-    :param runtime_num_actors: number of actors to run this step
+    :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params()
+    :param runtime_actor_cpus: number of CPUS needed for each actor
+    :param runtime_actor_memory: memory (in bytes) needed by each actor
     :param data_s3_config: s3 configuration
     :param data_max_files: max files to process
     :param data_num_samples: num samples to process
@@ -279,7 +291,6 @@ def data_cleaning_compute_execution_params(
     :param runtime_job_id: job id
     :param runtime_code_location: code location
     :param id_column: integer document id column name
-    :param duplicate_list_location: location of the list of duplicate doc ids
     :param operation_mode: filter (non-)duplicates or annotate
     :return: a dictionary with a Ray Job execution parameters
     """
@@ -298,13 +309,14 @@ def data_cleaning_compute_execution_params(
         output_subfolder = "annotated"
     data_s3_config_dict["output_folder"] = os.path.join(base_folder, output_subfolder)
     data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'")
-    runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus}
+    duplicate_list_location: str = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet")
+    actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory}
     return {
         "data_s3_config": data_s3_config,
         "data_max_files": data_max_files,
         "data_num_samples": data_num_samples,
         "runtime_num_workers": runtime_num_actors,
-        "runtime_worker_options": str(runtime_actor_options),
+        "runtime_worker_options": str(actor_options),
         "runtime_pipeline_id": runtime_pipeline_id,
         "runtime_job_id": runtime_job_id,
         "runtime_code_location": str(runtime_code_location),

From 170af4bb5c0e95ac31ea6971791b29d1f818cbb4 Mon Sep 17 00:00:00 2001
From: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
Date: Thu, 21 Nov 2024 12:22:38 -0800
Subject: [PATCH 089/105] first version of a notebook

Signed-off-by: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
---
 .../language/doc_quality/doc_quality.ipynb    | 169 ++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 transforms/language/doc_quality/doc_quality.ipynb

diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb
new file mode 100644
index 000000000..99bab8ff3
--- /dev/null
+++ b/transforms/language/doc_quality/doc_quality.ipynb
@@ -0,0 +1,169 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+   "metadata": {},
+   "source": [
+    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
+    "```\n",
+    "make venv \n",
+    "source venv/bin/activate \n",
+    "pip install jupyterlab\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "## This is here as a reference only\n",
+    "# Users and application developers must use the right tag for the latest from pypi\n",
+    "#!pip install data-prep-toolkit\n",
+    "#!pip install data-prep-toolkit-transforms\n",
+    "#!pip install data-prep-connector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "##### **** Configure the transform parameters. The set of dictionary keys holding DocQualityTransform configuration for values are as follows: \n",
+    "* text_lang - specifies language used in the text content. By default, \"en\" is used.\n",
+    "* doc_content_column - specifies column name that contains document text. By default, \"contents\" is used.\n",
+    "* bad_word_filepath - specifies a path to bad word file: local folder (file or directory) that points to bad word file. You don't have to set this parameter if you don't need to set bad words.\n",
+    "#####"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
+   "metadata": {},
+   "source": [
+    "##### ***** Import required classes and modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "from data_processing.runtime.pure_python import PythonTransformLauncher\n",
+    "from data_processing.utils import ParamsUtils\n",
+    "from doc_quality_transform import (bad_word_filepath_cli_param, doc_content_column_cli_param, text_lang_cli_param,)\n",
+    "from doc_quality_transform_python import DocQualityPythonTransformConfiguration"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
+   "metadata": {},
+   "source": [
+    "##### ***** Setup runtime parameters for this transform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# create parameters\n",
+    "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n",
+    "output_folder = os.path.join( \"python\", \"output\")\n",
+    "local_conf = {\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "}\n",
+    "params = {\n",
+    "    # Data access. Only required parameters are specified\n",
+    "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
+    "    # execution info\n",
+    "    \"runtime_pipeline_id\": \"pipeline_id\",\n",
+    "    \"runtime_job_id\": \"job_id\",\n",
+    "    \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n",
+    "    # doc_quality params\n",
+    "    text_lang_cli_param: \"en\",\n",
+    "    doc_content_column_cli_param: \"contents\",\n",
+    "    bad_word_filepath_cli_param: os.path.join(\"python\", \"ldnoobw\", \"en\"),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
+   "metadata": {},
+   "source": [
+    "##### ***** Use python runtime to invoke the transform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0775e400-7469-49a6-8998-bd4772931459",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "launcher = PythonTransformLauncher(runtime_config=DocQualityPythonTransformConfiguration())\n",
+    "launcher.launch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+   "metadata": {},
+   "source": [
+    "##### **** The specified folder will include the transformed parquet files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "glob.glob(\"python/output/*\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 10851f6643fcf96c2417b332118842660d225d3d Mon Sep 17 00:00:00 2001
From: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
Date: Thu, 21 Nov 2024 13:34:47 -0800
Subject: [PATCH 090/105] fixed code_location

Signed-off-by: SHAHROKH DAIJAVAD <shahrokh@us.ibm.com>
---
 .../language/doc_quality/doc_quality.ipynb    | 52 ++++++++++++++++---
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb
index 99bab8ff3..c6617b2bc 100644
--- a/transforms/language/doc_quality/doc_quality.ipynb
+++ b/transforms/language/doc_quality/doc_quality.ipynb
@@ -52,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
    "metadata": {},
    "outputs": [],
@@ -77,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
    "metadata": {},
    "outputs": [],
@@ -90,6 +90,7 @@
     "    \"input_folder\": input_folder,\n",
     "    \"output_folder\": output_folder,\n",
     "}\n",
+    "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n",
     "params = {\n",
     "    # Data access. Only required parameters are specified\n",
     "    \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n",
@@ -114,10 +115,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "0775e400-7469-49a6-8998-bd4772931459",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "13:32:09 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': <data_processing.data_access.data_access_factory.DataAccessFactory object at 0x103ca24d0>}\n",
+      "13:32:09 INFO - pipeline id pipeline_id\n",
+      "13:32:09 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n",
+      "13:32:09 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n",
+      "13:32:09 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "13:32:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "13:32:09 INFO - orchestrator docq started at 2024-11-21 13:32:09\n",
+      "13:32:09 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n",
+      "13:32:09 INFO - Load badwords found locally from python/ldnoobw/en\n",
+      "13:32:11 INFO - Completed 1 files (100.0%) in 0.025 min\n",
+      "13:32:11 INFO - Done processing 1 files, waiting for flush() completion.\n",
+      "13:32:11 INFO - done flushing in 0.0 sec\n",
+      "13:32:11 INFO - Completed execution in 0.025 min, execution result 0\n"
+     ]
+    }
+   ],
    "source": [
     "%%capture\n",
     "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
@@ -135,14 +156,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "7276fe84-6512-4605-ab65-747351e13a7c",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['python/output/metadata.json', 'python/output/test1.parquet']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import glob\n",
     "glob.glob(\"python/output/*\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From 7545872c6e059eb67f2f947418572d255bf66685 Mon Sep 17 00:00:00 2001
From: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
Date: Fri, 22 Nov 2024 10:43:05 +0900
Subject: [PATCH 091/105] add link to jupyter notebook

Signed-off-by: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
---
 .../language/doc_quality/doc_quality.ipynb    | 30 +++++++++++++------
 .../language/doc_quality/python/README.md     |  2 +-
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb
index c6617b2bc..f3978dc96 100644
--- a/transforms/language/doc_quality/doc_quality.ipynb
+++ b/transforms/language/doc_quality/doc_quality.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
    "metadata": {},
    "outputs": [],
@@ -23,9 +23,10 @@
     "%%capture\n",
     "## This is here as a reference only\n",
     "# Users and application developers must use the right tag for the latest from pypi\n",
-    "#!pip install data-prep-toolkit\n",
-    "#!pip install data-prep-toolkit-transforms\n",
-    "#!pip install data-prep-connector"
+    "%pip install data-prep-toolkit\n",
+    "%pip install data-prep-toolkit-transforms\n",
+    "%pip install data-prep-connector\n",
+    "%pip install dpk-doc-quality-transform-python"
    ]
   },
   {
@@ -52,12 +53,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
    "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'doc_quality_transform'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 6\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdata_processing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mruntime\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpure_python\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PythonTransformLauncher\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdata_processing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ParamsUtils\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoc_quality_transform\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (bad_word_filepath_cli_param, doc_content_column_cli_param, text_lang_cli_param,)\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoc_quality_transform_python\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DocQualityPythonTransformConfiguration\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'doc_quality_transform'"
+     ]
+    }
+   ],
    "source": [
-    "import ast\n",
     "import os\n",
     "import sys\n",
     "\n",
@@ -187,7 +199,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
@@ -201,7 +213,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.11.0"
   }
  },
  "nbformat": 4,
diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md
index 6a085ef05..c10bc4b88 100644
--- a/transforms/language/doc_quality/python/README.md
+++ b/transforms/language/doc_quality/python/README.md
@@ -90,7 +90,7 @@ To see results of the transform.
 
 ### Code example
 
-TBD (link to the notebook will be provided)
+[notebook](../doc_quality.ipynb)
 
 ### Transforming data using the transform image
 

From 9ee506e341749765c59b8ab8430829fd442f4950 Mon Sep 17 00:00:00 2001
From: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
Date: Fri, 22 Nov 2024 15:09:11 +0900
Subject: [PATCH 092/105] update notebook

Signed-off-by: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
---
 .../language/doc_quality/doc_quality.ipynb    | 52 +++++++------------
 1 file changed, 20 insertions(+), 32 deletions(-)

diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb
index f3978dc96..91aafd74d 100644
--- a/transforms/language/doc_quality/doc_quality.ipynb
+++ b/transforms/language/doc_quality/doc_quality.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
    "metadata": {},
    "outputs": [],
@@ -53,22 +53,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'doc_quality_transform'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 6\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdata_processing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mruntime\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpure_python\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PythonTransformLauncher\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdata_processing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ParamsUtils\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoc_quality_transform\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (bad_word_filepath_cli_param, doc_content_column_cli_param, text_lang_cli_param,)\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoc_quality_transform_python\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DocQualityPythonTransformConfiguration\n",
-      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'doc_quality_transform'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
     "import sys\n",
@@ -89,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 9,
    "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
    "metadata": {},
    "outputs": [],
@@ -127,7 +115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
    "id": "0775e400-7469-49a6-8998-bd4772931459",
    "metadata": {},
    "outputs": [
@@ -135,19 +123,19 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "13:32:09 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': <data_processing.data_access.data_access_factory.DataAccessFactory object at 0x103ca24d0>}\n",
-      "13:32:09 INFO - pipeline id pipeline_id\n",
-      "13:32:09 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n",
-      "13:32:09 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n",
-      "13:32:09 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "13:32:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:32:09 INFO - orchestrator docq started at 2024-11-21 13:32:09\n",
-      "13:32:09 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n",
-      "13:32:09 INFO - Load badwords found locally from python/ldnoobw/en\n",
-      "13:32:11 INFO - Completed 1 files (100.0%) in 0.025 min\n",
-      "13:32:11 INFO - Done processing 1 files, waiting for flush() completion.\n",
-      "13:32:11 INFO - done flushing in 0.0 sec\n",
-      "13:32:11 INFO - Completed execution in 0.025 min, execution result 0\n"
+      "10:38:40 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': <data_processing.data_access.data_access_factory.DataAccessFactory object at 0x11206e010>}\n",
+      "10:38:40 INFO - pipeline id pipeline_id\n",
+      "10:38:40 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n",
+      "10:38:40 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n",
+      "10:38:40 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "10:38:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "10:38:40 INFO - orchestrator docq started at 2024-11-22 10:38:40\n",
+      "10:38:40 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n",
+      "10:38:40 INFO - Load badwords found locally from python/ldnoobw/en\n",
+      "10:38:49 INFO - Completed 1 files (100.0%) in 0.146 min\n",
+      "10:38:49 INFO - Done processing 1 files, waiting for flush() completion.\n",
+      "10:38:49 INFO - done flushing in 0.0 sec\n",
+      "10:38:49 INFO - Completed execution in 0.146 min, execution result 0\n"
      ]
     }
    ],
@@ -168,7 +156,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
    "id": "7276fe84-6512-4605-ab65-747351e13a7c",
    "metadata": {},
    "outputs": [
@@ -178,7 +166,7 @@
        "['python/output/metadata.json', 'python/output/test1.parquet']"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }

From 280d105a1b5ced45ae4fc7d5bdf4123e86669022 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Fri, 22 Nov 2024 20:21:02 -0500
Subject: [PATCH 093/105] added fdedup to build package for all transforms

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/pyproject.toml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml
index 2357553e4..badb8bbd9 100644
--- a/transforms/pyproject.toml
+++ b/transforms/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_transforms"
-version = "0.2.2.dev3"
+version = "0.2.2.dev4"
 requires-python = ">=3.10,<3.13"
 keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 description = "Data Preparation Toolkit Transforms using Ray"
@@ -44,6 +44,7 @@ all = { file = [
 "universal/hap/python/requirements.txt",
 "universal/tokenization/python/requirements.txt",
 "universal/ededup/python/requirements.txt",
+"universal/fdedup/python/requirements.txt",
 "universal/profiler/python/requirements.txt",
 "universal/doc_id/python/requirements.txt",
 "universal/filter/python/requirements.txt",
@@ -71,6 +72,7 @@ pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]}
 hap = { file = ["universal/hap/python/requirements.txt"]} 
 tokenization = { file = ["universal/tokenization/python/requirements.txt"]} 
 ededup = { file = ["universal/ededup/python/requirements.txt"]} 
+fdedup = { file = ["universal/fdedup/python/requirements.txt"]} 
 profiler = { file = ["universal/profiler/python/requirements.txt"]} 
 doc_id = { file = ["universal/doc_id/python/requirements.txt"]} 
 filter = { file = ["universal/filter/python/requirements.txt"]} 
@@ -80,11 +82,8 @@ web2parquet = { file = ["universal/web2parquet/requirements.txt"]}
 # Does not seem to work for our custom layout
 # copy all files to a single src and let automatic discovery find them
 
-[tool.setuptools.package-data]
-"*" = ["*.txt"]
-
-[tool.setuptools.packages.find]
-where = ["src"]
+#[tool.setuptools.package-data]
+#"*" = ["*.txt"]
 
 #[tool.setuptools.package-dir]
 #dpk_web2parquet = "universal/web2parquet/dpk_web2parquet"

From cf133880deac097f18cc580dc9364c680f1a9623 Mon Sep 17 00:00:00 2001
From: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
Date: Mon, 25 Nov 2024 09:55:49 +0900
Subject: [PATCH 094/105] stop installing data-prep-connector

Signed-off-by: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
---
 transforms/language/doc_quality/doc_quality.ipynb | 1 -
 1 file changed, 1 deletion(-)

diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb
index 91aafd74d..5b87c91b8 100644
--- a/transforms/language/doc_quality/doc_quality.ipynb
+++ b/transforms/language/doc_quality/doc_quality.ipynb
@@ -25,7 +25,6 @@
     "# Users and application developers must use the right tag for the latest from pypi\n",
     "%pip install data-prep-toolkit\n",
     "%pip install data-prep-toolkit-transforms\n",
-    "%pip install data-prep-connector\n",
     "%pip install dpk-doc-quality-transform-python"
    ]
   },

From edb605bb681c57db1f9eb5d3fe9f425681f57c2b Mon Sep 17 00:00:00 2001
From: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
Date: Mon, 25 Nov 2024 12:39:31 +0900
Subject: [PATCH 095/105] use data-prep-toolkit-transforms==0.2.2.dev3

Signed-off-by: Daiki Tsuzuku <dtsuzuku@jp.ibm.com>
---
 .../language/doc_quality/doc_quality.ipynb    | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb
index 5b87c91b8..bf91047b6 100644
--- a/transforms/language/doc_quality/doc_quality.ipynb
+++ b/transforms/language/doc_quality/doc_quality.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
    "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
    "metadata": {},
    "outputs": [],
@@ -24,8 +24,7 @@
     "## This is here as a reference only\n",
     "# Users and application developers must use the right tag for the latest from pypi\n",
     "%pip install data-prep-toolkit\n",
-    "%pip install data-prep-toolkit-transforms\n",
-    "%pip install dpk-doc-quality-transform-python"
+    "%pip install data-prep-toolkit-transforms==0.2.2.dev3"
    ]
   },
   {
@@ -52,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 2,
    "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
    "metadata": {},
    "outputs": [],
@@ -76,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
    "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
    "metadata": {},
    "outputs": [],
@@ -114,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 4,
    "id": "0775e400-7469-49a6-8998-bd4772931459",
    "metadata": {},
    "outputs": [
@@ -122,19 +121,19 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "10:38:40 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': <data_processing.data_access.data_access_factory.DataAccessFactory object at 0x11206e010>}\n",
-      "10:38:40 INFO - pipeline id pipeline_id\n",
-      "10:38:40 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n",
-      "10:38:40 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n",
-      "10:38:40 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "10:38:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "10:38:40 INFO - orchestrator docq started at 2024-11-22 10:38:40\n",
-      "10:38:40 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n",
-      "10:38:40 INFO - Load badwords found locally from python/ldnoobw/en\n",
-      "10:38:49 INFO - Completed 1 files (100.0%) in 0.146 min\n",
-      "10:38:49 INFO - Done processing 1 files, waiting for flush() completion.\n",
-      "10:38:49 INFO - done flushing in 0.0 sec\n",
-      "10:38:49 INFO - Completed execution in 0.146 min, execution result 0\n"
+      "12:39:07 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': <data_processing.data_access.data_access_factory.DataAccessFactory object at 0x12ae67650>}\n",
+      "12:39:07 INFO - pipeline id pipeline_id\n",
+      "12:39:07 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n",
+      "12:39:07 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n",
+      "12:39:07 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "12:39:07 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "12:39:07 INFO - orchestrator docq started at 2024-11-25 12:39:07\n",
+      "12:39:07 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n",
+      "12:39:07 INFO - Load badwords found locally from python/ldnoobw/en\n",
+      "12:39:09 INFO - Completed 1 files (100.0%) in 0.033 min\n",
+      "12:39:09 INFO - Done processing 1 files, waiting for flush() completion.\n",
+      "12:39:09 INFO - done flushing in 0.0 sec\n",
+      "12:39:09 INFO - Completed execution in 0.033 min, execution result 0\n"
      ]
     }
    ],
@@ -155,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 5,
    "id": "7276fe84-6512-4605-ab65-747351e13a7c",
    "metadata": {},
    "outputs": [
@@ -165,7 +164,7 @@
        "['python/output/metadata.json', 'python/output/test1.parquet']"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }

From 1a762e01d6cdd3af08c5983c6bcf81e175ab3627 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 25 Nov 2024 11:53:18 -0500
Subject: [PATCH 096/105] First draft of fdedup notebook

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/fdedup.ipynb | 152 +++++++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 transforms/universal/fdedup/fdedup.ipynb

diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb
new file mode 100644
index 000000000..ee1d9b561
--- /dev/null
+++ b/transforms/universal/fdedup/fdedup.ipynb
@@ -0,0 +1,152 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+   "metadata": {},
+   "source": [
+    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
+    "```\n",
+    "make venv\n",
+    "source venv/bin/activate && pip install jupyterlab\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "## This is here as a reference only\n",
+    "# Users and application developers must use the right tag for the latest from pypi\n",
+    "#!pip install data-prep-toolkit\n",
+    "#!pip install data-prep-toolkit-transforms\n",
+    "#!pip install data-prep-connector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
+   "metadata": {},
+   "source": [
+    "##### ***** Import required Classes and modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "from data_processing.utils import ParamsUtils\n",
+    "from fdedup_transform_python import parse_args\n",
+    "from fdedup_transform_ray import RayServiceOrchestrator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
+   "metadata": {},
+   "source": [
+    "##### ***** Setup runtime parameters for this transform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create parameters\n",
+    "input_folder = os.path.join(\"ray\", \"test-data\", \"input\")\n",
+    "output_folder = os.path.join( \"ray\", \"output\")\n",
+    "params = {\n",
+    "    # transform configuration parameters\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "    \"contents_column\": \"contents\",\n",
+    "    \"document_id_column\": \"int_id_column\",\n",
+    "    \"num_permutations\": 112,\n",
+    "    \"num_bands\": 14,\n",
+    "    \"num_minhashes_per_band\": 8,\n",
+    "    \"num_segments\": 1,\n",
+    "    \"operation_mode\": \"annotate\",\n",
+    "    # ray configuration parameters\n",
+    "    \"run_locally\": True,\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
+   "metadata": {},
+   "source": [
+    "##### ***** Use ray runtime to invoke the transform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0775e400-7469-49a6-8998-bd4772931459",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "args = parse_args()\n",
+    "# Initialize the orchestrator\n",
+    "orchestrator = RayServiceOrchestrator(global_params=args)\n",
+    "# Launch ray fuzzy dedup execution\n",
+    "orchestrator.orchestrate()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+   "metadata": {},
+   "source": [
+    "##### **** The specified folder will include the transformed parquet files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "glob.glob(\"python/output/*\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From ffebdc1c388440a1b03e4efe88178405b4c569dc Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 25 Nov 2024 13:52:51 -0500
Subject: [PATCH 097/105] Added sample ray fuzzy dedup jupyter notebook

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/fdedup.ipynb      | 71 ++++++++++++++++---
 .../python/src/fdedup_transform_python.py     | 10 +++
 2 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb
index ee1d9b561..88bcd87aa 100644
--- a/transforms/universal/fdedup/fdedup.ipynb
+++ b/transforms/universal/fdedup/fdedup.ipynb
@@ -67,8 +67,8 @@
    "outputs": [],
    "source": [
     "# create parameters\n",
-    "input_folder = os.path.join(\"ray\", \"test-data\", \"input\")\n",
-    "output_folder = os.path.join( \"ray\", \"output\")\n",
+    "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n",
+    "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n",
     "params = {\n",
     "    # transform configuration parameters\n",
     "    \"input_folder\": input_folder,\n",
@@ -79,7 +79,7 @@
     "    \"num_bands\": 14,\n",
     "    \"num_minhashes_per_band\": 8,\n",
     "    \"num_segments\": 1,\n",
-    "    \"operation_mode\": \"annotate\",\n",
+    "    \"operation_mode\": \"filter_duplicates\",\n",
     "    # ray configuration parameters\n",
     "    \"run_locally\": True,\n",
     "}\n"
@@ -90,7 +90,7 @@
    "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
    "metadata": {},
    "source": [
-    "##### ***** Use ray runtime to invoke the transform"
+    "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline"
    ]
   },
   {
@@ -100,12 +100,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%capture\n",
+    "\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
     "args = parse_args()\n",
     "# Initialize the orchestrator\n",
     "orchestrator = RayServiceOrchestrator(global_params=args)\n",
     "# Launch ray fuzzy dedup execution\n",
-    "orchestrator.orchestrate()\n"
+    "orchestrator.orchestrate()"
    ]
   },
   {
@@ -124,15 +125,65 @@
    "outputs": [],
    "source": [
     "import glob\n",
-    "glob.glob(\"python/output/*\")"
+    "glob.glob(\"ray/output/cleaned/*\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d30489d9-fc98-423e-90a8-e8f372787e88",
+   "metadata": {},
+   "source": [
+    "***** print the input data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n",
+    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
+    "    print(input_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e",
+   "metadata": {},
+   "source": [
+    "***** print the output result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n",
+    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
+    "    print(output_df)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d60e391d-cf58-47ae-9991-04c05d114edc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "fdedup_ray",
    "language": "python",
-   "name": "python3"
+   "name": "fdedup_ray"
   },
   "language_info": {
    "codemirror_mode": {
@@ -144,7 +195,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py
index 166e48e26..b200676da 100644
--- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py
+++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py
@@ -147,6 +147,8 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list:
             if service_name == "fdclean":
                 sys_argv.append("--dcdata_s3_config")
                 sys_argv.append(ast_data_io)
+        if in_args.run_locally:
+            sys_argv.append(f"--run_locally={in_args.run_locally}")
         return sys_argv
 
     def execute_service(self, service_short_name: str, params: list) -> int:
@@ -240,6 +242,7 @@ def parse_args() -> argparse.Namespace:
         default=None,
         help="ast string of options for s3 credentials",
     )
+
     parser.add_argument(
         "--shingle_option",
         type=str,
@@ -248,6 +251,13 @@ def parse_args() -> argparse.Namespace:
         help="Option used for shingling",
     )
 
+    parser.add_argument(
+        "--run_locally",
+        type=lambda x: bool(str2bool(x)),
+        default=True,
+        help="run locally or connect to a remote machine",
+    )
+
     return parser.parse_args()
 
 
From 75fc4d1464d4d8c83dc0a087528c46c873a46d2f Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 25 Nov 2024 16:59:26 -0500
Subject: [PATCH 098/105] Add jupyter notebooks for python, ray and spark fuzzy
 dedup

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 .../universal/fdedup/fdedup_python.ipynb      | 215 ++++++++++++++++++
 transforms/universal/fdedup/fdedup_ray.ipynb  | 214 +++++++++++++++++
 .../universal/fdedup/fdedup_spark.ipynb       | 212 +++++++++++++++++
 .../python/src/fdedup_transform_python.py     |   2 +-
 4 files changed, 642 insertions(+), 1 deletion(-)
 create mode 100644 transforms/universal/fdedup/fdedup_python.ipynb
 create mode 100644 transforms/universal/fdedup/fdedup_ray.ipynb
 create mode 100644 transforms/universal/fdedup/fdedup_spark.ipynb

diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb
new file mode 100644
index 000000000..83f9bd600
--- /dev/null
+++ b/transforms/universal/fdedup/fdedup_python.ipynb
@@ -0,0 +1,215 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+   "metadata": {},
+   "source": [
+    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
+    "```\n",
+    "make venv\n",
+    "source venv/bin/activate && pip install jupyterlab\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "## This is here as a reference only\n",
+    "# Users and application developers must use the right tag for the latest from pypi\n",
+    "#!pip install data-prep-toolkit\n",
+    "#!pip install data-prep-toolkit-transforms\n",
+    "#!pip install data-prep-connector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
+   "metadata": {},
+   "source": [
+    "##### ***** Import required Classes and modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "from data_processing.utils import ParamsUtils\n",
+    "from fdedup_transform_python import parse_args, ServiceOrchestrator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
+   "metadata": {},
+   "source": [
+    "##### ***** Setup runtime parameters for this transform\n",
+    "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n",
+    "|parameter:type | value | description |\n",
+    "|-|-|-|\n",
+    "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n",
+    "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n",
+    "| contents_column:str | contents | name of the column that stores document text |\n",
+    "| document_id_column:str | int_id_column | name of the column that stores document ID |\n",
+    "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n",
+    "| num_bands:int | 14 | number of bands to use for band hash calculation |\n",
+    "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n",
+    "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create parameters\n",
+    "input_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\")\n",
+    "output_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"output\")\n",
+    "params = {\n",
+    "    # transform configuration parameters\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "    \"contents_column\": \"contents\",\n",
+    "    \"document_id_column\": \"int_id_column\",\n",
+    "    \"num_permutations\": 112,\n",
+    "    \"num_bands\": 14,\n",
+    "    \"num_minhashes_per_band\": 8,\n",
+    "    \"operation_mode\": \"filter_duplicates\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
+   "metadata": {},
+   "source": [
+    "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0775e400-7469-49a6-8998-bd4772931459",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "args = parse_args()\n",
+    "# Initialize the orchestrator\n",
+    "orchestrator = ServiceOrchestrator(global_params=args)\n",
+    "# Launch python fuzzy dedup execution\n",
+    "orchestrator.orchestrate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+   "metadata": {},
+   "source": [
+    "##### **** The specified folder will include the transformed parquet files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "glob.glob(\"python/output/cleaned/*\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d30489d9-fc98-423e-90a8-e8f372787e88",
+   "metadata": {},
+   "source": [
+    "***** print the input data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n",
+    "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n",
+    "input_df = input_df_1.vstack(input_df_2)\n",
+    "\n",
+    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
+    "    print(input_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e",
+   "metadata": {},
+   "source": [
+    "***** print the output result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n",
+    "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n",
+    "output_df = output_df_1.vstack(output_df_2)\n",
+    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
+    "    print(output_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d60e391d-cf58-47ae-9991-04c05d114edc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fdedup_ray",
+   "language": "python",
+   "name": "fdedup_ray"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb
new file mode 100644
index 000000000..533ca019f
--- /dev/null
+++ b/transforms/universal/fdedup/fdedup_ray.ipynb
@@ -0,0 +1,214 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+   "metadata": {},
+   "source": [
+    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
+    "```\n",
+    "make venv\n",
+    "source venv/bin/activate && pip install jupyterlab\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "## This is here as a reference only\n",
+    "# Users and application developers must use the right tag for the latest from pypi\n",
+    "#!pip install data-prep-toolkit\n",
+    "#!pip install data-prep-toolkit-transforms\n",
+    "#!pip install data-prep-connector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
+   "metadata": {},
+   "source": [
+    "##### ***** Import required Classes and modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "from data_processing.utils import ParamsUtils\n",
+    "from fdedup_transform_python import parse_args\n",
+    "from fdedup_transform_ray import RayServiceOrchestrator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
+   "metadata": {},
+   "source": [
+    "##### ***** Setup runtime parameters for this transform\n",
+    "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n",
+    "|parameter:type | value | description |\n",
+    "|-|-|-|\n",
+    "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n",
+    "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n",
+    "| contents_column:str | contents | name of the column that stores document text |\n",
+    "| document_id_column:str | int_id_column | name of the column that stores document ID |\n",
+    "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n",
+    "| num_bands:int | 14 | number of bands to use for band hash calculation |\n",
+    "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n",
+    "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |\n",
+    "| run_locally:bool | true | if true, launch a ray cluster locally, otherwise connect to an already existing cluster | \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create parameters\n",
+    "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n",
+    "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n",
+    "params = {\n",
+    "    # transform configuration parameters\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "    \"contents_column\": \"contents\",\n",
+    "    \"document_id_column\": \"int_id_column\",\n",
+    "    \"num_permutations\": 112,\n",
+    "    \"num_bands\": 14,\n",
+    "    \"num_minhashes_per_band\": 8,\n",
+    "    \"operation_mode\": \"filter_duplicates\",\n",
+    "    # ray configuration parameters\n",
+    "    \"run_locally\": True,\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
+   "metadata": {},
+   "source": [
+    "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0775e400-7469-49a6-8998-bd4772931459",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "args = parse_args()\n",
+    "# Initialize the orchestrator\n",
+    "orchestrator = RayServiceOrchestrator(global_params=args)\n",
+    "# Launch ray fuzzy dedup execution\n",
+    "orchestrator.orchestrate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+   "metadata": {},
+   "source": [
+    "##### **** The specified folder will include the transformed parquet files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "glob.glob(\"ray/output/cleaned/*\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d30489d9-fc98-423e-90a8-e8f372787e88",
+   "metadata": {},
+   "source": [
+    "***** print the input data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n",
+    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
+    "    print(input_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e",
+   "metadata": {},
+   "source": [
+    "***** print the output result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n",
+    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
+    "    print(output_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d60e391d-cf58-47ae-9991-04c05d114edc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fdedup_ray",
+   "language": "python",
+   "name": "fdedup_ray"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transforms/universal/fdedup/fdedup_spark.ipynb b/transforms/universal/fdedup/fdedup_spark.ipynb
new file mode 100644
index 000000000..9f4bf1772
--- /dev/null
+++ b/transforms/universal/fdedup/fdedup_spark.ipynb
@@ -0,0 +1,212 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+   "metadata": {},
+   "source": [
+    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
+    "```\n",
+    "make venv\n",
+    "source venv/bin/activate && pip install jupyterlab\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "## This is here as a reference only\n",
+    "# Users and application developers must use the right tag for the latest from pypi\n",
+    "#!pip install data-prep-toolkit\n",
+    "#!pip install data-prep-toolkit-transforms\n",
+    "#!pip install data-prep-connector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
+   "metadata": {},
+   "source": [
+    "##### ***** Import required Classes and modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "from data_processing.utils import ParamsUtils\n",
+    "from fdedup_transform_python import parse_args\n",
+    "from fdedup_transform_spark import SparkServiceOrchestrator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
+   "metadata": {},
+   "source": [
+    "##### ***** Setup runtime parameters for this transform\n",
+    "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n",
+    "|parameter:type | value | description |\n",
+    "|-|-|-|\n",
+    "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n",
+    "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n",
+    "| contents_column:str | contents | name of the column that stores document text |\n",
+    "| document_id_column:str | int_id_column | name of the column that stores document ID |\n",
+    "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n",
+    "| num_bands:int | 14 | number of bands to use for band hash calculation |\n",
+    "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n",
+    "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create parameters\n",
+    "input_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\")\n",
+    "output_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"output\")\n",
+    "params = {\n",
+    "    # transform configuration parameters\n",
+    "    \"input_folder\": input_folder,\n",
+    "    \"output_folder\": output_folder,\n",
+    "    \"contents_column\": \"contents\",\n",
+    "    \"document_id_column\": \"int_id_column\",\n",
+    "    \"num_permutations\": 112,\n",
+    "    \"num_bands\": 14,\n",
+    "    \"num_minhashes_per_band\": 8,\n",
+    "    \"operation_mode\": \"filter_duplicates\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
+   "metadata": {},
+   "source": [
+    "##### ***** Use spark runtime to invoke each transform in the fuzzy dedup pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0775e400-7469-49a6-8998-bd4772931459",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
+    "args = parse_args()\n",
+    "# Initialize the orchestrator\n",
+    "orchestrator = SparkServiceOrchestrator(global_params=args)\n",
+    "# Launch spark fuzzy dedup execution\n",
+    "orchestrator.orchestrate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+   "metadata": {},
+   "source": [
+    "##### **** The specified folder will include the transformed parquet files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "glob.glob(\"spark/output/cleaned/*\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d30489d9-fc98-423e-90a8-e8f372787e88",
+   "metadata": {},
+   "source": [
+    "***** print the input data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\", \"df1.parquet\"))\n",
+    "\n",
+    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
+    "    print(input_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e",
+   "metadata": {},
+   "source": [
+    "***** print the output result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"output\", \"cleaned\", \"df1.parquet\"))\n",
+    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
+    "    print(output_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d60e391d-cf58-47ae-9991-04c05d114edc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fdedup_spark",
+   "language": "python",
+   "name": "fdedup_spark"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py
index b200676da..def3590e4 100644
--- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py
+++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py
@@ -254,7 +254,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--run_locally",
         type=lambda x: bool(str2bool(x)),
-        default=True,
+        default=False,
         help="run locally or connect to a remote machine",
     )
 

From edd5841bb199c974489a8f612968c587bdeebab3 Mon Sep 17 00:00:00 2001
From: Constantin M Adam <cmadam@us.ibm.com>
Date: Mon, 25 Nov 2024 17:08:43 -0500
Subject: [PATCH 099/105] Add jupyter notebooks for python, ray and spark fuzzy
 dedup

Signed-off-by: Constantin M Adam <cmadam@us.ibm.com>
---
 transforms/universal/fdedup/fdedup.ipynb | 203 -----------------------
 1 file changed, 203 deletions(-)
 delete mode 100644 transforms/universal/fdedup/fdedup.ipynb

diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb
deleted file mode 100644
index 88bcd87aa..000000000
--- a/transforms/universal/fdedup/fdedup.ipynb
+++ /dev/null
@@ -1,203 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
-   "metadata": {},
-   "source": [
-    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
-    "```\n",
-    "make venv\n",
-    "source venv/bin/activate && pip install jupyterlab\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%capture\n",
-    "## This is here as a reference only\n",
-    "# Users and application developers must use the right tag for the latest from pypi\n",
-    "#!pip install data-prep-toolkit\n",
-    "#!pip install data-prep-toolkit-transforms\n",
-    "#!pip install data-prep-connector"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
-   "metadata": {},
-   "source": [
-    "##### ***** Import required Classes and modules"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import ast\n",
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "from data_processing.utils import ParamsUtils\n",
-    "from fdedup_transform_python import parse_args\n",
-    "from fdedup_transform_ray import RayServiceOrchestrator"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
-   "metadata": {},
-   "source": [
-    "##### ***** Setup runtime parameters for this transform"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create parameters\n",
-    "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n",
-    "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n",
-    "params = {\n",
-    "    # transform configuration parameters\n",
-    "    \"input_folder\": input_folder,\n",
-    "    \"output_folder\": output_folder,\n",
-    "    \"contents_column\": \"contents\",\n",
-    "    \"document_id_column\": \"int_id_column\",\n",
-    "    \"num_permutations\": 112,\n",
-    "    \"num_bands\": 14,\n",
-    "    \"num_minhashes_per_band\": 8,\n",
-    "    \"num_segments\": 1,\n",
-    "    \"operation_mode\": \"filter_duplicates\",\n",
-    "    # ray configuration parameters\n",
-    "    \"run_locally\": True,\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
-   "metadata": {},
-   "source": [
-    "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0775e400-7469-49a6-8998-bd4772931459",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-    "args = parse_args()\n",
-    "# Initialize the orchestrator\n",
-    "orchestrator = RayServiceOrchestrator(global_params=args)\n",
-    "# Launch ray fuzzy dedup execution\n",
-    "orchestrator.orchestrate()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
-   "metadata": {},
-   "source": [
-    "##### **** The specified folder will include the transformed parquet files."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import glob\n",
-    "glob.glob(\"ray/output/cleaned/*\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d30489d9-fc98-423e-90a8-e8f372787e88",
-   "metadata": {},
-   "source": [
-    "***** print the input data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import polars as pl\n",
-    "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n",
-    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
-    "    print(input_df)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e",
-   "metadata": {},
-   "source": [
-    "***** print the output result"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import polars as pl\n",
-    "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n",
-    "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
-    "    print(output_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d60e391d-cf58-47ae-9991-04c05d114edc",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "fdedup_ray",
-   "language": "python",
-   "name": "fdedup_ray"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From f61493cf1915638e0a4ff4f94c824b02c69833a7 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Mon, 25 Nov 2024 17:53:00 -0500
Subject: [PATCH 100/105] relax hap denpendencies on torch

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/pyproject.toml                        | 8 ++++----
 transforms/universal/hap/python/requirements.txt | 2 +-
 transforms/universal/hap/ray/requirements.txt    | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml
index 2357553e4..3c1f64c32 100644
--- a/transforms/pyproject.toml
+++ b/transforms/pyproject.toml
@@ -80,11 +80,11 @@ web2parquet = { file = ["universal/web2parquet/requirements.txt"]}
 # Does not seem to work for our custom layout
 # copy all files to a single src and let automatic discovery find them
 
-[tool.setuptools.package-data]
-"*" = ["*.txt"]
+#[tool.setuptools.package-data]
+#"*" = ["*.txt"]
 
-[tool.setuptools.packages.find]
-where = ["src"]
+#[tool.setuptools.packages.find]
+#where = ["src"]
 
 #[tool.setuptools.package-dir]
 #dpk_web2parquet = "universal/web2parquet/dpk_web2parquet"
diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt
index 505dd9ceb..ba8948477 100644
--- a/transforms/universal/hap/python/requirements.txt
+++ b/transforms/universal/hap/python/requirements.txt
@@ -1,5 +1,5 @@
 data-prep-toolkit==0.2.2.dev2
 nltk==3.9.1
 transformers==4.38.2
-torch==2.4.1
+torch>=2.2.2,<=2.4.1
 pandas==2.2.2
diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt
index 0ed65f625..34e1d6932 100644
--- a/transforms/universal/hap/ray/requirements.txt
+++ b/transforms/universal/hap/ray/requirements.txt
@@ -2,5 +2,5 @@ data-prep-toolkit[ray]==0.2.2.dev2
 dpk-hap-transform-python==0.2.2.dev2
 nltk==3.9.1
 transformers==4.38.2
-torch==2.4.1
+torch>=2.2.2,<=2.4.1
 pandas==2.2.2

From 60dd4794b62876c30a12e72abd8395a9dcc24be8 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Mon, 25 Nov 2024 19:12:58 -0500
Subject: [PATCH 101/105] run make.version for dpk 0.2.2 and connector 0.2.3

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .make.versions                                       | 12 ++++++++++--
 data-connector-lib/pyproject.toml                    |  2 +-
 data-processing-lib/pyproject.toml                   |  2 +-
 data-processing-lib/spark/pyproject.toml             |  4 ++--
 .../createRayClusterComponent.yaml                   |  2 +-
 .../deleteRayClusterComponent.yaml                   |  2 +-
 kfp/kfp_ray_components/executeRayJobComponent.yaml   |  2 +-
 .../executeRayJobComponent_multi_s3.yaml             |  2 +-
 .../executeSubWorkflowComponent.yaml                 |  2 +-
 .../kfp_v1_workflow_support/pyproject.toml           |  4 ++--
 .../kfp_v2_workflow_support/pyproject.toml           |  4 ++--
 .../shared_workflow_support/pyproject.toml           |  4 ++--
 .../code/code2parquet/kfp_ray/code2parquet_wf.py     |  2 +-
 transforms/code/code2parquet/python/pyproject.toml   |  2 +-
 transforms/code/code2parquet/python/requirements.txt |  2 +-
 transforms/code/code2parquet/ray/pyproject.toml      |  6 +++---
 transforms/code/code_profiler/python/pyproject.toml  |  2 +-
 .../code/code_profiler/python/requirements.txt       |  2 +-
 transforms/code/code_profiler/ray/pyproject.toml     |  6 +++---
 .../code/code_quality/kfp_ray/code_quality_wf.py     |  2 +-
 transforms/code/code_quality/python/pyproject.toml   |  2 +-
 transforms/code/code_quality/python/requirements.txt |  2 +-
 transforms/code/code_quality/ray/pyproject.toml      |  6 +++---
 .../header_cleanser/kfp_ray/header_cleanser_wf.py    |  2 +-
 .../code/header_cleanser/python/pyproject.toml       |  2 +-
 .../code/header_cleanser/python/requirements.txt     |  2 +-
 transforms/code/header_cleanser/ray/pyproject.toml   |  6 +++---
 .../code/license_select/kfp_ray/license_select_wf.py |  2 +-
 transforms/code/license_select/python/pyproject.toml |  2 +-
 .../code/license_select/python/requirements.txt      |  2 +-
 transforms/code/license_select/ray/pyproject.toml    |  6 +++---
 transforms/code/malware/kfp_ray/malware_wf.py        |  2 +-
 transforms/code/malware/python/pyproject.toml        |  4 ++--
 transforms/code/malware/ray/pyproject.toml           |  6 +++---
 .../proglang_select/kfp_ray/proglang_select_wf.py    |  2 +-
 .../code/proglang_select/python/pyproject.toml       |  2 +-
 .../code/proglang_select/python/requirements.txt     |  2 +-
 transforms/code/proglang_select/ray/pyproject.toml   |  6 +++---
 .../kfp_ray/repo_level_order_wf.py                   |  2 +-
 .../code/repo_level_ordering/ray/pyproject.toml      |  4 ++--
 .../doc_chunk/kfp_ray/doc_chunk_multiple_wf.py       |  2 +-
 .../language/doc_chunk/kfp_ray/doc_chunk_wf.py       |  2 +-
 .../language/doc_chunk/python/requirements.txt       |  2 +-
 transforms/language/doc_chunk/ray/pyproject.toml     |  2 +-
 .../doc_quality/kfp_ray/doc_quality_multiple_wf.py   |  2 +-
 .../language/doc_quality/kfp_ray/doc_quality_wf.py   |  2 +-
 .../language/doc_quality/python/pyproject.toml       |  2 +-
 .../language/doc_quality/python/requirements.txt     |  2 +-
 transforms/language/doc_quality/ray/pyproject.toml   |  6 +++---
 .../language/html2parquet/kfp_ray/html2parquet_wf.py |  2 +-
 .../language/html2parquet/python/pyproject.toml      |  2 +-
 .../language/html2parquet/python/requirements.txt    |  2 +-
 transforms/language/html2parquet/ray/pyproject.toml  |  2 +-
 .../language/html2parquet/ray/requirements.txt       |  4 ++--
 .../language/lang_id/kfp_ray/lang_id_multiple_wf.py  |  2 +-
 transforms/language/lang_id/kfp_ray/lang_id_wf.py    |  2 +-
 transforms/language/lang_id/python/pyproject.toml    |  2 +-
 transforms/language/lang_id/python/requirements.txt  |  2 +-
 transforms/language/lang_id/ray/pyproject.toml       |  6 +++---
 .../pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py   |  2 +-
 .../language/pdf2parquet/kfp_ray/pdf2parquet_wf.py   |  2 +-
 .../language/pdf2parquet/python/requirements.txt     |  2 +-
 transforms/language/pdf2parquet/ray/requirements.txt |  2 +-
 .../language/pii_redactor/python/requirements.txt    |  2 +-
 transforms/language/pii_redactor/ray/pyproject.toml  |  2 +-
 .../text_encoder/kfp_ray/text_encoder_multiple_wf.py |  2 +-
 .../language/text_encoder/kfp_ray/text_encoder_wf.py |  2 +-
 .../language/text_encoder/python/pyproject.toml      |  2 +-
 .../language/text_encoder/python/requirements.txt    |  2 +-
 transforms/language/text_encoder/ray/pyproject.toml  |  6 +++---
 transforms/pyproject.toml                            |  2 +-
 transforms/requirements-ray.txt                      |  2 +-
 transforms/requirements.txt                          |  2 +-
 transforms/transform.config                          |  8 --------
 transforms/universal/doc_id/kfp_ray/doc_id_wf.py     |  2 +-
 transforms/universal/doc_id/python/pyproject.toml    |  2 +-
 transforms/universal/doc_id/python/requirements.txt  |  2 +-
 transforms/universal/doc_id/ray/pyproject.toml       |  6 +++---
 transforms/universal/doc_id/spark/pyproject.toml     |  4 ++--
 transforms/universal/ededup/kfp_ray/ededup_wf.py     |  2 +-
 transforms/universal/ededup/python/pyproject.toml    |  2 +-
 transforms/universal/ededup/python/requirements.txt  |  2 +-
 transforms/universal/ededup/ray/pyproject.toml       |  6 +++---
 transforms/universal/fdedup/kfp_ray/fdedup_wf.py     |  2 +-
 transforms/universal/fdedup/ray/pyproject.toml       |  4 ++--
 transforms/universal/filter/kfp_ray/filter_wf.py     |  2 +-
 transforms/universal/filter/python/pyproject.toml    |  2 +-
 transforms/universal/filter/python/requirements.txt  |  2 +-
 transforms/universal/filter/ray/pyproject.toml       |  6 +++---
 transforms/universal/filter/spark/pyproject.toml     |  4 ++--
 transforms/universal/hap/kfp_ray.disable/hap_wf.py   |  2 +-
 transforms/universal/hap/python/pyproject.toml       |  2 +-
 transforms/universal/hap/python/requirements.txt     |  2 +-
 transforms/universal/hap/ray/pyproject.toml          |  2 +-
 transforms/universal/hap/ray/requirements.txt        |  4 ++--
 .../universal/noop/kfp_ray/noop_multiple_wf.py       |  2 +-
 transforms/universal/noop/kfp_ray/noop_wf.py         |  2 +-
 transforms/universal/noop/python/pyproject.toml      |  4 ++--
 transforms/universal/noop/ray/pyproject.toml         |  6 +++---
 transforms/universal/noop/spark/pyproject.toml       |  6 +++---
 transforms/universal/profiler/kfp_ray/profiler_wf.py |  2 +-
 transforms/universal/profiler/python/pyproject.toml  |  2 +-
 .../universal/profiler/python/requirements.txt       |  2 +-
 transforms/universal/profiler/ray/pyproject.toml     |  6 +++---
 transforms/universal/profiler/spark/pyproject.toml   |  6 +++---
 transforms/universal/resize/kfp_ray/resize_wf.py     |  2 +-
 transforms/universal/resize/python/pyproject.toml    |  2 +-
 transforms/universal/resize/python/requirements.txt  |  2 +-
 transforms/universal/resize/ray/pyproject.toml       |  6 +++---
 transforms/universal/resize/spark/pyproject.toml     |  6 +++---
 .../tokenization/kfp_ray/tokenization_wf.py          |  2 +-
 .../universal/tokenization/python/pyproject.toml     |  2 +-
 .../universal/tokenization/python/requirements.txt   |  2 +-
 transforms/universal/tokenization/ray/pyproject.toml |  6 +++---
 transforms/universal/web2parquet/requirements.txt    |  4 ++--
 115 files changed, 176 insertions(+), 176 deletions(-)

diff --git a/.make.versions b/.make.versions
index ed36fe8c8..e3a8e8239 100644
--- a/.make.versions
+++ b/.make.versions
@@ -19,7 +19,7 @@ DPK_MINOR_VERSION=2
 DPK_MICRO_VERSION=2
 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches.
 # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. 
-DPK_VERSION_SUFFIX=.dev2
+DPK_VERSION_SUFFIX=
 
 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX)
 
@@ -39,7 +39,7 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION)
 KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION)
 KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION)
 
-DPK_CONNECTOR_VERSION=0.2.3.dev0
+DPK_CONNECTOR_VERSION=0.2.3
 
 ################## ################## ################## ################## ################## ##################
 # Begin versions that the repo depends on. 
@@ -59,3 +59,11 @@ else
         WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support
 endif
 
+################################################################################
+# This defines the transforms' package version number as would be used
+# when publishing the wheel.  In general, only the micro version
+# number should be advanced relative to the DPK_VERSION. 
+#
+# If you change the versions numbers, be sure to run "make set-versions" to 
+# update version numbers across the transform (e.g., pyproject.toml).
+TRANSFORMS_PKG_VERSION=0.2.2
diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml
index 4fcc97ed9..d3d213946 100644
--- a/data-connector-lib/pyproject.toml
+++ b/data-connector-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_connector"
-version = "0.2.3.dev1"
+version = "0.2.3"
 requires-python = ">=3.10,<3.13"
 keywords = [
     "data",
diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml
index 2e827ea82..36e4e155f 100644
--- a/data-processing-lib/pyproject.toml
+++ b/data-processing-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit"
-version = "0.2.2.dev2"
+version = "0.2.2"
 keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Toolkit Library for Ray and Python"
diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml
index 89b4d9bf8..c0be43920 100644
--- a/data-processing-lib/spark/pyproject.toml
+++ b/data-processing-lib/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_spark"
-version = "0.2.2.dev2"
+version = "0.2.2"
 keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Toolkit Library for Spark"
@@ -11,7 +11,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit==0.2.2.dev2",
+    "data-prep-toolkit==0.2.2",
     "pyspark>=3.5.2",
     "psutil>=6.0.0",
     "PyYAML>=6.0.2"
diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml
index 30b0b66d8..78976a97c 100644
--- a/kfp/kfp_ray_components/createRayClusterComponent.yaml
+++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml
@@ -11,7 +11,7 @@ inputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml
index 44e199c47..c75554d5f 100644
--- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml
+++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml
@@ -9,7 +9,7 @@ inputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml
index 7ab517bff..2e02c3adf 100644
--- a/kfp/kfp_ray_components/executeRayJobComponent.yaml
+++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml
@@ -12,7 +12,7 @@ inputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml
index 9b98912f0..37c0198bf 100644
--- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml
+++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml
@@ -13,7 +13,7 @@ inputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml
index 6b261a003..ec82e9484 100644
--- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml
+++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml
@@ -27,7 +27,7 @@ outputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists, and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
index d7058f2ae..daa903aaf 100644
--- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
+++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_kfp_v1"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Kit Library. KFP support"
 license = {text = "Apache-2.0"}
@@ -13,7 +13,7 @@ authors = [
 ]
 dependencies = [
     "kfp==1.8.22",
-    "data-prep-toolkit-kfp-shared==0.2.2.dev2",
+    "data-prep-toolkit-kfp-shared==0.2.2",
 ]
 
 [build-system]
diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
index 04b6bc7a2..61f54663f 100644
--- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
+++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_kfp_v2"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Kit Library. KFP support"
 license = {text = "Apache-2.0"}
@@ -14,7 +14,7 @@ authors = [
 dependencies = [
     "kfp==2.8.0",
     "kfp-kubernetes==1.2.0",
-    "data-prep-toolkit-kfp-shared==0.2.2.dev2",
+    "data-prep-toolkit-kfp-shared==0.2.2",
 ]
 
 [build-system]
diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
index df27ad1cf..3ba7491bc 100644
--- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
+++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_kfp_shared"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Kit Library. KFP support"
 license = {text = "Apache-2.0"}
@@ -14,7 +14,7 @@ authors = [
 dependencies = [
     "requests",
     "kubernetes",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
index f3f491e4b..3e5f262b9 100644
--- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
+++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
@@ -25,7 +25,7 @@
 
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml
index 5e6f41bb2..d4f8c11cf 100644
--- a/transforms/code/code2parquet/python/pyproject.toml
+++ b/transforms/code/code2parquet/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code2parquet_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "code2parquet Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt
index bbb84b749..4a217ff8c 100644
--- a/transforms/code/code2parquet/python/requirements.txt
+++ b/transforms/code/code2parquet/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 parameterized
 pandas
diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml
index 15a4be4c1..98b2e3a65 100644
--- a/transforms/code/code2parquet/ray/pyproject.toml
+++ b/transforms/code/code2parquet/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code2parquet_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "code2parquet Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2.dev2",
-    "dpk-code2parquet-transform-python==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-code2parquet-transform-python==0.2.2",
     "parameterized",
     "pandas",
 ]
diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml
index 492603d54..d3c2c2196 100644
--- a/transforms/code/code_profiler/python/pyproject.toml
+++ b/transforms/code/code_profiler/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_profiler_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Code Profiler Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt
index 8608c6d6e..31509b291 100644
--- a/transforms/code/code_profiler/python/requirements.txt
+++ b/transforms/code/code_profiler/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 parameterized
 pandas
 aiolimiter==1.1.0
diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml
index 933152e3f..0c9457efc 100644
--- a/transforms/code/code_profiler/ray/pyproject.toml
+++ b/transforms/code/code_profiler/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_profiler_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Code Profiler Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" },
 ]
 dependencies = [
-	"dpk-code-profiler-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+	"dpk-code-profiler-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 	]
 
 [build-system]
diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py
index 6a4ccec1b..7f5aa9768 100644
--- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py
+++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml
index 5f201c8ae..d7b452d6b 100644
--- a/transforms/code/code_quality/python/pyproject.toml
+++ b/transforms/code/code_quality/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_quality_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Code Quality Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt
index 0bd936ef2..a50ddff5c 100644
--- a/transforms/code/code_quality/python/requirements.txt
+++ b/transforms/code/code_quality/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 bs4==0.0.2
 transformers==4.38.2
diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml
index 290429f95..ea6aad8ae 100644
--- a/transforms/code/code_quality/ray/pyproject.toml
+++ b/transforms/code/code_quality/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_quality_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Code Quality Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
 ]
 dependencies = [
-    "dpk-code-quality-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-code-quality-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py
index 9bb315569..5049a9c11 100644
--- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py
+++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml
index ecaf4d7bb..2dadeaf02 100644
--- a/transforms/code/header_cleanser/python/pyproject.toml
+++ b/transforms/code/header_cleanser/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_header_cleanser_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "License and Copyright Removal Transform for Python"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt
index c2d0d8793..fd3fc0de4 100644
--- a/transforms/code/header_cleanser/python/requirements.txt
+++ b/transforms/code/header_cleanser/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 scancode-toolkit==32.1.0 ; platform_system != 'Darwin'
 
diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml
index adff71cfc..471ce1d5e 100644
--- a/transforms/code/header_cleanser/ray/pyproject.toml
+++ b/transforms/code/header_cleanser/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_header_cleanser_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "License and copyright removal Transform for Ray"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" },
 ]
 dependencies = [
-    "dpk-header-cleanser-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-header-cleanser-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
     "scancode-toolkit==32.1.0",
 ]
 
diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py
index 7dba0d9d1..9bdcc6e96 100644
--- a/transforms/code/license_select/kfp_ray/license_select_wf.py
+++ b/transforms/code/license_select/kfp_ray/license_select_wf.py
@@ -25,7 +25,7 @@
 
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml
index 30f2f001e..b445c6b09 100644
--- a/transforms/code/license_select/python/pyproject.toml
+++ b/transforms/code/license_select/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_license_select_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "License Select Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt
index 368287e5d..880c7c2c7 100644
--- a/transforms/code/license_select/python/requirements.txt
+++ b/transforms/code/license_select/python/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit==0.2.2.dev2
\ No newline at end of file
+data-prep-toolkit==0.2.2
\ No newline at end of file
diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml
index 815121787..b2c56e940 100644
--- a/transforms/code/license_select/ray/pyproject.toml
+++ b/transforms/code/license_select/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_license_select_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "License Select Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" },
 ]
 dependencies = [
-    "dpk-license-select-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-license-select-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py
index bede80b88..89eb9d730 100644
--- a/transforms/code/malware/kfp_ray/malware_wf.py
+++ b/transforms/code/malware/kfp_ray/malware_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml
index 22d92fd8c..2a7d1a5b9 100644
--- a/transforms/code/malware/python/pyproject.toml
+++ b/transforms/code/malware/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_malware_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Malware Python Transform"
 license = {text = "Apache-2.0"}
@@ -9,7 +9,7 @@ authors = [
     { name = "Takuya Goto", email = "tkyg@jp.ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit==0.2.2.dev2",
+    "data-prep-toolkit==0.2.2",
     "clamd==1.0.2",
 ]
 
diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml
index 791b8d253..36901b88c 100644
--- a/transforms/code/malware/ray/pyproject.toml
+++ b/transforms/code/malware/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_malware_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Malware Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Takuya Goto", email = "tkyg@jp.ibm.com" },
 ]
 dependencies = [
-    "dpk-malware-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-malware-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py
index 11f001bfa..bb114e3d6 100644
--- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py
+++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml
index 186198d83..e20a62f7c 100644
--- a/transforms/code/proglang_select/python/pyproject.toml
+++ b/transforms/code/proglang_select/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_proglang_select_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Programming Language Selection Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt
index 368287e5d..880c7c2c7 100644
--- a/transforms/code/proglang_select/python/requirements.txt
+++ b/transforms/code/proglang_select/python/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit==0.2.2.dev2
\ No newline at end of file
+data-prep-toolkit==0.2.2
\ No newline at end of file
diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml
index bf3e5f9f4..d2e820d99 100644
--- a/transforms/code/proglang_select/ray/pyproject.toml
+++ b/transforms/code/proglang_select/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_proglang_select_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Programming Language Selection Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
 ]
 dependencies = [
-    "dpk-proglang-select-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-proglang-select-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py
index 38a829fab..fa739bfd0 100644
--- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py
+++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "repo_level_order_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml
index 80440a362..5fb561d67 100644
--- a/transforms/code/repo_level_ordering/ray/pyproject.toml
+++ b/transforms/code/repo_level_ordering/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_repo_level_order_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "repo_level_order Ray Transform"
 license = {text = "Apache-2.0"}
@@ -11,7 +11,7 @@ authors = [
     { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.2",
     "networkx==3.3",
     "colorlog==6.8.2",
     "func-timeout==4.3.5",
diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py
index 7e30ee8b8..1fd927356 100644
--- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py
+++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py
index 387c3bda7..e128df8b0 100644
--- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py
+++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt
index c24d0c3e2..144688f63 100644
--- a/transforms/language/doc_chunk/python/requirements.txt
+++ b/transforms/language/doc_chunk/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 docling-core==2.3.0
 pydantic>=2.0.0,<2.10.0 
 llama-index-core>=0.11.22,<0.12.0
diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml
index 29b594fac..ed8f5d60b 100644
--- a/transforms/language/doc_chunk/ray/pyproject.toml
+++ b/transforms/language/doc_chunk/ray/pyproject.toml
@@ -12,7 +12,7 @@ authors = [
 ]
 dependencies = [
     "dpk-doc-chunk-transform-python==0.3.0",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py
index 436d93ff3..f103b7269 100644
--- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py
+++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py
index f39fd7e39..0ca4fb865 100644
--- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py
+++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml
index 72406e945..f3abe0337 100644
--- a/transforms/language/doc_quality/python/pyproject.toml
+++ b/transforms/language/doc_quality/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_quality_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Document Quality Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt
index 2993d6b12..de76cb006 100644
--- a/transforms/language/doc_quality/python/requirements.txt
+++ b/transforms/language/doc_quality/python/requirements.txt
@@ -1,2 +1,2 @@
 
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml
index dc13d5f94..c1433d29b 100644
--- a/transforms/language/doc_quality/ray/pyproject.toml
+++ b/transforms/language/doc_quality/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_quality_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Document Quality Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" }
 ]
 dependencies = [
-    "dpk-doc_quality-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-doc_quality-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py
index 4eb8b9de1..4eaef2fea 100644
--- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py
+++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "html2parquet_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/html2parquet/python/pyproject.toml b/transforms/language/html2parquet/python/pyproject.toml
index dfd0c3928..af6b64763 100644
--- a/transforms/language/html2parquet/python/pyproject.toml
+++ b/transforms/language/html2parquet/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_html2parquet_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "HTML2PARQUET Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/html2parquet/python/requirements.txt b/transforms/language/html2parquet/python/requirements.txt
index af6ffe1e5..432362451 100644
--- a/transforms/language/html2parquet/python/requirements.txt
+++ b/transforms/language/html2parquet/python/requirements.txt
@@ -1,2 +1,2 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 trafilatura==1.12.0
diff --git a/transforms/language/html2parquet/ray/pyproject.toml b/transforms/language/html2parquet/ray/pyproject.toml
index 873883e49..859706621 100644
--- a/transforms/language/html2parquet/ray/pyproject.toml
+++ b/transforms/language/html2parquet/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_html2parquet_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "HTML2PARQUET Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt
index 151d05a3e..7e543b153 100644
--- a/transforms/language/html2parquet/ray/requirements.txt
+++ b/transforms/language/html2parquet/ray/requirements.txt
@@ -1,3 +1,3 @@
-dpk-html2parquet-transform-python==0.2.2.dev2
-data-prep-toolkit[ray]==0.2.2.dev2
+dpk-html2parquet-transform-python==0.2.2
+data-prep-toolkit[ray]==0.2.2
 trafilatura==1.12.0
\ No newline at end of file
diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py
index a89c54ab3..e853c2328 100644
--- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py
+++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py
index 2ac84645d..5aed719c5 100644
--- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py
+++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml
index c5de6826a..43650a50a 100644
--- a/transforms/language/lang_id/python/pyproject.toml
+++ b/transforms/language/lang_id/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_lang_id_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Language Identification Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt
index a405f7afc..2cd053cfb 100644
--- a/transforms/language/lang_id/python/requirements.txt
+++ b/transforms/language/lang_id/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 fasttext==0.9.2
 langcodes==3.3.0
 huggingface-hub >= 0.21.4, <1.0.0
diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml
index ac45a167e..6347bda71 100644
--- a/transforms/language/lang_id/ray/pyproject.toml
+++ b/transforms/language/lang_id/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_lang_id_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Language Identification Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" }
 ]
 dependencies = [
-    "dpk-lang_id-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-lang_id-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
index 8992f1145..56e881b5e 100644
--- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
+++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
index c9cdbf652..395918ac3 100644
--- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
+++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt
index 2912af252..4d09ff394 100644
--- a/transforms/language/pdf2parquet/python/requirements.txt
+++ b/transforms/language/pdf2parquet/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 docling-core==2.3.0
 docling-ibm-models==2.0.3
 deepsearch-glm==0.26.1
diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt
index 2b414c59e..abec5044d 100644
--- a/transforms/language/pdf2parquet/ray/requirements.txt
+++ b/transforms/language/pdf2parquet/ray/requirements.txt
@@ -1,5 +1,5 @@
 dpk-pdf2parquet-transform-python==0.3.0
-data-prep-toolkit[ray]==0.2.2.dev2
+data-prep-toolkit[ray]==0.2.2
 # docling-core==1.7.2
 # docling-ibm-models==2.0.0
 # deepsearch-glm==0.22.0
diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt
index 958210865..1fb9c95b9 100644
--- a/transforms/language/pii_redactor/python/requirements.txt
+++ b/transforms/language/pii_redactor/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 presidio-analyzer>=2.2.355
 presidio-anonymizer>=2.2.355
 flair>=0.14.0
diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml
index b96f16615..b98b2c9af 100644
--- a/transforms/language/pii_redactor/ray/pyproject.toml
+++ b/transforms/language/pii_redactor/ray/pyproject.toml
@@ -11,7 +11,7 @@ authors = [
 ]
 dependencies = [
     "dpk_pii_redactor_transform_python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.2",
     "presidio-analyzer>=2.2.355",
     "presidio-anonymizer>=2.2.355",
     "flair>=0.14.0",
diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py
index e522737a1..bad5e24cd 100644
--- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py
+++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py
index f88fe9eef..5c762c2a1 100644
--- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py
+++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml
index 87dad3c1c..62182b27b 100644
--- a/transforms/language/text_encoder/python/pyproject.toml
+++ b/transforms/language/text_encoder/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_text_encoder_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Text Encoder Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt
index 2eb79e69b..32bf83692 100644
--- a/transforms/language/text_encoder/python/requirements.txt
+++ b/transforms/language/text_encoder/python/requirements.txt
@@ -1,2 +1,2 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 sentence-transformers==3.0.1
diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml
index ef08f697a..c6d49701b 100644
--- a/transforms/language/text_encoder/ray/pyproject.toml
+++ b/transforms/language/text_encoder/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_text_encoder_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Text Encoder Ray Transform"
 license = {text = "Apache-2.0"}
@@ -11,8 +11,8 @@ authors = [
     { name = "Peter Staar", email = "taa@zurich.ibm.com" },
 ]
 dependencies = [
-    "dpk-text_encoder-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-text_encoder-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml
index 3c1f64c32..3b853cbe7 100644
--- a/transforms/pyproject.toml
+++ b/transforms/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_transforms"
-version = "0.2.2.dev3"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 description = "Data Preparation Toolkit Transforms using Ray"
diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt
index 9012f685b..11d0decf5 100644
--- a/transforms/requirements-ray.txt
+++ b/transforms/requirements-ray.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit[ray]>=0.2.2.dev2
+data-prep-toolkit[ray]>=0.2.2
 networkx==3.3
 colorlog==6.8.2
 func-timeout==4.3.5
diff --git a/transforms/requirements.txt b/transforms/requirements.txt
index 8b48a970f..7317d33e3 100644
--- a/transforms/requirements.txt
+++ b/transforms/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit>=0.2.2.dev2
+data-prep-toolkit>=0.2.2
diff --git a/transforms/transform.config b/transforms/transform.config
index c226171c6..7bafba684 100644
--- a/transforms/transform.config
+++ b/transforms/transform.config
@@ -7,11 +7,3 @@
 # expected files and is used to define the transform's image name. 
 TRANSFORM_NAME=data-prep-kit-transforms
 
-################################################################################
-# This defines the transforms' package version number as would be used
-# when publishing the wheel.  In general, only the micro version
-# number should be advanced relative to the DPK_VERSION. 
-#
-# If you change the versions numbers, be sure to run "make set-versions" to 
-# update version numbers across the transform (e.g., pyproject.toml).
-TRANSFORMS_PKG_VERSION=0.2.2.dev2
diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py
index f41231159..7e1bd0b8e 100644
--- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py
+++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py
@@ -22,7 +22,7 @@
 # the name of the job script
 EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py"
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml
index 0e2658087..a9e69f0bf 100644
--- a/transforms/universal/doc_id/python/pyproject.toml
+++ b/transforms/universal/doc_id/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_id_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "ededup Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt
index 368287e5d..880c7c2c7 100644
--- a/transforms/universal/doc_id/python/requirements.txt
+++ b/transforms/universal/doc_id/python/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit==0.2.2.dev2
\ No newline at end of file
+data-prep-toolkit==0.2.2
\ No newline at end of file
diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml
index 5a5941155..fc6a37b19 100644
--- a/transforms/universal/doc_id/ray/pyproject.toml
+++ b/transforms/universal/doc_id/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_id_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "docid Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "dpk_doc_id_transform_python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk_doc_id_transform_python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml
index 36f345c09..f50d4f70d 100644
--- a/transforms/universal/doc_id/spark/pyproject.toml
+++ b/transforms/universal/doc_id/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_id_transform_spark"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Doc ID Spark Transform"
 license = {text = "Apache-2.0"}
@@ -10,7 +10,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[spark]==0.2.2.dev2",
+    "data-prep-toolkit[spark]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py
index ab46daadb..d878bd3e2 100644
--- a/transforms/universal/ededup/kfp_ray/ededup_wf.py
+++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml
index 735104f20..67fd0f758 100644
--- a/transforms/universal/ededup/python/pyproject.toml
+++ b/transforms/universal/ededup/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_ededup_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "ededup Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt
index 75baaef62..45b4cfd50 100644
--- a/transforms/universal/ededup/python/requirements.txt
+++ b/transforms/universal/ededup/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 mmh3>=4.1.0
 xxhash==3.4.1
diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml
index 9e3885e50..d74fa0637 100644
--- a/transforms/universal/ededup/ray/pyproject.toml
+++ b/transforms/universal/ededup/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_ededup_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "ededup Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2.dev2",
-    "dpk_ededup_transform_python==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.2",
+    "dpk_ededup_transform_python==0.2.2",
     "tqdm==4.66.3",
 ]
 
diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index 3156ab6f1..da431d030 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml
index 923cbdf82..7c59dcff9 100644
--- a/transforms/universal/fdedup/ray/pyproject.toml
+++ b/transforms/universal/fdedup/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_fdedup_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "fdedup Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,7 +10,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.2",
     "mmh3>=4.1.0",
     "xxhash==3.4.1",
     "tqdm==4.66.3",
diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py
index b856b1007..4b122d98f 100644
--- a/transforms/universal/filter/kfp_ray/filter_wf.py
+++ b/transforms/universal/filter/kfp_ray/filter_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml
index 64f148799..8e9bb2366 100644
--- a/transforms/universal/filter/python/pyproject.toml
+++ b/transforms/universal/filter/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_filter_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Filter Transform for Python"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt
index 9f1feff29..5e3e783c8 100644
--- a/transforms/universal/filter/python/requirements.txt
+++ b/transforms/universal/filter/python/requirements.txt
@@ -1,3 +1,3 @@
 
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 duckdb>=0.10.1
diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml
index a794a1a0b..a8ec7bb4d 100644
--- a/transforms/universal/filter/ray/pyproject.toml
+++ b/transforms/universal/filter/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_filter_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Filter Transform for Ray"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
 dependencies = [
-    "dpk-filter-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-filter-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml
index 7b60dba46..85403487a 100644
--- a/transforms/universal/filter/spark/pyproject.toml
+++ b/transforms/universal/filter/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_filter_transform_spark"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Filter Spark Transform"
 license = {text = "Apache-2.0"}
@@ -9,7 +9,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[spark]==0.2.2.dev2",
+    "data-prep-toolkit[spark]==0.2.2",
 ]
 
 [project.optional-dependencies]
diff --git a/transforms/universal/hap/kfp_ray.disable/hap_wf.py b/transforms/universal/hap/kfp_ray.disable/hap_wf.py
index 786011d4d..8069ec181 100644
--- a/transforms/universal/hap/kfp_ray.disable/hap_wf.py
+++ b/transforms/universal/hap/kfp_ray.disable/hap_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "hap_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml
index 389788363..7b30dd72e 100644
--- a/transforms/universal/hap/python/pyproject.toml
+++ b/transforms/universal/hap/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_hap_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "HAP Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt
index ba8948477..07c5f854a 100644
--- a/transforms/universal/hap/python/requirements.txt
+++ b/transforms/universal/hap/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 nltk==3.9.1
 transformers==4.38.2
 torch>=2.2.2,<=2.4.1
diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml
index abbb1a30c..6518e5277 100644
--- a/transforms/universal/hap/ray/pyproject.toml
+++ b/transforms/universal/hap/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_hap_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "HAP Ray Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt
index 34e1d6932..119167ca2 100644
--- a/transforms/universal/hap/ray/requirements.txt
+++ b/transforms/universal/hap/ray/requirements.txt
@@ -1,5 +1,5 @@
-data-prep-toolkit[ray]==0.2.2.dev2
-dpk-hap-transform-python==0.2.2.dev2
+data-prep-toolkit[ray]==0.2.2
+dpk-hap-transform-python==0.2.2
 nltk==3.9.1
 transformers==4.38.2
 torch>=2.2.2,<=2.4.1
diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py
index 3b102d205..737b60121 100644
--- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py
+++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "noop_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py
index e8125328b..9dbdaf3b0 100644
--- a/transforms/universal/noop/kfp_ray/noop_wf.py
+++ b/transforms/universal/noop/kfp_ray/noop_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "noop_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml
index 998161e31..e8c089ef0 100644
--- a/transforms/universal/noop/python/pyproject.toml
+++ b/transforms/universal/noop/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_noop_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "NOOP Python Transform"
 license = {text = "Apache-2.0"}
@@ -10,7 +10,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit==0.2.2.dev2",
+    "data-prep-toolkit==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml
index 5d475fe12..19fe77560 100644
--- a/transforms/universal/noop/ray/pyproject.toml
+++ b/transforms/universal/noop/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_noop_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "NOOP Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "dpk-noop-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-noop-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml
index f867fb070..495d827a0 100644
--- a/transforms/universal/noop/spark/pyproject.toml
+++ b/transforms/universal/noop/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_noop_transform_spark"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "NOOP Spark Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "dpk-noop-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[spark]==0.2.2.dev2",
+    "dpk-noop-transform-python==0.2.2",
+    "data-prep-toolkit[spark]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py
index 914637895..ee6323d74 100644
--- a/transforms/universal/profiler/kfp_ray/profiler_wf.py
+++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "profiler_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml
index 95775e3a6..117be53c0 100644
--- a/transforms/universal/profiler/python/pyproject.toml
+++ b/transforms/universal/profiler/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_profiler_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "profiler Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt
index 89801e4ad..fee352d4a 100644
--- a/transforms/universal/profiler/python/requirements.txt
+++ b/transforms/universal/profiler/python/requirements.txt
@@ -1,5 +1,5 @@
 
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 mmh3==4.1.0
 xxhash==3.4.1
 
diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml
index 6060653fa..c9f1b1da3 100644
--- a/transforms/universal/profiler/ray/pyproject.toml
+++ b/transforms/universal/profiler/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_profiler_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "profiler Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2.dev2",
-    "dpk_profiler_transform_python==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.2",
+    "dpk_profiler_transform_python==0.2.2",
     "tqdm==4.66.3",
 ]
 
diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml
index 455684b4f..05602dc26 100644
--- a/transforms/universal/profiler/spark/pyproject.toml
+++ b/transforms/universal/profiler/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_profiler_transform_spark"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Profiler Spark Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "dpk-profiler-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[spark]==0.2.2.dev2",
+    "dpk-profiler-transform-python==0.2.2",
+    "data-prep-toolkit[spark]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py
index 0724ed731..0a9be8e95 100644
--- a/transforms/universal/resize/kfp_ray/resize_wf.py
+++ b/transforms/universal/resize/kfp_ray/resize_wf.py
@@ -22,7 +22,7 @@
 # the name of the job script
 EXEC_SCRIPT_NAME: str = "resize_transform_ray.py"
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml
index 082f37f0c..836388694 100644
--- a/transforms/universal/resize/python/pyproject.toml
+++ b/transforms/universal/resize/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_resize_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "resize Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt
index 368287e5d..880c7c2c7 100644
--- a/transforms/universal/resize/python/requirements.txt
+++ b/transforms/universal/resize/python/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit==0.2.2.dev2
\ No newline at end of file
+data-prep-toolkit==0.2.2
\ No newline at end of file
diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml
index 1490303bb..4f7603f6f 100644
--- a/transforms/universal/resize/ray/pyproject.toml
+++ b/transforms/universal/resize/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_resize_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Resize Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "dpk-resize-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-resize-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml
index 538c12d20..c8bb67111 100644
--- a/transforms/universal/resize/spark/pyproject.toml
+++ b/transforms/universal/resize/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_resize_transform_spark"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Resize Spark Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "dpk-resize-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[spark]==0.2.2.dev2",
+    "dpk-resize-transform-python==0.2.2",
+    "data-prep-toolkit[spark]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
index c131d11ea..243cac6be 100644
--- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
+++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
@@ -23,7 +23,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
 # path to kfp component specifications files
 
 # path to kfp component specifications files
diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml
index bc352f0fd..021a1427f 100644
--- a/transforms/universal/tokenization/python/pyproject.toml
+++ b/transforms/universal/tokenization/python/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "dpk_tokenization_transform_python"
 keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Tokenization Transform for Python"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt
index 5e00dbaa1..afd567d8b 100644
--- a/transforms/universal/tokenization/python/requirements.txt
+++ b/transforms/universal/tokenization/python/requirements.txt
@@ -1,2 +1,2 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.2
 transformers==4.38.2
diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml
index 095cb63e0..3cc4bcf80 100644
--- a/transforms/universal/tokenization/ray/pyproject.toml
+++ b/transforms/universal/tokenization/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_tokenization_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.2"
 requires-python = ">=3.10,<3.13"
 description = "Tokenization Transform for Ray"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"},
 ]
 dependencies = [
-    "dpk-tokenization-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-tokenization-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.2",
 ]
 
 [build-system]
diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt
index 5c989591d..dfb74a6ca 100644
--- a/transforms/universal/web2parquet/requirements.txt
+++ b/transforms/universal/web2parquet/requirements.txt
@@ -1,2 +1,2 @@
-data-prep-toolkit>=0.2.2.dev2
-data_prep_connector>=0.2.3.dev0
\ No newline at end of file
+data-prep-toolkit>=0.2.2
+data_prep_connector>=0.2.3
\ No newline at end of file

From 0587637771e36cbb099465fc28e6d388c1bc9b8e Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Mon, 25 Nov 2024 20:18:16 -0500
Subject: [PATCH 102/105] update release notes

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 release-notes.md | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/release-notes.md b/release-notes.md
index 15f23c542..4b7b8d553 100644
--- a/release-notes.md
+++ b/release-notes.md
@@ -1,5 +1,42 @@
 # Data Prep Kit Release notes
 
+## Release 0.2.2 - 11/25/2024
+
+### General 
+1. Update RAG example to use granite model 
+1. Updated transforms with Docling 2
+1. Added single package for dpk with extra for \[spark\] and \[ray\]
+1. Added single package for transforms with extra for \[all\] or \[individual-transform-name\]
+
+
+### data-prep-toolkit libraries (python, ray, spark) 
+
+1. Fix metadata logging even when actors crash 
+1. Add multilock for ray workers downloads/cleanup
+1. Multiple updates to spark runtime
+1. Added support for python 3.12
+1. refactoring of data access code
+
+
+### KFP Workloads 
+
+1. Modify superpipeline params type Str/json
+1. Set kuberay apiserver version 
+1. Add Super pipeline for code transforms
+
+
+### Transforms
+
+1. Enhance pdf2parquet with docling2 support for extracting HTML, DOCS, etc.
+1. Added web2parquet transform
+1. Added HAP transform
+
+### HTTP Connector 0.2.3
+
+1. Enhanced parameter/configuration allows the user to customize crawler settings 
+1. implement subdomain focus feature in data-prep-connector 
+
+
 ## Release 0.2.2- HTTP Connector Module - 10/23/2024
 
 ### General 

From a067e55e13fede9dbb30cccc2a74b4b441a961e7 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Mon, 25 Nov 2024 21:41:15 -0500
Subject: [PATCH 103/105] Setup dev after new release

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .make.versions                                            | 8 ++++----
 data-connector-lib/pyproject.toml                         | 2 +-
 data-processing-lib/pyproject.toml                        | 2 +-
 data-processing-lib/spark/pyproject.toml                  | 4 ++--
 kfp/kfp_ray_components/createRayClusterComponent.yaml     | 2 +-
 kfp/kfp_ray_components/deleteRayClusterComponent.yaml     | 2 +-
 kfp/kfp_ray_components/executeRayJobComponent.yaml        | 2 +-
 .../executeRayJobComponent_multi_s3.yaml                  | 2 +-
 kfp/kfp_ray_components/executeSubWorkflowComponent.yaml   | 2 +-
 .../kfp_v1_workflow_support/pyproject.toml                | 4 ++--
 .../kfp_v2_workflow_support/pyproject.toml                | 4 ++--
 .../shared_workflow_support/pyproject.toml                | 4 ++--
 transforms/code/code2parquet/kfp_ray/code2parquet_wf.py   | 2 +-
 transforms/code/code2parquet/python/pyproject.toml        | 2 +-
 transforms/code/code2parquet/python/requirements.txt      | 2 +-
 transforms/code/code2parquet/ray/pyproject.toml           | 6 +++---
 transforms/code/code_profiler/python/pyproject.toml       | 2 +-
 transforms/code/code_profiler/python/requirements.txt     | 2 +-
 transforms/code/code_profiler/ray/pyproject.toml          | 6 +++---
 transforms/code/code_quality/kfp_ray/code_quality_wf.py   | 2 +-
 transforms/code/code_quality/python/pyproject.toml        | 2 +-
 transforms/code/code_quality/python/requirements.txt      | 2 +-
 transforms/code/code_quality/ray/pyproject.toml           | 6 +++---
 .../code/header_cleanser/kfp_ray/header_cleanser_wf.py    | 2 +-
 transforms/code/header_cleanser/python/pyproject.toml     | 2 +-
 transforms/code/header_cleanser/python/requirements.txt   | 2 +-
 transforms/code/header_cleanser/ray/pyproject.toml        | 6 +++---
 .../code/license_select/kfp_ray/license_select_wf.py      | 2 +-
 transforms/code/license_select/python/pyproject.toml      | 2 +-
 transforms/code/license_select/python/requirements.txt    | 2 +-
 transforms/code/license_select/ray/pyproject.toml         | 6 +++---
 transforms/code/malware/kfp_ray/malware_wf.py             | 2 +-
 transforms/code/malware/python/pyproject.toml             | 4 ++--
 transforms/code/malware/ray/pyproject.toml                | 6 +++---
 .../code/proglang_select/kfp_ray/proglang_select_wf.py    | 2 +-
 transforms/code/proglang_select/python/pyproject.toml     | 2 +-
 transforms/code/proglang_select/python/requirements.txt   | 2 +-
 transforms/code/proglang_select/ray/pyproject.toml        | 6 +++---
 .../repo_level_ordering/kfp_ray/repo_level_order_wf.py    | 2 +-
 transforms/code/repo_level_ordering/ray/pyproject.toml    | 4 ++--
 .../language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py   | 2 +-
 transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py     | 2 +-
 transforms/language/doc_chunk/python/requirements.txt     | 2 +-
 transforms/language/doc_chunk/ray/pyproject.toml          | 2 +-
 .../doc_quality/kfp_ray/doc_quality_multiple_wf.py        | 2 +-
 transforms/language/doc_quality/kfp_ray/doc_quality_wf.py | 2 +-
 transforms/language/doc_quality/python/pyproject.toml     | 2 +-
 transforms/language/doc_quality/python/requirements.txt   | 2 +-
 transforms/language/doc_quality/ray/pyproject.toml        | 6 +++---
 .../language/html2parquet/kfp_ray/html2parquet_wf.py      | 2 +-
 transforms/language/html2parquet/python/pyproject.toml    | 2 +-
 transforms/language/html2parquet/python/requirements.txt  | 2 +-
 transforms/language/html2parquet/ray/pyproject.toml       | 2 +-
 transforms/language/html2parquet/ray/requirements.txt     | 4 ++--
 .../language/lang_id/kfp_ray/lang_id_multiple_wf.py       | 2 +-
 transforms/language/lang_id/kfp_ray/lang_id_wf.py         | 2 +-
 transforms/language/lang_id/python/pyproject.toml         | 2 +-
 transforms/language/lang_id/python/requirements.txt       | 2 +-
 transforms/language/lang_id/ray/pyproject.toml            | 6 +++---
 .../pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py        | 2 +-
 transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +-
 transforms/language/pdf2parquet/python/requirements.txt   | 2 +-
 transforms/language/pdf2parquet/ray/requirements.txt      | 2 +-
 transforms/language/pii_redactor/python/requirements.txt  | 2 +-
 transforms/language/pii_redactor/ray/pyproject.toml       | 6 +++---
 .../text_encoder/kfp_ray/text_encoder_multiple_wf.py      | 2 +-
 .../language/text_encoder/kfp_ray/text_encoder_wf.py      | 2 +-
 transforms/language/text_encoder/python/pyproject.toml    | 2 +-
 transforms/language/text_encoder/python/requirements.txt  | 2 +-
 transforms/language/text_encoder/ray/pyproject.toml       | 6 +++---
 transforms/pyproject.toml                                 | 2 +-
 transforms/requirements-ray.txt                           | 2 +-
 transforms/requirements.txt                               | 2 +-
 transforms/universal/doc_id/kfp_ray/doc_id_wf.py          | 2 +-
 transforms/universal/doc_id/python/pyproject.toml         | 2 +-
 transforms/universal/doc_id/python/requirements.txt       | 2 +-
 transforms/universal/doc_id/ray/pyproject.toml            | 6 +++---
 transforms/universal/doc_id/spark/pyproject.toml          | 4 ++--
 transforms/universal/ededup/kfp_ray/ededup_wf.py          | 2 +-
 transforms/universal/ededup/python/pyproject.toml         | 2 +-
 transforms/universal/ededup/python/requirements.txt       | 2 +-
 transforms/universal/ededup/ray/pyproject.toml            | 6 +++---
 transforms/universal/fdedup/kfp_ray/fdedup_wf.py          | 2 +-
 transforms/universal/fdedup/ray/pyproject.toml            | 4 ++--
 transforms/universal/filter/kfp_ray/filter_wf.py          | 2 +-
 transforms/universal/filter/python/pyproject.toml         | 2 +-
 transforms/universal/filter/python/requirements.txt       | 2 +-
 transforms/universal/filter/ray/pyproject.toml            | 6 +++---
 transforms/universal/filter/spark/pyproject.toml          | 4 ++--
 transforms/universal/hap/kfp_ray.disable/hap_wf.py        | 2 +-
 transforms/universal/hap/python/pyproject.toml            | 2 +-
 transforms/universal/hap/python/requirements.txt          | 2 +-
 transforms/universal/hap/ray/pyproject.toml               | 2 +-
 transforms/universal/hap/ray/requirements.txt             | 4 ++--
 transforms/universal/noop/kfp_ray/noop_multiple_wf.py     | 2 +-
 transforms/universal/noop/kfp_ray/noop_wf.py              | 2 +-
 transforms/universal/noop/python/pyproject.toml           | 4 ++--
 transforms/universal/noop/ray/pyproject.toml              | 6 +++---
 transforms/universal/noop/spark/pyproject.toml            | 6 +++---
 transforms/universal/profiler/kfp_ray/profiler_wf.py      | 2 +-
 transforms/universal/profiler/python/pyproject.toml       | 2 +-
 transforms/universal/profiler/python/requirements.txt     | 2 +-
 transforms/universal/profiler/ray/pyproject.toml          | 6 +++---
 transforms/universal/profiler/spark/pyproject.toml        | 6 +++---
 transforms/universal/resize/kfp_ray/resize_wf.py          | 2 +-
 transforms/universal/resize/python/pyproject.toml         | 2 +-
 transforms/universal/resize/python/requirements.txt       | 2 +-
 transforms/universal/resize/ray/pyproject.toml            | 6 +++---
 transforms/universal/resize/spark/pyproject.toml          | 6 +++---
 .../universal/tokenization/kfp_ray/tokenization_wf.py     | 2 +-
 transforms/universal/tokenization/python/pyproject.toml   | 2 +-
 transforms/universal/tokenization/python/requirements.txt | 2 +-
 transforms/universal/tokenization/ray/pyproject.toml      | 6 +++---
 transforms/universal/web2parquet/requirements.txt         | 2 +-
 114 files changed, 171 insertions(+), 171 deletions(-)

diff --git a/.make.versions b/.make.versions
index e3a8e8239..bd01a60d7 100644
--- a/.make.versions
+++ b/.make.versions
@@ -16,10 +16,10 @@ DPK_MAJOR_VERSION=0
 # The minor version is incremented manually when significant features have been added that are backward compatible with the previous major.minor release.
 DPK_MINOR_VERSION=2
 # The minor version is incremented AUTOMATICALLY by the release.sh script when a new release is set.
-DPK_MICRO_VERSION=2
+DPK_MICRO_VERSION=3
 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches.
 # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. 
-DPK_VERSION_SUFFIX=
+DPK_VERSION_SUFFIX=.dev0
 
 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX)
 
@@ -39,7 +39,7 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION)
 KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION)
 KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION)
 
-DPK_CONNECTOR_VERSION=0.2.3
+DPK_CONNECTOR_VERSION=0.2.3.dev0
 
 ################## ################## ################## ################## ################## ##################
 # Begin versions that the repo depends on. 
@@ -66,4 +66,4 @@ endif
 #
 # If you change the versions numbers, be sure to run "make set-versions" to 
 # update version numbers across the transform (e.g., pyproject.toml).
-TRANSFORMS_PKG_VERSION=0.2.2
+TRANSFORMS_PKG_VERSION=0.2.3.dev0
diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml
index d3d213946..eaf459a07 100644
--- a/data-connector-lib/pyproject.toml
+++ b/data-connector-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_connector"
-version = "0.2.3"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 keywords = [
     "data",
diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml
index 36e4e155f..40bf6b2a1 100644
--- a/data-processing-lib/pyproject.toml
+++ b/data-processing-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit"
-version = "0.2.2"
+version = "0.2.3.dev0"
 keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Toolkit Library for Ray and Python"
diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml
index c0be43920..55c5a5e9e 100644
--- a/data-processing-lib/spark/pyproject.toml
+++ b/data-processing-lib/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_spark"
-version = "0.2.2"
+version = "0.2.3.dev0"
 keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Toolkit Library for Spark"
@@ -11,7 +11,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit==0.2.2",
+    "data-prep-toolkit==0.2.3.dev0",
     "pyspark>=3.5.2",
     "psutil>=6.0.0",
     "PyYAML>=6.0.2"
diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml
index 78976a97c..30b0b66d8 100644
--- a/kfp/kfp_ray_components/createRayClusterComponent.yaml
+++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml
@@ -11,7 +11,7 @@ inputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml
index c75554d5f..44e199c47 100644
--- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml
+++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml
@@ -9,7 +9,7 @@ inputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml
index 2e02c3adf..7ab517bff 100644
--- a/kfp/kfp_ray_components/executeRayJobComponent.yaml
+++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml
@@ -12,7 +12,7 @@ inputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml
index 37c0198bf..9b98912f0 100644
--- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml
+++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml
@@ -13,7 +13,7 @@ inputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml
index ec82e9484..6b261a003 100644
--- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml
+++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml
@@ -27,7 +27,7 @@ outputs:
 
 implementation:
     container:
-        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+        image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
         # command is a list of strings (command-line arguments).
         # The YAML language has two syntaxes for lists, and you can use either of them.
         # Here we use the "flow syntax" - comma-separated strings inside square brackets.
diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
index daa903aaf..f09b2f32a 100644
--- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
+++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_kfp_v1"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Kit Library. KFP support"
 license = {text = "Apache-2.0"}
@@ -13,7 +13,7 @@ authors = [
 ]
 dependencies = [
     "kfp==1.8.22",
-    "data-prep-toolkit-kfp-shared==0.2.2",
+    "data-prep-toolkit-kfp-shared==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
index 61f54663f..01c5b3e17 100644
--- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
+++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_kfp_v2"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Kit Library. KFP support"
 license = {text = "Apache-2.0"}
@@ -14,7 +14,7 @@ authors = [
 dependencies = [
     "kfp==2.8.0",
     "kfp-kubernetes==1.2.0",
-    "data-prep-toolkit-kfp-shared==0.2.2",
+    "data-prep-toolkit-kfp-shared==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
index 3ba7491bc..aa7a6dd3a 100644
--- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
+++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_kfp_shared"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Kit Library. KFP support"
 license = {text = "Apache-2.0"}
@@ -14,7 +14,7 @@ authors = [
 dependencies = [
     "requests",
     "kubernetes",
-    "data-prep-toolkit[ray]==0.2.2",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
index 3e5f262b9..f3f491e4b 100644
--- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
+++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
@@ -25,7 +25,7 @@
 
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml
index d4f8c11cf..be84b2f20 100644
--- a/transforms/code/code2parquet/python/pyproject.toml
+++ b/transforms/code/code2parquet/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code2parquet_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "code2parquet Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt
index 4a217ff8c..cec7f9c5f 100644
--- a/transforms/code/code2parquet/python/requirements.txt
+++ b/transforms/code/code2parquet/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 parameterized
 pandas
diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml
index 98b2e3a65..d56fed1e8 100644
--- a/transforms/code/code2parquet/ray/pyproject.toml
+++ b/transforms/code/code2parquet/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code2parquet_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "code2parquet Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2",
-    "dpk-code2parquet-transform-python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
+    "dpk-code2parquet-transform-python==0.2.3.dev0",
     "parameterized",
     "pandas",
 ]
diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml
index d3c2c2196..334c86fed 100644
--- a/transforms/code/code_profiler/python/pyproject.toml
+++ b/transforms/code/code_profiler/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_profiler_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Code Profiler Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt
index 31509b291..27706b467 100644
--- a/transforms/code/code_profiler/python/requirements.txt
+++ b/transforms/code/code_profiler/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 parameterized
 pandas
 aiolimiter==1.1.0
diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml
index 0c9457efc..9b760c1c3 100644
--- a/transforms/code/code_profiler/ray/pyproject.toml
+++ b/transforms/code/code_profiler/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_profiler_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Code Profiler Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" },
 ]
 dependencies = [
-	"dpk-code-profiler-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+	"dpk-code-profiler-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 	]
 
 [build-system]
diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py
index 7f5aa9768..6a4ccec1b 100644
--- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py
+++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml
index d7b452d6b..17cbce67d 100644
--- a/transforms/code/code_quality/python/pyproject.toml
+++ b/transforms/code/code_quality/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_quality_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Code Quality Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt
index a50ddff5c..ef627d39f 100644
--- a/transforms/code/code_quality/python/requirements.txt
+++ b/transforms/code/code_quality/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 bs4==0.0.2
 transformers==4.38.2
diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml
index ea6aad8ae..eceee32ed 100644
--- a/transforms/code/code_quality/ray/pyproject.toml
+++ b/transforms/code/code_quality/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_quality_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Code Quality Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
 ]
 dependencies = [
-    "dpk-code-quality-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-code-quality-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py
index 5049a9c11..9bb315569 100644
--- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py
+++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml
index 2dadeaf02..3703ec55f 100644
--- a/transforms/code/header_cleanser/python/pyproject.toml
+++ b/transforms/code/header_cleanser/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_header_cleanser_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "License and Copyright Removal Transform for Python"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt
index fd3fc0de4..915a462dc 100644
--- a/transforms/code/header_cleanser/python/requirements.txt
+++ b/transforms/code/header_cleanser/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 scancode-toolkit==32.1.0 ; platform_system != 'Darwin'
 
diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml
index 471ce1d5e..5fb1bcf26 100644
--- a/transforms/code/header_cleanser/ray/pyproject.toml
+++ b/transforms/code/header_cleanser/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_header_cleanser_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "License and copyright removal Transform for Ray"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" },
 ]
 dependencies = [
-    "dpk-header-cleanser-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-header-cleanser-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
     "scancode-toolkit==32.1.0",
 ]
 
diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py
index 9bdcc6e96..7dba0d9d1 100644
--- a/transforms/code/license_select/kfp_ray/license_select_wf.py
+++ b/transforms/code/license_select/kfp_ray/license_select_wf.py
@@ -25,7 +25,7 @@
 
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml
index b445c6b09..3345d3a5a 100644
--- a/transforms/code/license_select/python/pyproject.toml
+++ b/transforms/code/license_select/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_license_select_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "License Select Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt
index 880c7c2c7..2f67f6a80 100644
--- a/transforms/code/license_select/python/requirements.txt
+++ b/transforms/code/license_select/python/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit==0.2.2
\ No newline at end of file
+data-prep-toolkit==0.2.3.dev0
\ No newline at end of file
diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml
index b2c56e940..ce5979d62 100644
--- a/transforms/code/license_select/ray/pyproject.toml
+++ b/transforms/code/license_select/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_license_select_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "License Select Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" },
 ]
 dependencies = [
-    "dpk-license-select-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-license-select-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py
index 89eb9d730..bede80b88 100644
--- a/transforms/code/malware/kfp_ray/malware_wf.py
+++ b/transforms/code/malware/kfp_ray/malware_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml
index 2a7d1a5b9..a1bc05ab4 100644
--- a/transforms/code/malware/python/pyproject.toml
+++ b/transforms/code/malware/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_malware_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Malware Python Transform"
 license = {text = "Apache-2.0"}
@@ -9,7 +9,7 @@ authors = [
     { name = "Takuya Goto", email = "tkyg@jp.ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit==0.2.2",
+    "data-prep-toolkit==0.2.3.dev0",
     "clamd==1.0.2",
 ]
 
diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml
index 36901b88c..659ee62ef 100644
--- a/transforms/code/malware/ray/pyproject.toml
+++ b/transforms/code/malware/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_malware_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Malware Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Takuya Goto", email = "tkyg@jp.ibm.com" },
 ]
 dependencies = [
-    "dpk-malware-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-malware-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py
index bb114e3d6..11f001bfa 100644
--- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py
+++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml
index e20a62f7c..e5736a9c7 100644
--- a/transforms/code/proglang_select/python/pyproject.toml
+++ b/transforms/code/proglang_select/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_proglang_select_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Programming Language Selection Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt
index 880c7c2c7..2f67f6a80 100644
--- a/transforms/code/proglang_select/python/requirements.txt
+++ b/transforms/code/proglang_select/python/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit==0.2.2
\ No newline at end of file
+data-prep-toolkit==0.2.3.dev0
\ No newline at end of file
diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml
index d2e820d99..d8288d189 100644
--- a/transforms/code/proglang_select/ray/pyproject.toml
+++ b/transforms/code/proglang_select/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_proglang_select_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Programming Language Selection Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
 ]
 dependencies = [
-    "dpk-proglang-select-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-proglang-select-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py
index fa739bfd0..38a829fab 100644
--- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py
+++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "repo_level_order_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml
index 5fb561d67..9581c8941 100644
--- a/transforms/code/repo_level_ordering/ray/pyproject.toml
+++ b/transforms/code/repo_level_ordering/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_repo_level_order_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "repo_level_order Ray Transform"
 license = {text = "Apache-2.0"}
@@ -11,7 +11,7 @@ authors = [
     { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
     "networkx==3.3",
     "colorlog==6.8.2",
     "func-timeout==4.3.5",
diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py
index 1fd927356..7e30ee8b8 100644
--- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py
+++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py
index e128df8b0..387c3bda7 100644
--- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py
+++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt
index 144688f63..207ab9249 100644
--- a/transforms/language/doc_chunk/python/requirements.txt
+++ b/transforms/language/doc_chunk/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 docling-core==2.3.0
 pydantic>=2.0.0,<2.10.0 
 llama-index-core>=0.11.22,<0.12.0
diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml
index ed8f5d60b..4fb356038 100644
--- a/transforms/language/doc_chunk/ray/pyproject.toml
+++ b/transforms/language/doc_chunk/ray/pyproject.toml
@@ -12,7 +12,7 @@ authors = [
 ]
 dependencies = [
     "dpk-doc-chunk-transform-python==0.3.0",
-    "data-prep-toolkit[ray]==0.2.2",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py
index f103b7269..436d93ff3 100644
--- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py
+++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py
index 0ca4fb865..f39fd7e39 100644
--- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py
+++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml
index f3abe0337..23538b8c7 100644
--- a/transforms/language/doc_quality/python/pyproject.toml
+++ b/transforms/language/doc_quality/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_quality_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Document Quality Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt
index de76cb006..4aa2d8111 100644
--- a/transforms/language/doc_quality/python/requirements.txt
+++ b/transforms/language/doc_quality/python/requirements.txt
@@ -1,2 +1,2 @@
 
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml
index c1433d29b..ec56ac2c7 100644
--- a/transforms/language/doc_quality/ray/pyproject.toml
+++ b/transforms/language/doc_quality/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_quality_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Document Quality Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" }
 ]
 dependencies = [
-    "dpk-doc_quality-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-doc_quality-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py
index 4eaef2fea..4eb8b9de1 100644
--- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py
+++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "html2parquet_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/html2parquet/python/pyproject.toml b/transforms/language/html2parquet/python/pyproject.toml
index af6b64763..3a7a6efbc 100644
--- a/transforms/language/html2parquet/python/pyproject.toml
+++ b/transforms/language/html2parquet/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_html2parquet_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "HTML2PARQUET Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/html2parquet/python/requirements.txt b/transforms/language/html2parquet/python/requirements.txt
index 432362451..f21e65774 100644
--- a/transforms/language/html2parquet/python/requirements.txt
+++ b/transforms/language/html2parquet/python/requirements.txt
@@ -1,2 +1,2 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 trafilatura==1.12.0
diff --git a/transforms/language/html2parquet/ray/pyproject.toml b/transforms/language/html2parquet/ray/pyproject.toml
index 859706621..5e888748c 100644
--- a/transforms/language/html2parquet/ray/pyproject.toml
+++ b/transforms/language/html2parquet/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_html2parquet_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "HTML2PARQUET Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt
index 7e543b153..9aa193432 100644
--- a/transforms/language/html2parquet/ray/requirements.txt
+++ b/transforms/language/html2parquet/ray/requirements.txt
@@ -1,3 +1,3 @@
-dpk-html2parquet-transform-python==0.2.2
-data-prep-toolkit[ray]==0.2.2
+dpk-html2parquet-transform-python==0.2.3.dev0
+data-prep-toolkit[ray]==0.2.3.dev0
 trafilatura==1.12.0
\ No newline at end of file
diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py
index e853c2328..a89c54ab3 100644
--- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py
+++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py
index 5aed719c5..2ac84645d 100644
--- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py
+++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml
index 43650a50a..a69724a2d 100644
--- a/transforms/language/lang_id/python/pyproject.toml
+++ b/transforms/language/lang_id/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_lang_id_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Language Identification Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt
index 2cd053cfb..06bec1ab9 100644
--- a/transforms/language/lang_id/python/requirements.txt
+++ b/transforms/language/lang_id/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 fasttext==0.9.2
 langcodes==3.3.0
 huggingface-hub >= 0.21.4, <1.0.0
diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml
index 6347bda71..dba929905 100644
--- a/transforms/language/lang_id/ray/pyproject.toml
+++ b/transforms/language/lang_id/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_lang_id_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Language Identification Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" }
 ]
 dependencies = [
-    "dpk-lang_id-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-lang_id-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
index 56e881b5e..8992f1145 100644
--- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
+++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
index 395918ac3..c9cdbf652 100644
--- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
+++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt
index 4d09ff394..310909164 100644
--- a/transforms/language/pdf2parquet/python/requirements.txt
+++ b/transforms/language/pdf2parquet/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 docling-core==2.3.0
 docling-ibm-models==2.0.3
 deepsearch-glm==0.26.1
diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt
index abec5044d..34831cde8 100644
--- a/transforms/language/pdf2parquet/ray/requirements.txt
+++ b/transforms/language/pdf2parquet/ray/requirements.txt
@@ -1,5 +1,5 @@
 dpk-pdf2parquet-transform-python==0.3.0
-data-prep-toolkit[ray]==0.2.2
+data-prep-toolkit[ray]==0.2.3.dev0
 # docling-core==1.7.2
 # docling-ibm-models==2.0.0
 # deepsearch-glm==0.22.0
diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt
index 1fb9c95b9..0abcc1d96 100644
--- a/transforms/language/pii_redactor/python/requirements.txt
+++ b/transforms/language/pii_redactor/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 presidio-analyzer>=2.2.355
 presidio-anonymizer>=2.2.355
 flair>=0.14.0
diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml
index b98b2c9af..4549851d0 100644
--- a/transforms/language/pii_redactor/ray/pyproject.toml
+++ b/transforms/language/pii_redactor/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_pii_redactor_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "PII Redactor Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "dpk_pii_redactor_transform_python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk_pii_redactor_transform_python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
     "presidio-analyzer>=2.2.355",
     "presidio-anonymizer>=2.2.355",
     "flair>=0.14.0",
diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py
index bad5e24cd..e522737a1 100644
--- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py
+++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py
index 5c762c2a1..f88fe9eef 100644
--- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py
+++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml
index 62182b27b..dc15beb6e 100644
--- a/transforms/language/text_encoder/python/pyproject.toml
+++ b/transforms/language/text_encoder/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_text_encoder_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Text Encoder Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt
index 32bf83692..3ac880bba 100644
--- a/transforms/language/text_encoder/python/requirements.txt
+++ b/transforms/language/text_encoder/python/requirements.txt
@@ -1,2 +1,2 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 sentence-transformers==3.0.1
diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml
index c6d49701b..f1b2c09d5 100644
--- a/transforms/language/text_encoder/ray/pyproject.toml
+++ b/transforms/language/text_encoder/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_text_encoder_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Text Encoder Ray Transform"
 license = {text = "Apache-2.0"}
@@ -11,8 +11,8 @@ authors = [
     { name = "Peter Staar", email = "taa@zurich.ibm.com" },
 ]
 dependencies = [
-    "dpk-text_encoder-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-text_encoder-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml
index 3b853cbe7..57a2908c2 100644
--- a/transforms/pyproject.toml
+++ b/transforms/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_transforms"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 description = "Data Preparation Toolkit Transforms using Ray"
diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt
index 11d0decf5..b0527bdd6 100644
--- a/transforms/requirements-ray.txt
+++ b/transforms/requirements-ray.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit[ray]>=0.2.2
+data-prep-toolkit[ray]>=0.2.3.dev0
 networkx==3.3
 colorlog==6.8.2
 func-timeout==4.3.5
diff --git a/transforms/requirements.txt b/transforms/requirements.txt
index 7317d33e3..934c95182 100644
--- a/transforms/requirements.txt
+++ b/transforms/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit>=0.2.2
+data-prep-toolkit>=0.2.3.dev0
diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py
index 7e1bd0b8e..f41231159 100644
--- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py
+++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py
@@ -22,7 +22,7 @@
 # the name of the job script
 EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py"
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml
index a9e69f0bf..1a962662d 100644
--- a/transforms/universal/doc_id/python/pyproject.toml
+++ b/transforms/universal/doc_id/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_id_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "ededup Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt
index 880c7c2c7..2f67f6a80 100644
--- a/transforms/universal/doc_id/python/requirements.txt
+++ b/transforms/universal/doc_id/python/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit==0.2.2
\ No newline at end of file
+data-prep-toolkit==0.2.3.dev0
\ No newline at end of file
diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml
index fc6a37b19..da34dded3 100644
--- a/transforms/universal/doc_id/ray/pyproject.toml
+++ b/transforms/universal/doc_id/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_id_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "docid Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "dpk_doc_id_transform_python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk_doc_id_transform_python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml
index f50d4f70d..369a1bb72 100644
--- a/transforms/universal/doc_id/spark/pyproject.toml
+++ b/transforms/universal/doc_id/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_doc_id_transform_spark"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Doc ID Spark Transform"
 license = {text = "Apache-2.0"}
@@ -10,7 +10,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[spark]==0.2.2",
+    "data-prep-toolkit[spark]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py
index d878bd3e2..ab46daadb 100644
--- a/transforms/universal/ededup/kfp_ray/ededup_wf.py
+++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml
index 67fd0f758..da28e715f 100644
--- a/transforms/universal/ededup/python/pyproject.toml
+++ b/transforms/universal/ededup/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_ededup_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "ededup Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt
index 45b4cfd50..aa73a106a 100644
--- a/transforms/universal/ededup/python/requirements.txt
+++ b/transforms/universal/ededup/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 mmh3>=4.1.0
 xxhash==3.4.1
diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml
index d74fa0637..424e220fd 100644
--- a/transforms/universal/ededup/ray/pyproject.toml
+++ b/transforms/universal/ededup/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_ededup_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "ededup Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2",
-    "dpk_ededup_transform_python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
+    "dpk_ededup_transform_python==0.2.3.dev0",
     "tqdm==4.66.3",
 ]
 
diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index da431d030..3156ab6f1 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml
index 7c59dcff9..ee69ac81b 100644
--- a/transforms/universal/fdedup/ray/pyproject.toml
+++ b/transforms/universal/fdedup/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_fdedup_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "fdedup Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,7 +10,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
     "mmh3>=4.1.0",
     "xxhash==3.4.1",
     "tqdm==4.66.3",
diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py
index 4b122d98f..b856b1007 100644
--- a/transforms/universal/filter/kfp_ray/filter_wf.py
+++ b/transforms/universal/filter/kfp_ray/filter_wf.py
@@ -24,7 +24,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml
index 8e9bb2366..fcf0f6419 100644
--- a/transforms/universal/filter/python/pyproject.toml
+++ b/transforms/universal/filter/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_filter_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Filter Transform for Python"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt
index 5e3e783c8..100626f60 100644
--- a/transforms/universal/filter/python/requirements.txt
+++ b/transforms/universal/filter/python/requirements.txt
@@ -1,3 +1,3 @@
 
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 duckdb>=0.10.1
diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml
index a8ec7bb4d..64776e0c1 100644
--- a/transforms/universal/filter/ray/pyproject.toml
+++ b/transforms/universal/filter/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_filter_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Filter Transform for Ray"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
 ]
 dependencies = [
-    "dpk-filter-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-filter-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml
index 85403487a..ef46c9a1b 100644
--- a/transforms/universal/filter/spark/pyproject.toml
+++ b/transforms/universal/filter/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_filter_transform_spark"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Filter Spark Transform"
 license = {text = "Apache-2.0"}
@@ -9,7 +9,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[spark]==0.2.2",
+    "data-prep-toolkit[spark]==0.2.3.dev0",
 ]
 
 [project.optional-dependencies]
diff --git a/transforms/universal/hap/kfp_ray.disable/hap_wf.py b/transforms/universal/hap/kfp_ray.disable/hap_wf.py
index 8069ec181..786011d4d 100644
--- a/transforms/universal/hap/kfp_ray.disable/hap_wf.py
+++ b/transforms/universal/hap/kfp_ray.disable/hap_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "hap_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml
index 7b30dd72e..bf7c85577 100644
--- a/transforms/universal/hap/python/pyproject.toml
+++ b/transforms/universal/hap/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_hap_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "HAP Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt
index 07c5f854a..1250d1f77 100644
--- a/transforms/universal/hap/python/requirements.txt
+++ b/transforms/universal/hap/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 nltk==3.9.1
 transformers==4.38.2
 torch>=2.2.2,<=2.4.1
diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml
index 6518e5277..38e78938b 100644
--- a/transforms/universal/hap/ray/pyproject.toml
+++ b/transforms/universal/hap/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_hap_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "HAP Ray Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt
index 119167ca2..7c4c8eb94 100644
--- a/transforms/universal/hap/ray/requirements.txt
+++ b/transforms/universal/hap/ray/requirements.txt
@@ -1,5 +1,5 @@
-data-prep-toolkit[ray]==0.2.2
-dpk-hap-transform-python==0.2.2
+data-prep-toolkit[ray]==0.2.3.dev0
+dpk-hap-transform-python==0.2.3.dev0
 nltk==3.9.1
 transformers==4.38.2
 torch>=2.2.2,<=2.4.1
diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py
index 737b60121..3b102d205 100644
--- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py
+++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py
@@ -23,7 +23,7 @@
 EXEC_SCRIPT_NAME: str = "noop_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py
index 9dbdaf3b0..e8125328b 100644
--- a/transforms/universal/noop/kfp_ray/noop_wf.py
+++ b/transforms/universal/noop/kfp_ray/noop_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "noop_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml
index e8c089ef0..ff9a24244 100644
--- a/transforms/universal/noop/python/pyproject.toml
+++ b/transforms/universal/noop/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_noop_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "NOOP Python Transform"
 license = {text = "Apache-2.0"}
@@ -10,7 +10,7 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit==0.2.2",
+    "data-prep-toolkit==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml
index 19fe77560..da9327917 100644
--- a/transforms/universal/noop/ray/pyproject.toml
+++ b/transforms/universal/noop/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_noop_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "NOOP Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "dpk-noop-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-noop-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml
index 495d827a0..d3cd47bf6 100644
--- a/transforms/universal/noop/spark/pyproject.toml
+++ b/transforms/universal/noop/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_noop_transform_spark"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "NOOP Spark Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "dpk-noop-transform-python==0.2.2",
-    "data-prep-toolkit[spark]==0.2.2",
+    "dpk-noop-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[spark]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py
index ee6323d74..914637895 100644
--- a/transforms/universal/profiler/kfp_ray/profiler_wf.py
+++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py
@@ -24,7 +24,7 @@
 EXEC_SCRIPT_NAME: str = "profiler_transform_ray.py"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml
index 117be53c0..39d9788f8 100644
--- a/transforms/universal/profiler/python/pyproject.toml
+++ b/transforms/universal/profiler/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_profiler_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "profiler Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt
index fee352d4a..526140ada 100644
--- a/transforms/universal/profiler/python/requirements.txt
+++ b/transforms/universal/profiler/python/requirements.txt
@@ -1,5 +1,5 @@
 
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 mmh3==4.1.0
 xxhash==3.4.1
 
diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml
index c9f1b1da3..ac8d729ec 100644
--- a/transforms/universal/profiler/ray/pyproject.toml
+++ b/transforms/universal/profiler/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_profiler_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "profiler Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2",
-    "dpk_profiler_transform_python==0.2.2",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
+    "dpk_profiler_transform_python==0.2.3.dev0",
     "tqdm==4.66.3",
 ]
 
diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml
index 05602dc26..6ba790301 100644
--- a/transforms/universal/profiler/spark/pyproject.toml
+++ b/transforms/universal/profiler/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_profiler_transform_spark"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Profiler Spark Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "dpk-profiler-transform-python==0.2.2",
-    "data-prep-toolkit[spark]==0.2.2",
+    "dpk-profiler-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[spark]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py
index 0a9be8e95..0724ed731 100644
--- a/transforms/universal/resize/kfp_ray/resize_wf.py
+++ b/transforms/universal/resize/kfp_ray/resize_wf.py
@@ -22,7 +22,7 @@
 # the name of the job script
 EXEC_SCRIPT_NAME: str = "resize_transform_ray.py"
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 
 # path to kfp component specifications files
 component_spec_path = "../../../../kfp/kfp_ray_components/"
diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml
index 836388694..6fdad69d0 100644
--- a/transforms/universal/resize/python/pyproject.toml
+++ b/transforms/universal/resize/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_resize_transform_python"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "resize Python Transform"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt
index 880c7c2c7..2f67f6a80 100644
--- a/transforms/universal/resize/python/requirements.txt
+++ b/transforms/universal/resize/python/requirements.txt
@@ -1 +1 @@
-data-prep-toolkit==0.2.2
\ No newline at end of file
+data-prep-toolkit==0.2.3.dev0
\ No newline at end of file
diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml
index 4f7603f6f..c266a39f4 100644
--- a/transforms/universal/resize/ray/pyproject.toml
+++ b/transforms/universal/resize/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_resize_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Resize Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "dpk-resize-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-resize-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml
index c8bb67111..7de14c673 100644
--- a/transforms/universal/resize/spark/pyproject.toml
+++ b/transforms/universal/resize/spark/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_resize_transform_spark"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Resize Spark Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsk@ibm.com" },
 ]
 dependencies = [
-    "dpk-resize-transform-python==0.2.2",
-    "data-prep-toolkit[spark]==0.2.2",
+    "dpk-resize-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[spark]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
index 243cac6be..c131d11ea 100644
--- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
+++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
@@ -23,7 +23,7 @@
 task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest"
 
 # components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2"
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
 # path to kfp component specifications files
 
 # path to kfp component specifications files
diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml
index 021a1427f..dbb8e84ba 100644
--- a/transforms/universal/tokenization/python/pyproject.toml
+++ b/transforms/universal/tokenization/python/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "dpk_tokenization_transform_python"
 keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Tokenization Transform for Python"
 license = {text = "Apache-2.0"}
diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt
index afd567d8b..8a1920162 100644
--- a/transforms/universal/tokenization/python/requirements.txt
+++ b/transforms/universal/tokenization/python/requirements.txt
@@ -1,2 +1,2 @@
-data-prep-toolkit==0.2.2
+data-prep-toolkit==0.2.3.dev0
 transformers==4.38.2
diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml
index 3cc4bcf80..c094b9e7e 100644
--- a/transforms/universal/tokenization/ray/pyproject.toml
+++ b/transforms/universal/tokenization/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_tokenization_transform_ray"
-version = "0.2.2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Tokenization Transform for Ray"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"},
 ]
 dependencies = [
-    "dpk-tokenization-transform-python==0.2.2",
-    "data-prep-toolkit[ray]==0.2.2",
+    "dpk-tokenization-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]
diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt
index dfb74a6ca..1af3f12a4 100644
--- a/transforms/universal/web2parquet/requirements.txt
+++ b/transforms/universal/web2parquet/requirements.txt
@@ -1,2 +1,2 @@
-data-prep-toolkit>=0.2.2
+data-prep-toolkit>=0.2.3.dev0
 data_prep_connector>=0.2.3
\ No newline at end of file

From 0509fb1ad50a84939d4635b5953d82069e9a4a36 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Tue, 26 Nov 2024 04:59:46 -0500
Subject: [PATCH 104/105] bump up connector version after code release

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .make.versions                    | 2 +-
 data-connector-lib/pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.make.versions b/.make.versions
index bd01a60d7..564caa214 100644
--- a/.make.versions
+++ b/.make.versions
@@ -39,7 +39,7 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION)
 KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION)
 KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION)
 
-DPK_CONNECTOR_VERSION=0.2.3.dev0
+DPK_CONNECTOR_VERSION=0.2.4.dev0
 
 ################## ################## ################## ################## ################## ##################
 # Begin versions that the repo depends on. 
diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml
index eaf459a07..69e914f0c 100644
--- a/data-connector-lib/pyproject.toml
+++ b/data-connector-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_connector"
-version = "0.2.3.dev0"
+version = "0.2.4.dev0"
 requires-python = ">=3.10,<3.13"
 keywords = [
     "data",

From 7ae1f135ccc3ba7bb4cc4ff500b0e070b7d30b7b Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Tue, 26 Nov 2024 07:01:52 -0500
Subject: [PATCH 105/105] remove reference to noop transform project

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 data-processing-lib/pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml
index 40bf6b2a1..a347a14a1 100644
--- a/data-processing-lib/pyproject.toml
+++ b/data-processing-lib/pyproject.toml
@@ -16,7 +16,6 @@ dynamic = ["dependencies", "optional-dependencies"]
 Repository = "https://github.com/IBM/data-prep-kit"
 Issues = "https://github.com/IBM/data-prep-kit/issues"
 Documentation = "https://ibm.github.io/data-prep-kit/doc"
-"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop"
 
 [build-system]
 requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]