From 47f4526cd5217dd55a88185fdc99c93fed00953e Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:05:39 +0100 Subject: [PATCH 01/91] added folder_transform --- .../pure_python/transform_file_processor.py | 15 ++++-- .../pure_python/transform_orchestrator.py | 42 ++++++++++------ .../runtime/transform_file_processor.py | 41 ++++++++------- .../src/data_processing/transform/__init__.py | 2 + .../transform/abstract_transform.py | 16 ++++++ .../transform/binary_transform.py | 5 +- .../transform/folder_transform.py | 50 +++++++++++++++++++ .../runtime/ray/transform_file_processor.py | 1 + .../runtime/ray/transform_orchestrator.py | 19 ++++--- .../runtime/spark/transform_file_processor.py | 5 +- .../runtime/spark/transform_orchestrator.py | 25 +++++++--- 11 files changed, 168 insertions(+), 53 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/transform/abstract_transform.py create mode 100644 data-processing-lib/python/src/data_processing/transform/folder_transform.py diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index 143835dd0..fa3e69e4a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -14,7 +14,7 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime import AbstractTransformFileProcessor -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractTransform, TransformStatistics from data_processing.utils import UnrecoverableException @@ -28,7 +28,8 @@ def __init__( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ): """ Init method @@ -36,11 +37,13 @@ def __init__( :param statistics - reference to statistics class :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag """ # invoke superclass super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) self.transform_params["statistics"] = statistics # Create local processor @@ -52,7 +55,8 @@ def __init__( # Create statistics self.stats = statistics - def _publish_stats(self, stats: dict[str, Any]) -> None: + +def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) @@ -65,17 +69,20 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool ): """ Init method :param data_access_factory - data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder tranform flag """ super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) # Add data access and statistics to the processor parameters self.transform_params["data_access"] = self.data_access diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 8692da29e..153eaaf0a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,7 +24,7 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger @@ -48,8 +48,6 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } - - def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -74,15 +72,21 @@ def orchestrate( return 1 # create additional execution parameters runtime = runtime_config.create_transform_runtime() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - if retries > 0: - statistics.add_stats({"data access retries": retries}) - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + if retries > 0: + statistics.add_stats({"data access retries": retries}) + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -99,6 +103,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) else: # using sequential execution @@ -111,6 +116,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) status = "success" return_code = 0 @@ -157,7 +163,8 @@ def _process_transforms( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ) -> None: """ Process transforms sequentially @@ -167,9 +174,8 @@ def _process_transforms( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag :return: metadata for the execution - - :return: None """ # create executor executor = PythonTransformFileProcessor( @@ -177,6 +183,7 @@ def _process_transforms( statistics=statistics, transform_params=transform_params, transform_class=transform_class, + is_folder=is_folder, ) # process data t_start = time.time() @@ -203,6 +210,7 @@ def _process_transforms_multiprocessor( data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], transform_class: type[AbstractBinaryTransform], + is_folder: bool ) -> TransformStatistics: """ Process transforms using multiprocessing pool @@ -212,13 +220,17 @@ def _process_transforms_multiprocessor( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform class :return: metadata for the execution """ # result statistics statistics = TransformStatistics() # create processor processor = PythonPoolTransformFileProcessor( - data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class + data_access_factory=data_access_factory, + transform_params=transform_params, + transform_class=transform_class, + is_folder=is_folder, ) completed = 0 t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index d4ec548d8..1d268875f 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -26,11 +26,13 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_parameters: dict[str, Any], + is_folder: bool = False, ): """ Init method :param data_access_factory: Data Access Factory :param transform_parameters: Transform parameters + :param is_folder: folder transform flag """ self.logger = get_logger(__name__) # validate parameters @@ -46,6 +48,7 @@ def __init__( # Add data access and statistics to the processor parameters self.transform_params = transform_parameters self.transform_params["data_access"] = self.data_access + self.is_folder = is_folder def process_file(self, f_name: str) -> None: """ @@ -58,25 +61,29 @@ def process_file(self, f_name: str) -> None: self.logger.warning("No data_access found. Returning.") return t_start = time.time() - # Read source file - filedata, retries = self.data_access.get_file(path=f_name) - if retries > 0: - self._publish_stats({"data access retries": retries}) - if filedata is None: - self.logger.warning(f"File read resulted in None for {f_name}. Returning.") - self._publish_stats({"failed_reads": 1}) - return - self._publish_stats({"source_files": 1, "source_size": len(filedata)}) + if not self.is_folder: + # Read source file only if we are processing file + filedata, retries = self.data_access.get_file(path=f_name) + if retries > 0: + self._publish_stats({"data access retries": retries}) + if filedata is None: + self.logger.warning(f"File read resulted in None for {f_name}. Returning.") + self._publish_stats({"failed_reads": 1}) + return + self._publish_stats({"source_files": 1, "source_size": len(filedata)}) # Process input file try: - # execute local processing - name_extension = TransformUtils.get_file_extension(f_name) self.logger.debug(f"Begin transforming file {f_name}") - out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + if not self.is_folder: + # execute local processing + out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + name_extension = TransformUtils.get_file_extension(f_name) + self.last_file_name = name_extension[0] + self.last_file_name_next_index = None + self.last_extension = name_extension[1] + else: + out_files, stats = self.transform.transform(folder_name=f_name) self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") - self.last_file_name = name_extension[0] - self.last_file_name_next_index = None - self.last_extension = name_extension[1] # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) # Process unrecoverable exceptions @@ -95,10 +102,10 @@ def flush(self) -> None: the hook for them to return back locally stored data and their statistics. :return: None """ - if self.last_file_name is None: + if self.last_file_name is None or self.is_folder: # for some reason a given worker never processed anything. Happens in testing # when the amount of workers is greater than the amount of files - self.logger.debug("skipping flush, no name for file is defined") + self.logger.debug("skipping flush, no name for file is defined or this is a folder transform") return try: t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py index 6af43ad60..20254e47b 100644 --- a/data-processing-lib/python/src/data_processing/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/transform/__init__.py @@ -1,3 +1,5 @@ +from data_processing.transform.abstract_transform import AbstractTransform +from data_processing.transform.folder_transform import AbstractFolderTransform from data_processing.transform.binary_transform import AbstractBinaryTransform from data_processing.transform.table_transform import AbstractTableTransform from data_processing.transform.transform_statistics import TransformStatistics diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py new file mode 100644 index 000000000..89db70f42 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py @@ -0,0 +1,16 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +class AbstractTransform: + """ + Base class for all transform types + """ \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py index 80dff61ea..b313aff2f 100644 --- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py @@ -10,10 +10,11 @@ # limitations under the License. ################################################################################ -from typing import Any, TypeVar +from typing import Any +from data_processing.transform import AbstractTransform -class AbstractBinaryTransform: +class AbstractBinaryTransform(AbstractTransform): """ Converts input binary file to output file(s) (binary) Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py new file mode 100644 index 000000000..866e3286f --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from data_processing.data_access import data_access +from data_processing.transform import AbstractTransform + + +class AbstractFolderTransform(AbstractTransform): + """ + Converts input folder to output file(s) (binary) + Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or + more new binary files and metadata. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This simply stores the given instance in this instance for later use. + """ + self.config = config + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the extension to be used when writing out the new bytes. + """ + raise NotImplemented() + + @staticmethod + def get_folders(data_access:data_access) -> list(str): + """ + Compute the list of folders to use. + :param data_access - data access class + :return: + """ + raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py index e1fabb144..cdad1309f 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py @@ -35,6 +35,7 @@ def __init__(self, params: dict[str, Any]): super().__init__( data_access_factory=params.get("data_access_factory", None), transform_parameters=dict(params.get("transform_params", {})), + is_folder=params.get("is_folder", False) ) # Create statistics self.stats = params.get("statistics", None) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 42eba47a6..8276eb56c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -16,6 +16,7 @@ import ray from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import AbstractFolderTransform from data_processing_ray.runtime.ray import ( RayTransformExecutionConfiguration, RayTransformFileProcessor, @@ -56,13 +57,18 @@ def orchestrate( # create transformer runtime runtime = runtime_config.create_transform_runtime() resources = RayUtils.get_cluster_resources() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -84,6 +90,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), "statistics": statistics, + "is_folder": is_folder, } logger.debug("Creating actors") processors = RayUtils.create_actors( diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py index d63664ac4..a0968ab1d 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py @@ -29,12 +29,15 @@ def __init__( data_access_factory: DataAccessFactoryBase, runtime_configuration: SparkTransformRuntimeConfiguration, statistics: TransformStatistics, + is_folder: bool, ): """ Init method """ super().__init__( - data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params() + data_access_factory=data_access_factory, + transform_parameters=runtime_configuration.get_transform_params(), + is_folder=is_folder, ) # Add data access ant statistics to the processor parameters self.runtime_configuration = runtime_configuration diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index 57a6c58fc..11589dbaf 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -15,7 +15,7 @@ from datetime import datetime from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics +from data_processing.transform import TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger from data_processing_spark.runtime.spark import ( SparkTransformFileProcessor, @@ -68,7 +68,10 @@ def process_partition(iterator): runtime = runtime_conf.create_transform_runtime() # create file processor file_processor = SparkTransformFileProcessor( - data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics + data_access_factory=d_access_factory, + runtime_configuration=runtime_conf, + statistics=statistics, + is_folder=is_folder, ) first = True for f in iterator: @@ -92,13 +95,19 @@ def process_partition(iterator): return list(statistics.get_execution_stats().items()) num_partitions = 0 + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # process data logger.debug("Begin processing files") # process files split by partitions From 5fd20a125a71a40d6db7dc958dce50321369f3c0 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:13:01 +0100 Subject: [PATCH 02/91] added folder_transform --- .../runtime/pure_python/transform_orchestrator.py | 2 +- .../python/src/data_processing/transform/folder_transform.py | 4 ++-- .../data_processing_ray/runtime/ray/transform_orchestrator.py | 2 +- .../runtime/spark/transform_orchestrator.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 153eaaf0a..d51f80a8a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 866e3286f..eca191bbb 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -41,10 +41,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str raise NotImplemented() @staticmethod - def get_folders(data_access:data_access) -> list(str): + def get_folders(d_access: data_access) -> list(str): """ Compute the list of folders to use. - :param data_access - data access class + :param d_access - data access class :return: """ raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 8276eb56c..a8ff95729 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index 11589dbaf..a4c0c5835 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -99,7 +99,7 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process From 38b47259977fbe64ead50231a52660e375625add Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 21:00:43 +0100 Subject: [PATCH 03/91] added folder_transform --- .../runtime/pure_python/transform_file_processor.py | 3 +-- .../runtime/pure_python/transform_orchestrator.py | 11 ++++++----- .../runtime/pure_python/transform_runtime.py | 10 +++++++++- .../data_processing/transform/folder_transform.py | 12 +----------- .../runtime/ray/transform_orchestrator.py | 2 +- .../runtime/ray/transform_runtime.py | 10 +++++++++- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index fa3e69e4a..44ccd0ef0 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -55,8 +55,7 @@ def __init__( # Create statistics self.stats = statistics - -def _publish_stats(self, stats: dict[str, Any]) -> None: + def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index d51f80a8a..812be8caf 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,14 +24,13 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform +from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger logger = get_logger(__name__) -@staticmethod def _execution_resources() -> dict[str, Any]: """ Get Execution resource @@ -48,6 +47,7 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } + def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process @@ -145,7 +145,8 @@ def orchestrate( "job_input_params": input_params | data_access_factory.get_input_params() | execution_config.get_input_params(), - "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, + "execution_stats": _execution_resources() | + {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, "job_output_stats": stats, } logger.debug(f"Saving job metadata: {metadata}.") @@ -209,7 +210,7 @@ def _process_transforms_multiprocessor( print_interval: int, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], is_folder: bool ) -> TransformStatistics: """ diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py index 4173154ae..478d40837 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] ) -> dict[str, Any]: diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index eca191bbb..9a2fb3713 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -11,7 +11,6 @@ ################################################################################ from typing import Any -from data_processing.data_access import data_access from data_processing.transform import AbstractTransform @@ -38,13 +37,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str to metadata. Each element of the return list, is a tuple of the transformed bytes and a string holding the extension to be used when writing out the new bytes. """ - raise NotImplemented() - - @staticmethod - def get_folders(d_access: data_access) -> list(str): - """ - Compute the list of folders to use. - :param d_access - data access class - :return: - """ - raise NotImplemented() + raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index a8ff95729..b29682997 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py index 57f071406..64479302c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from ray.actor import ActorHandle @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] ) -> dict[str, Any]: From a3abf21cda7e280f7089555bc974058d193b502f Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 08:48:00 +0100 Subject: [PATCH 04/91] added folder_transform --- .../runtime/spark/transform_orchestrator.py | 3 ++- .../runtime/spark/transform_runtime.py | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index a4c0c5835..c404559d8 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -99,7 +99,8 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + runtime = runtime_config.create_transform_runtime() + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index f16b09520..3c9fca76f 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics ) -> dict[str, Any]: From af8475df9648a76cb268b284f60de3597fa579c8 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 11 Oct 2024 10:20:48 -0400 Subject: [PATCH 05/91] Fuzzy dedup pure python implementation Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/README.md | 11 + .../universal/fdedup/python/pyproject.toml | 55 ++ .../universal/fdedup/python/src/Murmur_MH.py | 99 ++++ .../src/cluster_analysis_local_python.py | 46 ++ .../python/src/cluster_analysis_transform.py | 229 ++++++++ .../src/cluster_analysis_transform_python.py | 44 ++ .../python/src/data_cleaning_local_python.py | 56 ++ .../python/src/data_cleaning_transform.py | 150 ++++++ .../src/data_cleaning_transform_python.py | 83 +++ .../fdedup/python/src/file_copy_util.py | 158 ++++++ .../fdedup/python/src/service_orchestrator.py | 265 +++++++++ .../python/src/signature_calc_local_python.py | 60 +++ .../python/src/signature_calc_transform.py | 504 ++++++++++++++++++ .../src/signature_calc_transform_python.py | 44 ++ 14 files changed, 1804 insertions(+) create mode 100644 transforms/universal/fdedup/python/README.md create mode 100644 transforms/universal/fdedup/python/pyproject.toml create mode 100644 transforms/universal/fdedup/python/src/Murmur_MH.py create mode 100644 transforms/universal/fdedup/python/src/cluster_analysis_local_python.py create mode 100644 transforms/universal/fdedup/python/src/cluster_analysis_transform.py create mode 100644 transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py create mode 100644 transforms/universal/fdedup/python/src/data_cleaning_local_python.py create mode 100644 transforms/universal/fdedup/python/src/data_cleaning_transform.py create mode 100644 transforms/universal/fdedup/python/src/data_cleaning_transform_python.py create mode 100644 transforms/universal/fdedup/python/src/file_copy_util.py create mode 100644 transforms/universal/fdedup/python/src/service_orchestrator.py create mode 100644 transforms/universal/fdedup/python/src/signature_calc_local_python.py create mode 100644 transforms/universal/fdedup/python/src/signature_calc_transform.py create mode 100644 transforms/universal/fdedup/python/src/signature_calc_transform_python.py diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md new file mode 100644 index 000000000..34f18c73b --- /dev/null +++ b/transforms/universal/fdedup/python/README.md @@ -0,0 +1,11 @@ +# Fuzzy Dedup + +Please see the set of +[transform project conventions](../../../README.md) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary + +The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see +[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details. \ No newline at end of file diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml new file mode 100644 index 000000000..f2b9d8268 --- /dev/null +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -0,0 +1,55 @@ +[project] +name = "dpk_fdedup_transform_python" +version = "0.3.0.dev0" +requires-python = ">=3.10" +description = "Fuzzy Dedup Transform for Python" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, +] +dependencies = [ + "data-prep-toolkit==0.2.2.dev0", + "pyarrow==16.1.0", + "pyyaml>=6.0.2", + "boto3>=1.34.69", + "kubernetes>=30.1.0", + "polars>=1.6.0", + "disjoint-set>=0.8.0", + "scipy>=1.14.1", + "numpy<1.29.0", + "sentencepiece>=0.2.0", + "mmh3>=4.1.0", +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py new file mode 100644 index 000000000..e3442ba02 --- /dev/null +++ b/transforms/universal/fdedup/python/src/Murmur_MH.py @@ -0,0 +1,99 @@ +import logging +import os +from typing import List, Set + +import mmh3 +import numpy as np + + +class Murmur_MH: + def __init__(self, num_perm=64, seed=42, hashfunc=None): + self.seed = seed + self.num_perm = num_perm # the number of buckets, i.e. the vector length after self.minhash() call + self.permutations = self._init_permutations(seed, num_perm) + + def _init_permutations(self, seed, num_perm): + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + max_int = np.uint64((1 << 64) - 1) + # initialize pseudo random number generator with given seed value + gen = np.random.RandomState(seed) + # get self.num_perm pseudo random numbers between 2 and max_int (excl) + permutations = np.array( + [gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], + dtype=np.uint64, + ).T + # make all even pseudo random numbers odd by adding 1 + permutations[permutations % 2 == 0] += 1 + return permutations + + def minhash(self, shingles: List[str]): + """return np.array of minhash""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0) + ) + + def minhash2(self, shingles: List[str], doc_len: int): + """ + for each shingle (i.e. a group of k-words) it generates a digest value based on + mmh3-hash function (32-bit) + + return tuple (A, B) + A = an array of values = np.array of minhash + B = document_length = number of characters""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0), + doc_len, + ) + + def minhash2_nosalt(self, shingles: List[str], doc_len: int, doc_id: int): + """ + for each shingle (i.e. a group of k-words) it generates a digest value based on + mmh3-hash function (32-bit) + + return tuple (A, B) + A = an array of values = np.array of minhash + B = document_length = number of characters""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0) + .tolist(), + doc_len, + doc_id, + ) + + @staticmethod + def jaccard(mh1: np.array, mh2: np.array) -> float: + """ + The Jaccard similarity measures the similarity between two sets of data + to see which members are shared and distinct. + + The Jaccard similarity is calculated by dividing the number of observations + in both sets by the number of observations in either set. + + Developed by Paul Jaccard, the index ranges from 0 to 1. + The closer to 1, the more similar the two sets of data. + + As a document is represented by a set. We use Jaccard distance to see how similar between two documents. + """ + assert len(mh1) == len(mh2) + return np.count_nonzero(mh1 == mh2) / len(mh1) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py new file mode 100644 index 000000000..dcfc9a7e4 --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} +if __name__ == "__main__": + # Set the simulated command line args + # sys.argv = ParamsUtils.dict_to_req(d=params) + # print(sys.argv) + # create launcher + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py new file mode 100644 index 000000000..5ad18362a --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -0,0 +1,229 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os +from argparse import ArgumentParser, Namespace +from typing import Any, List, Tuple + +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, get_logger +from Murmur_MH import Murmur_MH + + +short_name = "cluster" +cli_prefix = f"{short_name}_" + +# configuration keys +jaccard_similarity_threshold_key = "jaccard_similarity_threshold" +""" This key holds the Jaccard similarity threshold above which two documents are duplicates""" + +# command line arguments +jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" +""" Jaccard similarity threshold above which two documents are duplicates""" + +captured_arg_keys = [ + jaccard_similarity_threshold_key, +] + +# defaults +jaccard_similarity_threshold_default = 0.8 +""" Default Jaccard similarity threshold above which two documents are duplicates""" + + +class ClusterAnalysisTransform(AbstractTableTransform): + """ + This is the second transform of the fuzzy dedup pipeline. It runs in parallel: + for each band, the hashing interval is divided into segments. A cluster analysis + uses as input all the parquet files from segment of a band. The `bands` output + of the signature calculation, the first transform in the fuzzy dedup pipeline + contains all the data for a given segment s of a specific band b in the + subfolder `bands/band=b/segment=s`. + The transform loads all the parquet files in the `bands/band=b/segment=s` + subfolder. Each one of these parquet files has two columns: the `band_hash` + and a `data` structure, which includes the `document_id`, the `minhashes` and + the `document_size` fields. Once all the files have been loaded in a single + dataframe, a `group_by` operation on the `band_hash` field is performed in + that dataframe. All the documents that have the same band_hash are grouped + in a cluster. Subsequently, the documents of each cluster are sorted in + descending order according to their size, and a Jaccard similarity is + calculated between the cluster documents. The documents for which the Jaccard + similarity is above the `jaccard_similarity_threshold` remain in the cluster, + the others are removed from the cluster. Finally, from each cluster that has + more than one document after running the Jaccard similarity, we select a doc + to keep (the largest size document), and mark the other documents as + duplicates. The resulting clusters are saved in a file for further analysis. + + Args: + jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.jaccard_similarity_threshold = config.get( + jaccard_similarity_threshold_key, jaccard_similarity_threshold_default + ) + self.logger = get_logger(__name__) + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + bands_dataframe = pl.from_arrow(table) + docs2remove_list = [] + # clustering + bands_dataframe_groups = bands_dataframe.group_by("band_hash").agg("document_data") + bands_dataframe_cluster = bands_dataframe_groups.with_columns( + cluster_length=pl.col("document_data").list.len() + ).filter(pl.col("cluster_length") > 1) + self.logger.info(f"file_name = {file_name}") + num_clusters = len(bands_dataframe_cluster) + if num_clusters > 0: + sum_cdocs = bands_dataframe_cluster.select(pl.sum("cluster_length")).item() + max_cdocs = bands_dataframe_cluster.select(pl.max("cluster_length")).item() + min_cdocs = bands_dataframe_cluster.select(pl.min("cluster_length")).item() + avg_cdocs = bands_dataframe_cluster.select(pl.mean("cluster_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + bands_dataframe_response = self.process_bands(bands_dataframe_cluster) + + filtered_doc2remove_dataframe = bands_dataframe_response.filter(pl.col("docs_to_remove_length") > 0) + num_clusters = len(filtered_doc2remove_dataframe) + if num_clusters > 0: + sum_cdocs = filtered_doc2remove_dataframe.select(pl.sum("docs_to_remove_length")).item() + max_cdocs = filtered_doc2remove_dataframe.select(pl.max("docs_to_remove_length")).item() + min_cdocs = filtered_doc2remove_dataframe.select(pl.min("docs_to_remove_length")).item() + avg_cdocs = filtered_doc2remove_dataframe.select(pl.mean("docs_to_remove_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + + # Explode the 'docs_to_remove' column + doc2remove_exploded_dataframe = filtered_doc2remove_dataframe.explode("docs_to_remove") + table = doc2remove_exploded_dataframe.to_arrow() + self.logger.info(f"{len(doc2remove_exploded_dataframe)} documents marked to remove") + metadata = {"nrows": len(table)} + return [table], metadata + + def process_bands(self, df: pl.DataFrame) -> pl.DataFrame: + # Define the schema with specific data types + schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64} + doc_ids_lists = [] + docs_to_remove_lists = [] + len_of_docs2remove_lists = [] + for row in df.iter_rows(named=True): + doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self.jaccard_distance_calculation(row) + doc_ids_lists += doc_ids_list + docs_to_remove_lists += docs_to_remove_list + len_of_docs2remove_lists += len_of_docs2remove_list + processed_rows = pl.DataFrame( + { + "first_doc": doc_ids_lists, + "docs_to_remove": docs_to_remove_lists, + "docs_to_remove_length": len_of_docs2remove_lists, + }, + schema=schema, + ) + return processed_rows + + def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: + # Process row and return a new list of Series or a new row + threshold = self.jaccard_similarity_threshold + doc_ids_list = [] + docs_to_remove_list = [] + len_of_docs2remove_list = [] + # sort documents + document_data = row["document_data"] + + # Sort the list by 'document_length' + sorted_document_data = sorted(document_data, key=lambda x: (-x["document_length"], x["int_id_column"])) + + # Extracting int_id_column values into a list + doc_list = list(set([item["int_id_column"] for item in sorted_document_data])) + + # Creating a dictionary with int_id_column as key and minhashes as value + doc_minhashes = {item["int_id_column"]: item["minhashes"] for item in sorted_document_data} + + while len(doc_list) > 1: + docs_to_remove = [] + new_doc_list = [] + # this is the document we are going to keep + first_doc = doc_list[0] + first_mh = doc_minhashes[first_doc] + for int_id_column in doc_list[1:]: + doc_mh = doc_minhashes[int_id_column] + distance = Murmur_MH.jaccard(np.array(first_mh), np.array(doc_mh)) + if distance >= threshold: + docs_to_remove.append(int_id_column) + else: + new_doc_list.append(int_id_column) + if len(docs_to_remove) > 0: + docs_to_remove = list(set(docs_to_remove)) + doc_ids_list.append(first_doc) + docs_to_remove_list.append(docs_to_remove) + len_of_docs2remove_list.append(len(docs_to_remove)) + doc_list = new_doc_list + + return doc_ids_list, docs_to_remove_list, len_of_docs2remove_list + + +class ClusterAnalysisTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=ClusterAnalysisTransform, + remove_from_metadata=[], + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{jaccard_similarity_threshold_cli_param}", + type=float, + default=jaccard_similarity_threshold_default, + help="Jaccard similarity threshold above which two documents are duplicates", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py new file mode 100644 index 000000000..28d96f428 --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class ClusterAnalysisPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(ClusterAnalysisTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py new file mode 100644 index 000000000..4295e4e82 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py @@ -0,0 +1,56 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + ) +) +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) + # create launcher + launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py new file mode 100644 index 000000000..f03b6c1d0 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -0,0 +1,150 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +from argparse import ArgumentParser, Namespace +from typing import Any, List, Tuple + +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger + + +short_name = "fdclean" +cli_prefix = f"{short_name}_" + +# configuration keys +document_id_column_key = "document_id_column" +""" This key holds the name of the column storing the unique ID assigned to each document""" +duplicate_list_location_key = "duplicate_list_location" +""" This key holds the location of the list of duplicate documents marked for removal""" + +# command line arguments +document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" +""" Name of the column storing the unique ID assigned to each document""" +duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" +""" Location of the list of duplicate documents marked for removal""" + +captured_arg_keys = [ + document_id_column_key, + duplicate_list_location_key, +] + +# defaults +document_id_column_default = "int_id_column" +""" Default name of the column storing the unique ID assigned to each document""" +duplicate_list_location_default = None +""" Default location of the list of duplicate documents marked for removal""" + + +class DataCleaningTransform(AbstractTableTransform): + """ + This is the third transform of the fuzzy dedup pipeline. It takes as input + the list of the documents to remove (identified as duplicates during the + cluster analysis phase, and the original dataset. Each dataset file is + imported into a table, and the documents that are in the documents to remove + list are filtered out from that table. The output is a new dataset, which + keeps the directory structure of the input dataset, but has all the fuzzy + duplicates removed. + + Args: + duplicate_location: location (local or s3) of the duplicate document list + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.logger = get_logger(__name__) + self.document_id_column = config.get(document_id_column_key, document_id_column_default) + self.duplicate_list_location = config.get(duplicate_list_location_key, duplicate_list_location_default) + contents = config.get("df") + self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents)) + self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") + self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column}) + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + input_df = pl.from_arrow(table) + # handle the case when the doc_id columns in the input dataframe and the + # docs_to_remove_df have different types, i.e. one is int32 and the + # other is int64 + input_doc_id_type = input_df[self.document_id_column].dtype + if input_doc_id_type != self.docs_to_remove_df[self.document_id_column].dtype: + self.docs_to_remove_df = self.docs_to_remove_df.select( + pl.col(self.document_id_column).cast(input_doc_id_type) + ) + filtered_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti") + filtered_table = filtered_df.to_arrow() + metadata = { + "input_files": 1, + "input_docs": table.num_rows, + "input_bytes": table.nbytes, + "output_files": 1, + "output_docs": filtered_table.num_rows, + "output_bytes": filtered_table.nbytes, + "filtered_docs": (table.num_rows - filtered_table.num_rows), + "filtered_bytes": (table.nbytes - filtered_table.nbytes), + } + return [filtered_table], metadata + + +class DataCleaningTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=DataCleaningTransform, + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{document_id_column_cli_param}", + type=str, + default=document_id_column_default, + help="name of the column storing the unique ID assigned to each document", + ) + parser.add_argument( + f"--{duplicate_list_location_cli_param}", + type=str, + required=True, + default=duplicate_list_location_default, + help="location of duplicate document list that are marked for removal", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py new file mode 100644 index 000000000..c0b5fefd6 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -0,0 +1,83 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +from data_cleaning_transform import DataCleaningTransformConfiguration +from data_processing.data_access import DataAccessFactoryBase +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + DefaultPythonTransformRuntime, + PythonTransformRuntimeConfiguration, +) +from data_processing.transform import TransformStatistics +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class DataCleaningPythonRuntime(DefaultPythonTransformRuntime): + """ + Data cleaning runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_transform_config( + self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + ) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :param statistics - reference to statistics actor + :param files - list of files to process + :return: dictionary of transform init params + """ + duplicate_list_location = self.params["duplicate_list_location"] + data_access = data_access_factory.create_data_access() + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + return self.params | {"df": self.duplicate_list} + + +class DataCleaningPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for fuzzy dedup data cleaning step + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param: transform_configuration - transform configuration class + :param: runtime_class - name of the runtime configuration class + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(), + runtime_class=DataCleaningPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(DataCleaningTransformConfiguration()) + logger.info("Launching fuzzy dedup data cleaning transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/file_copy_util.py b/transforms/universal/fdedup/python/src/file_copy_util.py new file mode 100644 index 000000000..87867e532 --- /dev/null +++ b/transforms/universal/fdedup/python/src/file_copy_util.py @@ -0,0 +1,158 @@ +import argparse +import io +import os +import re + +import polars as pl +from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase +from data_processing.utils import ParamsUtils, get_logger + + +""" +This class reads all the parquet files inside an `input_folder` of the type +`.../bands/band=b/segment=s`, concatenates those files, and writes them into a +file called `.../consolidated_bands/band_b_segment_s.parquet` +""" + + +class FileCopyUtil: + def __init__( + self, + data_access_factory: DataAccessFactoryBase, + config: dict, + stats: dict, + ): + self.data_access_factory = data_access_factory + self.root_folder = config.get("root_folder") + self.logger = get_logger(__name__, level="INFO") + + def copy_data(self, subfolder_name: str, data_type: str): + self.logger.info(f"copy_data(): subfolder_name = {subfolder_name}, data_type = {data_type}") + if self.data_access_factory.s3_config is not None: + _, root_folder = self.root_folder.split("://") + else: + root_folder = self.root_folder + self.logger.debug(f"copy_data(): root_folder = {root_folder}") + if data_type == "bands": + match = re.match(r"^band=(\d+)/segment=(\d+)$", subfolder_name) + if match: + band = int(match.group(1)) + segment = int(match.group(2)) + else: + raise ValueError(f"Wrong subfolder_name {subfolder_name}, should be band=b/segment=s") + input_folder = os.path.join( + root_folder, + "bands", + f"band={band}", + f"segment={segment}/", + ) + output_path = os.path.join( + root_folder, + "bands_consolidated", + f"band_{band}_segment_{segment}.parquet", + ) + elif data_type == "docs_to_remove": + input_folder = os.path.join( + root_folder, + f"{subfolder_name}/", + ) + output_path = os.path.join( + root_folder, + "docs_to_remove_consolidated", + f"docs_to_remove_consolidated.parquet", + ) + self.logger.debug(f"copy_data(): input_folder = {input_folder}, output_path = {output_path}") + + data_access = self.data_access_factory.create_data_access() + self.logger.debug(f"copy_data(): getting the data from the input_folder {input_folder}") + file_dict, status = data_access.get_folder_files( + input_folder, + extensions=[".parquet"], + return_data=True, + ) + self.logger.info(f"Found {len(file_dict)} files in input folder {input_folder}") + consolidated_df = pl.DataFrame() + for fname, contents in file_dict.items(): + df = pl.read_parquet(io.BytesIO(contents)) + # self.logger.info(f"{fname} has {len(df)} rows") + consolidated_df = consolidated_df.vstack(df) + if "docs_to_remove" in consolidated_df.columns: + consolidated_df = consolidated_df.select("docs_to_remove").unique() + output_table = consolidated_df.to_arrow() + self.logger.info( + f"Writing to {output_path} table with {output_table.num_rows} rows and {output_table.nbytes:,d} bytes" + ) + stats = { + "input_files": len(file_dict), + "input_bytes": sum(len(v) for v in file_dict.values()), + "input_rows": output_table.num_rows, + "output_files": 1, + "output_bytes": output_table.nbytes, + "output_rows": output_table.num_rows, + } + data_access.save_table(output_path, output_table) + return stats + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--root_folder", + type=str, + default=os.getenv("HOME", os.path.join(os.sep)), + help="root folder", + ) + parser.add_argument( + "--subfolder_name", + type=str, + default=os.path.join("band=0", "segment=0"), + help="subfolder name", + ) + parser.add_argument( + "--data_type", + type=str, + default="docs_to_remove", + help="Processing either bands or docs_to_remove", + ) + parser.add_argument( + "--use_s3", + type=bool, + default=False, + help="use s3", + ) + args = parser.parse_args() + root_folder = args.root_folder + config = {"root_folder": args.root_folder} + input_folder = args.root_folder + output_folder = args.root_folder + data_type = args.data_type + data_access_factory: DataAccessFactoryBase = DataAccessFactory() + daf_args = [] + if args.use_s3: + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + s3_config = { + "input_folder": root_folder, + "output_folder": root_folder, + } + daf_args.append("--data_s3_cred") + daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) + daf_args.append("--data_s3_config") + daf_args.append(ParamsUtils.convert_to_ast(s3_config)), + else: + local_config = { + "input_folder": root_folder, + "output_folder": root_folder, + } + daf_args.append("--data_local_config") + daf_args.append(ParamsUtils.convert_to_ast(local_config)) + daf_parser = argparse.ArgumentParser() + data_access_factory.add_input_params(parser=daf_parser) + data_access_factory_args = daf_parser.parse_args(args=daf_args) + data_access_factory.apply_input_params(args=data_access_factory_args) + stats = {} + fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats) + fcu.copy_data(args.subfolder_name, args.data_type) diff --git a/transforms/universal/fdedup/python/src/service_orchestrator.py b/transforms/universal/fdedup/python/src/service_orchestrator.py new file mode 100644 index 000000000..897a3210c --- /dev/null +++ b/transforms/universal/fdedup/python/src/service_orchestrator.py @@ -0,0 +1,265 @@ +import argparse +import os +import sys + +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from file_copy_util import FileCopyUtil +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +class ServiceOrchestrator: + def __init__(self, global_params=None): + self.global_params = global_params or {} + + def execute_service(self, service_logic, service_params): + # Call the generic service logic + service_logic(service_params) + + def orchestrate(self, service_logic): + service_list = self.global_params["services"].split(",") + + for service in service_list: + if service == "SignatureCalculation": + params = create_transform_args_payload(args, service) + params["service_type"] = "SignatureCalculation" + self.execute_service(service_logic, params) + elif service == "ClusterAnalysis": + params = create_transform_args_payload(args, service) + params["service_type"] = "ClusterAnalysis" + self.execute_service(service_logic, params) + elif service == "DataCleaning": + params = create_transform_args_payload(args, service) + params["service_type"] = "DataCleaning" + self.execute_service(service_logic, params) + elif service == "BandsFileCopy": + params = args + params["service_type"] = "BandsFileCopy" + self.execute_service(service_logic, params) + elif service == "DocsToRemoveFileCopy": + params = args + params["service_type"] = "DocsToRemoveFileCopy" + self.execute_service(service_logic, params) + else: + print(f"Warning: {service} is not a recognized service.") + + +def generic_service_logic(params): + print("Service executed with parameters:", params) + service_type = params["service_type"] + use_s3 = params["use_s3"] + # Remove the 'service_type' key + params.pop("service_type", None) # Using pop() method + + if service_type == "SignatureCalculation" or service_type == "ClusterAnalysis" or service_type == "DataCleaning": + # Set the simulated command line args + params.pop("num_permutations", None) # Using pop() method + params.pop("num_bands", None) # Using pop() method + params.pop("num_segments", None) # Using pop() method + params.pop("use_s3", None) # Using pop() method + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + if use_s3: + sys.argv.append("--data_s3_cred") + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + + if service_type == "SignatureCalculation": + runtime_config = SignatureCalculationPythonTransformConfiguration() + launch_transform_service(runtime_config) + elif service_type == "ClusterAnalysis": + runtime_config = ClusterAnalysisPythonTransformConfiguration() + launch_transform_service(runtime_config) + elif service_type == "DataCleaning": + runtime_config = DataCleaningPythonTransformConfiguration() + launch_transform_service(runtime_config) + elif service_type == "BandsFileCopy": + launch_file_copy_service(params, service_type) + elif service_type == "DocsToRemoveFileCopy": + launch_file_copy_service(params, service_type) + + +def launch_transform_service(params): + # create launcher + launcher = PythonTransformLauncher(runtime_config=params) + # Launch the ray actor(s) to process the input + launcher.launch() + + +def launch_file_copy_service(args, service_type): + root_folder = os.path.join(args["root_folder"], args["output_folder"]) + data_type = None + if service_type == "BandsFileCopy": + data_type = "bands" + # Get files to process + files = [ + f"band={band}/segment={segment}" + for band in range(args["num_bands"]) + for segment in range(args["num_segments"]) + ] + elif service_type == "DocsToRemoveFileCopy": + files = ["docs_to_remove"] + data_type = "docs_to_remove" + config = {"root_folder": root_folder} + data_access_factory: DataAccessFactoryBase = DataAccessFactory() + daf_args = [] + + if args["use_s3"]: + + s3_config = { + "input_folder": root_folder, + "output_folder": root_folder, + } + daf_args.append("--data_s3_cred") + daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) + daf_args.append("--data_s3_config") + daf_args.append(ParamsUtils.convert_to_ast(s3_config)), + else: + + # Construct folders + local_config = { + "input_folder": root_folder, + "output_folder": os.path.abspath(os.path.join(args["root_folder"], args["output_folder"])), + } + daf_args.append("--data_local_config") + daf_args.append(ParamsUtils.convert_to_ast(local_config)) + + daf_parser = argparse.ArgumentParser() + data_access_factory.add_input_params(parser=daf_parser) + data_access_factory_args = daf_parser.parse_args(args=daf_args) + data_access_factory.apply_input_params(args=data_access_factory_args) + stats = {} + fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats) + for file in files: + fcu.copy_data(file, data_type) + + +def create_transform_args_payload(args, service): + print(args) + # Construct folders + input_folder = os.path.join(args["root_folder"], args["input_folder"]) + output_folder = os.path.join(args["root_folder"], args["output_folder"]) + if service == "ClusterAnalysis": + input_folder = os.path.join(args["root_folder"], args["output_folder"], "bands_consolidated") + output_folder = os.path.join(args["root_folder"], args["output_folder"], "docs_to_remove") + elif service == "DataCleaning": + output_folder = os.path.join(args["root_folder"], args["output_folder"], "cleaned") + duplicate_location = os.path.join( + args["root_folder"], + args["output_folder"], + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + + # Create a local configuration + local_conf = {"input_folder": input_folder, "output_folder": output_folder} + + # Create parameters + params = { + "num_permutations": args["num_permutations"], + "num_bands": args["num_bands"], + "num_segments": args["num_segments"], + "use_s3": args["use_s3"], + } + + if args["use_s3"]: + params["data_s3_config"] = ParamsUtils.convert_to_ast(local_conf) + else: + params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + + # add extra + if service == "DataCleaning": + short_name = "fdclean" + cli_prefix = f"{short_name}_" + + # configuration keys + document_id_column_key = "document_id_column" + """ This key holds the name of the column storing the unique ID assigned to each document""" + duplicate_list_location_key = "duplicate_list_location" + """ This key holds the location of the list of duplicate documents marked for removal""" + + # command line arguments + document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" + """ Name of the column storing the unique ID assigned to each document""" + duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" + """ Location of the list of duplicate documents marked for removal""" + + params[document_id_column_cli_param] = "int_id_column" + params[duplicate_list_location_cli_param] = duplicate_location + + return params + + +def create_file_copy_args_payload(args): + daf_args = [] + local_config = { + "input_folder": args.root_folder, + "output_folder": args.root_folder, + } + daf_args.append("--data_local_config") + daf_args.append(ParamsUtils.convert_to_ast(local_config)) + data_access_factory: DataAccessFactoryBase = DataAccessFactory() + daf_parser = argparse.ArgumentParser() + data_access_factory.add_input_params(parser=daf_parser) + data_access_factory_args = daf_parser.parse_args(args=daf_args) + data_access_factory.apply_input_params(args=data_access_factory_args) + return data_access_factory + + +def parse_args(): + parser = argparse.ArgumentParser(description="Service Orchestrator") + + # Define command line arguments + parser.add_argument("--root_folder", type=str, required=True, help="Root folder path") + parser.add_argument("--input_folder", type=str, required=True, help="Input folder path") + parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") + + parser.add_argument( + "--contents_column", type=str, default="text", help="Name of the column that holds document text" + ) + parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations") + parser.add_argument("--num_bands", type=int, default=14, help="Number of bands") + parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band") + parser.add_argument("--num_segments", type=int, default=2, help="Number of segments") + + # Single argument for service execution + parser.add_argument( + "--services", + type=str, + required=True, + help="Comma-separated list of services to run (e.g., SignatureCalculation,BandsFileCopy,ClusterAnalysis,DocsToRemoveFileCopy,DataCleaning)", + ) + + parser.add_argument( + "--use_s3", + type=bool, + default=False, + help="use s3", + ) + + args = parser.parse_args() + return vars(args) # Convert Namespace to dictionary + + +if __name__ == "__main__": + + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + + # Parse command line arguments + args = parse_args() + + # Initialize the orchestrator + orchestrator = ServiceOrchestrator(global_params=args) + + # Example service execution (if you had defined services) + orchestrator.orchestrate(generic_service_logic) diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py new file mode 100644 index 000000000..eb958ee3d --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -0,0 +1,60 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +# # create parameters +# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +# output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output_second")) +# local_conf = { +# "input_folder": input_folder, +# "output_folder": output_folder +# } +# code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +# params = { +# # Data access. Only required parameters are specified +# "data_local_config": ParamsUtils.convert_to_ast(local_conf), +# # execution info +# "runtime_pipeline_id": "pipeline_id", +# "runtime_job_id": "job_id", +# "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +# "minhash_num_permutations":112, +# "minhash_num_bands":14, +# "minhash_num_segments":2 +# } + + +if __name__ == "__main__": + # Set the simulated command line args + # sys.argv = ParamsUtils.dict_to_req(d=params) + # print(sys.argv) + + sys.argv.append("--data_s3_cred") + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + + # create launcher + launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) + # Launch python to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py new file mode 100644 index 000000000..7ac8eb057 --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -0,0 +1,504 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Any, List + +import mmh3 +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider +from Murmur_MH import Murmur_MH +from scipy.integrate import quad as integrate + + +short_name = "minhash" +cli_prefix = f"{short_name}_" + +# configuration keys +document_id_column_key = "document_id_column" +""" This key holds the name of the column storing the unique ID assigned to each document""" +contents_column_key = "contents_column" +""" This key holds the name of the column storing the contents of each document""" +seed_key = "seed" +""" This key holds the seed used to instantiate the random number generator""" +num_permutations_key = "num_permutations" +""" This key holds the number of permutations that determine how many minhashes to calculate for each document""" +num_bands_key = "num_bands" +""" This key holds the number of bands to use in the banding technique""" +num_minhashes_per_band_key = "num_minhashes_per_band" +""" This key holds the number of minhashes to use in each band""" +jaccard_similarity_threshold_key = "jaccard_similarity_threshold" +""" This key holds the Jaccard similarity threshold above which two documents are duplicates""" +word_shingle_size_key = "word_shingle_size" +""" This key holds the size of the word shingles calculated for each document""" +num_segments_key = "num_segments" +""" This key holds the number of segments across which we divide the hashing space for each band""" + +# command line arguments +document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" +""" Name of the column storing the unique ID assigned to each document""" +contents_column_cli_param = f"{cli_prefix}{contents_column_key}" +""" Name of the column storing the contents of each document""" +seed_cli_param = f"{cli_prefix}{seed_key}" +""" The seed used to instantiate the random number generator""" +num_permutations_cli_param = f"{cli_prefix}{num_permutations_key}" +""" Number of permutations that determine how many minhashes to calculate for each document""" +num_bands_cli_param = f"{cli_prefix}{num_bands_key}" +""" The number of bands to use in the banding technique""" +num_minhashes_per_band_cli_param = f"{cli_prefix}{num_minhashes_per_band_key}" +""" The number of minhashes to use in each band""" +jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" +""" Jaccard similarity threshold above which two documents are duplicates""" +word_shingle_size_cli_param = f"{cli_prefix}{word_shingle_size_key}" +""" The size of the word shingles calculated for each document""" +num_segments_cli_param = f"{cli_prefix}{num_segments_key}" +""" The number of segments across which we divide the hashing space for each band""" + +captured_arg_keys = [ + document_id_column_key, + contents_column_key, + seed_key, + num_bands_key, + num_minhashes_per_band_key, + num_permutations_key, + jaccard_similarity_threshold_key, + word_shingle_size_key, + num_segments_key, +] + +# defaults +document_id_column_default = "int_id_column" +""" Default name of the column storing the unique ID assigned to each document""" +contents_column_default = "contents" +""" Default name of the column storing the contents of each document""" +seed_default = 42 +""" Default seed used to instantiate the random number generator""" +num_permutations_default = 112 +""" Default number of minhashes used for each document (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_bands_default = 14 +""" Default number of bands to use in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_minhashes_per_band_default = 8 +""" Default number of minhashes to use in each band (from FineWeb https://arxiv.org/pdf/2406.17557)""" +word_shingle_size_default = 5 +""" Default size of the word shingles (from FineWeb https://arxiv.org/pdf/2406.17557)""" +jaccard_similarity_threshold_default = 0.75 +""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_segments_default = 1 +""" Default number of segments across which we divide the hashing space for each band""" + + +def _optimal_minhashlsh_param( + threshold: float = jaccard_similarity_threshold_default, + num_perm: int = num_permutations_default, + false_positive_weight: float = 0.5, + false_negative_weight: float = 0.5, +): + """ + Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum + of probabilities of false positive and false negative. + :param threshold: desired similarity threshold + :param num_perm: number of permutations + :param false_positive_weight: importance of avoiding false positive results + :param false_negative_weight: importance of avoiding false negative results + :return: a tuple (optimal number of bands, optimal number of rows) + """ + + def _false_positive_probability(threshold, b, r): + _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) + a, err = integrate(_probability, 0.0, threshold) + return a + + def _false_negative_probability(threshold, b, r): + _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) + a, err = integrate(_probability, threshold, 1.0) + return a + + min_error = float("inf") + opt = (0, 0) + for b in range(1, num_perm + 1): + max_r = int(num_perm / b) + for r in range(1, max_r + 1): + fp = _false_positive_probability(threshold, b, r) + fn = _false_negative_probability(threshold, b, r) + error = fp * false_positive_weight + fn * false_negative_weight + if error < min_error: + min_error = error + opt = (b, r) + return opt + + +class SignatureCalculationTransform(AbstractTableTransform): + """ + This is the first transform of the fuzzy dedup pipeline. First, it calculates, + for each document in a dataset, `num_permutations` minhashes. It accepts as + input the number of bands and the length of each band. If those two parameters + are not specified, then, based on the values of `jaccard_similarity_threshold` + and `num_permutations`, it determines the optimal number of bands, and the + length of each band (how many minhashes will be used to get the signature for + each band). The band signatures, the minhashes and the document lengths are + then saved in the output folder, under a folder structure `bands/band=b/segment=s`. + To improve scalability of the next step of fuzzy dedup, the hash space of + each band is divided into `num_segments` segments. + + Args: + document_id_column: name of the column storing the unique ID assigned to each document + contents_column_cli_param: name of the column storing the contents of each document + seed: the seed used to instantiate the random number generator + num_permutations: number of minhashes to calculate for each document + num_bands: number of bands to use for banding technique + num_minhashes_per_band: number of minhashes to use in each band + jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + word_shingle_size: the size of the word shingles calculated for each document + num_segments the number of segments across which we divide the hashing space for each band + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, SignatureCalculationTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + super().__init__(config) + self.document_id_column = config.get(document_id_column_key, document_id_column_default) + self.contents_column = config.get(contents_column_key, contents_column_default) + self.seed = config.get(seed_key, seed_default) + self.num_permutations = config.get(num_permutations_key, num_permutations_default) + self.jaccard_similarity_threshold = config.get( + jaccard_similarity_threshold_key, jaccard_similarity_threshold_default + ) + self.word_shingle_size = config.get(word_shingle_size_key, word_shingle_size_default) + self.num_segments = config.get(num_segments_key, num_segments_default) + self.num_bands = config.get(num_bands_key, num_bands_default) + self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) + # Calculate optimal parameters for bands calculation + # self.num_bands, self.num_rows = _optimal_minhashlsh_param( + # threshold=self.jaccard_similarity_threshold, + # num_perm=self.num_permutations, + # false_positive_weight=0.5, + # false_negative_weight=0.5, + # ) + # use this dataframe to store the minhashes and size for each document + self.all_minhashes: pl.DataFrame = None + # use this dataframe to store the band hashes for each document + self.all_band_hashes: pl.DataFrame = None + # this variable keeps track of how many files were processed since last + # data write to properly update metadata + self.files_processed = 0 + self.bytes_processed = 0 + self.data_access = config.get("data_access") + self.last_file_name = None + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + """ + Put Transform-specific to convert one Table to 0 or more tables. It also returns + a dictionary of execution statistics - arbitrary dictionary + This implementation makes no modifications so effectively implements a copy of the + input parquet to the output folder, without modification. + """ + self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + self.logger.debug("----minhash---") + self.last_file_name = file_name + self.files_processed += 1 + self.bytes_processed += table.nbytes + # instantiate with same seed so every worker use same hash functions + mm_min_hash = Murmur_MH(num_perm=self.num_permutations, seed=self.seed) + + # load the data from pyarrow table + df = pl.from_arrow(table) + # read the target columns + df = df.select(self.contents_column, self.document_id_column) + + # generate minhash values + minhashes = df.map_rows( + lambda text: mm_min_hash.minhash2_nosalt( + *self._generate_word_shingles(text, window_size=self.word_shingle_size) + ) + ) + # rename columns, cast minhashes to list(uint32) + minhashes = minhashes.select( + pl.col("column_2").alias(self.document_id_column), + pl.col("column_0").cast(pl.List(pl.UInt32)).alias("minhashes"), + pl.col("column_1").alias("document_length"), + ) + # store the minhash calculations to send out at the end of execution + if self.all_minhashes is None: + self.all_minhashes = minhashes + else: + self.all_minhashes = self.all_minhashes.vstack(minhashes) + + # Calculate band hashes + band_hashes_list = self.process_rows_into_bands( + minhashes, + self.num_bands, + self.num_rows, + ) + band_hash_schema = pl.Schema( + { + "band_hash": pl.UInt64, + "band_index": pl.Int32, + self.document_id_column: pl.Int64, + } + ) + band_hashes = pl.DataFrame(band_hashes_list, schema=band_hash_schema) + + # store the band hash calculations to send out at the end of execution + if self.all_band_hashes is None: + self.all_band_hashes = band_hashes + else: + self.all_band_hashes = self.all_band_hashes.vstack(band_hashes) + + if len(self.all_minhashes) > 750000: + tables, metadata = self.write_band_signatures() + else: + tables = [] + metadata = {} + # update metadata stats and return the stats (no tables are returned in transform) + return tables, metadata + + def flush(self) -> tuple[list[pa.Table], dict[str, Any]]: + """ + This is supporting method for transformers, that implement buffering of tables, for example coalesce. + These transformers can have buffers containing tables that were not written to the output. Flush is + the hook for them to return back locally stored tables and their statistics. The majority of transformers + should use default implementation. + If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray. + :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be + propagated to metadata + """ + self.logger.info(f"Starting flush()") + if self.all_band_hashes is not None and self.all_minhashes is not None: + tables, metadata = self.write_band_signatures() + else: + tables = [] + metadata = {} + return tables, metadata + + def write_band_signatures(self): + # define the upper and lower bounds of each band segment + segment_bounds_list = [] + upper_bound = np.uint64(np.iinfo(np.uint64).max) + segment_len = np.uint64(upper_bound // self.num_segments) + for segment_index in range(self.num_segments): + segment_bounds_list.append(np.uint64(segment_index) * segment_len) + segment_bounds_list.append(upper_bound) + segment_bounds = np.array(segment_bounds_list, dtype=np.uint64) + self.logger.debug(f"Calculated {len(segment_bounds)} segment_bounds") + # output stats for the metadata + num_tables_written = 0 + num_docs_written = 0 + num_bytes_written = 0 + self.logger.debug(f"dataframe self.all_band_hashes has {len(self.all_band_hashes)} rows") + self.logger.debug(f"dataframe self.all_minhashes has {len(self.all_minhashes)} rows") + # iterate through the bands, get the band hashes for each band, divide + # them into segments, join with minhashes, and upload to storage + for band_ix in range(self.num_bands): + # Filtering on, then dropping the `band_index` column + band_df = self.all_band_hashes.filter(pl.col("band_index") == band_ix).drop("band_index") + # assign each band hash to a segment of the hashing space + self.logger.debug(f"band {band_ix} band_df has {len(band_df)} rows") + for segment_index in range(self.num_segments): + segment_band_df = band_df.filter( + (pl.col("band_hash") > segment_bounds[segment_index]) + & (pl.col("band_hash") <= segment_bounds[segment_index + 1]) + ) + self.logger.debug( + f"band {band_ix} segment {segment_index} segment_band_df has {len(segment_band_df)} rows" + ) + # join the band hash dataframe with the minihash and doc length dataframe + segment_band_minhash_df = segment_band_df.join( + self.all_minhashes, + on=self.document_id_column, + how="inner", + ) + self.logger.debug(f"band {band_ix} segment {segment_index} joined segment_band_df and minhashes") + + # encapsulate document info in a structure + segment_band_minhash_df = segment_band_minhash_df.select( + pl.col("band_hash"), + pl.struct( + [ + pl.col(self.document_id_column), + pl.col("minhashes"), + pl.col("document_length"), + ] + ).alias("document_data"), + ) + self.logger.debug(f"band {band_ix} segment {segment_index} encapsulated document info in a structure") + + # append the table to the result list, and the path to metadata + common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name]) + last_file_name_path = Path(self.last_file_name) + suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) + save_path = os.path.join( + self.data_access.output_folder, + "bands", + f"band={band_ix}", + f"segment={segment_index}", + suffix_path, + ) + segment_band_minhash_table = segment_band_minhash_df.to_arrow() + bytes_written, _, _ = self.data_access.save_table(save_path, segment_band_minhash_table) + if bytes_written > 0: + num_tables_written += 1 + num_docs_written += segment_band_minhash_table.num_rows + num_bytes_written += bytes_written + self.logger.debug(f"Uploaded table for band {band_ix} and segment {segment_index}") + # add the stats to metadata + metadata = { + "input_files": self.files_processed, + "input_docs": len(self.all_minhashes), + "input_bytes": self.bytes_processed, + "output_files": num_tables_written, + "output_docs": num_docs_written, + "output_bytes": num_bytes_written, + } + self.logger.info(f"Wrote {num_tables_written} tables with a total size of {num_bytes_written:,d} bytes") + self.files_processed = 0 + self.bytes_processed = 0 + self.all_minhashes = None + self.all_band_hashes = None + return [], metadata + + # define shingles generation function + def _generate_word_shingles(self, text: str, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]: + words = text[0].split() + document_id = text[1] + doc_len = len(text[0]) + word_count = len(words) + k_shingles = [] + for i in range(0, max(1, word_count - window_size + 1)): + k_shingles.append(delimiter.join(words[i : i + window_size])) + return k_shingles, doc_len, document_id + + def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b: int, r: int, seed: int = 42): + num_minhashes = len(minhashes) + assert b * r <= num_minhashes, f"b*r must be <= num minhashes, was b={b}, r={r}, num_minhashes={num_minhashes}" + results = [] + for band_index in range(b): + band_hash, _ = mmh3.hash64( + minhashes[band_index * r : (band_index + 1) * r], + seed=seed, + signed=False, + ) + results.append((band_hash, band_index, int_id_column)) + return results + + # Apply the function + def process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band): + result = [] + for row in df.iter_rows(): + bands = self.emit_bands( + row[0], # document id + np.array(row[1], dtype=np.uint32), # minhashes + row[2], # document length + minhashlsh_num_bands, + minhashlsh_length_band, + ) + for band in bands: + result.append(band) + return result + + +class SignatureCalculationTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=SignatureCalculationTransform, + remove_from_metadata=[], + ) + from data_processing.utils import get_logger + + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{document_id_column_cli_param}", + type=str, + default=document_id_column_default, + help="name of the column storing the unique ID assigned to each document", + ) + parser.add_argument( + f"--{contents_column_cli_param}", + type=str, + default=contents_column_default, + help="name of the column storing the contents of each document", + ) + parser.add_argument( + f"--{seed_cli_param}", + type=int, + default=seed_default, + help="the seed used to instantiate the random number generator", + ) + parser.add_argument( + f"--{num_permutations_cli_param}", + type=int, + default=num_permutations_default, + help="number of permutations (minhashes) calculated for each document", + ) + parser.add_argument( + f"--{jaccard_similarity_threshold_cli_param}", + type=int, + default=jaccard_similarity_threshold_default, + help="Jaccard similarity threshold above which two documents are duplicates", + ) + parser.add_argument( + f"--{word_shingle_size_cli_param}", + type=int, + default=word_shingle_size_default, + help="the size of the word shingles calculated for each document", + ) + parser.add_argument( + f"--{num_bands_cli_param}", + type=int, + default=num_bands_default, + help="the number of bands to use in the banding technique", + ) + parser.add_argument( + f"--{num_minhashes_per_band_cli_param}", + type=int, + default=num_minhashes_per_band_default, + help="the number of minhashes to use in each band", + ) + parser.add_argument( + f"--{num_segments_cli_param}", + type=int, + default=num_segments_default, + help="the number of segments across which we divide the hashing space for each band", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py new file mode 100644 index 000000000..5ddc102eb --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(SignatureCalculationTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() From 7f9b503978c4d7daf9cafc2ae7b448577ca5a7d6 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 11 Oct 2024 10:27:16 -0400 Subject: [PATCH 06/91] Fuzzy dedup spark implementation Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/spark/Dockerfile | 54 ++++ transforms/universal/fdedup/spark/Makefile | 45 +++ transforms/universal/fdedup/spark/README.md | 109 ++++++++ .../spark-executor-pod-template.yml | 8 + .../deployment/kubernetes/spark_profile.yml | 14 + .../universal/fdedup/spark/pyproject.toml | 42 +++ .../universal/fdedup/spark/requirements.txt | 10 + .../spark/src/cluster_analysis_spark.py | 33 +++ .../src/cluster_analysis_transform_spark.py | 42 +++ .../fdedup/spark/src/data_cleaning_spark.py | 33 +++ .../src/data_cleaning_transform_spark.py | 102 +++++++ .../fdedup/spark/src/file_copy_util_spark.py | 261 ++++++++++++++++++ .../fdedup/spark/src/fuzzy_dedup_spark.py | 205 ++++++++++++++ .../fdedup/spark/src/requirements.txt | 8 + .../fdedup/spark/src/signature_calc_spark.py | 35 +++ .../src/signature_calc_transform_spark.py | 42 +++ 16 files changed, 1043 insertions(+) create mode 100644 transforms/universal/fdedup/spark/Dockerfile create mode 100644 transforms/universal/fdedup/spark/Makefile create mode 100644 transforms/universal/fdedup/spark/README.md create mode 100644 transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml create mode 100644 transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml create mode 100644 transforms/universal/fdedup/spark/pyproject.toml create mode 100644 transforms/universal/fdedup/spark/requirements.txt create mode 100644 transforms/universal/fdedup/spark/src/cluster_analysis_spark.py create mode 100644 transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/src/data_cleaning_spark.py create mode 100644 transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/src/file_copy_util_spark.py create mode 100644 transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py create mode 100644 transforms/universal/fdedup/spark/src/requirements.txt create mode 100644 transforms/universal/fdedup/spark/src/signature_calc_spark.py create mode 100644 transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile new file mode 100644 index 000000000..523b94c06 --- /dev/null +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -0,0 +1,54 @@ +ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0 + +FROM ${BASE_IMAGE} + +# USER root +# install pytest +RUN pip install --no-cache-dir pytest + +WORKDIR ${SPARK_HOME}/work-dir + +# Copy in the data processing framework source/project and install it +# This is expected to be placed in the docker context before this is run (see the make image). +COPY --chown=spark:root data-processing-lib-python/ data-processing-lib-python/ +RUN cd data-processing-lib-python && pip install --no-cache-dir -e . +COPY --chown=spark:root data-processing-lib-spark/ data-processing-lib-spark/ +RUN cd data-processing-lib-spark && pip install --no-cache-dir -e . +COPY --chown=spark:root python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . + +# Install project source +COPY --chown=spark:root src/ src/ +COPY --chown=spark:root pyproject.toml pyproject.toml +RUN mkdir -p /opt/spark/work-dir/src/templates && \ + mkdir -p /opt/spark/work-dir/config + +# install requirements from requirements.txt +COPY requirements.txt . +RUN pip3 install -r requirements.txt + +COPY deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ +COPY deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ + +RUN pip install --no-cache-dir -e . + +# copy the main() entry point to the image +COPY ./src/signature_calc_spark.py . + +# copy some of the samples in +# COPY src/filter_local_spark.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +USER spark + +# Set environment +ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH} + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile new file mode 100644 index 000000000..d30013da8 --- /dev/null +++ b/transforms/universal/fdedup/spark/Makefile @@ -0,0 +1,45 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +# This is included in the image name, if defined +TRANSFORM_NAME=fd-sig-calc + +DOCKER_IMAGE_NAME=pyspark-base +DOCKER_IMAGE_VERSION=latest +DOCKER_FILE=Dockerfile +REGISTRY_HOST=docker.io +REGISTRY_PATH= +DOCKER=docker +PYTHON=python + +venv: requirements.txt + @# Help: Create the virtual environment using requirements.txt + $(PYTHON) -m venv venv + @source venv/bin/activate; \ + pip install --upgrade pip; \ + pip install wheel; \ + pip install -r requirements.txt; + +image:: .transforms.spark-image + +image-direct: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings. + @# Help: Create the docker image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + $(DOCKER) build -t $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -f $(DOCKER_FILE) . + +publish-docker: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings. + @# Help: Publish image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) to $(REGISTRY_HOST) container registry + $(DOCKER) logout $(REGISTRY_HOST) + $(DOCKER) login $(REGISTRY_HOST) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)' + $(DOCKER) push $(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + +publish-ibm: + ibmcloud login -q -u "$(IBM_CLOUD_USER)" -apikey "$(IBM_CLOUD_API_KEY)" + ibmcloud cr login --client docker + $(DOCKER) tag $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + $(DOCKER) push $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + # ibmcloud cr image-list | grep $(DOCKER_IMAGE_NAME) diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md new file mode 100644 index 000000000..3bf9b3245 --- /dev/null +++ b/transforms/universal/fdedup/spark/README.md @@ -0,0 +1,109 @@ +# Spark-GUF + +This is an implementation of Spark data processing modules. At a high level, every Spark application consists of a driver program that runs the user’s main function and executes various parallel operations on a cluster. + +The modules can run locally or remotely in a Kubernetes cluster. + +## Running Transforms locally + +Start in the `spark-guf` directory. To run the modules locally, follow these steps: +1. Create a virtual environment using this command + ``` + make venv + ``` +2. Activate the virtual environment: + ``` + source venv/bin/activate + ``` + +3. Set the `PYTHONPATH` environment variable to include the `src` directory: + ``` + export PYTHONPATH=${PYTHONPATH}:${PWD}/src + ``` +4. Invoke one of the transforms: + ``` + python src/transforms/spark_pi/spark_transformer_pi.py + ``` +5. To find out which arguments a transform takes, run that transform with a `--help` flag: + ``` + python src/transforms/spark_filter/spark_filter_transform.py --help + usage: spark_filter_transform.py [-h] --input_folder INPUT_FOLDER --output_folder OUTPUT_FOLDER [--data_type DATA_TYPE] + --filter_criteria_list FILTER_CRITERIA_LIST [--filter_columns_to_drop FILTER_COLUMNS_TO_DROP] + [--filter_logical_operator {AND,OR}] + + optional arguments: + -h, --help show this help message and exit + --input_folder INPUT_FOLDER + path to read the input files (local fs or s3) + --output_folder OUTPUT_FOLDER + path to write the output files (local fs or s3) + --data_type DATA_TYPE + Type of files to filter (parquet, orc, csv, json, txt) + --filter_criteria_list FILTER_CRITERIA_LIST + list of filter criteria (in SQL WHERE clause format), for example: [ "docq_total_words > 100 AND docq_total_words < 200", "docq_perplex_score < 230", "date_acquired BETWEEN '2023-07-04' + AND '2023-07-08'", "title LIKE 'https://%'", "document_id IN ('doc-id-1', 'doc-id-2', 'doc-id-3')" ] + --filter_columns_to_drop FILTER_COLUMNS_TO_DROP + list of columns to drop after filtering, for example: ["column1", "column2"] + --filter_logical_operator {AND,OR} + logical operator (AND or OR) that joins filter criteria + ``` + +## Running Transforms in Kubernetes/OpenShift + +Start in the `spark-guf` directory. To run the transforms in a Kubernetes or OpenShift cluster, follow these steps: + +1. Build and push a pyspark base docker image (this example assumes that images are pushed to the Docker hub, but same approach can be used to push images to icr.io, or quai.io: + ``` + docker build -t my-docker-username/my-pyspark:3.5.1 . + docker push my-docker-username/my-pyspark:3.5.1 + ``` +2. Build and push a specific transform image (this will use the pyspark built in the previous point as the base image): + ``` + docker build -t my-docker-username/my-pyspark-filter:3.5.1 -f src/transforms/spark_filter/Dockerfile --build-arg BASE_IMAGE=my-docker-username/my-pyspark:3.5.1 . + docker push my-docker-username/my-pyspark-filter:3.5.1 + ``` + +3. Configure the `spark` service account (note that you can use any other service account name, but you will need then to replace `spark` with `your-service-account-name` in all the yaml files listed below). This is a one-time process to perform for each namespace where you want to run spark apps: + ``` + # create 'spark' service account + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-serviceaccount.yaml --namespace=my-namespace + + # create 'spark' role + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role.yaml --namespace=my-namespace + + # bind the 'spark' service account to the 'spark' role + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role-binding.yaml --namespace=my-namespace + + # bind the 'spark' service account to the cluster roles + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-edit-role-binding.yaml --namespace=my-namespace + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-cluster-role-binding.yaml --namespace=my-namespace + ``` + + 4. Create any secrets that are needed to access S3 folders used for input or output of the transforms. Follow [this link](https://github.com/aws-samples/machine-learning-using-k8s/blob/master/docs/aws-creds-secret.md) for more information on how to build the S3 secrets. + + 5. Edit a pod yaml file from the `deployment/kubernetes/pods` directory. The steps below refer to the [yaml file used to build the filter pod] (deployment/kubernetes/pods/spark-driver-pod-filter.yaml): + 1. Give a name to the pod (`metadata/name`), the container launched inside the pod (`spec/containers/name`), and the Spark application (the `APP_NAME` variable in `spec/containers/env`). + 2. Specify the namespace where the pod will be created (`metadata/namespace`). Use the same namespace for the `EXECUTOR_NAMESPACE` variable in `spec/containers/env`) + 3. Specify the command to launch the Spark application (in `spec/containers/args`) + 4. Specify the image used by the driver (`spec/containers/image` - usually this is the transform image built under point 2). + 5. Specify the image used by the executors (`EXECUTOR_DOCKER_IMAGE` variable in `spec/containers/env`) + 6. Specify the service account to use by the driver (`spec/containers/serviceAccount`) and by the executors(the `SERVICE_ACCOUNT` variable in `spec/containers/env`) + 7. Configure S3: + 1. Specify the input (`AWS_ENDPOINT_URL_IN`) and output (`AWS_ENDPOINT_URL_OUT`) endpoint URLs. + 2. Specify the input and out access key ids and secret access keys. + +6. Launch the Spark application by creating the driver pod: + ``` + kubectl apply -f deployment/kubernetes/pod/spark-driver-pod-filter.yaml + ``` + +7. Monitor the creation of the executor pods: + ``` + kubectl get pods -w + ``` + +8. Monitor the driver logs: + ``` + kubectl logs spark-driver-pod-filter -f + ``` + ``` diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml new file mode 100644 index 000000000..d9579e0c7 --- /dev/null +++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Pod +metadata: +spec: + imagePullSecrets: + - name: prod-all-icr-io + securityContext: + fsGroup: 0 diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml new file mode 100644 index 000000000..eeddbd694 --- /dev/null +++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml @@ -0,0 +1,14 @@ +spark.app.name: ${APP_NAME} +spark.driver.memory: ${DRIVER_MEMORY} +spark.executor.instances: ${NUM_EXECUTORS} +spark.executor.memory: ${EXECUTOR_MEMORY} +spark.executor.cores: ${EXECUTOR_CORES} +spark.sql.shuffle.partitions: ${NUM_TASKS} +spark.task.cpus: ${TASK_CPUS} +spark.sql.legacy.parquet.nanosAsLong: true +spark.executor.decommission.forceKillTimeout: "10h" +# spark.sql.files.ignoreCorruptFiles: true +# configuration needed when running in kubernetes +spark.kubernetes.authenticate.driver.serviceAccountName: ${SERVICE_ACCOUNT} +spark.kubernetes.container.image: ${EXECUTOR_DOCKER_IMAGE} +spark.kubernetes.namespace: ${EXECUTOR_NAMESPACE} diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml new file mode 100644 index 000000000..dcf1f48e2 --- /dev/null +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -0,0 +1,42 @@ +[project] +name = "dpk_fdedup_transform_spark" +version = "0.3.0.dev0" +requires-python = ">=3.10" +description = "Fuzzy Dedup Spark Transform" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, +] +dependencies = [ + "dpk_fdedup_transform_python==0.3.0.dev0", + "data-prep-toolkit-spark==0.2.2.dev0", +] + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt new file mode 100644 index 000000000..10f3e129b --- /dev/null +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -0,0 +1,10 @@ +pyarrow +pyyaml +boto3 +kubernetes +polars +disjoint-set +scipy +numpy +sentencepiece +mmh3 diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py new file mode 100644 index 000000000..83498f59e --- /dev/null +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +if __name__ == "__main__": + sys.argv.append("--data_s3_cred") + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # create launcher + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py new file mode 100644 index 000000000..afb8c51b7 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class ClusterAnalysisSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Cluster Analysis + as required by the SparkTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup signature calculation transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_spark.py new file mode 100644 index 000000000..7b6bd626d --- /dev/null +++ b/transforms/universal/fdedup/spark/src/data_cleaning_spark.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +if __name__ == "__main__": + sys.argv.append("--data_s3_cred") + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py new file mode 100644 index 000000000..03976bac8 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -0,0 +1,102 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +from data_cleaning_transform import DataCleaningTransformConfiguration +from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import TransformStatistics +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class DataCleaningSparkRuntime(DefaultSparkTransformRuntime): + """ + Data cleaning runtime support for Spark + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_transform_config( + self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics + ) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :param statistics - reference to statistics actor + :param files - list of files to process + :return: dictionary of transform init params + """ + duplicate_list_location = self.params["duplicate_list_location"] + data_access = data_access_factory.create_data_access() + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + return self.params | {"df": self.duplicate_list} + + +class DataCleaningSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Data Cleaning + as required by the SparkTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(), + runtime_class=DataCleaningSparkRuntime, + ) + + def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :return: dictionary of parameters to be broadcast + """ + duplicate_list_location = self.transform_config.params["duplicate_list_location"] + data_access = data_access_factory.create_data_access() + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + return {"df": self.duplicate_list} + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup data cleaning transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/file_copy_util_spark.py b/transforms/universal/fdedup/spark/src/file_copy_util_spark.py new file mode 100644 index 000000000..58a43a736 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/file_copy_util_spark.py @@ -0,0 +1,261 @@ +import argparse +import os +import socket +import time +import traceback +from datetime import datetime + +import polars as pl +import yaml +from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase +from data_processing.utils import ParamsUtils, get_logger +from file_copy_util import FileCopyUtil +from pyspark.sql import SparkSession + + +logger = get_logger(__name__) + + +class FileCopySpark: + def __init__(self, root_folder: str, num_bands: int, num_segments: int, use_s3: bool): + self.root_folder = root_folder + self.num_bands = num_bands + self.num_segments = num_segments + self.use_s3 = use_s3 + self.subdirs = [f"band={b}/segment={s}" for b in range(num_bands) for s in range(num_segments)] + + def _init_spark(self, app_name: str = "copy-app") -> SparkSession: + server_port_https = int(os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "-1")) + if server_port_https == -1: + # we are running locally + spark_config = {"spark.driver.host": "127.0.0.1"} + return SparkSession.builder.appName(app_name).config(map=spark_config).getOrCreate() + else: + # we are running in Kubernetes, use spark_profile.yml and + # environment variables for configuration + + server_port = os.environ["KUBERNETES_SERVICE_PORT"] + master_url = f"k8s://https://kubernetes.default:{server_port}" + + # Read Spark configuration profile + config_filepath = os.path.abspath( + os.path.join(os.getenv("SPARK_HOME"), "work-dir", "config", "spark_profile.yml") + ) + with open(config_filepath, "r") as config_fp: + spark_config = yaml.safe_load(os.path.expandvars(config_fp.read())) + spark_config["spark.submit.deployMode"] = "client" + + # configure the executor pods from template + executor_pod_template_file = os.path.join( + os.getenv("SPARK_HOME"), + "work-dir", + "src", + "templates", + "spark-executor-pod-template.yml", + ) + spark_config["spark.kubernetes.executor.podTemplateFile"] = executor_pod_template_file + spark_config["spark.kubernetes.container.image.pullPolicy"] = "Always" + + # Pass the driver IP address to the workers for callback + myservice_url = socket.gethostbyname(socket.gethostname()) + spark_config["spark.driver.host"] = myservice_url + spark_config["spark.driver.bindAddress"] = "0.0.0.0" + + spark_config["spark.decommission.enabled"] = True + logger.info(f"Launching Spark Session with configuration\n" f"{yaml.dump(spark_config, indent=2)}") + app_name = spark_config.get("spark.app.name", "my-spark-app") + return SparkSession.builder.master(master_url).appName(app_name).config(map=spark_config).getOrCreate() + + def create_data_access_factory(self, root_folder: str, use_s3: bool) -> DataAccessFactoryBase: + input_folder = root_folder + output_folder = root_folder + data_access_factory: DataAccessFactoryBase = DataAccessFactory() + daf_args = [] + if use_s3: + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + s3_config = { + "input_folder": root_folder, + "output_folder": root_folder, + } + daf_args.append("--data_s3_cred") + daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) + daf_args.append("--data_s3_config") + daf_args.append(ParamsUtils.convert_to_ast(s3_config)), + else: + local_config = { + "input_folder": root_folder, + "output_folder": os.path.join(root_folder, "bands_consolidated"), + } + daf_args.append("--data_local_config") + daf_args.append(ParamsUtils.convert_to_ast(local_config)) + daf_parser = argparse.ArgumentParser() + data_access_factory.add_input_params(parser=daf_parser) + data_access_factory_args = daf_parser.parse_args(args=daf_args) + data_access_factory.apply_input_params(args=data_access_factory_args) + + return data_access_factory + + def orchestrate( + self, runtime_config: dict, execution_config: dict, data_access_factory: DataAccessFactoryBase, data_type: str + ) -> int: + """ + orchestrator for transformer execution + :param execution_config: orchestrator configuration + :param data_access_factory: data access factory + :param runtime_config: transformer runtime configuration + :return: 0 - success or 1 - failure + """ + start_time = time.time() + start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info(f"orchestrator started at {start_ts}") + data_access = data_access_factory.create_data_access() + # initialize Spark + spark_session = self._init_spark() + sc = spark_session.sparkContext + transform_config = sc.broadcast(runtime_config) + daf = sc.broadcast(data_access_factory) + data_type = data_type + print("data_type") + print(data_type) + + def process_partition(iterator): + """ + process partitions + :param iterator: iterator of records + :return: + """ + # local statistics dictionary + stats = {} + # create file processor + file_processor = FileCopyUtil( + data_access_factory=daf.value, + config=transform_config.value, + stats=stats, + ) + for f in iterator: + stats = file_processor.copy_data(subfolder_name=f[0], data_type=data_type) + # return partition's statistics + return list(stats.items()) + + num_partitions = 0 + try: + if data_type == "bands": + # Get files to process + files = [ + f"band={band}/segment={segment}" + for band in range(self.num_bands) + for segment in range(self.num_segments) + ] + elif data_type == "docs_to_remove": + files = ["docs_to_remove"] + print(data_type) + + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}") + # process data + logger.debug("Begin processing files") + source_rdd = sc.parallelize(files, execution_config.get("parallelization")) + num_partitions = source_rdd.getNumPartitions() + logger.info(f"Parallelizing execution. Using {num_partitions} partitions") + stats_rdd = source_rdd.zipWithIndex().mapPartitions(process_partition) + # build overall statistics + stats = dict(stats_rdd.reduceByKey(lambda a, b: a + b).collect()) + return_code = 0 + status = "success" + except Exception as e: + # process execution exception + logger.error(f"Exception during execution {e}: {traceback.print_exc()}") + return_code = 1 + status = "failure" + stats = {} + try: + # build and save metadata + logger.debug("Building job metadata") + input_params = runtime_config + # input_params = runtime_config.get_transform_metadata() | execution_config.get_input_params() + metadata = { + "job details": { + "start_time": start_ts, + "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "status": status, + }, + "job_input_params": input_params | data_access_factory.get_input_params(), + "execution_stats": { + "num partitions": num_partitions, + "execution time, min": (time.time() - start_time) / 60, + }, + "job_output_stats": stats, + } + logger.debug(f"Saving job metadata: {metadata}.") + + if data_access_factory.s3_config is not None: + _, root_folder = self.root_folder.split("://") + in_path = os.path.join(root_folder, "bands") + out_path = os.path.join(root_folder, "bands_consolidated") + data_access.input_folder = f"{in_path}{os.sep}" + data_access.output_folder = f"{out_path}{os.sep}" + else: + data_access.input_folder = os.path.join(self.root_folder, "bands") + data_access.output_folder = os.path.join(self.root_folder, "bands_consolidated") + data_access.save_job_metadata(metadata) + logger.debug("Saved job metadata.") + return return_code + except Exception as e: + logger.error(f"Exception during execution {e}: {traceback.print_exc()}") + return 1 + finally: + # stop spark context at the end. Required for running multiple tests + spark_session.stop() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--root_folder", + type=str, + default="/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_second/", + help="root folder", + ) + parser.add_argument( + "--num_bands", + type=int, + default=14, + help="number of bands", + ) + parser.add_argument( + "--num_segments", + type=int, + default=2, + help="number of segments", + ) + parser.add_argument( + "--data_type", + type=str, + default="docs_to_remove", + help="bands or doc2remove", + ) + parser.add_argument( + "--parallelization", + type=int, + default=-1, + help="spark parallelization", + ) + parser.add_argument( + "--use_s3", + type=bool, + default=False, + help="use s3", + ) + args = parser.parse_args() + fcs = FileCopySpark(args.root_folder, args.num_bands, args.num_segments, args.use_s3) + data_access_factory = fcs.create_data_access_factory(args.root_folder, args.use_s3) + app_config = {"root_folder": args.root_folder} + execution_config = {"parallelization": args.parallelization} if args.parallelization > 0 else {} + status = fcs.orchestrate(app_config, execution_config, data_access_factory, args.data_type) + print(f"Orchestrate concluded with status {status}") diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py new file mode 100644 index 000000000..6d0e090e4 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py @@ -0,0 +1,205 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import argparse +import logging +import os +import sys +from typing import Union + +import polars as pl +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from file_copy_util import FileCopyUtil +from file_copy_util_spark import FileCopySpark +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + +args_map = { + "minhash": [ + "document_id_column", + "contents_column", + "seed", + "num_permutations", + "num_bands", + "num_minhashes_per_band", + "jaccard_similarity_threshold", + "word_shingle_size", + "num_segments", + ], + "copyutil": [ + "subfolder_name", + "data_type", + "num_bands", + "num_segments", + "parallelization", + "use_s3", + ], + "cluster": [ + "jaccard_similarity_threshold", + ], + "fdclean": [ + "document_id_column", + "duplicate_list_location", + ], +} + + +def get_arguments(in_args: argparse.Namespace, module_name: str) -> Union[list, dict]: + sys_argv = ["python"] + in_args_dict = vars(in_args) + if in_args.use_s3: + sys_argv.append("--data_s3_cred") + sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) + all_module_arguments = args_map.get(module_name, []) + passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None} + if module_name == "copyutil": + copy_util_config = {k: v for k, v in passed_args.items()} + copy_util_config["root_folder"] = in_args_dict["output_folder"] + return copy_util_config + else: + for k, v in passed_args.items(): + sys_argv.append(f"--{module_name}_{k}") + sys_argv.append(str(v)) + if module_name == "minhash": + input_folder = in_args_dict["input_folder"] + output_folder = os.path.join(in_args_dict["output_folder"]) + elif module_name == "cluster": + input_folder = os.path.join(in_args_dict["output_folder"], "bands_consolidated") + output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove") + elif module_name == "fdclean": + if f"--{module_name}_duplicate_list_location" not in sys_argv: + sys_argv.append(f"--{module_name}_duplicate_list_location") + sys_argv.append( + os.path.join( + in_args_dict["output_folder"], + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + input_folder = in_args_dict["input_folder"] + output_folder = os.path.join(in_args_dict["output_folder"], "cleaned") + else: + logging.error(f"Unknown module name: {module_name}") + data_io = { + "input_folder": input_folder, + "output_folder": output_folder, + } + if in_args.use_s3: + sys_argv.append("--data_s3_config") + else: + sys_argv.append("--data_local_config") + sys_argv.append(ParamsUtils.convert_to_ast(data_io)) + return sys_argv + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_folder", type=str, required=True, help="path to read the input files") + parser.add_argument("--output_folder", type=str, required=True, help="path to write the output files") + parser.add_argument( + "--use_s3", type=bool, required=False, default=False, help="if true, use S3, if false use local FS" + ) + parser.add_argument( + "--contents_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument( + "--document_id_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text") + parser.add_argument( + "--num_permutations", type=int, required=True, help="number of permutations to use for minhash calculation" + ) + parser.add_argument( + "--num_bands", type=int, required=True, help="number of bands to use for band hash calculation" + ) + parser.add_argument( + "--num_minhashes_per_band", type=int, required=True, help="number of minhashes to use in each band" + ) + parser.add_argument( + "--word_shingle_size", type=int, required=False, help="number of words included in one shingle" + ) + parser.add_argument( + "--jaccard_similarity_threshold", + type=float, + required=False, + help="jaccard similarity threshold above which two documents are similar", + ) + parser.add_argument( + "--num_segments", + type=int, + required=True, + help="number of segments to divide each band hash interval (to improve scalability)", + ) + parser.add_argument("--parallelization", type=int, required=False, default=-1, help="spark parallelization") + parser.add_argument( + "--duplicate_list_location", + type=str, + required=False, + help="path to the file with all the duplicate document ids", + ) + return parser.parse_args() + + +if __name__ == "__main__": + # configure logging + logging.basicConfig( + format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + ) + args = parse_arguments() + sys.argv = get_arguments(args, "minhash") + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + status = launcher.launch() + logging.info(f"Signature calculation concluded with status {status}") + + fcs_config = get_arguments(args, "copyutil") + + root_folder = fcs_config["root_folder"] + parallelization = fcs_config["parallelization"] + fcs = FileCopySpark(root_folder, fcs_config["num_bands"], fcs_config["num_segments"], args.use_s3) + data_access_factory = fcs.create_data_access_factory(root_folder, args.use_s3) + app_config = {"root_folder": root_folder} + execution_config = {"parallelization": parallelization} if parallelization > 0 else {} + status = fcs.orchestrate(app_config, execution_config, data_access_factory, data_type="bands") + logging.info(f"Consolidate bands concluded with status {status}") + + sys.argv = get_arguments(args, "cluster") + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + status = launcher.launch() + logging.info(f"Cluster analysis concluded with status {status}") + + stats = {} + fcu_config = get_arguments(args, "copyutil") + fcu = FileCopyUtil(data_access_factory=data_access_factory, config=fcu_config, stats=stats) + fcu.copy_data(subfolder_name="docs_to_remove", data_type="docs_to_remove") + + sys.argv = get_arguments(args, "fdclean") + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + status = launcher.launch() + logging.info(f"Data cleanup concluded with status {status}") diff --git a/transforms/universal/fdedup/spark/src/requirements.txt b/transforms/universal/fdedup/spark/src/requirements.txt new file mode 100644 index 000000000..c1a1f2c3d --- /dev/null +++ b/transforms/universal/fdedup/spark/src/requirements.txt @@ -0,0 +1,8 @@ +pyspark +pyarrow +pyyaml +boto3 +kubernetes +disjoint_set +mmh3 +scipy diff --git a/transforms/universal/fdedup/spark/src/signature_calc_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_spark.py new file mode 100644 index 000000000..0e7046549 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/signature_calc_spark.py @@ -0,0 +1,35 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +if __name__ == "__main__": + sys.argv.append("--data_s3_cred") + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py new file mode 100644 index 000000000..4e39810c6 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Signature Calculation + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup signature calculation transform") + # Launch the spark worker(s) to process the input + launcher.launch() From 3349521bdfe3b1d95d8160cf442b722988c344be Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:05:39 +0100 Subject: [PATCH 07/91] added folder_transform --- .../pure_python/transform_file_processor.py | 15 ++++-- .../pure_python/transform_orchestrator.py | 42 ++++++++++------ .../runtime/transform_file_processor.py | 41 ++++++++------- .../src/data_processing/transform/__init__.py | 2 + .../transform/abstract_transform.py | 16 ++++++ .../transform/binary_transform.py | 5 +- .../transform/folder_transform.py | 50 +++++++++++++++++++ .../runtime/ray/transform_file_processor.py | 1 + .../runtime/ray/transform_orchestrator.py | 19 ++++--- .../runtime/spark/transform_file_processor.py | 5 +- .../runtime/spark/transform_orchestrator.py | 25 +++++++--- 11 files changed, 168 insertions(+), 53 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/transform/abstract_transform.py create mode 100644 data-processing-lib/python/src/data_processing/transform/folder_transform.py diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index 143835dd0..fa3e69e4a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -14,7 +14,7 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime import AbstractTransformFileProcessor -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractTransform, TransformStatistics from data_processing.utils import UnrecoverableException @@ -28,7 +28,8 @@ def __init__( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ): """ Init method @@ -36,11 +37,13 @@ def __init__( :param statistics - reference to statistics class :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag """ # invoke superclass super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) self.transform_params["statistics"] = statistics # Create local processor @@ -52,7 +55,8 @@ def __init__( # Create statistics self.stats = statistics - def _publish_stats(self, stats: dict[str, Any]) -> None: + +def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) @@ -65,17 +69,20 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool ): """ Init method :param data_access_factory - data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder tranform flag """ super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) # Add data access and statistics to the processor parameters self.transform_params["data_access"] = self.data_access diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 8692da29e..153eaaf0a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,7 +24,7 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger @@ -48,8 +48,6 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } - - def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -74,15 +72,21 @@ def orchestrate( return 1 # create additional execution parameters runtime = runtime_config.create_transform_runtime() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - if retries > 0: - statistics.add_stats({"data access retries": retries}) - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + if retries > 0: + statistics.add_stats({"data access retries": retries}) + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -99,6 +103,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) else: # using sequential execution @@ -111,6 +116,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) status = "success" return_code = 0 @@ -157,7 +163,8 @@ def _process_transforms( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ) -> None: """ Process transforms sequentially @@ -167,9 +174,8 @@ def _process_transforms( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag :return: metadata for the execution - - :return: None """ # create executor executor = PythonTransformFileProcessor( @@ -177,6 +183,7 @@ def _process_transforms( statistics=statistics, transform_params=transform_params, transform_class=transform_class, + is_folder=is_folder, ) # process data t_start = time.time() @@ -203,6 +210,7 @@ def _process_transforms_multiprocessor( data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], transform_class: type[AbstractBinaryTransform], + is_folder: bool ) -> TransformStatistics: """ Process transforms using multiprocessing pool @@ -212,13 +220,17 @@ def _process_transforms_multiprocessor( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform class :return: metadata for the execution """ # result statistics statistics = TransformStatistics() # create processor processor = PythonPoolTransformFileProcessor( - data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class + data_access_factory=data_access_factory, + transform_params=transform_params, + transform_class=transform_class, + is_folder=is_folder, ) completed = 0 t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index d4ec548d8..1d268875f 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -26,11 +26,13 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_parameters: dict[str, Any], + is_folder: bool = False, ): """ Init method :param data_access_factory: Data Access Factory :param transform_parameters: Transform parameters + :param is_folder: folder transform flag """ self.logger = get_logger(__name__) # validate parameters @@ -46,6 +48,7 @@ def __init__( # Add data access and statistics to the processor parameters self.transform_params = transform_parameters self.transform_params["data_access"] = self.data_access + self.is_folder = is_folder def process_file(self, f_name: str) -> None: """ @@ -58,25 +61,29 @@ def process_file(self, f_name: str) -> None: self.logger.warning("No data_access found. Returning.") return t_start = time.time() - # Read source file - filedata, retries = self.data_access.get_file(path=f_name) - if retries > 0: - self._publish_stats({"data access retries": retries}) - if filedata is None: - self.logger.warning(f"File read resulted in None for {f_name}. Returning.") - self._publish_stats({"failed_reads": 1}) - return - self._publish_stats({"source_files": 1, "source_size": len(filedata)}) + if not self.is_folder: + # Read source file only if we are processing file + filedata, retries = self.data_access.get_file(path=f_name) + if retries > 0: + self._publish_stats({"data access retries": retries}) + if filedata is None: + self.logger.warning(f"File read resulted in None for {f_name}. Returning.") + self._publish_stats({"failed_reads": 1}) + return + self._publish_stats({"source_files": 1, "source_size": len(filedata)}) # Process input file try: - # execute local processing - name_extension = TransformUtils.get_file_extension(f_name) self.logger.debug(f"Begin transforming file {f_name}") - out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + if not self.is_folder: + # execute local processing + out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + name_extension = TransformUtils.get_file_extension(f_name) + self.last_file_name = name_extension[0] + self.last_file_name_next_index = None + self.last_extension = name_extension[1] + else: + out_files, stats = self.transform.transform(folder_name=f_name) self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") - self.last_file_name = name_extension[0] - self.last_file_name_next_index = None - self.last_extension = name_extension[1] # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) # Process unrecoverable exceptions @@ -95,10 +102,10 @@ def flush(self) -> None: the hook for them to return back locally stored data and their statistics. :return: None """ - if self.last_file_name is None: + if self.last_file_name is None or self.is_folder: # for some reason a given worker never processed anything. Happens in testing # when the amount of workers is greater than the amount of files - self.logger.debug("skipping flush, no name for file is defined") + self.logger.debug("skipping flush, no name for file is defined or this is a folder transform") return try: t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py index 6af43ad60..20254e47b 100644 --- a/data-processing-lib/python/src/data_processing/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/transform/__init__.py @@ -1,3 +1,5 @@ +from data_processing.transform.abstract_transform import AbstractTransform +from data_processing.transform.folder_transform import AbstractFolderTransform from data_processing.transform.binary_transform import AbstractBinaryTransform from data_processing.transform.table_transform import AbstractTableTransform from data_processing.transform.transform_statistics import TransformStatistics diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py new file mode 100644 index 000000000..89db70f42 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py @@ -0,0 +1,16 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +class AbstractTransform: + """ + Base class for all transform types + """ \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py index 80dff61ea..b313aff2f 100644 --- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py @@ -10,10 +10,11 @@ # limitations under the License. ################################################################################ -from typing import Any, TypeVar +from typing import Any +from data_processing.transform import AbstractTransform -class AbstractBinaryTransform: +class AbstractBinaryTransform(AbstractTransform): """ Converts input binary file to output file(s) (binary) Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py new file mode 100644 index 000000000..866e3286f --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from data_processing.data_access import data_access +from data_processing.transform import AbstractTransform + + +class AbstractFolderTransform(AbstractTransform): + """ + Converts input folder to output file(s) (binary) + Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or + more new binary files and metadata. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This simply stores the given instance in this instance for later use. + """ + self.config = config + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the extension to be used when writing out the new bytes. + """ + raise NotImplemented() + + @staticmethod + def get_folders(data_access:data_access) -> list(str): + """ + Compute the list of folders to use. + :param data_access - data access class + :return: + """ + raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py index e1fabb144..cdad1309f 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py @@ -35,6 +35,7 @@ def __init__(self, params: dict[str, Any]): super().__init__( data_access_factory=params.get("data_access_factory", None), transform_parameters=dict(params.get("transform_params", {})), + is_folder=params.get("is_folder", False) ) # Create statistics self.stats = params.get("statistics", None) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 42eba47a6..8276eb56c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -16,6 +16,7 @@ import ray from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import AbstractFolderTransform from data_processing_ray.runtime.ray import ( RayTransformExecutionConfiguration, RayTransformFileProcessor, @@ -56,13 +57,18 @@ def orchestrate( # create transformer runtime runtime = runtime_config.create_transform_runtime() resources = RayUtils.get_cluster_resources() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -84,6 +90,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), "statistics": statistics, + "is_folder": is_folder, } logger.debug("Creating actors") processors = RayUtils.create_actors( diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py index d63664ac4..a0968ab1d 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py @@ -29,12 +29,15 @@ def __init__( data_access_factory: DataAccessFactoryBase, runtime_configuration: SparkTransformRuntimeConfiguration, statistics: TransformStatistics, + is_folder: bool, ): """ Init method """ super().__init__( - data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params() + data_access_factory=data_access_factory, + transform_parameters=runtime_configuration.get_transform_params(), + is_folder=is_folder, ) # Add data access ant statistics to the processor parameters self.runtime_configuration = runtime_configuration diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index c279f2b73..c534b685f 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -18,7 +18,7 @@ import yaml from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics +from data_processing.transform import TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger from data_processing_spark.runtime.spark import ( SparkTransformExecutionConfiguration, @@ -117,7 +117,10 @@ def process_partition(iterator): runtime = runtime_conf.create_transform_runtime() # create file processor file_processor = SparkTransformFileProcessor( - data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics + data_access_factory=d_access_factory, + runtime_configuration=runtime_conf, + statistics=statistics, + is_folder=is_folder, ) first = True for f in iterator: @@ -144,13 +147,19 @@ def process_partition(iterator): return list(statistics.get_execution_stats().items()) num_partitions = 0 + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # process data logger.debug("Begin processing files") # process files split by partitions From 0553edf9d5a6d9507a470927b14f5c65b7ec8773 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:13:01 +0100 Subject: [PATCH 08/91] added folder_transform --- .../runtime/pure_python/transform_orchestrator.py | 2 +- .../python/src/data_processing/transform/folder_transform.py | 4 ++-- .../data_processing_ray/runtime/ray/transform_orchestrator.py | 2 +- .../runtime/spark/transform_orchestrator.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 153eaaf0a..d51f80a8a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 866e3286f..eca191bbb 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -41,10 +41,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str raise NotImplemented() @staticmethod - def get_folders(data_access:data_access) -> list(str): + def get_folders(d_access: data_access) -> list(str): """ Compute the list of folders to use. - :param data_access - data access class + :param d_access - data access class :return: """ raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 8276eb56c..a8ff95729 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index c534b685f..4a0897952 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -151,7 +151,7 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process From a53412ecb5a00535dd85c56939c2d2fa4542c14a Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 21:00:43 +0100 Subject: [PATCH 09/91] added folder_transform --- .../runtime/pure_python/transform_file_processor.py | 3 +-- .../runtime/pure_python/transform_orchestrator.py | 11 ++++++----- .../runtime/pure_python/transform_runtime.py | 10 +++++++++- .../data_processing/transform/folder_transform.py | 12 +----------- .../runtime/ray/transform_orchestrator.py | 2 +- .../runtime/ray/transform_runtime.py | 10 +++++++++- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index fa3e69e4a..44ccd0ef0 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -55,8 +55,7 @@ def __init__( # Create statistics self.stats = statistics - -def _publish_stats(self, stats: dict[str, Any]) -> None: + def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index d51f80a8a..812be8caf 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,14 +24,13 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform +from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger logger = get_logger(__name__) -@staticmethod def _execution_resources() -> dict[str, Any]: """ Get Execution resource @@ -48,6 +47,7 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } + def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process @@ -145,7 +145,8 @@ def orchestrate( "job_input_params": input_params | data_access_factory.get_input_params() | execution_config.get_input_params(), - "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, + "execution_stats": _execution_resources() | + {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, "job_output_stats": stats, } logger.debug(f"Saving job metadata: {metadata}.") @@ -209,7 +210,7 @@ def _process_transforms_multiprocessor( print_interval: int, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], is_folder: bool ) -> TransformStatistics: """ diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py index 4173154ae..478d40837 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] ) -> dict[str, Any]: diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index eca191bbb..9a2fb3713 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -11,7 +11,6 @@ ################################################################################ from typing import Any -from data_processing.data_access import data_access from data_processing.transform import AbstractTransform @@ -38,13 +37,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str to metadata. Each element of the return list, is a tuple of the transformed bytes and a string holding the extension to be used when writing out the new bytes. """ - raise NotImplemented() - - @staticmethod - def get_folders(d_access: data_access) -> list(str): - """ - Compute the list of folders to use. - :param d_access - data access class - :return: - """ - raise NotImplemented() + raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index a8ff95729..b29682997 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py index 57f071406..64479302c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from ray.actor import ActorHandle @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] ) -> dict[str, Any]: From 9c3ace785b9a529e047df93ed9e65d27bf3d7ba0 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 08:48:00 +0100 Subject: [PATCH 10/91] added folder_transform --- .../runtime/spark/transform_orchestrator.py | 3 ++- .../runtime/spark/transform_runtime.py | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index 4a0897952..096fab272 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -151,7 +151,8 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + runtime = runtime_config.create_transform_runtime() + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index 7b968b1e9..7410d09d1 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics ) -> dict[str, Any]: From 7091a2e6087c77d5b204c803917f97b60d974310 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 15:35:00 +0100 Subject: [PATCH 11/91] added noop testing --- .../runtime/transform_file_processor.py | 44 +++++--- .../test_support/transform/__init__.py | 13 ++- .../transform/noop_folder_transform.py | 105 ++++++++++++++++++ .../test_support/transform/noop_transform.py | 6 +- .../transform/folder_transform.py | 2 +- .../transform/transform_configuration.py | 6 +- .../transform/test_folders_noop.py | 33 ++++++ .../launch/ray/ray_test_noop_launch.py | 6 - .../ededup/ray/src/ededup_transform_ray.py | 9 +- 9 files changed, 187 insertions(+), 37 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py create mode 100644 data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index 1d268875f..4075f40be 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -83,6 +83,7 @@ def process_file(self, f_name: str) -> None: self.last_extension = name_extension[1] else: out_files, stats = self.transform.transform(folder_name=f_name) + self.last_file_name = f_name self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) @@ -148,15 +149,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats ) case 1: # we have exactly 1 output file - file_ext = out_files[0] - lfn = self.last_file_name - if self.last_file_name_next_index is not None: - lfn = f"{lfn}_{self.last_file_name_next_index}" - output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}") + if self.is_folder: + # its folder + output_name = out_files[0][1] + dt = out_files[0][0] + else: + file_ext = out_files[0] + lfn = self.last_file_name + if self.last_file_name_next_index is not None: + lfn = f"{lfn}_{self.last_file_name_next_index}" + output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}") + dt = file_ext[0] self.logger.debug( f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}" ) - save_res, retries = self.data_access.save_file(path=output_name, data=file_ext[0]) + save_res, retries = self.data_access.save_file(path=output_name, data=dt) if retries > 0: self._publish_stats({"data access retries": retries}) if save_res is None: @@ -166,7 +173,7 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats self._publish_stats( { "result_files": 1, - "result_size": len(file_ext[0]), + "result_size": len(dt), "processing_time": time.time() - t_start, } ) @@ -183,14 +190,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats start_index = 0 count = len(out_files) for index in range(count): - file_ext = out_files[index] - output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}" - file_sizes += len(file_ext[0]) - self.logger.debug( - f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} " - f"of {count} to {output_name_indexed}" - ) - save_res, retries = self.data_access.save_file(path=output_name_indexed, data=file_ext[0]) + if self.is_folder: + # its a folder + output_name_indexed = out_files[index][1] + dt = out_files[index][0] + else: + # files + file_ext = out_files[index] + output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}" + self.logger.debug( + f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} " + f"of {count} to {output_name_indexed}" + ) + dt = file_ext[0] + file_sizes += len(dt) + save_res, retries = self.data_access.save_file(path=output_name_indexed, data=dt) if retries > 0: self._publish_stats({"data access retries": retries}) if save_res is None: diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py index 0e90f7ffd..04d6f3b0f 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py @@ -1,6 +1,11 @@ -from .table_transform_test import AbstractTableTransformTest -from .binary_transform_test import AbstractBinaryTransformTest -from .noop_transform import ( +from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest +from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest +from data_processing.test_support.transform.noop_transform import ( NOOPTransform, - NOOPPythonTransformConfiguration, + NOOPTransformConfiguration, + NOOPPythonTransformConfiguration ) +from data_processing.test_support.transform.noop_folder_transform import ( + NOOPFolderTransform, + NOOPFolderPythonTransformConfiguration +) \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py new file mode 100644 index 000000000..5baab7858 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py @@ -0,0 +1,105 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, + DefaultPythonTransformRuntime) +from data_processing.transform import AbstractFolderTransform +from data_processing.utils import get_logger +from data_processing.test_support.transform import NOOPTransformConfiguration + + +logger = get_logger(__name__) + + +class NOOPFolderTransform(AbstractFolderTransform): + """ + Implements a simple copy of a pyarrow Table. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, NOOPTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + # Make sure that the param name corresponds to the name used in apply_input_params method + # of NOOPTransformConfiguration class + super().__init__(config) + self.sleep = config.get("sleep_sec", 1) + self.data_access = config.get("data_access") + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the file name to use. + """ + logger.debug(f"Transforming one folder {folder_name}") + metadata = {} + # get folder files + files, retries = self.data_access.get_folder_files(path=folder_name) + if retries > 0: + metadata |= {"data access retries": retries} + result = [()] * len(files) + index = 0 + for name, file in files.items(): + result[index] = (file, self.data_access.get_output_location(name)) + if self.sleep is not None: + logger.info(f"Sleep for {self.sleep} seconds") + time.sleep(self.sleep) + logger.info("Sleep completed - continue") + index += 1 + # Add some sample metadata. + metadata |= {"nfiles": len(files)} + return result, metadata + + +class NOOPFolderPythonRuntime(DefaultPythonTransformRuntime): + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + return [data_access.get_input_folder()] + + +class NOOPFolderPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderPythonRuntime) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py index 0dee013a4..2fea35506 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py @@ -19,7 +19,7 @@ from data_processing.runtime.pure_python.runtime_configuration import ( PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.transform import AbstractTableTransform, TransformConfiguration, AbstractTransform from data_processing.utils import CLIArgumentProvider, get_logger @@ -75,10 +75,10 @@ class NOOPTransformConfiguration(TransformConfiguration): configuration with CLI args. """ - def __init__(self): + def __init__(self, clazz: type[AbstractTransform] = NOOPTransform): super().__init__( name=short_name, - transform_class=NOOPTransform, + transform_class=clazz, remove_from_metadata=[pwd_key], ) diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 9a2fb3713..caa3bfa52 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -35,6 +35,6 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str :param folder_name: the name of the folder containing arbitrary amount of files. :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated to metadata. Each element of the return list, is a tuple of the transformed bytes and a string - holding the extension to be used when writing out the new bytes. + holding the file name to use. """ raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py index 033e92f2a..a5c9ec9ad 100644 --- a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py +++ b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py @@ -13,7 +13,7 @@ from argparse import ArgumentParser from typing import Any -from data_processing.transform import AbstractBinaryTransform +from data_processing.transform import AbstractTransform from data_processing.utils import CLIArgumentProvider @@ -23,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider): """ def __init__( - self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = [] + self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = [] ): """ Initialization @@ -36,7 +36,7 @@ def __init__( self.remove_from_metadata = remove_from_metadata self.params = {} - def get_transform_class(self) -> type[AbstractBinaryTransform]: + def get_transform_class(self) -> type[AbstractTransform]: """ Get the class extending AbstractBinaryTransform which implements a specific transformation. The class will generally be instantiated with a dictionary of configuration produced by diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py new file mode 100644 index 000000000..e0fdd86c8 --- /dev/null +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.transform import NOOPFolderPythonTransformConfiguration + + +class TestRayNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../../../test-data/data_processing/python/noop/" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration()) + fixtures = [(launcher, {"noop_sleep_sec": 0}, basedir + "/input", basedir + "/expected")] + return fixtures diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py index d4cc874f0..e706a4dfa 100644 --- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py @@ -12,7 +12,6 @@ import os -import pyarrow as pa from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) @@ -20,11 +19,6 @@ from data_processing_ray.test_support.transform import NOOPRayTransformConfiguration -table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])}) -expected_table = table # We're a noop after all. -expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result - - class TestRayNOOPTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. diff --git a/transforms/universal/ededup/ray/src/ededup_transform_ray.py b/transforms/universal/ededup/ray/src/ededup_transform_ray.py index c0823a22e..d90dfa780 100644 --- a/transforms/universal/ededup/ray/src/ededup_transform_ray.py +++ b/transforms/universal/ededup/ray/src/ededup_transform_ray.py @@ -149,13 +149,12 @@ def _load_snapshots(self, data_access_factory: DataAccessFactoryBase, statistics statistics.add_stats.remote({"data access retries": retries}) self.logger.info(f"Found the following snapshot files {files.keys()}") # process snapshot files - for file in files.keys(): - # load the file + for file in files.values(): + # convert the file try: - b_hashes, _ = data_access.get_file(file) - snaps = pickle.loads(b_hashes) + snaps = pickle.loads(file) except Exception as e: - self.logger.warning(f"Failed to load hashes from file {file} with exception {e}") + self.logger.warning(f"Failed to load hashes with exception {e}") raise UnrecoverableException("failed to load hashes") request = [[] for _ in range(len(self.filters))] for h in snaps: From 680c78ac3f38724dfcf646673aae2ac3661107be Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 11 Oct 2024 10:47:42 -0400 Subject: [PATCH 12/91] Fuzzy dedup ray implementation Signed-off-by: nelson --- .../universal/fdedup/ray/pyproject.toml | 10 +- .../ray/src/cluster_analysis_local_ray.py | 51 ++ .../ray/src/cluster_analysis_transform_ray.py | 42 + .../fdedup/ray/src/compute_shingles.py | 50 -- ...ocal_ray.py => data_cleaning_local_ray.py} | 61 +- .../ray/src/data_cleaning_transform_ray.py | 120 +++ .../universal/fdedup/ray/src/fdedup_s3_ray.py | 76 -- .../fdedup/ray/src/fdedup_support.py | 621 -------------- .../fdedup/ray/src/fdedup_transform_ray.py | 803 ------------------ .../ray/src/signature_calc_local_ray.py | 54 ++ .../ray/src/signature_calc_transform_ray.py | 42 + 11 files changed, 340 insertions(+), 1590 deletions(-) create mode 100644 transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py create mode 100644 transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py delete mode 100644 transforms/universal/fdedup/ray/src/compute_shingles.py rename transforms/universal/fdedup/ray/src/{fdedup_local_ray.py => data_cleaning_local_ray.py} (59%) create mode 100644 transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py delete mode 100644 transforms/universal/fdedup/ray/src/fdedup_s3_ray.py delete mode 100644 transforms/universal/fdedup/ray/src/fdedup_support.py delete mode 100644 transforms/universal/fdedup/ray/src/fdedup_transform_ray.py create mode 100644 transforms/universal/fdedup/ray/src/signature_calc_local_ray.py create mode 100644 transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 3f2c8ba51..e2a2d34c9 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,20 +1,18 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.2.dev0" +version = "0.3.0.dev0" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ + "dpk_fdedup_transform_python==0.3.0.dev0", "data-prep-toolkit-ray==0.2.2.dev0", - "mmh3==4.1.0", - "xxhash==3.4.1", "tqdm==4.66.3", - "scipy==1.12.0" ] [build-system] diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py new file mode 100644 index 000000000..25b96788d --- /dev/null +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py @@ -0,0 +1,51 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py new file mode 100644 index 000000000..970686e13 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class ClusterAnalysisRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/compute_shingles.py b/transforms/universal/fdedup/ray/src/compute_shingles.py deleted file mode 100644 index 2db75ebe2..000000000 --- a/transforms/universal/fdedup/ray/src/compute_shingles.py +++ /dev/null @@ -1,50 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import string - - -""" -This implements the most simplistic splitting of document based on the white spaces -that can be overwritten by a different document splitter (tokenizer). This method is -build in the library and can be overwritten using approach described at -https://stackoverflow.com/questions/37553545/how-do-i-override-a-function-of-a-python-library - -import compute_shingles -compute_shingles.compute_shingles = my_local_compute_shingles -""" - - -def _find(s: str, ch: str) -> list[int]: - """ - Get indexes of all locations of character in string - :param s: string - :param ch: character - :return: list of locations - """ - return [i for i, ltr in enumerate(s) if ltr == ch] - - -def compute_shingles(txt: str, word_shingle_size: int, delimiter: str = " ") -> list[str]: - """ - Generate word shingles - :param txt: document - :param delimiter: delimiter to split document - :param word_shingle_size: size of shingle in words - :return: list of shingles - """ - text = txt.replace("\n", "").lower().translate(str.maketrans("", "", string.punctuation)) - separators = _find(text, delimiter) - if len(separators) + 1 <= word_shingle_size: - return [text] - bounds = [-1] + separators + [len(text)] - return [text[bounds[i] + 1 : bounds[i + word_shingle_size]] for i in range(0, len(bounds) - word_shingle_size)] diff --git a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py similarity index 59% rename from transforms/universal/fdedup/ray/src/fdedup_local_ray.py rename to transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py index af7bec71c..54fa2ccac 100644 --- a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py @@ -13,59 +13,52 @@ import os import sys +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration -# create launcher -launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, } +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + ) +) worker_options = {"num_cpus": 0.8} + code_location = {"github": "github", "commit_hash": "12345", "path": "path"} params = { # where to run "run_locally": True, # Data access. Only required parameters are specified "data_local_config": ParamsUtils.convert_to_ast(local_conf), - # Orchestration parameters - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 1, + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", "runtime_creation_delay": 0, "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 1, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 2, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, } -sys.argv = ParamsUtils.dict_to_req(d=params) -# launch -launcher.launch() + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py new file mode 100644 index 000000000..9fdb220f7 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -0,0 +1,120 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +import ray +from data_cleaning_transform import ( + DataCleaningTransform, + DataCleaningTransformConfiguration, + docs2remove_list, + docs2remove_list_key, + get_docs_to_remove, +) +from data_processing.data_access import DataAccessFactoryBase +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, +) +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from ray.actor import ActorHandle + + +logger = get_logger(__name__) + + +class DataCleaningRayTransform(DataCleaningTransform): + """ """ + + def __init__(self, config: dict): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, LangSelectorTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + docs2remove = config.get(docs2remove_list_key, None) + if docs2remove is not None: + # This is recommended for production approach. In this case domain list is build by the + # runtime once, loaded to the object store and can be accessed by actors without additional reads + try: + + config[docs2remove_list_key] = ray.get(config.get(docs2remove_list_key)) + except Exception as e: + self.logger.warning(f"Exception loading languages list from ray object storage {e}") + raise RuntimeError(f"exception loading from object storage for key {docs2remove}") + super().__init__(config) + + +class DataCleaningRuntime(DefaultRayTransformRuntime): + """ + Ingest Data cleaning runtime support + """ + + def __init__(self, params: dict[str, Any]): + """ + Create filter runtime + :param params: parameters, that should include + ingest_supported_langs_file_key: supported languages file + ingest_detect_programming_lang_key: whether to detect programming language + ingest_domain_key: domain + ingest_snapshot_key: snapshot + """ + super().__init__(params) + from data_processing.utils import get_logger + + self.logger = get_logger(__name__) + + def get_transform_config( + self, + data_access_factory: DataAccessFactoryBase, + statistics: ActorHandle, + files: list[str], + ) -> dict[str, Any]: + """ + Set environment for filter execution + :param data_access_factory - data access factory + :param statistics - reference to the statistics object + :param files - list of files to remove + :return: dictionary of filter init params + """ + docs_to_remove = get_docs_to_remove(self.params) + docs_to_remove_list = ray.put(docs_to_remove) + return {docs2remove_list_key: docs_to_remove_list} | self.params + + +class DataCleaningRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(transform_class=DataCleaningRayTransform), + runtime_class=DataCleaningRuntime, + ) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py b/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py deleted file mode 100644 index 285fcfa22..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py +++ /dev/null @@ -1,76 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import sys - -from data_processing.utils import ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration - - -# create launcher -launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) -# create parameters -s3_cred = { - "access_key": "localminioaccesskey", - "secret_key": "localminiosecretkey", - "url": "http://localhost:9000", -} - -s3_conf = { - "input_folder": "test/fdedup/input", - "output_folder": "test/fdedup/output", -} -worker_options = {"num_cpus": 0.8} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -params = { - # where to run - "run_locally": True, - # Data access. Only required parameters are specified - "data_s3_config": ParamsUtils.convert_to_ast(s3_conf), - "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred), - # Orchestration parameters - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 5, - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 2, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 2, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, -} -sys.argv = ParamsUtils.dict_to_req(d=params) - - -# launch -launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fdedup_support.py b/transforms/universal/fdedup/ray/src/fdedup_support.py deleted file mode 100644 index 60afb84bf..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_support.py +++ /dev/null @@ -1,621 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import pickle -import time -from typing import Any, Iterator, Union - -import numpy as np -import ray -from data_processing.data_access import SnapshotUtils -from data_processing.utils import GB, RANDOM_SEED, TransformUtils, get_logger -from data_processing_ray.runtime.ray import RayUtils -from ray.actor import ActorHandle -from ray.util import ActorPool -from scipy.integrate import quad as integrate - - -NO_SIMILARITY = -1 -REQUEST_LEN = 4096 -LONG_BUCKET = 5000 -LONG_BUCKET_PRINT = 1000 - - -def fuzzy_optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float, - false_negative_weight: float, -) -> tuple[int, int]: - """ - Computes parameters for fuzzy dedup - :param threshold: filtering threshold - :param num_perm: number of permutations - :param false_positive_weight: false positive weight - :param false_negative_weight: false negative weight - :return: number of buckets and bucket length - """ - - def _false_positive_probability(ths: float, b: int, r: int) -> float: - """ - Compute false positive probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, ths) - return a - - def _false_negative_probability(ths: float, b: int, r: int) -> float: - """ - Compute false negative probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, ths, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for perm in range(1, num_perm + 1): - max_r = int(num_perm / perm) - for rel in range(1, max_r + 1): - fp = _false_positive_probability(threshold, perm, rel) - fn = _false_negative_probability(threshold, perm, rel) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (perm, rel) - return opt - - -class MurmurMH: - def __init__(self, num_perm: int, seed: int = RANDOM_SEED): - self.seed = seed - self.num_perm = num_perm - self.permutations = self._init_permutations(seed, num_perm) - - def minhash(self, shingle_count: int, shingles: Iterator[str]) -> np.array: - def generator(): - for shingle in shingles: - yield TransformUtils.str_to_int(shingle) - - hash_values = np.fromiter(generator(), dtype=np.uint64, count=shingle_count) - - result = np.zeros(self.permutations.shape, dtype=np.uint32) - for i, perm in enumerate(self.permutations): - result[i] = np.right_shift((perm * hash_values).T, 32).astype(np.uint32).min(axis=0, keepdims=False) - return result - - @staticmethod - def _init_permutations(seed: int, num_perm: int) -> np.array: - # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic - max_int = np.uint64((1 << 64) - 1) - gen = np.random.RandomState(seed) - # get self.num_perm pseudo random numbers between 2 and max_int (excl) - permutations = np.array([gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], dtype=np.uint64).T - # make all even pseudo random numbers odd by adding 1 - permutations[permutations % 2 == 0] += 1 - return permutations - - @staticmethod - def jaccard(mh1: np.array, mh2: np.array) -> float: - return np.count_nonzero(mh1 == mh2) - - -@ray.remote(scheduling_strategy="SPREAD") -class DocCollector: - """ - An actor collecting de duped document IDs - """ - - def __init__(self, params: dict[str, Any]): - """ - Initializer - """ - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - self.removed = set() - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.ids = {} - else: - try: - bids, _ = self.data_access.get_file(snapshot) - self.ids = pickle.loads(bids) - except Exception as e: - self.logger.warning(f"Failed to load doc collector {self.actor_id} with exception {e}") - raise e - - def add_documents(self, dr: tuple[list[tuple[int, int]], list[int]]) -> None: - """ - Add documents and removed document - :param dr: documents to keep and documents to remove - :return: - """ - docs = dr[0] - rm = dr[1] - # process documents to remove - for did in rm: - self.ids.pop(did, None) - self.removed.update(rm) - # process documents to keep - for key, val in docs: - if key in self.removed: - continue - if key in self.ids and val == NO_SIMILARITY: - # Do not update existing docs with NO_SIMILARITY - continue - else: - self.ids[key] = val - - def filter(self, docs: list[int]) -> dict[int, int]: - """ - Filter documents - :param docs: documents to filter - :return: documents to keep - """ - result = {} - for doc_id in docs: - r = self.ids.get(doc_id, None) - if r is not None: - result[doc_id] = r - return result - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_doc = pickle.dumps(self.ids) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}docs/doc_collector_{self.actor_id}", b_doc - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot doc collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float, int, float]: - """ - get sizes - :return: number of ids, its memory utilization, number of removed, its memory utilization - """ - return ( - len(self.ids), - TransformUtils.deep_get_size(self.ids) / GB, - len(self.removed), - TransformUtils.deep_get_size(self.removed) / GB, - ) - - -@ray.remote(scheduling_strategy="SPREAD") -class DocsMinHash: - """ - An actor storing min hashes for a doc id - """ - - def __init__(self, params: dict[str, Any]): - """ - Initialize - :param params: parameters - """ - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.docs = {} - else: - try: - bdocs, _ = self.data_access.get_file(snapshot) - self.docs = pickle.loads(bdocs) - except Exception as e: - self.logger.warning(f"Failed to load minhash collector {self.actor_id} with exception {e}") - raise e - - def add_minhashes(self, updates: list[tuple[int, int, np.array]]) -> None: - """ - Add minhashes - :param updates: minhash for doc_id a tuple of doc len and array of hashes - :return: None - """ - for doc_id, length, minhash in updates: - self.docs[doc_id] = np.concatenate(([length], minhash)) - - def get_minhashes(self, doc_ids: list[int]) -> list[tuple[int, int, np.array]]: - """ - Get minhashes for a list of documents - :param doc_ids: list of doc ids - :return: doc id, len, minhashes - """ - result = [] - for doc_id in doc_ids: - info = self.docs.get(doc_id) - if info is not None: - result.append((doc_id, info[0], info[1:])) - return result - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_doc = pickle.dumps(self.docs) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}minhash/minhash_collector_{self.actor_id}", - b_doc, - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot minhash collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float]: - """ - Get size of used min hashes - :return: number of docs, its memory utilization - """ - return len(self.docs), TransformUtils.deep_get_size(self.docs) / GB - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHash: - """ - Actor storing buckets information - """ - - def __init__(self, params: dict[str, Any]): - """ - Initialization - """ - from ray.util.metrics import Counter - - self.submitter = None - self.n_buckets = 0 - self.bucket_memory = 0 - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.buckets = {} - else: - try: - b_buckets, _ = self.data_access.get_file(snapshot) - self.buckets = pickle.loads(b_buckets) - except Exception as e: - self.logger.warning(f"Failed to load buckets collector {self.actor_id} with exception {e}") - raise e - self.bucket_created_counter = Counter("bucket_created", "Amount of buckets created") - self.long_bucket_submit_counter = Counter("long_bucket_submitted", "Amount of long buckets submitted") - self.short_bucket_submit_counter = Counter("short_bucket_submitted", "Amount of short buckets submitted") - - def add_buckets(self, bck: list[tuple[int, list[int]]]) -> None: - """ - Add additional buckets to hash - :param bck: bucket information - :return: None - """ - for bucket in bck: - b_hash = bucket[0] - buckets_for_hash = self.buckets.get(b_hash) - if buckets_for_hash: - if type(buckets_for_hash) == int: - self.buckets[b_hash] = [buckets_for_hash] + bucket[1] - else: - buckets_for_hash.extend(bucket[1]) - else: - if len(bucket[1]) == 1: - self.buckets[b_hash] = bucket[1][0] - else: - self.buckets[b_hash] = bucket[1] - self.bucket_created_counter.inc(1) - - def add_processing_submitter(self, submitter: ActorHandle) -> None: - """ - Add process submitter - :param submitter: reference to submitter - :return: - """ - self.submitter = submitter - - def process_buckets(self) -> None: - """ - Process buckets to generate documents - :return: None - """ - - # Remember usage - self.n_buckets = len(self.buckets) - self.bucket_memory = TransformUtils.deep_get_size(self.buckets) / GB - - # split buckets into short and long. Long buckets can take very long to process - long_buckets = [] - short_buckets = [] - while len(self.buckets) > 0: - doc_id, bucket = self.buckets.popitem() - if type(bucket) == list and len(bucket) > LONG_BUCKET: - # Its long - long_buckets.append(bucket) - else: - short_buckets.append(bucket) - self.logger.info(f"processing buckets {len(long_buckets)} long, {len(short_buckets)} short") - - # process long buckets first - we are submitting them one at a time - for bucket in long_buckets: - if len(bucket) > 2 * LONG_BUCKET: - # For very long buckets, split them - self.logger.info(f"Splitting bucket of length len(bucket) into chunks") - smaller_bucket = [ - bucket[i * LONG_BUCKET : (i + 1) * LONG_BUCKET] - for i in range((len(bucket) + LONG_BUCKET - 1) // LONG_BUCKET) - ] - for b in smaller_bucket: - ray.get(self.submitter.submit_for_processing.remote([b])) - self.long_bucket_submit_counter.inc(1) - else: - ray.get(self.submitter.submit_for_processing.remote([bucket])) - self.long_bucket_submit_counter.inc(1) - self.logger.info("Done submitting long buckets") - - # And now the rest of buckets - bucket_chunks = [short_buckets[i * 100 : (i + 1) * 100] for i in range((len(short_buckets) + 99) // 100)] - for b in bucket_chunks: - ray.get(self.submitter.submit_for_processing.remote(b)) - self.short_bucket_submit_counter.inc(len(b)) - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_buckets = pickle.dumps(self.buckets) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}buckets/buckets_collector_{self.actor_id}", - b_buckets, - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot buckets collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float]: - """ - Get buckets resource utilization - :return: number of buckets and memory utilization - """ - return self.n_buckets, self.bucket_memory - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHashProcessor: - """ - Actor for processing buckets - """ - - def __init__(self, params: dict[str, Any]): - """ - Init method - :param params - dictionary of parameters containing the following keys - remote_docs - handles to the remote docs - remote_minhashes - handles to the remote minhashes - mn_min_hash - MurmurMH class - threshold - threshold - statistics - statistics actor - """ - from ray.util.metrics import Counter - - self.threshold = params["threshold"] - self.mn_min_hash = params["mn_min_hash"] - self.remote_docs = params["remote_docs"] - self.remote_minhashes = params["remote_minhashes"] - self.stats = params["statistics"] - self.logger = get_logger(__name__) - self.bucket_processed_counter = Counter("bucket_processed", "Amount of buckets processed") - - def _submit_generated_docs(self, docs: dict[int, int], removed: set[int]) -> None: - """ - Submit generated documents - :param docs: docs to submit - :param removed: removed documents - :return: None - """ - # Remove doc ids that are already removed - for did in removed: - docs.pop(did, None) - # Build remote requests - request = [([], []) for _ in range(len(self.remote_docs))] - for key, value in docs.items(): - req_tuple = request[key % len(self.remote_docs)] - req_tuple[0].append((key, value)) - for did in removed: - req_tuple = request[did % len(self.remote_docs)] - req_tuple[1].append(did) - # Submit requests and wait for replies - remote_replies = [] - i = 0 - for req in request: - if len(req[0]) > 0 or len(req[1]) > 0: # Only submit if the request has data - remote_replies.append(self.remote_docs[i].add_documents.remote(req)) - i += 1 - # Process replies - RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies) - - # get minhashes and length for docs in the bucket - def _get_minhashes_docs(self, doc_ids: list[int]) -> dict[int, tuple[int, list[int]]]: - """ - Get minhashes for documents by submitting requests to an appropriate doc collectors - :param doc_ids: doc ids - :return: doc ids with hashes - """ - request = [[] for _ in range(len(self.remote_minhashes))] - for value in doc_ids: - request[value % len(self.remote_minhashes)].append(value) - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.remote_minhashes[i].get_minhashes.remote(req)) - i += 1 - # Process replies - hashes = {} - while remote_replies: - # Wait for replies - ready, not_ready = ray.wait(remote_replies) - reply = ray.get(ready)[0] - for r in reply: - hashes[r[0]] = (r[1], r[2]) - remote_replies = not_ready - return hashes - - def process_buckets(self, buckets: list[Union[int, list[int]]]) -> None: - """ - process buckets to generate documents - :param buckets: buckets - :return: none - """ - t_start = time.time() - docs = {} - removed = set() - for bucket in buckets: - if type(bucket) == int: - # This hash has a single document - if bucket not in docs: - docs[bucket] = NO_SIMILARITY - self.bucket_processed_counter.inc(1) - continue - # multiple documents - start = time.time() - bucket_len = len(bucket) - very_long = bucket_len > LONG_BUCKET - - hashes = self._get_minhashes_docs(bucket) - set_list = [] - unvisited = set(bucket) - - # combine similar documents - index = 0 - while len(unvisited) > 0: - current_doc_id = unvisited.pop() - current_mh = hashes[current_doc_id][1] - current_set = set() - for other_doc_id in bucket: - if other_doc_id in unvisited: - other_mh = hashes[other_doc_id][1] - if self.mn_min_hash.jaccard(current_mh, other_mh) >= self.threshold: - current_set.add(current_doc_id) - current_set.add(other_doc_id) - unvisited.discard(other_doc_id) - if len(current_set) > 0: - set_list.append(current_set) - index += 1 - if index % LONG_BUCKET_PRINT == 0: - self.logger.info(f"processing very long {bucket_len} bucket, {index} documents so far") - if index > LONG_BUCKET_PRINT: - self.logger.info(f"done processing very long {bucket_len}") - - # process created sets - for current_set in set_list: - for d in current_set: - bucket.remove(d) - removed.update(current_set) - for i, doc_id in enumerate(current_set): - if i == 0: - cluster_id = doc_id - remaining = doc_id - min_len = hashes[doc_id][0] - max_len = min_len - continue - c_len = hashes[doc_id][0] - if c_len > max_len: - max_len = c_len - remaining = doc_id - continue - if c_len <= min_len: - min_len = c_len - cluster_id = doc_id - docs[remaining] = cluster_id - removed.discard(remaining) - - # if we did not find docs in connections, submit them as NO_SIMILARITY - for d in bucket: - if d not in docs: - docs[d] = NO_SIMILARITY - if very_long: - self.logger.info( - f"Processed long ({bucket_len}) bucket in {round((time.time() - start) / 60.,3)} " - f"min; " - f"docs chains {len(set_list)}" - ) - self.bucket_processed_counter.inc(1) - # Submit docs - self._submit_generated_docs(docs, removed) - # peg stats - self.stats.add_stats.remote({"generated doc_ids": len(docs), "bucket processing time": time.time() - t_start}) - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHashProcessorInvoker(object): - """ - Bucket hash processing coordinator (singleton) - """ - - def __init__(self, bucket_processors: list[ActorHandle]) -> None: - self.n_processors = len(bucket_processors) - self.pool = ActorPool(bucket_processors) - self.submitted = 0 - self.processed = 0 - self.logger = get_logger(__name__) - self.start = time.time() - - def submit_for_processing(self, buckets: list[Union[int, list[int]]]) -> None: - # Get completed results - if self.submitted < self.n_processors: # still have room - self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets) - self.logger.debug("Submitted bucket processing request") - self.submitted += 1 - return - else: - while True: - # we can have several workers fail here - try: - self.pool.get_next_unordered() - break - except Exception as e: - self.logger.error(f"Failed to process request worker exception {e}") - self.processed += 1 - self.processed += 1 - if self.processed % 100 == 0: - self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min") - self.logger.debug("Completed bucket processing request") - self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets) - self.submitted += 1 - self.logger.debug("Submitted bucket processing request") - return - - def wait_for_completion(self) -> None: - self.logger.info(f"Waiting bucket processing completion. Submitted requests {self.submitted}") - while self.pool.has_next(): - try: - self.pool.get_next_unordered() - except Exception as e: - self.logger.error(f"Failed to process request worker exception {e}") - self.processed += 1 - if self.processed % 100 == 0: - self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min") diff --git a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py deleted file mode 100644 index 6c6c02bb3..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py +++ /dev/null @@ -1,803 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import random -import time -from argparse import ArgumentParser, Namespace -from typing import Any - -import mmh3 -import numpy as np -import pyarrow as pa -import ray -from data_processing.data_access import DataAccessFactoryBase, SnapshotUtils -from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import ( - RANDOM_SEED, - CLIArgumentProvider, - TransformUtils, - str2bool, -) -from data_processing_ray.runtime.ray import ( - DefaultRayTransformRuntime, - RayTransformFileProcessor, - RayTransformLauncher, - RayUtils, -) -from data_processing_ray.runtime.ray.runtime_configuration import ( - RayTransformRuntimeConfiguration, -) -from fdedup_support import ( - REQUEST_LEN, - BucketsHash, - BucketsHashProcessor, - BucketsHashProcessorInvoker, - DocCollector, - DocsMinHash, - MurmurMH, - fuzzy_optimal_param, -) -from ray.actor import ActorHandle -from ray.util import ActorPool - - -short_name = "fdedup" -cli_prefix = f"{short_name}_" - - -class FdedupTransform(AbstractTableTransform): - """ - Implements fuzzy dedup data preprocessor (building tables and minhashes). - """ - - def __init__(self, config: dict): - """ - Initialize based on the dictionary of configuration information. - :param config: initialization parameters, with the following keys - doc_column - name of doc column - doc_id_int_column - name of int doc id column - word_shingle_size - word shingle size - mn_min_hash - MurmurMH class - num_bands - number of bands - length_band band length - remote_buckets - bucket actors - remote_minhashes - minhash actors - delimiter - delimiter - random_delay_limit - random delay limit - """ - super().__init__(config) - self.doc_column = config.get("doc_column", "") - self.doc_id_column = config.get("doc_id_int_column", "") - self.word_shingle_size = config.get("word_shingle_size", 1) - self.delimiter = config.get("delimiter", " ") - self.mn_min_hash = config.get("mn_min_hash", None) - self.num_bands = config.get("num_bands", 1) - self.length_band = config.get("length_band", 1) - self.buckets = config.get("remote_buckets", []) - self.minhashes = config.get("remote_minhashes", []) - self.random_delay_limit = config.get("random_delay_limit", 10) - - def _generate_minhashes(self, shingles: list[str]) -> np.array: - """ - Generate minhashes - :param shingles: - :return: generated minhashes - """ - min_hashes = self.mn_min_hash.minhash(len(shingles), shingles) - num_min_hashes = len(min_hashes) - assert self.num_bands * self.length_band <= num_min_hashes, ( - f"num_bans*band_len must be <= num min hashes, was num_bands={self.num_bands}, " - f"bands_len={self.length_band}, num_min hashes={num_min_hashes}" - ) - return min_hashes - - def _generate_buckets(self, min_hashes: np.array) -> list[int]: - """ - Generate buckets - :param min_hashes: array of minhashes - :return: - """ - return [ - mmh3.hash64(min_hashes[i * self.length_band : (i + 1) * self.length_band], seed=RANDOM_SEED, signed=False)[ - 0 - ] - for i in range(self.num_bands) - ] - - def _submit_buckets_minhashes( - self, buckets: dict[int, list[int]], minhashes: list[tuple[int, int, np.array]] - ) -> None: - """ - Submit buckets to hash - :param buckets: buckets - :param minhashes: minhashes - :return: None - """ - # bucket requests - request = [[] for _ in range(len(self.buckets))] - for key, value in buckets.items(): - request[key % len(self.buckets)].append((key, value)) - # Submit requests to appropriate bucket collectors - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.buckets[i].add_buckets.remote(req)) - i += 1 - # Minhashes - request = [[] for _ in range(len(self.minhashes))] - for minh in minhashes: - request[minh[0] % len(self.minhashes)].append(minh) - # Submit requests to appropriate minhash collectors - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.minhashes[i].add_minhashes.remote(req)) - i += 1 - # wait for completion - RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies) - - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - """ - Preprocessing table content. - :param table: table - :param file_name - name of currently processed file - :return: resulting table, statistics - """ - from compute_shingles import compute_shingles - - def flush(limit: int) -> None: - """ - flushing buckets and minhashes to dedicated actors - :param limit: number of buckets to flush - :return: None - """ - if len(buckets) >= limit: # time to submit - nonlocal num_buckets - nonlocal num_minhashes - self._submit_buckets_minhashes(buckets, minhashes) - num_buckets = num_buckets + len(buckets) - num_minhashes = num_minhashes + len(minhashes) - buckets.clear() - minhashes.clear() - - # make sure that the doc column exists - TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column]) - # Inner variables - buckets = {} - minhashes = [] - num_buckets = 0 - num_minhashes = 0 - docs = table[self.doc_column] - doc_ids = table[self.doc_id_column] - # for every document/its integer id - for n in range(table.num_rows): - doc = docs[n].as_py() - doc_id = doc_ids[n].as_py() - shingles = compute_shingles(txt=doc, word_shingle_size=self.word_shingle_size, delimiter=self.delimiter) - if len(shingles) > 0: - mh = self._generate_minhashes(shingles) - minhashes.append((doc_id, len(doc), mh)) - candidates = self._generate_buckets(mh) - - for b_hash in candidates: - bucket_array = buckets.get(b_hash) - if bucket_array is None: - buckets[b_hash] = [doc_id] - else: - bucket_array.append(doc_id) - flush(REQUEST_LEN) - flush(0) - # peg stats - stats = {"generated buckets": num_buckets, "generated minhashes": num_minhashes} - time.sleep(int(random.random() * self.random_delay_limit)) - return [], stats - - -class FdedupFilter(AbstractTableTransform): - """ - Filtering documents - """ - - def __init__(self, config: dict): - """ - Initialize based on the dictionary of configuration information. - The dictionary should contain the following: - doc_column - name of doc column - doc_id_int_column - name of int doc id column - cluster_column - name of the cluster column - remote_docs - list of remote doc collectors - random_delay_limit - random delay limit - """ - super().__init__(config) - self.doc_column = config.get("doc_column", "") - self.doc_id_column = config.get("doc_id_int_column", "") - self.cluster_column = config.get("cluster_column", "") - self.docs = config.get("remote_docs", "") - self.random_delay_limit = config.get("random_delay_limit", 10) - - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - """ - De duping (filtering) table content. - :param table: table - :param file_name: name of the currently processing file - :return: resulting table, statistics - """ - # make sure that the doc column exists - TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column]) - # inner variables - ids = table.column(self.doc_id_column) - # Submit requests to an appropriate doc collectors - request = [[] for _ in range(len(self.docs))] - for value in ids: - doc_id = value.as_py() - request[doc_id % len(self.docs)].append(doc_id) - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.docs[i].filter.remote(req)) - i += 1 - # Process replies - unique = {} - while remote_replies: - # Wait for replies - ready, not_ready = ray.wait(remote_replies) - reply = ray.get(ready)[0] - unique.update(reply) - remote_replies = not_ready - # Filter out table - mask = [] - clusters = [] - # Actual filtering - for n in range(table.num_rows): - doc_id = ids[n].as_py() - if doc_id in unique: - mask.append(True) - clusters.append(unique.pop(doc_id)) - else: - mask.append(False) - # build out table - out_table = TransformUtils.add_column(table=table.filter(mask), name=self.cluster_column, content=clusters) - # build execution statistics - stats = {"source_documents": table.num_rows, "result_documents": out_table.num_rows} - time.sleep(int(random.random() * self.random_delay_limit)) - return [out_table], stats - - -class FdedupRuntime(DefaultRayTransformRuntime): - """ - Fuzzy dedup runtime support. Here we are using set environment to implement first two steps of fuzzy dedup - processing - preprocessing and bucket hash processing - """ - - def __init__(self, params: dict[str, Any]): - """ - Create filter runtime - :param params: parameters, that should include - doc_column - name of the document column - id_column - name of the integer doc id column - cluster_column - name of the cluster column - worker_options - start options for preprocessor - from the orchestrator configuration - bucket_cpu - number of cpus for bucket actor - doc_cpu - number of cpus for doc actor - mhash_cpu - number of cpus for minhash actor - num_doc_actors - number of document actors - num_bucket_actors - number of bucket actors - num_minhash_actors - number of minhash actors - num_preprocessors - number of preprocessors - snapshot_delay - delay (sec) in sending snapshot requests to actors - use_bucket_snapshot - use bucket snapshot - use_doc_snapshot - use doc snapshot - random_delay_limit - random_delay limit - # fuzzy specific parameters - num_permutations - number of permutations - threshold - threshold - world_shingle_size - word shingles size - delimiters - delimiter - """ - from data_processing.utils import get_logger - - super().__init__(params) - self.logger = get_logger(__name__) - self.sum_buckets = 0 - self.sum_buckets_mem = 0 - self.sum_mh = 0 - self.sum_mh_mem = 0 - self.document_collectors = [] - self.snapshot_delay = self.params.get("snapshot_delay", 1) - self.random_delay_limit = self.params.get("random_delay_limit", 10) - - def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] - ) -> dict[str, Any]: - """ - Set environment for filter execution - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param files - list of files to process - :return: dictionary of filter init params - """ - if self.params.get("use_doc_snapshot", False): - self.logger.info("continuing from the document actors snapshot") - data_access = data_access_factory.create_data_access() - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}docs" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.info(f"Found the following snapshot files {files.keys()}") - self.document_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - self.document_collectors[i] = DocCollector.options( - **{"num_cpus": self.params.get("doc_cpu", 0.5)} - ).remote({"id": i, "data_access": data_access_factory, "snapshot": file}) - time.sleep(self.snapshot_delay) - self.logger.info(f"Created {len(self.document_collectors)} document collectors to continue processing") - else: - self.logger.info("starting run from the beginning") - self._create_doc_actors(data_access_factory=data_access_factory, statistics=statistics, files=files) - return { - "doc_column": self.params.get("doc_column", ""), - "doc_id_int_column": self.params.get("id_column", ""), - "cluster_column": self.params.get("cluster_column", ""), - "remote_docs": self.document_collectors, - "random_delay_limit": self.random_delay_limit, - } - - def _create_doc_actors( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] - ) -> None: - """ - Create document actors - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param files - list of files to process - :return: None - """ - mn_min_hash = MurmurMH(num_perm=self.params.get("num_permutations", 64), seed=RANDOM_SEED) - if self.params.get("use_bucket_snapshot", False): - self.logger.info("continuing from the bucket actors snapshot") - data_access = data_access_factory.create_data_access() - # recreate bucket collectors - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}buckets" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.debug(f"Found the following bucket snapshot files {files.keys()}") - bucket_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory, "snapshot": file} - ) - time.sleep(self.snapshot_delay) - self.logger.info(f"Created {len(bucket_collectors)} bucket collectors to continue processing") - # recreate minhash collectors - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}minhash" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.debug(f"Found the following minhash snapshot files {files.keys()}") - minhash_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory, "snapshot": file} - ) - time.sleep(self.snapshot_delay) - self._process_buckets( - data_access_factory=data_access_factory, - statistics=statistics, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - mn_min_hash=mn_min_hash, - ) - self.logger.info(f"Created {len(minhash_collectors)} minhash collectors to continue processing") - else: - self.logger.info("continuing from the very beginning") - self._create_doc_actors_internal( - data_access_factory=data_access_factory, statistics=statistics, mn_min_hash=mn_min_hash, files=files - ) - - def _create_doc_actors_internal( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - mn_min_hash: MurmurMH, - files: list[str], - ) -> None: - """ - Create document actors - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param mn_min_hash - MurmurMH class - :param files - list of files to process - :return: None - """ - # compute fuzzy dedup parameters - num_buckets, length_bucket = fuzzy_optimal_param( - threshold=self.params.get("threshold", 0.8), - num_perm=self.params.get("num_permutations", 64), - false_positive_weight=0.5, - false_negative_weight=0.5, - ) - self.logger.info(f"Fuzzy: num buckets {num_buckets}, bucket length {length_bucket}") - # Build bucket and minhash collectors - bucket_collectors = [None] * self.params.get("num_bucket_actors", 1) - for i in range(self.params.get("num_bucket_actors", 1)): - bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(bucket_collectors)} bucket actors") - minhash_collectors = [None] * self.params.get("num_minhash_actors", 1) - for i in range(self.params.get("num_minhash_actors", 1)): - minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(minhash_collectors)} minhash actors") - self._preprocess_tables( - data_access_factory=data_access_factory, - statistics=statistics, - files=files, - mn_min_hash=mn_min_hash, - num_buckets=num_buckets, - length_bucket=length_bucket, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - random_delay_limit=self.random_delay_limit, - ) - # At this point we can snapshot both bucket and minhash collectors for potential restart - self.logger.info("creating minhash snapshots") - minhash_replies = [None] * len(minhash_collectors) - index = 0 - for collector in minhash_collectors: - minhash_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while minhash_replies: - ready, not_ready = ray.wait(minhash_replies) - minhash_replies = not_ready - self.logger.info("minhash snapshots created") - self.logger.info("creating bucket snapshots") - bucket_replies = [None] * len(bucket_collectors) - index = 0 - for collector in bucket_collectors: - bucket_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while bucket_replies: - ready, not_ready = ray.wait(bucket_replies) - bucket_replies = not_ready - self.logger.info("bucket snapshots created") - self._process_buckets( - data_access_factory=data_access_factory, - statistics=statistics, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - mn_min_hash=mn_min_hash, - ) - - def _process_buckets( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - bucket_collectors: list[ActorHandle], - minhash_collectors: list[ActorHandle], - mn_min_hash: MurmurMH, - ) -> None: - """ - Process buckets - :param data_access_factory - data access factory - :param statistics - statistics actor - :param bucket_collectors - bucket collectors - :param minhash_collectors - minhash collectors - :param mn_min_hash - MMurmurMH class - :return: None - """ - # Create document collectors - self.document_collectors = [None] * self.params.get("num_doc_actors", 1) - for i in range(self.params.get("num_doc_actors", 1)): - self.document_collectors[i] = DocCollector.options(**{"num_cpus": self.params.get("doc_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(self.document_collectors)} document actors") - # create bucket processors - bucket_processors_list = RayUtils.create_actors( - clazz=BucketsHashProcessor, - params={ - "remote_docs": self.document_collectors, - "remote_minhashes": minhash_collectors, - "mn_min_hash": mn_min_hash, - "threshold": self.params.get("threshold", 0.8) * self.params.get("num_permutations", 64), - "statistics": statistics, - }, - actor_options=self.params.get("worker_options", None), - n_actors=self.params.get("num_preprocessors", 1), - ) - self.logger.info(f"created {len(bucket_processors_list)} bucket processor actors") - # create bucket processors invoker - bucket_processor_invoker = BucketsHashProcessorInvoker.options( - num_cpus=self.params.get("bucket_cpu", 0.5) - ).remote(bucket_processors=bucket_processors_list) - self.logger.info(f"created bucket processor invoker") - # Add invoker to the buckets - bucket_replies = [ - collector.add_processing_submitter.remote(submitter=bucket_processor_invoker) - for collector in bucket_collectors - ] - RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies) - self.logger.info(f"added invoker to bucket collectors") - # start bucket processing and wait for completion - start = time.time() - bucket_replies = [collector.process_buckets.remote() for collector in bucket_collectors] - RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies) - # Wait for pool to complete - ray.get(bucket_processor_invoker.wait_for_completion.remote()) - self.logger.info(f"Done processing buckets in {round((time.time() - start) / 60.,3)} min") - # At this point we can save doc actors, in case we would want to restart here - self.logger.info(f"creating document snapshots") - doc_replies = [None] * len(self.document_collectors) - index = 0 - for collector in self.document_collectors: - doc_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while doc_replies: - ready, not_ready = ray.wait(doc_replies) - doc_replies = not_ready - self.logger.info(f"document snapshots created") - # At this point we do not need bucket and minhash actors, remove them - # but first get usage information - # Bucket collector - replies = [collector.get_size.remote() for collector in bucket_collectors] - while replies: - ready, not_ready = ray.wait(replies) - b_amount, b_memory = ray.get(ready)[0] - self.sum_buckets += b_amount - self.sum_buckets_mem += b_memory - replies = not_ready - for collector in bucket_collectors: - ray.kill(actor=collector, no_restart=True) - # minhash collector - replies = [collector.get_size.remote() for collector in minhash_collectors] - while replies: - ready, not_ready = ray.wait(replies) - m_amount, m_memory = ray.get(ready)[0] - self.sum_mh += m_amount - self.sum_mh_mem += m_memory - replies = not_ready - for collector in minhash_collectors: - ray.kill(actor=collector, no_restart=True) - # Clean up processors - for processor in bucket_processors_list: - ray.kill(actor=processor, no_restart=True) - ray.kill(bucket_processor_invoker) - - def _preprocess_tables( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - files: list[str], - mn_min_hash: MurmurMH, - num_buckets: int, - length_bucket: int, - bucket_collectors: list[ActorHandle], - minhash_collectors: list[ActorHandle], - random_delay_limit: int, - ) -> None: - """ - Preprocess tables - build, run and cleanup - :param data_access_factory - data access factory - :param statistics - statistics actor - :param files - list of files to process - :param mn_min_hash - MurmurMH class - :param num_buckets - number of buckets - :param length_bucket - bucket length - :param bucket_collectors - bucket collector actors - :param minhash_collectors - minhash_collector actors - :param random_delay_limit - max for random dalay limit - :return: None - """ - from ray.util.metrics import Gauge - - worker_options = self.params.get("worker_options", None) - # Here we are limiting the number of readers not to overwhelm COS - n_readers = self.params.get("num_preprocessors", 1) - if n_readers > 1000: - n_readers = 1000 - self.logger.info(f"Table preprocessing uses {n_readers} readers") - # Create preprocessing actors - processor_params = { - "data_access_factory": data_access_factory, - "transform_class": FdedupTransform, - "statistics": statistics, - "transform_params": { - "doc_column": self.params.get("doc_column", ""), - "doc_id_int_column": self.params.get("id_column", ""), - "word_shingle_size": self.params.get("world_shingle_size", 1), - "mn_min_hash": mn_min_hash, - "num_bands": num_buckets, - "length_band": length_bucket, - "remote_buckets": bucket_collectors, - "remote_minhashes": minhash_collectors, - "delimiter": self.params.get("delimiter", " "), - "random_delay_limit": random_delay_limit, - }, - "base_table_stats": False, - } - processors_list = RayUtils.create_actors( - clazz=RayTransformFileProcessor, - params=processor_params, - actor_options=worker_options, - n_actors=n_readers, - ) - self.logger.info(f"created {len(processors_list)} table processor actors") - # Execute preprocessing - # create gauges - files_in_progress_gauge = Gauge( - "preprocessing_files_in_progress", "Number of files in progress, preprocessing" - ) - files_completed_gauge = Gauge( - "preprocessing_files_processed_total", "Number of files completed, preprocessing" - ) - available_cpus_gauge = Gauge("preprocessing_available_cpus", "Number of available CPUs, preprocessing") - available_gpus_gauge = Gauge("preprocessing_available_gpus", "Number of available GPUs, preprocessing") - available_memory_gauge = Gauge("preprocessing_available_memory", "Available memory, preprocessing") - available_object_memory_gauge = Gauge( - "preprocessing_available_object_store", "Available object store, preprocessing" - ) - print_interval = int(len(files) / 100) - if print_interval == 0: - print_interval = 1 - # process data - processors = ActorPool(processors_list) - failures = RayUtils.process_files( - executors=processors, - files=files, - print_interval=print_interval, - files_in_progress_gauge=files_in_progress_gauge, - files_completed_gauge=files_completed_gauge, - available_cpus_gauge=available_cpus_gauge, - available_gpus_gauge=available_gpus_gauge, - available_memory_gauge=available_memory_gauge, - object_memory_gauge=available_object_memory_gauge, - logger=self.logger, - ) - if failures > 0: - statistics.add_stats.remote({"actor failures": failures}) - # Clean up processors - for processor in processors_list: - ray.kill(actor=processor, no_restart=True) - del processors - - def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]: - """ - Compute execution statistics - :param stats: output of statistics - :return: job execution statistics - """ - # Get document collector statistics - sum_docs = 0 - sum_docs_mem = 0 - sum_removed = 0 - sum_removed_mem = 0 - replies = [collector.get_size.remote() for collector in self.document_collectors] - while replies: - ready, not_ready = ray.wait(replies) - d_amount, d_memory, r_amount, r_memory = ray.get(ready)[0] - sum_docs += d_amount - sum_docs_mem += d_memory - sum_removed += r_amount - sum_removed_mem += r_memory - replies = not_ready - overall_hash_memory = self.sum_buckets_mem + self.sum_mh_mem + sum_docs_mem + sum_docs_mem + sum_removed_mem - dedup_prst = 100 * (1.0 - stats.get("result_documents", 1) / stats.get("source_documents", 1)) - return { - "number of buckets": self.sum_buckets, - "number of docs": sum_docs, - "number of removed docs": sum_removed, - "number of min hashes": self.sum_mh, - "overall hash memory GB": overall_hash_memory, - "de duplication %": dedup_prst, - } | stats - - -class FdedupTableTransformConfiguration(TransformConfiguration): - """ - Provides support for configuring and using the associated Transform class include - configuration with CLI args and combining of metadata. - """ - - def __init__(self): - super().__init__( - name=short_name, - transform_class=FdedupFilter, - ) - from data_processing.utils import get_logger - - self.logger = get_logger(__name__) - - def add_input_params(self, parser: ArgumentParser) -> None: - """ - Add Transform-specific arguments to the given parser. - """ - parser.add_argument(f"--{cli_prefix}doc_column", type=str, default="contents", help="document column name") - parser.add_argument( - f"--{cli_prefix}id_column", type=str, default="int_document_id", help="integer document id column name" - ) - parser.add_argument(f"--{cli_prefix}cluster_column", type=str, default="cluster", help="cluster column name") - parser.add_argument( - f"--{cli_prefix}bucket_cpu", type=float, default=0.5, help="number of CPUs per bucket hash" - ) - parser.add_argument( - f"--{cli_prefix}mhash_cpu", type=float, default=0.5, help="number of CPUs per minhash hash" - ) - parser.add_argument(f"--{cli_prefix}doc_cpu", type=float, default=0.5, help="number of CPUs per doc hash") - parser.add_argument(f"--{cli_prefix}num_doc_actors", type=int, default=1, help="number of doc actors to use") - parser.add_argument( - f"--{cli_prefix}num_minhash_actors", type=int, default=1, help="number of minhash actors to use" - ) - parser.add_argument( - f"--{cli_prefix}num_bucket_actors", type=int, default=1, help="number of bucket actors to use" - ) - parser.add_argument( - f"--{cli_prefix}num_preprocessors", type=int, default=1, help="number of preprocessors to use" - ) - parser.add_argument(f"--{cli_prefix}num_permutations", type=int, default=64, help="number of permutations") - parser.add_argument(f"--{cli_prefix}threshold", type=float, default=0.8, help="threshold") - parser.add_argument(f"--{cli_prefix}shingles_size", type=int, default=5, help="number of words in shingle") - parser.add_argument( - f"--{cli_prefix}delimiters", type=str, default=" ", help="delimiter for splitting document" - ) - parser.add_argument(f"--{cli_prefix}snapshot_delay", type=int, default=1, help="snapshot delay time") - parser.add_argument( - f"--{cli_prefix}use_bucket_snapshot", - type=lambda x: bool(str2bool(x)), - default=False, - help="flag to continue with bucket snapshot", - ) - parser.add_argument( - f"--{cli_prefix}use_doc_snapshot", - type=lambda x: bool(str2bool(x)), - default=False, - help="flag to continue with doc snapshot", - ) - parser.add_argument( - f"--{cli_prefix}random_delay_limit", type=int, default=10, help="maximum delay between read" - ) - - def apply_input_params(self, args: Namespace) -> bool: - """ - Validate and apply the arguments that have been parsed - :param args: user defined arguments. - :return: True, if validate pass or False otherwise - """ - captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) - self.params = self.params | captured - self.params["worker_options"] = args.runtime_worker_options - if self.params["use_bucket_snapshot"] and self.params["use_doc_snapshot"]: - self.logger.warning("both bucket and doc snapshot are specified. Only one allowed") - return False - - self.logger.info(f"fuzzy dedup params are {self.params}") - return True - - -class FdedupRayTransformConfiguration(RayTransformRuntimeConfiguration): - def __init__(self): - super().__init__(transform_config=FdedupTableTransformConfiguration(), runtime_class=FdedupRuntime) - - -if __name__ == "__main__": - launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) - launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py new file mode 100644 index 000000000..64f492584 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py @@ -0,0 +1,54 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # execution info + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py new file mode 100644 index 000000000..bc3c0d991 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() From 0c31dc07a06942b3b6eb73cc29a62f512f4c7a00 Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 11 Oct 2024 12:25:46 -0400 Subject: [PATCH 13/91] Fixed bug in ray to distribute docs to remove file to all workers Signed-off-by: nelson --- .../python/src/data_cleaning_transform.py | 4 +-- .../ray/src/data_cleaning_transform_ray.py | 26 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index f03b6c1d0..05b18cc8b 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -110,10 +110,10 @@ class DataCleaningTransformConfiguration(TransformConfiguration): configuration with CLI args. """ - def __init__(self): + def __init__(self, transform_class: type[AbstractTableTransform] = DataCleaningTransform): super().__init__( name=short_name, - transform_class=DataCleaningTransform, + transform_class=transform_class, ) self.logger = get_logger(__name__, level="INFO") diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py index 9fdb220f7..831a6c9c2 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -16,9 +16,8 @@ from data_cleaning_transform import ( DataCleaningTransform, DataCleaningTransformConfiguration, - docs2remove_list, - docs2remove_list_key, - get_docs_to_remove, + duplicate_list_location_default, + duplicate_list_location_key, ) from data_processing.data_access import DataAccessFactoryBase from data_processing.utils import CLIArgumentProvider, get_logger @@ -45,16 +44,15 @@ def __init__(self, config: dict): by the companion runtime, LangSelectorTransformRuntime. If running inside the RayMutatingDriver, these will be provided by that class with help from the RayMutatingDriver. """ - docs2remove = config.get(docs2remove_list_key, None) - if docs2remove is not None: + docs2removedf = config.get("df", None) + if docs2removedf is not None: # This is recommended for production approach. In this case domain list is build by the # runtime once, loaded to the object store and can be accessed by actors without additional reads try: - - config[docs2remove_list_key] = ray.get(config.get(docs2remove_list_key)) + config["df"] = ray.get(config.get("df")) except Exception as e: - self.logger.warning(f"Exception loading languages list from ray object storage {e}") - raise RuntimeError(f"exception loading from object storage for key {docs2remove}") + self.logger.warning(f"Exception loading docs2remove list from ray object storage {e}") + raise RuntimeError(f"exception loading from object storage for key {docs2removedf}") super().__init__(config) @@ -90,9 +88,13 @@ def get_transform_config( :param files - list of files to remove :return: dictionary of filter init params """ - docs_to_remove = get_docs_to_remove(self.params) - docs_to_remove_list = ray.put(docs_to_remove) - return {docs2remove_list_key: docs_to_remove_list} | self.params + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + data_access = data_access_factory.create_data_access() + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + duplicate_list, retries = data_access.get_file(duplicate_list_location) + docs_to_remove_list = ray.put(duplicate_list) + return {"df": docs_to_remove_list} | self.params class DataCleaningRayTransformConfiguration(RayTransformRuntimeConfiguration): From 6ee6695c1ef5d494935c42207dce0d5e0ccd151f Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:05:39 +0100 Subject: [PATCH 14/91] added folder_transform --- .../pure_python/transform_file_processor.py | 15 ++++-- .../pure_python/transform_orchestrator.py | 42 ++++++++++------ .../runtime/transform_file_processor.py | 41 ++++++++------- .../src/data_processing/transform/__init__.py | 2 + .../transform/abstract_transform.py | 16 ++++++ .../transform/binary_transform.py | 5 +- .../transform/folder_transform.py | 50 +++++++++++++++++++ .../runtime/ray/transform_file_processor.py | 1 + .../runtime/ray/transform_orchestrator.py | 19 ++++--- .../runtime/spark/transform_file_processor.py | 5 +- .../runtime/spark/transform_orchestrator.py | 25 +++++++--- 11 files changed, 168 insertions(+), 53 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/transform/abstract_transform.py create mode 100644 data-processing-lib/python/src/data_processing/transform/folder_transform.py diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index 143835dd0..fa3e69e4a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -14,7 +14,7 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime import AbstractTransformFileProcessor -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractTransform, TransformStatistics from data_processing.utils import UnrecoverableException @@ -28,7 +28,8 @@ def __init__( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ): """ Init method @@ -36,11 +37,13 @@ def __init__( :param statistics - reference to statistics class :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag """ # invoke superclass super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) self.transform_params["statistics"] = statistics # Create local processor @@ -52,7 +55,8 @@ def __init__( # Create statistics self.stats = statistics - def _publish_stats(self, stats: dict[str, Any]) -> None: + +def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) @@ -65,17 +69,20 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool ): """ Init method :param data_access_factory - data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder tranform flag """ super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) # Add data access and statistics to the processor parameters self.transform_params["data_access"] = self.data_access diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 8692da29e..153eaaf0a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,7 +24,7 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger @@ -48,8 +48,6 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } - - def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -74,15 +72,21 @@ def orchestrate( return 1 # create additional execution parameters runtime = runtime_config.create_transform_runtime() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - if retries > 0: - statistics.add_stats({"data access retries": retries}) - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + if retries > 0: + statistics.add_stats({"data access retries": retries}) + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -99,6 +103,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) else: # using sequential execution @@ -111,6 +116,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) status = "success" return_code = 0 @@ -157,7 +163,8 @@ def _process_transforms( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ) -> None: """ Process transforms sequentially @@ -167,9 +174,8 @@ def _process_transforms( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag :return: metadata for the execution - - :return: None """ # create executor executor = PythonTransformFileProcessor( @@ -177,6 +183,7 @@ def _process_transforms( statistics=statistics, transform_params=transform_params, transform_class=transform_class, + is_folder=is_folder, ) # process data t_start = time.time() @@ -203,6 +210,7 @@ def _process_transforms_multiprocessor( data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], transform_class: type[AbstractBinaryTransform], + is_folder: bool ) -> TransformStatistics: """ Process transforms using multiprocessing pool @@ -212,13 +220,17 @@ def _process_transforms_multiprocessor( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform class :return: metadata for the execution """ # result statistics statistics = TransformStatistics() # create processor processor = PythonPoolTransformFileProcessor( - data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class + data_access_factory=data_access_factory, + transform_params=transform_params, + transform_class=transform_class, + is_folder=is_folder, ) completed = 0 t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index d4ec548d8..1d268875f 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -26,11 +26,13 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_parameters: dict[str, Any], + is_folder: bool = False, ): """ Init method :param data_access_factory: Data Access Factory :param transform_parameters: Transform parameters + :param is_folder: folder transform flag """ self.logger = get_logger(__name__) # validate parameters @@ -46,6 +48,7 @@ def __init__( # Add data access and statistics to the processor parameters self.transform_params = transform_parameters self.transform_params["data_access"] = self.data_access + self.is_folder = is_folder def process_file(self, f_name: str) -> None: """ @@ -58,25 +61,29 @@ def process_file(self, f_name: str) -> None: self.logger.warning("No data_access found. Returning.") return t_start = time.time() - # Read source file - filedata, retries = self.data_access.get_file(path=f_name) - if retries > 0: - self._publish_stats({"data access retries": retries}) - if filedata is None: - self.logger.warning(f"File read resulted in None for {f_name}. Returning.") - self._publish_stats({"failed_reads": 1}) - return - self._publish_stats({"source_files": 1, "source_size": len(filedata)}) + if not self.is_folder: + # Read source file only if we are processing file + filedata, retries = self.data_access.get_file(path=f_name) + if retries > 0: + self._publish_stats({"data access retries": retries}) + if filedata is None: + self.logger.warning(f"File read resulted in None for {f_name}. Returning.") + self._publish_stats({"failed_reads": 1}) + return + self._publish_stats({"source_files": 1, "source_size": len(filedata)}) # Process input file try: - # execute local processing - name_extension = TransformUtils.get_file_extension(f_name) self.logger.debug(f"Begin transforming file {f_name}") - out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + if not self.is_folder: + # execute local processing + out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + name_extension = TransformUtils.get_file_extension(f_name) + self.last_file_name = name_extension[0] + self.last_file_name_next_index = None + self.last_extension = name_extension[1] + else: + out_files, stats = self.transform.transform(folder_name=f_name) self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") - self.last_file_name = name_extension[0] - self.last_file_name_next_index = None - self.last_extension = name_extension[1] # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) # Process unrecoverable exceptions @@ -95,10 +102,10 @@ def flush(self) -> None: the hook for them to return back locally stored data and their statistics. :return: None """ - if self.last_file_name is None: + if self.last_file_name is None or self.is_folder: # for some reason a given worker never processed anything. Happens in testing # when the amount of workers is greater than the amount of files - self.logger.debug("skipping flush, no name for file is defined") + self.logger.debug("skipping flush, no name for file is defined or this is a folder transform") return try: t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py index 6af43ad60..20254e47b 100644 --- a/data-processing-lib/python/src/data_processing/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/transform/__init__.py @@ -1,3 +1,5 @@ +from data_processing.transform.abstract_transform import AbstractTransform +from data_processing.transform.folder_transform import AbstractFolderTransform from data_processing.transform.binary_transform import AbstractBinaryTransform from data_processing.transform.table_transform import AbstractTableTransform from data_processing.transform.transform_statistics import TransformStatistics diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py new file mode 100644 index 000000000..89db70f42 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py @@ -0,0 +1,16 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +class AbstractTransform: + """ + Base class for all transform types + """ \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py index 80dff61ea..b313aff2f 100644 --- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py @@ -10,10 +10,11 @@ # limitations under the License. ################################################################################ -from typing import Any, TypeVar +from typing import Any +from data_processing.transform import AbstractTransform -class AbstractBinaryTransform: +class AbstractBinaryTransform(AbstractTransform): """ Converts input binary file to output file(s) (binary) Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py new file mode 100644 index 000000000..866e3286f --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from data_processing.data_access import data_access +from data_processing.transform import AbstractTransform + + +class AbstractFolderTransform(AbstractTransform): + """ + Converts input folder to output file(s) (binary) + Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or + more new binary files and metadata. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This simply stores the given instance in this instance for later use. + """ + self.config = config + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the extension to be used when writing out the new bytes. + """ + raise NotImplemented() + + @staticmethod + def get_folders(data_access:data_access) -> list(str): + """ + Compute the list of folders to use. + :param data_access - data access class + :return: + """ + raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py index e1fabb144..cdad1309f 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py @@ -35,6 +35,7 @@ def __init__(self, params: dict[str, Any]): super().__init__( data_access_factory=params.get("data_access_factory", None), transform_parameters=dict(params.get("transform_params", {})), + is_folder=params.get("is_folder", False) ) # Create statistics self.stats = params.get("statistics", None) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 42eba47a6..8276eb56c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -16,6 +16,7 @@ import ray from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import AbstractFolderTransform from data_processing_ray.runtime.ray import ( RayTransformExecutionConfiguration, RayTransformFileProcessor, @@ -56,13 +57,18 @@ def orchestrate( # create transformer runtime runtime = runtime_config.create_transform_runtime() resources = RayUtils.get_cluster_resources() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -84,6 +90,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), "statistics": statistics, + "is_folder": is_folder, } logger.debug("Creating actors") processors = RayUtils.create_actors( diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py index d63664ac4..a0968ab1d 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py @@ -29,12 +29,15 @@ def __init__( data_access_factory: DataAccessFactoryBase, runtime_configuration: SparkTransformRuntimeConfiguration, statistics: TransformStatistics, + is_folder: bool, ): """ Init method """ super().__init__( - data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params() + data_access_factory=data_access_factory, + transform_parameters=runtime_configuration.get_transform_params(), + is_folder=is_folder, ) # Add data access ant statistics to the processor parameters self.runtime_configuration = runtime_configuration diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index c279f2b73..c534b685f 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -18,7 +18,7 @@ import yaml from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics +from data_processing.transform import TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger from data_processing_spark.runtime.spark import ( SparkTransformExecutionConfiguration, @@ -117,7 +117,10 @@ def process_partition(iterator): runtime = runtime_conf.create_transform_runtime() # create file processor file_processor = SparkTransformFileProcessor( - data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics + data_access_factory=d_access_factory, + runtime_configuration=runtime_conf, + statistics=statistics, + is_folder=is_folder, ) first = True for f in iterator: @@ -144,13 +147,19 @@ def process_partition(iterator): return list(statistics.get_execution_stats().items()) num_partitions = 0 + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # process data logger.debug("Begin processing files") # process files split by partitions From e7260ba32d4d3dc1ab7a4e8d23fa302efdc8b18e Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:13:01 +0100 Subject: [PATCH 15/91] added folder_transform --- .../runtime/pure_python/transform_orchestrator.py | 2 +- .../python/src/data_processing/transform/folder_transform.py | 4 ++-- .../data_processing_ray/runtime/ray/transform_orchestrator.py | 2 +- .../runtime/spark/transform_orchestrator.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 153eaaf0a..d51f80a8a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 866e3286f..eca191bbb 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -41,10 +41,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str raise NotImplemented() @staticmethod - def get_folders(data_access:data_access) -> list(str): + def get_folders(d_access: data_access) -> list(str): """ Compute the list of folders to use. - :param data_access - data access class + :param d_access - data access class :return: """ raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 8276eb56c..a8ff95729 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index c534b685f..4a0897952 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -151,7 +151,7 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process From 5856f3f54137ae225b8cbdf07add9eaf20ed38b2 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 21:00:43 +0100 Subject: [PATCH 16/91] added folder_transform --- .../runtime/pure_python/transform_file_processor.py | 3 +-- .../runtime/pure_python/transform_orchestrator.py | 11 ++++++----- .../runtime/pure_python/transform_runtime.py | 10 +++++++++- .../data_processing/transform/folder_transform.py | 12 +----------- .../runtime/ray/transform_orchestrator.py | 2 +- .../runtime/ray/transform_runtime.py | 10 +++++++++- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index fa3e69e4a..44ccd0ef0 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -55,8 +55,7 @@ def __init__( # Create statistics self.stats = statistics - -def _publish_stats(self, stats: dict[str, Any]) -> None: + def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index d51f80a8a..812be8caf 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,14 +24,13 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform +from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger logger = get_logger(__name__) -@staticmethod def _execution_resources() -> dict[str, Any]: """ Get Execution resource @@ -48,6 +47,7 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } + def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process @@ -145,7 +145,8 @@ def orchestrate( "job_input_params": input_params | data_access_factory.get_input_params() | execution_config.get_input_params(), - "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, + "execution_stats": _execution_resources() | + {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, "job_output_stats": stats, } logger.debug(f"Saving job metadata: {metadata}.") @@ -209,7 +210,7 @@ def _process_transforms_multiprocessor( print_interval: int, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], is_folder: bool ) -> TransformStatistics: """ diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py index 4173154ae..478d40837 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] ) -> dict[str, Any]: diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index eca191bbb..9a2fb3713 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -11,7 +11,6 @@ ################################################################################ from typing import Any -from data_processing.data_access import data_access from data_processing.transform import AbstractTransform @@ -38,13 +37,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str to metadata. Each element of the return list, is a tuple of the transformed bytes and a string holding the extension to be used when writing out the new bytes. """ - raise NotImplemented() - - @staticmethod - def get_folders(d_access: data_access) -> list(str): - """ - Compute the list of folders to use. - :param d_access - data access class - :return: - """ - raise NotImplemented() + raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index a8ff95729..b29682997 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py index 57f071406..64479302c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from ray.actor import ActorHandle @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] ) -> dict[str, Any]: From 6519686320fb2e76d03d9079b2b59b24be42b6cd Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 08:48:00 +0100 Subject: [PATCH 17/91] added folder_transform --- .../runtime/spark/transform_orchestrator.py | 3 ++- .../runtime/spark/transform_runtime.py | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index 4a0897952..096fab272 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -151,7 +151,8 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + runtime = runtime_config.create_transform_runtime() + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index 7b968b1e9..7410d09d1 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics ) -> dict[str, Any]: From c728224a5e3396ebe5d71dddb1b23a7a4b64ae7c Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 15:35:00 +0100 Subject: [PATCH 18/91] added noop testing --- .../runtime/transform_file_processor.py | 44 +++++--- .../test_support/transform/__init__.py | 13 ++- .../transform/noop_folder_transform.py | 105 ++++++++++++++++++ .../test_support/transform/noop_transform.py | 6 +- .../transform/folder_transform.py | 2 +- .../transform/transform_configuration.py | 6 +- .../transform/test_folders_noop.py | 33 ++++++ .../launch/ray/ray_test_noop_launch.py | 6 - .../ededup/ray/src/ededup_transform_ray.py | 9 +- 9 files changed, 187 insertions(+), 37 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py create mode 100644 data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index 1d268875f..4075f40be 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -83,6 +83,7 @@ def process_file(self, f_name: str) -> None: self.last_extension = name_extension[1] else: out_files, stats = self.transform.transform(folder_name=f_name) + self.last_file_name = f_name self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) @@ -148,15 +149,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats ) case 1: # we have exactly 1 output file - file_ext = out_files[0] - lfn = self.last_file_name - if self.last_file_name_next_index is not None: - lfn = f"{lfn}_{self.last_file_name_next_index}" - output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}") + if self.is_folder: + # its folder + output_name = out_files[0][1] + dt = out_files[0][0] + else: + file_ext = out_files[0] + lfn = self.last_file_name + if self.last_file_name_next_index is not None: + lfn = f"{lfn}_{self.last_file_name_next_index}" + output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}") + dt = file_ext[0] self.logger.debug( f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}" ) - save_res, retries = self.data_access.save_file(path=output_name, data=file_ext[0]) + save_res, retries = self.data_access.save_file(path=output_name, data=dt) if retries > 0: self._publish_stats({"data access retries": retries}) if save_res is None: @@ -166,7 +173,7 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats self._publish_stats( { "result_files": 1, - "result_size": len(file_ext[0]), + "result_size": len(dt), "processing_time": time.time() - t_start, } ) @@ -183,14 +190,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats start_index = 0 count = len(out_files) for index in range(count): - file_ext = out_files[index] - output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}" - file_sizes += len(file_ext[0]) - self.logger.debug( - f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} " - f"of {count} to {output_name_indexed}" - ) - save_res, retries = self.data_access.save_file(path=output_name_indexed, data=file_ext[0]) + if self.is_folder: + # its a folder + output_name_indexed = out_files[index][1] + dt = out_files[index][0] + else: + # files + file_ext = out_files[index] + output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}" + self.logger.debug( + f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} " + f"of {count} to {output_name_indexed}" + ) + dt = file_ext[0] + file_sizes += len(dt) + save_res, retries = self.data_access.save_file(path=output_name_indexed, data=dt) if retries > 0: self._publish_stats({"data access retries": retries}) if save_res is None: diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py index 0e90f7ffd..04d6f3b0f 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py @@ -1,6 +1,11 @@ -from .table_transform_test import AbstractTableTransformTest -from .binary_transform_test import AbstractBinaryTransformTest -from .noop_transform import ( +from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest +from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest +from data_processing.test_support.transform.noop_transform import ( NOOPTransform, - NOOPPythonTransformConfiguration, + NOOPTransformConfiguration, + NOOPPythonTransformConfiguration ) +from data_processing.test_support.transform.noop_folder_transform import ( + NOOPFolderTransform, + NOOPFolderPythonTransformConfiguration +) \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py new file mode 100644 index 000000000..5baab7858 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py @@ -0,0 +1,105 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, + DefaultPythonTransformRuntime) +from data_processing.transform import AbstractFolderTransform +from data_processing.utils import get_logger +from data_processing.test_support.transform import NOOPTransformConfiguration + + +logger = get_logger(__name__) + + +class NOOPFolderTransform(AbstractFolderTransform): + """ + Implements a simple copy of a pyarrow Table. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, NOOPTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + # Make sure that the param name corresponds to the name used in apply_input_params method + # of NOOPTransformConfiguration class + super().__init__(config) + self.sleep = config.get("sleep_sec", 1) + self.data_access = config.get("data_access") + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the file name to use. + """ + logger.debug(f"Transforming one folder {folder_name}") + metadata = {} + # get folder files + files, retries = self.data_access.get_folder_files(path=folder_name) + if retries > 0: + metadata |= {"data access retries": retries} + result = [()] * len(files) + index = 0 + for name, file in files.items(): + result[index] = (file, self.data_access.get_output_location(name)) + if self.sleep is not None: + logger.info(f"Sleep for {self.sleep} seconds") + time.sleep(self.sleep) + logger.info("Sleep completed - continue") + index += 1 + # Add some sample metadata. + metadata |= {"nfiles": len(files)} + return result, metadata + + +class NOOPFolderPythonRuntime(DefaultPythonTransformRuntime): + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + return [data_access.get_input_folder()] + + +class NOOPFolderPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderPythonRuntime) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py index 0dee013a4..2fea35506 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py @@ -19,7 +19,7 @@ from data_processing.runtime.pure_python.runtime_configuration import ( PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.transform import AbstractTableTransform, TransformConfiguration, AbstractTransform from data_processing.utils import CLIArgumentProvider, get_logger @@ -75,10 +75,10 @@ class NOOPTransformConfiguration(TransformConfiguration): configuration with CLI args. """ - def __init__(self): + def __init__(self, clazz: type[AbstractTransform] = NOOPTransform): super().__init__( name=short_name, - transform_class=NOOPTransform, + transform_class=clazz, remove_from_metadata=[pwd_key], ) diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 9a2fb3713..caa3bfa52 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -35,6 +35,6 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str :param folder_name: the name of the folder containing arbitrary amount of files. :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated to metadata. Each element of the return list, is a tuple of the transformed bytes and a string - holding the extension to be used when writing out the new bytes. + holding the file name to use. """ raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py index 033e92f2a..a5c9ec9ad 100644 --- a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py +++ b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py @@ -13,7 +13,7 @@ from argparse import ArgumentParser from typing import Any -from data_processing.transform import AbstractBinaryTransform +from data_processing.transform import AbstractTransform from data_processing.utils import CLIArgumentProvider @@ -23,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider): """ def __init__( - self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = [] + self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = [] ): """ Initialization @@ -36,7 +36,7 @@ def __init__( self.remove_from_metadata = remove_from_metadata self.params = {} - def get_transform_class(self) -> type[AbstractBinaryTransform]: + def get_transform_class(self) -> type[AbstractTransform]: """ Get the class extending AbstractBinaryTransform which implements a specific transformation. The class will generally be instantiated with a dictionary of configuration produced by diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py new file mode 100644 index 000000000..e0fdd86c8 --- /dev/null +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.transform import NOOPFolderPythonTransformConfiguration + + +class TestRayNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../../../test-data/data_processing/python/noop/" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration()) + fixtures = [(launcher, {"noop_sleep_sec": 0}, basedir + "/input", basedir + "/expected")] + return fixtures diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py index d4cc874f0..e706a4dfa 100644 --- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py @@ -12,7 +12,6 @@ import os -import pyarrow as pa from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) @@ -20,11 +19,6 @@ from data_processing_ray.test_support.transform import NOOPRayTransformConfiguration -table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])}) -expected_table = table # We're a noop after all. -expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result - - class TestRayNOOPTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. diff --git a/transforms/universal/ededup/ray/src/ededup_transform_ray.py b/transforms/universal/ededup/ray/src/ededup_transform_ray.py index c0823a22e..d90dfa780 100644 --- a/transforms/universal/ededup/ray/src/ededup_transform_ray.py +++ b/transforms/universal/ededup/ray/src/ededup_transform_ray.py @@ -149,13 +149,12 @@ def _load_snapshots(self, data_access_factory: DataAccessFactoryBase, statistics statistics.add_stats.remote({"data access retries": retries}) self.logger.info(f"Found the following snapshot files {files.keys()}") # process snapshot files - for file in files.keys(): - # load the file + for file in files.values(): + # convert the file try: - b_hashes, _ = data_access.get_file(file) - snaps = pickle.loads(b_hashes) + snaps = pickle.loads(file) except Exception as e: - self.logger.warning(f"Failed to load hashes from file {file} with exception {e}") + self.logger.warning(f"Failed to load hashes with exception {e}") raise UnrecoverableException("failed to load hashes") request = [[] for _ in range(len(self.filters))] for h in snaps: From 6e2863a319716c513aa5f1bafa00a363089d2685 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sun, 13 Oct 2024 08:53:49 +0100 Subject: [PATCH 19/91] added noop Ray testing --- .../runtime/ray/transform_orchestrator.py | 6 +- .../test_support/transform/__init__.py | 1 + .../transform/noop_folder_transform.py | 57 +++++++++++++++++++ .../test_support/transform/noop_transform.py | 4 +- .../launch/ray/ray_test_noop_folder_launch.py | 33 +++++++++++ 5 files changed, 95 insertions(+), 6 deletions(-) create mode 100644 data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py create mode 100644 data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index b29682997..da39cbcf7 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -68,6 +68,9 @@ def orchestrate( if len(files) == 0: logger.error("No input files to process - exiting") return 0 + # log retries + if retries > 0: + statistics.add_stats.remote({"data access retries": retries}) logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) @@ -79,9 +82,6 @@ def orchestrate( logger.info( f"Number of workers - {preprocessing_params.n_workers} " f"with {preprocessing_params.worker_options} each" ) - # log retries - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) # create executors processor_params = { "data_access_factory": data_access_factory, diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py index a6cd700f7..dd095c961 100644 --- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py @@ -1 +1,2 @@ from data_processing_ray.test_support.transform.noop_transform import NOOPRayTransformConfiguration +from data_processing_ray.test_support.transform.noop_folder_transform import NOOPFolderRayTransformConfiguration diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py new file mode 100644 index 000000000..9919600c4 --- /dev/null +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py @@ -0,0 +1,57 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + + +from data_processing.test_support.transform import NOOPTransformConfiguration +from data_processing.test_support.transform import NOOPFolderTransform +from data_processing.utils import get_logger +from data_processing_ray.runtime.ray import ( + RayTransformLauncher, + RayTransformRuntimeConfiguration, + DefaultRayTransformRuntime +) +from data_processing.data_access import DataAccess + + +logger = get_logger(__name__) + + +class NOOPFolderPythonRuntime(DefaultRayTransformRuntime): + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + return [data_access.get_input_folder()] + + +class NOOPFolderRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderPythonRuntime) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(NOOPFolderRayTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py index 67cf20253..a2082c48c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py @@ -11,9 +11,7 @@ ################################################################################ -from data_processing.test_support.transform.noop_transform import ( - NOOPTransformConfiguration, -) +from data_processing.test_support.transform import NOOPTransformConfiguration from data_processing.utils import get_logger from data_processing_ray.runtime.ray import ( RayTransformLauncher, diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py new file mode 100644 index 000000000..cd61c6745 --- /dev/null +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing_ray.test_support.transform import NOOPFolderRayTransformConfiguration + + +class TestRayNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../../../../test-data/data_processing/ray/noop/" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + launcher = RayTransformLauncher(NOOPFolderRayTransformConfiguration()) + fixtures = [(launcher, {"noop_sleep_sec": 0, "run_locally": True}, basedir + "/input", basedir + "/expected")] + return fixtures From 3c9be57d656eee4fbda6b1d41849894249e167d8 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sun, 13 Oct 2024 09:07:48 +0100 Subject: [PATCH 20/91] added noop Spark testing --- .../transform/noop_folder_transform.py | 7 ++- .../test_support/transform/__init__.py | 1 + .../transform/noop_folder_transform.py | 53 +++++++++++++++++++ .../launch/spark/test_noop_folder_launch.py | 34 ++++++++++++ 4 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py create mode 100644 data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py index 9919600c4..1d084b58a 100644 --- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py @@ -11,8 +11,7 @@ ################################################################################ -from data_processing.test_support.transform import NOOPTransformConfiguration -from data_processing.test_support.transform import NOOPFolderTransform +from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration from data_processing.utils import get_logger from data_processing_ray.runtime.ray import ( RayTransformLauncher, @@ -25,7 +24,7 @@ logger = get_logger(__name__) -class NOOPFolderPythonRuntime(DefaultRayTransformRuntime): +class NOOPFolderRayRuntime(DefaultRayTransformRuntime): def get_folders(self, data_access: DataAccess) -> list[str]: """ Get folders to process @@ -47,7 +46,7 @@ def __init__(self): Initialization """ super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), - runtime_class=NOOPFolderPythonRuntime) + runtime_class=NOOPFolderRayRuntime) if __name__ == "__main__": diff --git a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py index 83516f9ae..041cb43d6 100644 --- a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py +++ b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py @@ -11,3 +11,4 @@ ################################################################################ from data_processing_spark.test_support.transform.noop_transform import NOOPSparkTransformConfiguration +from data_processing_spark.test_support.transform.noop_folder_transform import NOOPFolderSparkTransformConfiguration diff --git a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py new file mode 100644 index 000000000..9972e0f79 --- /dev/null +++ b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py @@ -0,0 +1,53 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import SparkTransformLauncher +from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime +from data_processing.data_access import DataAccess + + +logger = get_logger(__name__) + + +class NOOPFolderSparkRuntime(DefaultSparkTransformRuntime): + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + return [data_access.get_input_folder()] + + +class NOOPFolderSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderSparkRuntime) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=NOOPFolderSparkTransformConfiguration()) + logger.info("Launching noop transform") + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py b/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py new file mode 100644 index 000000000..c8e3ce40b --- /dev/null +++ b/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py @@ -0,0 +1,34 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher +from data_processing_spark.test_support.transform import NOOPFolderSparkTransformConfiguration + + +class TestSparkNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../../../../test-data" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + fixtures = [] + launcher = SparkTransformLauncher(NOOPFolderSparkTransformConfiguration()) + fixtures.append((launcher, {"noop_sleep_sec": 1}, basedir + "/input", basedir + "/expected")) + return fixtures From 371a7124c1570270fd692249dd2e601c4b3476c8 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sun, 13 Oct 2024 10:03:21 +0100 Subject: [PATCH 21/91] more data access simplifications --- .../src/data_processing/data_access/data_access.py | 5 ++++- .../data_processing/data_access/data_access_local.py | 11 ----------- .../src/data_processing/data_access/data_access_s3.py | 11 ----------- 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access.py b/data-processing-lib/python/src/data_processing/data_access/data_access.py index bba5afd2b..51d7b54b8 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access.py @@ -358,7 +358,10 @@ def get_output_location(self, path: str) -> str: :param path: input file location :return: output file location """ - raise NotImplementedError("Subclasses should implement this!") + if self.get_output_folder() is None: + self.logger.error("Get out put location. S3 configuration is not provided, returning None") + return None + return path.replace(self.get_input_folder(), self.get_output_folder()) def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]: """ diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access_local.py b/data-processing-lib/python/src/data_processing/data_access/data_access_local.py index 224e30ce8..d37e571a3 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access_local.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access_local.py @@ -130,17 +130,6 @@ def get_table(self, path: str) -> tuple[pa.table, int]: logger.error(f"Error reading table from {path}: {e}") return None, 0 - def get_output_location(self, path: str) -> str: - """ - Get output location based on input - :param path: input file location - :return: output file location - """ - if self.output_folder is None: - logger.error("Get output location. local configuration is not defined, returning None") - return None - return path.replace(self.input_folder, self.output_folder) - def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]: """ Saves a pyarrow table to a file and returns information about the operation. diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py b/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py index 43e13bcb1..8ddc772c5 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py @@ -126,17 +126,6 @@ def get_table(self, path: str) -> tuple[pyarrow.table, int]: self.logger.error(f"Exception reading table {path} from S3 - {e}") return None, 0 - def get_output_location(self, path: str) -> str: - """ - Get output location based on input - :param path: input file location - :return: output file location - """ - if self.output_folder is None: - self.logger.error("Get out put location. S3 configuration is not provided, returning None") - return None - return path.replace(self.input_folder, self.output_folder) - def save_table(self, path: str, table: pyarrow.Table) -> tuple[int, dict[str, Any], int]: """ Save table to a given location From 680f3138d1e183a814f6c9230ab1eee33ad759c0 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:40:59 -0400 Subject: [PATCH 22/91] Renamed/refactored fuzzy dedup python orchestrator Signed-off-by: Constantin M Adam --- .../fdedup/python/src/fuzzy_dedup_python.py | 180 ++++++++++++ .../fdedup/python/src/service_orchestrator.py | 265 ------------------ 2 files changed, 180 insertions(+), 265 deletions(-) create mode 100644 transforms/universal/fdedup/python/src/fuzzy_dedup_python.py delete mode 100644 transforms/universal/fdedup/python/src/service_orchestrator.py diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py new file mode 100644 index 000000000..ca64f336f --- /dev/null +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -0,0 +1,180 @@ +import argparse +import os +import sys + +import cluster_analysis_transform +import data_cleaning_transform +import get_duplicate_list_transform +import signature_calc_transform +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils, get_logger +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +SERVICE_DICT = { + "SignatureCalculation": "minhash", + "ClusterAnalysis": "cluster", + "GetDuplicateList": "fdlist", + "DataCleaning": "fdclean", +} + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + +ARGS_MAP = { + "minhash": [ + signature_calc_transform.contents_column_key, + signature_calc_transform.document_id_column_key, + signature_calc_transform.seed_key, + signature_calc_transform.num_permutations_key, + signature_calc_transform.num_bands_key, + signature_calc_transform.num_minhashes_per_band_key, + signature_calc_transform.jaccard_similarity_threshold_key, + signature_calc_transform.word_shingle_size_key, + signature_calc_transform.num_segments_key, + ], + "cluster": [ + cluster_analysis_transform.jaccard_similarity_threshold_key, + cluster_analysis_transform.num_bands_key, + cluster_analysis_transform.num_segments_key, + ], + "fdlist": [ + get_duplicate_list_transform.subfolder_key, + get_duplicate_list_transform.consolidated_filename_key, + ], + "fdclean": [ + data_cleaning_transform.document_id_column_key, + data_cleaning_transform.duplicate_list_location_key, + ], +} + + +class ServiceOrchestrator: + def __init__(self, global_params: argparse.Namespace = None): + self.global_params = global_params + self.logger = get_logger(__name__) + + def execute_service(self, service_logic, service_params): + # Call the generic service logic + service_logic(service_params) + + def orchestrate(self): + service_list = self.global_params.services.split(",") + for service in service_list: + self.logger.info(f"Starting {service} step") + if service not in SERVICE_DICT: + err_msg = f"Unknown service {service} specified. Must be one of {SERVICE_DICT.keys()}" + self.logger.error(err_msg) + raise ValueError(err_msg) + service_short_name = SERVICE_DICT[service] + service_params = self.get_arguments(args, service_short_name) + self.logger.info(f"Got parameters for {service}") + status = self.execute_service(service_short_name, service_params) + if status == 0: + self.logger.info(f"{service} completed successfully") + else: + self.logger.error(f"{service} failed with status {status}, aborting ...") + break + + def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: + sys_argv = ["python"] + in_args_dict = vars(in_args) + all_module_arguments = ARGS_MAP.get(service_name, []) + passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None} + for k, v in passed_args.items(): + sys_argv.append(f"--{service_name}_{k}") + sys_argv.append(str(v)) + if service_name == "minhash": + input_folder = in_args_dict["input_folder"] + output_folder = in_args_dict["output_folder"] + elif service_name == "cluster": + input_folder = os.path.join(in_args_dict["output_folder"], "bands") + output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove") + elif service_name == "fdlist": + input_folder = in_args_dict["output_folder"] + output_folder = in_args_dict["output_folder"] + elif service_name == "fdclean": + input_folder = in_args_dict["input_folder"] + output_folder = os.path.join(in_args_dict["output_folder"], "cleaned") + else: + self.logger.error(f"Unknown service name: {service_name}") + data_io = { + "input_folder": input_folder, + "output_folder": output_folder, + } + if in_args.use_s3: + sys_argv.append("--data_s3_cred") + sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) + sys_argv.append("--data_s3_config") + else: + sys_argv.append("--data_local_config") + sys_argv.append(ParamsUtils.convert_to_ast(data_io)) + return sys_argv + + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params + if service_short_name == "minhash": + launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) + elif service_short_name == "cluster": + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + status = launcher.launch() + return status + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Service Orchestrator") + + # Define command line arguments + parser.add_argument("--input_folder", type=str, required=True, help="Input folder path") + parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") + + parser.add_argument( + "--contents_column", type=str, default="text", help="Name of the column that holds document text" + ) + parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations") + parser.add_argument("--num_bands", type=int, default=14, help="Number of bands") + parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band") + parser.add_argument("--num_segments", type=int, default=2, help="Number of segments") + + # Single argument for service execution + parser.add_argument( + "--services", + type=str, + required=False, + default="SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning", + help="Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning)", + ) + + parser.add_argument( + "--use_s3", + action="store_true", + help="use s3", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = ServiceOrchestrator(global_params=args) + # Launch python fuzzy dedup execution + orchestrator.orchestrate() diff --git a/transforms/universal/fdedup/python/src/service_orchestrator.py b/transforms/universal/fdedup/python/src/service_orchestrator.py deleted file mode 100644 index 897a3210c..000000000 --- a/transforms/universal/fdedup/python/src/service_orchestrator.py +++ /dev/null @@ -1,265 +0,0 @@ -import argparse -import os -import sys - -from cluster_analysis_transform_python import ( - ClusterAnalysisPythonTransformConfiguration, -) -from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration -from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import ParamsUtils -from file_copy_util import FileCopyUtil -from signature_calc_transform_python import ( - SignatureCalculationPythonTransformConfiguration, -) - - -class ServiceOrchestrator: - def __init__(self, global_params=None): - self.global_params = global_params or {} - - def execute_service(self, service_logic, service_params): - # Call the generic service logic - service_logic(service_params) - - def orchestrate(self, service_logic): - service_list = self.global_params["services"].split(",") - - for service in service_list: - if service == "SignatureCalculation": - params = create_transform_args_payload(args, service) - params["service_type"] = "SignatureCalculation" - self.execute_service(service_logic, params) - elif service == "ClusterAnalysis": - params = create_transform_args_payload(args, service) - params["service_type"] = "ClusterAnalysis" - self.execute_service(service_logic, params) - elif service == "DataCleaning": - params = create_transform_args_payload(args, service) - params["service_type"] = "DataCleaning" - self.execute_service(service_logic, params) - elif service == "BandsFileCopy": - params = args - params["service_type"] = "BandsFileCopy" - self.execute_service(service_logic, params) - elif service == "DocsToRemoveFileCopy": - params = args - params["service_type"] = "DocsToRemoveFileCopy" - self.execute_service(service_logic, params) - else: - print(f"Warning: {service} is not a recognized service.") - - -def generic_service_logic(params): - print("Service executed with parameters:", params) - service_type = params["service_type"] - use_s3 = params["use_s3"] - # Remove the 'service_type' key - params.pop("service_type", None) # Using pop() method - - if service_type == "SignatureCalculation" or service_type == "ClusterAnalysis" or service_type == "DataCleaning": - # Set the simulated command line args - params.pop("num_permutations", None) # Using pop() method - params.pop("num_bands", None) # Using pop() method - params.pop("num_segments", None) # Using pop() method - params.pop("use_s3", None) # Using pop() method - # Set the simulated command line args - sys.argv = ParamsUtils.dict_to_req(d=params) - if use_s3: - sys.argv.append("--data_s3_cred") - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) - - if service_type == "SignatureCalculation": - runtime_config = SignatureCalculationPythonTransformConfiguration() - launch_transform_service(runtime_config) - elif service_type == "ClusterAnalysis": - runtime_config = ClusterAnalysisPythonTransformConfiguration() - launch_transform_service(runtime_config) - elif service_type == "DataCleaning": - runtime_config = DataCleaningPythonTransformConfiguration() - launch_transform_service(runtime_config) - elif service_type == "BandsFileCopy": - launch_file_copy_service(params, service_type) - elif service_type == "DocsToRemoveFileCopy": - launch_file_copy_service(params, service_type) - - -def launch_transform_service(params): - # create launcher - launcher = PythonTransformLauncher(runtime_config=params) - # Launch the ray actor(s) to process the input - launcher.launch() - - -def launch_file_copy_service(args, service_type): - root_folder = os.path.join(args["root_folder"], args["output_folder"]) - data_type = None - if service_type == "BandsFileCopy": - data_type = "bands" - # Get files to process - files = [ - f"band={band}/segment={segment}" - for band in range(args["num_bands"]) - for segment in range(args["num_segments"]) - ] - elif service_type == "DocsToRemoveFileCopy": - files = ["docs_to_remove"] - data_type = "docs_to_remove" - config = {"root_folder": root_folder} - data_access_factory: DataAccessFactoryBase = DataAccessFactory() - daf_args = [] - - if args["use_s3"]: - - s3_config = { - "input_folder": root_folder, - "output_folder": root_folder, - } - daf_args.append("--data_s3_cred") - daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) - daf_args.append("--data_s3_config") - daf_args.append(ParamsUtils.convert_to_ast(s3_config)), - else: - - # Construct folders - local_config = { - "input_folder": root_folder, - "output_folder": os.path.abspath(os.path.join(args["root_folder"], args["output_folder"])), - } - daf_args.append("--data_local_config") - daf_args.append(ParamsUtils.convert_to_ast(local_config)) - - daf_parser = argparse.ArgumentParser() - data_access_factory.add_input_params(parser=daf_parser) - data_access_factory_args = daf_parser.parse_args(args=daf_args) - data_access_factory.apply_input_params(args=data_access_factory_args) - stats = {} - fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats) - for file in files: - fcu.copy_data(file, data_type) - - -def create_transform_args_payload(args, service): - print(args) - # Construct folders - input_folder = os.path.join(args["root_folder"], args["input_folder"]) - output_folder = os.path.join(args["root_folder"], args["output_folder"]) - if service == "ClusterAnalysis": - input_folder = os.path.join(args["root_folder"], args["output_folder"], "bands_consolidated") - output_folder = os.path.join(args["root_folder"], args["output_folder"], "docs_to_remove") - elif service == "DataCleaning": - output_folder = os.path.join(args["root_folder"], args["output_folder"], "cleaned") - duplicate_location = os.path.join( - args["root_folder"], - args["output_folder"], - "docs_to_remove_consolidated", - "docs_to_remove_consolidated.parquet", - ) - - # Create a local configuration - local_conf = {"input_folder": input_folder, "output_folder": output_folder} - - # Create parameters - params = { - "num_permutations": args["num_permutations"], - "num_bands": args["num_bands"], - "num_segments": args["num_segments"], - "use_s3": args["use_s3"], - } - - if args["use_s3"]: - params["data_s3_config"] = ParamsUtils.convert_to_ast(local_conf) - else: - params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) - - # add extra - if service == "DataCleaning": - short_name = "fdclean" - cli_prefix = f"{short_name}_" - - # configuration keys - document_id_column_key = "document_id_column" - """ This key holds the name of the column storing the unique ID assigned to each document""" - duplicate_list_location_key = "duplicate_list_location" - """ This key holds the location of the list of duplicate documents marked for removal""" - - # command line arguments - document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" - """ Name of the column storing the unique ID assigned to each document""" - duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" - """ Location of the list of duplicate documents marked for removal""" - - params[document_id_column_cli_param] = "int_id_column" - params[duplicate_list_location_cli_param] = duplicate_location - - return params - - -def create_file_copy_args_payload(args): - daf_args = [] - local_config = { - "input_folder": args.root_folder, - "output_folder": args.root_folder, - } - daf_args.append("--data_local_config") - daf_args.append(ParamsUtils.convert_to_ast(local_config)) - data_access_factory: DataAccessFactoryBase = DataAccessFactory() - daf_parser = argparse.ArgumentParser() - data_access_factory.add_input_params(parser=daf_parser) - data_access_factory_args = daf_parser.parse_args(args=daf_args) - data_access_factory.apply_input_params(args=data_access_factory_args) - return data_access_factory - - -def parse_args(): - parser = argparse.ArgumentParser(description="Service Orchestrator") - - # Define command line arguments - parser.add_argument("--root_folder", type=str, required=True, help="Root folder path") - parser.add_argument("--input_folder", type=str, required=True, help="Input folder path") - parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") - - parser.add_argument( - "--contents_column", type=str, default="text", help="Name of the column that holds document text" - ) - parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations") - parser.add_argument("--num_bands", type=int, default=14, help="Number of bands") - parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band") - parser.add_argument("--num_segments", type=int, default=2, help="Number of segments") - - # Single argument for service execution - parser.add_argument( - "--services", - type=str, - required=True, - help="Comma-separated list of services to run (e.g., SignatureCalculation,BandsFileCopy,ClusterAnalysis,DocsToRemoveFileCopy,DataCleaning)", - ) - - parser.add_argument( - "--use_s3", - type=bool, - default=False, - help="use s3", - ) - - args = parser.parse_args() - return vars(args) # Convert Namespace to dictionary - - -if __name__ == "__main__": - - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - - # Parse command line arguments - args = parse_args() - - # Initialize the orchestrator - orchestrator = ServiceOrchestrator(global_params=args) - - # Example service execution (if you had defined services) - orchestrator.orchestrate(generic_service_logic) From c29d3bf78eb24045e7f6d3f110a8323432636290 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:45:50 -0400 Subject: [PATCH 23/91] Rewrote cluster_analysis_transform as a folder_transform Signed-off-by: Constantin M Adam --- .../src/cluster_analysis_local_python.py | 11 +- .../python/src/cluster_analysis_transform.py | 180 +++++++++++++----- .../src/cluster_analysis_transform_python.py | 49 ++++- 3 files changed, 183 insertions(+), 57 deletions(-) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py index dcfc9a7e4..7c162b1b1 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -21,7 +21,7 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { "input_folder": input_folder, @@ -35,12 +35,15 @@ "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.0, } if __name__ == "__main__": # Set the simulated command line args - # sys.argv = ParamsUtils.dict_to_req(d=params) - # print(sys.argv) + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) # create launcher launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) - # Launch the ray actor(s) to process the input + # Launch python to process the input launcher.launch() diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 5ad18362a..221b50512 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -9,15 +9,17 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import io import os +import re from argparse import ArgumentParser, Namespace from typing import Any, List, Tuple import numpy as np import polars as pl import pyarrow as pa -from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing.transform import AbstractFolderTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger from Murmur_MH import Murmur_MH @@ -25,23 +27,37 @@ cli_prefix = f"{short_name}_" # configuration keys +num_bands_key = "num_bands" +""" This key holds the number of bands used in the banding technique""" +num_segments_key = "num_segments" +""" This key holds the number of segments dividing the hashing space for each band""" jaccard_similarity_threshold_key = "jaccard_similarity_threshold" """ This key holds the Jaccard similarity threshold above which two documents are duplicates""" # command line arguments +num_bands_cli_param = f"{cli_prefix}{num_bands_key}" +""" The number of bands used in the banding technique""" jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" """ Jaccard similarity threshold above which two documents are duplicates""" +num_segments_cli_param = f"{cli_prefix}{num_segments_key}" +""" The number of segments dividing the hashing space for each band""" captured_arg_keys = [ + num_bands_key, + num_segments_key, jaccard_similarity_threshold_key, ] # defaults -jaccard_similarity_threshold_default = 0.8 -""" Default Jaccard similarity threshold above which two documents are duplicates""" +num_bands_default = 14 +""" Default number of bands used in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)""" +jaccard_similarity_threshold_default = 0.75 +""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_segments_default = 1 +""" Default number of segments dividing the hashing space for each band""" -class ClusterAnalysisTransform(AbstractTableTransform): +class ClusterAnalysisTransform(AbstractFolderTransform): """ This is the second transform of the fuzzy dedup pipeline. It runs in parallel: for each band, the hashing interval is divided into segments. A cluster analysis @@ -65,7 +81,9 @@ class ClusterAnalysisTransform(AbstractTableTransform): duplicates. The resulting clusters are saved in a file for further analysis. Args: + num_bands: number of bands used in the banding technique jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + num_segments: the number of segments dividing the hashing space for each band """ def __init__(self, config: dict[str, Any]): @@ -75,58 +93,102 @@ def __init__(self, config: dict[str, Any]): defined by the companion runtime, ClusterAnalysisTransformRuntime. """ super().__init__(config) + self.num_bands = config.get(num_bands_key, num_bands_default) + self.num_segments = config.get(num_segments_key, num_segments_default) self.jaccard_similarity_threshold = config.get( jaccard_similarity_threshold_key, jaccard_similarity_threshold_default ) + self.data_access = config.get("data_access") self.logger = get_logger(__name__) - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - bands_dataframe = pl.from_arrow(table) - docs2remove_list = [] - # clustering - bands_dataframe_groups = bands_dataframe.group_by("band_hash").agg("document_data") - bands_dataframe_cluster = bands_dataframe_groups.with_columns( - cluster_length=pl.col("document_data").list.len() - ).filter(pl.col("cluster_length") > 1) - self.logger.info(f"file_name = {file_name}") - num_clusters = len(bands_dataframe_cluster) - if num_clusters > 0: - sum_cdocs = bands_dataframe_cluster.select(pl.sum("cluster_length")).item() - max_cdocs = bands_dataframe_cluster.select(pl.max("cluster_length")).item() - min_cdocs = bands_dataframe_cluster.select(pl.min("cluster_length")).item() - avg_cdocs = bands_dataframe_cluster.select(pl.mean("cluster_length")).item() + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + self.logger.info(f"Cluster analysis for folder {folder_name}") + metadata = {} + input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + files, retries = self.data_access.get_folder_files( + path=input_folder, + extensions=[".parquet"], + return_data=True, + ) + if retries > 0: + metadata |= {"data_access_retries": retries} + match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name) + if match: + band = int(match.group(1)) + segment = int(match.group(2)) else: - sum_cdocs = 0 - max_cdocs = 0 - min_cdocs = 0 - avg_cdocs = 0 - self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") - self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") - bands_dataframe_response = self.process_bands(bands_dataframe_cluster) + raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s") + output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet") + + # consolidate into a single data frame band hashes computed by workers + band_segment_dataframe, consolidation_stats = self.consolidate_band_segment_files(files) + metadata |= consolidation_stats + # cluster grouping by band hashes + cluster_dataframe, cluster_stats = self.get_clusters(band_segment_dataframe) + metadata |= cluster_stats + # cluster analysis using jaccard similarity + jaccard_cluster_dataframe, jaccard_stats = self.analyze_clusters(cluster_dataframe) + metadata |= jaccard_stats + # Generate the docs_to_remove dataframe + docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove") + output_data = TransformUtils.convert_arrow_to_binary(docs_to_remove_dataframe.to_arrow()) + self.logger.info(f"{len(docs_to_remove_dataframe)} documents marked to remove") + metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)} + return [(output_data, output_path)], metadata + + def sanitize_folder_name(self, folder_name: str) -> str: + if "://" in folder_name: + _, folder_name = folder_name.split("://") + if folder_name[-1] != "/": + folder_name = f"{folder_name}/" + return folder_name + + def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + band_segment_dataframe = pl.DataFrame() + total_input_rows = 0 + for fname, contents in files.items(): + df = pl.read_parquet(io.BytesIO(contents)) + total_input_rows += len(df) + self.logger.debug(f"{fname} has {len(df)} rows") + band_segment_dataframe = band_segment_dataframe.vstack(df) - filtered_doc2remove_dataframe = bands_dataframe_response.filter(pl.col("docs_to_remove_length") > 0) - num_clusters = len(filtered_doc2remove_dataframe) + consolidation_stats = { + "input_files": len(files), + "input_bytes": sum(len(v) for v in files.values()), + "input_rows": total_input_rows, + "consolidated_files": 1, + "consolidated_bytes": band_segment_dataframe.to_arrow().nbytes, + "consolidated_rows": len(band_segment_dataframe), + } + return band_segment_dataframe, consolidation_stats + + def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + groupby_dataframe = band_segment_dataframe.group_by("band_hash").agg("document_data") + cluster_dataframe = groupby_dataframe.with_columns(cluster_length=pl.col("document_data").list.len()).filter( + pl.col("cluster_length") > 1 + ) + # self.logger.info(f"file_name = {file_name}") + num_clusters = len(cluster_dataframe) if num_clusters > 0: - sum_cdocs = filtered_doc2remove_dataframe.select(pl.sum("docs_to_remove_length")).item() - max_cdocs = filtered_doc2remove_dataframe.select(pl.max("docs_to_remove_length")).item() - min_cdocs = filtered_doc2remove_dataframe.select(pl.min("docs_to_remove_length")).item() - avg_cdocs = filtered_doc2remove_dataframe.select(pl.mean("docs_to_remove_length")).item() + sum_cdocs = cluster_dataframe.select(pl.sum("cluster_length")).item() + max_cdocs = cluster_dataframe.select(pl.max("cluster_length")).item() + min_cdocs = cluster_dataframe.select(pl.min("cluster_length")).item() + avg_cdocs = cluster_dataframe.select(pl.mean("cluster_length")).item() else: sum_cdocs = 0 max_cdocs = 0 min_cdocs = 0 avg_cdocs = 0 - self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + cluster_stats = { + "groupby_clusters": num_clusters, + "cluster_duplicate_docs": sum_cdocs, + } + return cluster_dataframe, cluster_stats - # Explode the 'docs_to_remove' column - doc2remove_exploded_dataframe = filtered_doc2remove_dataframe.explode("docs_to_remove") - table = doc2remove_exploded_dataframe.to_arrow() - self.logger.info(f"{len(doc2remove_exploded_dataframe)} documents marked to remove") - metadata = {"nrows": len(table)} - return [table], metadata - - def process_bands(self, df: pl.DataFrame) -> pl.DataFrame: + def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: # Define the schema with specific data types schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64} doc_ids_lists = [] @@ -137,7 +199,7 @@ def process_bands(self, df: pl.DataFrame) -> pl.DataFrame: doc_ids_lists += doc_ids_list docs_to_remove_lists += docs_to_remove_list len_of_docs2remove_lists += len_of_docs2remove_list - processed_rows = pl.DataFrame( + jaccard_cluster_dataframe = pl.DataFrame( { "first_doc": doc_ids_lists, "docs_to_remove": docs_to_remove_lists, @@ -145,7 +207,25 @@ def process_bands(self, df: pl.DataFrame) -> pl.DataFrame: }, schema=schema, ) - return processed_rows + filtered_jaccard_dataframe = jaccard_cluster_dataframe.filter(pl.col("docs_to_remove_length") > 0) + num_clusters = len(filtered_jaccard_dataframe) + if num_clusters > 0: + sum_cdocs = filtered_jaccard_dataframe.select(pl.sum("docs_to_remove_length")).item() + max_cdocs = filtered_jaccard_dataframe.select(pl.max("docs_to_remove_length")).item() + min_cdocs = filtered_jaccard_dataframe.select(pl.min("docs_to_remove_length")).item() + avg_cdocs = filtered_jaccard_dataframe.select(pl.mean("docs_to_remove_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + jaccard_stats = { + "jaccard_clusters": num_clusters, + "jaccard_duplicate_docs": sum_cdocs, + } + return filtered_jaccard_dataframe, jaccard_stats def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: # Process row and return a new list of Series or a new row @@ -216,6 +296,18 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=jaccard_similarity_threshold_default, help="Jaccard similarity threshold above which two documents are duplicates", ) + parser.add_argument( + f"--{num_bands_cli_param}", + type=int, + default=num_bands_default, + help="The number of bands used in the banding technique", + ) + parser.add_argument( + f"--{num_segments_cli_param}", + type=int, + default=num_segments_default, + help="The number of segments dividing the hashing space for each band", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py index 28d96f428..8ff6dbf2b 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py @@ -10,11 +10,19 @@ # limitations under the License. ################################################################################ +import os import time +from typing import Any -from cluster_analysis_transform import ClusterAnalysisTransformConfiguration -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.runtime.pure_python.runtime_configuration import ( +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + DefaultPythonTransformRuntime, + PythonTransformLauncher, PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger @@ -23,11 +31,31 @@ logger = get_logger(__name__) +class ClusterAnalysisPythonRuntime(DefaultPythonTransformRuntime): + """ + Cluster analysis runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + class ClusterAnalysisPythonTransformConfiguration(PythonTransformRuntimeConfiguration): """ - Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. - NOOP does not use a RayRuntime class so the superclass only needs the base - python-only configuration. + Implements the PythonTransformConfiguration for Fuzzy Dedup ClusterAnalysis + as required by the PythonTransformLauncher. """ def __init__(self): @@ -35,10 +63,13 @@ def __init__(self): Initialization :param base_configuration - base configuration class """ - super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisPythonRuntime, + ) if __name__ == "__main__": - launcher = PythonTransformLauncher(ClusterAnalysisTransformConfiguration()) - logger.info("Launching noop transform") + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + logger.info("Launching fuzzy dedup cluster analysis python transform") launcher.launch() From aada59eccbf6b8df6e1c5b332fa19a21a99b125c Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:48:21 -0400 Subject: [PATCH 24/91] Wrote get_duplicate_list_transform as a folder_transform Signed-off-by: Constantin M Adam --- .../src/get_duplicate_list_transform.py | 168 ++++++++++++++++++ .../get_duplicate_list_transform_python.py | 71 ++++++++ 2 files changed, 239 insertions(+) create mode 100644 transforms/universal/fdedup/python/src/get_duplicate_list_transform.py create mode 100644 transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py new file mode 100644 index 000000000..c7b4cbddf --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py @@ -0,0 +1,168 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +import re +from argparse import ArgumentParser, Namespace +from typing import Any, List, Tuple + +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractFolderTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger +from Murmur_MH import Murmur_MH + + +short_name = "fdlist" +cli_prefix = f"{short_name}_" + +# configuration keys +subfolder_key = "docs_to_remove" +""" This key holds the name of the subfolder with the duplicate records""" +consolidated_filename_key = "consolidated_filename" +""" This key holds the name of the file with the consolidated list of duplicates""" + +# command line arguments +subfolder_cli_param = f"{cli_prefix}{subfolder_key}" +""" The name of the subfolder with the duplicate records""" +consolidated_filename_cli_param = f"{cli_prefix}{consolidated_filename_key}" +""" The name of the file with the consolidated list of duplicates""" + +captured_arg_keys = [ + subfolder_key, + consolidated_filename_key, +] + +# defaults +subfolder_default = "docs_to_remove" +""" Default name of the subfolder with the duplicate records""" +consolidated_filename_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") +""" Default name of the file with the consolidated list of duplicates""" + + +class GetDuplicateListTransform(AbstractFolderTransform): + """ + This is an intermediate step of the fuzzy dedup pipeline. It runs in a single + location and consolidates in a single file all the duplicates found for each + band segment. + Args: + subfolder: name of the subfolder with the duplicate records + consolidated_filename: name of the file with the consolidated list of duplicates + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.subfolder = config.get(subfolder_key, subfolder_default) + self.consolidated_filename = config.get(consolidated_filename_key, consolidated_filename_default) + self.data_access = config.get("data_access") + self.logger = get_logger(__name__) + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + self.logger.info(f"Get Duplicate List for folder {folder_name}") + metadata = {} + input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + files, retries = self.data_access.get_folder_files( + path=input_folder, + extensions=[".parquet"], + return_data=True, + ) + if retries > 0: + metadata |= {"data_access_retries": retries} + output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_path = os.path.join(output_folder, self.consolidated_filename) + + # consolidate into a single data frame band hashes computed by workers + consolidated_dataframe, consolidation_stats = self.consolidate_docs_to_remove_files(files) + self.logger.info(f"{len(consolidated_dataframe)} documents marked as duplicates") + metadata |= consolidation_stats + output_data = TransformUtils.convert_arrow_to_binary(consolidated_dataframe.to_arrow()) + return [(output_data, output_path)], metadata + + def sanitize_folder_name(self, folder_name: str) -> str: + if "://" in folder_name: + _, folder_name = folder_name.split("://") + if folder_name[-1] != "/": + folder_name = f"{folder_name}/" + return folder_name + + def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + consolidated_dataframe = pl.DataFrame() + total_input_rows = 0 + for fname, contents in files.items(): + df = pl.read_parquet(io.BytesIO(contents)) + total_input_rows += len(df) + self.logger.debug(f"{fname} has {len(df)} rows") + consolidated_dataframe = consolidated_dataframe.vstack(df) + consolidated_dataframe = consolidated_dataframe.select("docs_to_remove").unique() + + consolidation_stats = { + "input_files": len(files), + "input_bytes": sum(len(v) for v in files.values()), + "input_rows": total_input_rows, + "consolidated_files": 1, + "consolidated_bytes": consolidated_dataframe.to_arrow().nbytes, + "consolidated_rows": len(consolidated_dataframe), + } + return consolidated_dataframe, consolidation_stats + + +class GetDuplicateListTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=GetDuplicateListTransform, + remove_from_metadata=[], + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the GetDuplicateListTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{subfolder_cli_param}", + type=str, + default=subfolder_default, + help="The name of the subfolder with the duplicate records", + ) + parser.add_argument( + f"--{consolidated_filename_cli_param}", + type=str, + default=consolidated_filename_default, + help="The name of the file with the consolidated list of duplicates", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py new file mode 100644 index 000000000..703ef630e --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py @@ -0,0 +1,71 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import time +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + DefaultPythonTransformRuntime, + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from get_duplicate_list_transform import ( + GetDuplicateListTransformConfiguration, + subfolder_key, +) + + +logger = get_logger(__name__) + + +class GetDuplicateListPythonRuntime(DefaultPythonTransformRuntime): + """ + Get duplicate list runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + return [self.params[subfolder_key]] + + +class GetDuplicateListPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for Fuzzy Dedup GetDuplicateList + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=GetDuplicateListTransformConfiguration(), + runtime_class=GetDuplicateListPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + logger.info("Launching fuzzy dedup get duplicate list python transform") + launcher.launch() From 2019d56565ea52c5474632a822e67ac7e66fdac8 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:50:13 -0400 Subject: [PATCH 25/91] Added text preprocessing Signed-off-by: Constantin M Adam --- .../python/src/signature_calc_local_python.py | 39 +++++---- .../python/src/signature_calc_transform.py | 81 +++++++------------ 2 files changed, 48 insertions(+), 72 deletions(-) diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py index eb958ee3d..062580f22 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -20,31 +20,28 @@ ) -# # create parameters -# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) -# output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output_second")) -# local_conf = { -# "input_folder": input_folder, -# "output_folder": output_folder -# } -# code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -# params = { -# # Data access. Only required parameters are specified -# "data_local_config": ParamsUtils.convert_to_ast(local_conf), -# # execution info -# "runtime_pipeline_id": "pipeline_id", -# "runtime_job_id": "job_id", -# "runtime_code_location": ParamsUtils.convert_to_ast(code_location), -# "minhash_num_permutations":112, -# "minhash_num_bands":14, -# "minhash_num_segments":2 -# } +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = {"input_folder": input_folder, "output_folder": output_folder} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} if __name__ == "__main__": # Set the simulated command line args - # sys.argv = ParamsUtils.dict_to_req(d=params) - # print(sys.argv) + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) sys.argv.append("--data_s3_cred") s3_creds = { diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 7ac8eb057..7c4dd391c 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -10,6 +10,8 @@ # limitations under the License. ################################################################################ import os +import re +import unicodedata from argparse import ArgumentParser, Namespace from pathlib import Path from typing import Any, List @@ -100,44 +102,16 @@ """ Default number of segments across which we divide the hashing space for each band""" -def _optimal_minhashlsh_param( - threshold: float = jaccard_similarity_threshold_default, - num_perm: int = num_permutations_default, - false_positive_weight: float = 0.5, - false_negative_weight: float = 0.5, -): - """ - Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum - of probabilities of false positive and false negative. - :param threshold: desired similarity threshold - :param num_perm: number of permutations - :param false_positive_weight: importance of avoiding false positive results - :param false_negative_weight: importance of avoiding false negative results - :return: a tuple (optimal number of bands, optimal number of rows) - """ - - def _false_positive_probability(threshold, b, r): - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, threshold) - return a - - def _false_negative_probability(threshold, b, r): - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, threshold, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for b in range(1, num_perm + 1): - max_r = int(num_perm / b) - for r in range(1, max_r + 1): - fp = _false_positive_probability(threshold, b, r) - fn = _false_negative_probability(threshold, b, r) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (b, r) - return opt +NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?") +WHITESPACE_PATTERN = re.compile(r"\s+") +PUNCTUATION = "!/—”:%1〈&(、━\\【#%「」,】;+^]~“《„';’{|∶´[=-`*.(–?!:$~«〉,><》)?)。…@_.\"}►»" + "".join( + map( + chr, + (x for a, b in ((0, 9), (11, 13), (13, 32), (127, 160)) for x in range(a, b)), + ) +) +PUNCTUATION_SET = set(PUNCTUATION) +PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) class SignatureCalculationTransform(AbstractTableTransform): @@ -184,13 +158,6 @@ def __init__(self, config: dict[str, Any]): self.num_segments = config.get(num_segments_key, num_segments_default) self.num_bands = config.get(num_bands_key, num_bands_default) self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) - # Calculate optimal parameters for bands calculation - # self.num_bands, self.num_rows = _optimal_minhashlsh_param( - # threshold=self.jaccard_similarity_threshold, - # num_perm=self.num_permutations, - # false_positive_weight=0.5, - # false_negative_weight=0.5, - # ) # use this dataframe to store the minhashes and size for each document self.all_minhashes: pl.DataFrame = None # use this dataframe to store the band hashes for each document @@ -224,8 +191,8 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab # generate minhash values minhashes = df.map_rows( - lambda text: mm_min_hash.minhash2_nosalt( - *self._generate_word_shingles(text, window_size=self.word_shingle_size) + lambda row: mm_min_hash.minhash2_nosalt( + *self._generate_word_shingles(row, window_size=self.word_shingle_size) ) ) # rename columns, cast minhashes to list(uint32) @@ -374,10 +341,22 @@ def write_band_signatures(self): return [], metadata # define shingles generation function - def _generate_word_shingles(self, text: str, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]: - words = text[0].split() - document_id = text[1] - doc_len = len(text[0]) + def _generate_word_shingles(self, row: tuple, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]: + text = row[0] + # lower case + text = text.lower() + # replace numbers with '0' + text = NUMBERS_PATTERN.sub("0", text) + # convert punctuation to spaces + text = text.translate(PUNCTUATION_TRANS) + # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end + text = WHITESPACE_PATTERN.sub(" ", text.strip()) + # diacritics/unicode normalization + text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn") + text = text.strip() + words = text.split() + document_id = row[1] + doc_len = len(row[0]) word_count = len(words) k_shingles = [] for i in range(0, max(1, word_count - window_size + 1)): From 9362803f99fa422437031263474e97365d61d9f3 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:51:22 -0400 Subject: [PATCH 26/91] Added python test data Signed-off-by: Constantin M Adam --- .../python/test-data/input/data_1/df1.parquet | Bin 0 -> 3093 bytes .../python/test-data/input/data_2/df2.parquet | Bin 0 -> 1397 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c9220bf39c8dd2127707be44ba210363df6aa1a3 GIT binary patch literal 3093 zcmZ`+2{csw`@b_;CTom)Lt|$wk+qV%-ma!GQW$#@8C$YuMnpxHk{H<5Kl{6hl*K_LhY?SX9Y7y!wX zqz`sV|1Q^a13;RF!EUz>NE6Y~w@A$P@=28JxvaxV$RH5*U-e2eP~2H6d^C%(D-cMcCRY7e=$)E(wYu}4&pB|!0${2ryymMznu=7KKYyAPeJ%s(6 zjYf$9gPJ44HcP>o%aY7TbEU0rFDl{7XQsW{aZeq6A3qCOAZ8zac1yXeW`e#rvh;jH z-Pbaja}xB9GW2QzkE;uzc=)!~DctzR>oiy8=F!LjpKZ?`D;g8s}-C zynvH}r;MZT>YH1acMpeCmB;+DnfAjrWs<$+BI!bp>iSlUZeJn32u$20c@za7C7+o8 zt7SYA8{qx6POX{-zJ7u-FDhs)U}CQ&1|-?kJJXDNPxUc_YPFg^<-5k-ezH?9`IajH z$nQx|8xu&@uJ$j!rA%}Sko3QA=@&Q|zr8==&L{Ebn?u9ZsZ&`ZVSb}U6~j<>>Zr}O6<5XEPNPzFj4I^Y9U^v;mi2V&B_aX`>TW6DS3e_b z@7JAjcS7ee2hTbknYUDPO!uN2{Yj^6)W`mbgk_CcSghREFN4!wV|ktSmzPh*Tt0h6 zeP)-+=W6X&olfqKNz(2sU_W(>K)&Evxt3auIoCQT~6$dmT7e%EPq zenO!yhTMx43Rx^3dsnU9YJa5XPDq#f28EWZQ9AYMv%$u?TKfxMITt?=`;kjO;5T$q zmXBbn_gS`BFfVPXVn?Jk59<+Gd~5x*1?5xenyg7(pYHvFPdsuvN`lp+oV#;AH>}}A zFIpHYdeVZM`hz@=p6(}jl}rBl(1Z+XUyt$xs8C3&&2^~mb+gU$ipuQ#5R+(!PKZfQ=77%m^N zFQleBR(*C~2o{|VcC4$2$?0bYW@|TB>@0VRy;Yo-R;KEzPCOdj_PjKyW>*f4WM74lqf9=$*}%X1uEW4L{C@`O8=s(~8%xHrU;!x)Aw_*9 z1C~KDO2^a`L-`OWCQlN{Ojj^Z*Ypc_oWSb#)^M6PjHBi^nGvqnnRM7=FCTV2Xmi{i z8+mq7N{(2HmM9ZDj(;a}Y}VXBAg8#m<6P3fv#1`6FxyH#C18l9|4!2XA%&9~O>K0D zH;f9TK2h4vHnl9&oOtLRFVr^L)b{5^x%wz8ycZ*nwW8ICjdtkX6vs;0bBMj*a_W-V zmQx=I-{rz~3Htam_{aLfJBCx}j2LzpKQ{JNw6dcI|HFDK8qDxu5t1`pqc z`^k#xUvw23GEaOl?@<`qbz zt^-?>KC~i@&qZ;ujR5X(0Wkkl>iXD0pJ060lr^gxxo&NVCZil69WZ9 z{`dhLH}ChY;4uI1GJ~SP3k5gr;GGBgzx_Z+?gG#8lU;>Z+g%sD4hwF7u_rmsYbxc? z(Wl+ve*tyE!D?%VA6KKufE8d1d^MwLdG-YFYE@IX5H4q;(3(^Neb=AT zr$2o6&QP3d9O`&w08bhnVXD1Cg~U6C3ceg9&9J9IUgDH&?{~mne`M%wte@7~hy!{= zbZ%Rm&tS;jGqLYI_4W9LN4Gt`cbXOM;{GA9pJFhFR~oF3@skxbtUD9_v@ZA$=5qdv zf~xqG`HO{8aZXqA0ZPYX{e4(#+__MWMORM>gXZ#8^lb}9R~Rn*T1;6u(c|%A{=6;R zD{%vD_Bt?pE3rq_{q4x?i6a-3&BZsy@?HwD)r53-3rp2(2dPrkJ{z^UyobzFnKPZ|84d#&}~g95rdDD1*C8g2DE{Y z$wH)D6u2V&OCG?1{=dXHSuhd9m64G%Ch>#>1<^nrL{UNi&wC93g^;sO}sPiw-ft+k36Uw4@v;|D>&T0b&ZDpxiQciJSYi3-}E6g9s_l6kwDtl#5c?Q zzQjM50$Rd@5P+xS|Fy>d`!Wz+m0KfU3&J3m3HpMOKqKEsQV6!Ov2rw`dK~i#aF@(L zYjer)5RA}1?q+xr(Zk8V^z|bP7^cE|VHn|Lc&jyH4-4bk&0*N*mmea+rrZrS2ZqQb{%KO?K&Ro}hD_jBtb+>(3hj^uaJ^Y?So lvph*5InXFR@E?FApLLMGdk9rZ6Bz*R3>cvSz#91>_%FJi=e_^{ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..23fac4c726f017c26b7fe64153fe5e0b459d60b1 GIT binary patch literal 1397 zcmWG=3^EjD6D<+-i4o-!WdLIqQ3gQ<2IhK=t@?i%BJMIUEZ}C~Vc=!pW8h~H07-!X zTL=>aGovVrq$&fOqy%qfUP*jrN_=vDPHAqQD32JQ*aAke3N;P}py~V6MbB~LGXJR< z!?(>nMg^IZ_)Ya<;Z#*wq zV)nzM(mQUw+G*c04GwO?b)C zb6Q{Yp4w_=$aYip$lHU`{jqzUo;;g7>&eqwGB@8A{5;GN8GGc#2iJ2KPVQ^$XlD{k zugvRvUHCfKf9}NZZx_ZrZPq<Xv3;kv_fPPC&yY29``!Wvw-csk z4{FP_GFx3b80p>0!?3_{TXnkgIe zvEzXj-}qJFm@7;4$`e zp(5e^1yhf6oy*mK_rY|BRp)0@UZ!T<8lm(riIYB@FtcD{VwREm;bH3(bamn9bKjSq z*!{NX`L^;ml8Z0xG;?3BHuDs_wSs|DQcs`<|NoO0b}{@<^0{lp{jB`W%W{t_!xLMJ zw!X=EFzNm55{m?bk0no+tWJn#cAjNb`qbgYIY*9f(|Jm!MMs&s;<$ zy#06^$Nz0RA{Osmcd*vE{W|GcAY42cTGmwl+Z$kG&2+Jq$gnuW@>RN z1TwtvW8`?|vS(6o%y%AHHnpgxog6X3LX%WQ*bLYMyN{VQcuJoW<$+`nj^zBjlGMDC zVsPGgr!Mx7TkQxVgQScYhiHWuFh>DdM;Jx_af8^vTxH|IQk0)xBFX~fs4}7DF9uN_ zpcvdpRR$j!pe`{!Nf}8UwFX8R!nsncflpL~LG2K)3;_p z0&=MYk7H1ff4Eg~az<)yqQn#?eGssSbOZud5MfaUe&f47Sdt7$olxQVMho5P%)+7znfhq$wS!4@3t-taJe@ zadfl Date: Mon, 14 Oct 2024 00:52:07 -0400 Subject: [PATCH 27/91] Added project admin tools Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/.dockerignore | 1 + transforms/universal/fdedup/python/Makefile | 64 +++++++++++++++++++ transforms/universal/fdedup/transform.config | 5 +- 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 transforms/universal/fdedup/python/.dockerignore create mode 100644 transforms/universal/fdedup/python/Makefile diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore new file mode 100644 index 000000000..f7275bbbd --- /dev/null +++ b/transforms/universal/fdedup/python/.dockerignore @@ -0,0 +1 @@ +venv/ diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile new file mode 100644 index 000000000..05f6bf5ca --- /dev/null +++ b/transforms/universal/fdedup/python/Makefile @@ -0,0 +1,64 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +# Include the common configuration for this transform +include ../transform.config + +venv:: .transforms.python-venv + +test:: .transforms.python-test + +clean:: .transforms.clean + +image:: .transforms.python-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-python + +setup:: .transforms.setup + +# distribution versions is the same as image version. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.python-test-image + +run-cli-sample: .transforms.run-cli-python-sample + +run-local-sample: .transforms.run-local-sample + +run-local-python-sample: .transforms.run-local-python-sample + +#run-s3-ray-sample: .transforms.run-s3-ray-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/fdedup/transform.config b/transforms/universal/fdedup/transform.config index 774716e15..ffaeb9f45 100644 --- a/transforms/universal/fdedup/transform.config +++ b/transforms/universal/fdedup/transform.config @@ -14,5 +14,6 @@ TRANSFORM_NAME=fdedup # # If you change the versions numbers, be sure to run "make set-versions" to # update version numbers across the transform (e.g., pyproject.toml). -FDEDUP_RAY_VERSION=$(DPK_VERSION) - +FDEDUP_PYTHON_VERSION=$(DPK_VERSION) +FDEDUP_RAY_VERSION=$(FDEDUP_PYTHON_VERSION) +FDEDUP_SPARK_VERSION=$(FDEDUP_PYTHON_VERSION) From 4dac838b2d941117f40bce371574aec268d09206 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 02:40:11 -0400 Subject: [PATCH 28/91] Bug fix Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/cluster_analysis_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 221b50512..2a5ec3e6b 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -240,7 +240,7 @@ def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: sorted_document_data = sorted(document_data, key=lambda x: (-x["document_length"], x["int_id_column"])) # Extracting int_id_column values into a list - doc_list = list(set([item["int_id_column"] for item in sorted_document_data])) + doc_list = [item["int_id_column"] for item in sorted_document_data] # Creating a dictionary with int_id_column as key and minhashes as value doc_minhashes = {item["int_id_column"]: item["minhashes"] for item in sorted_document_data} From fbc2b58e255edc758a9d4016d49dd57715c3db93 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 02:41:49 -0400 Subject: [PATCH 29/91] Add op modes for data cleaning: filter (non)dupl and annotate Signed-off-by: Constantin M Adam --- .../python/src/data_cleaning_transform.py | 38 +++++++++--- .../src/data_cleaning_transform_python.py | 5 +- .../fdedup/python/src/fuzzy_dedup_python.py | 60 +++++++++++++++---- 3 files changed, 83 insertions(+), 20 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 05b18cc8b..8e17b757f 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -29,12 +29,16 @@ """ This key holds the name of the column storing the unique ID assigned to each document""" duplicate_list_location_key = "duplicate_list_location" """ This key holds the location of the list of duplicate documents marked for removal""" +operation_mode_key = "operation_mode" +""" This key holds the operation mode: 'filter_duplicates', 'filter_non_duplicates', or 'annotate'""" # command line arguments document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" """ Name of the column storing the unique ID assigned to each document""" duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" """ Location of the list of duplicate documents marked for removal""" +operation_mode_cli_param = f"{cli_prefix}{operation_mode_key}" +""" Operation mode, can be one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate'""" captured_arg_keys = [ document_id_column_key, @@ -44,8 +48,10 @@ # defaults document_id_column_default = "int_id_column" """ Default name of the column storing the unique ID assigned to each document""" -duplicate_list_location_default = None +duplicate_list_location_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") """ Default location of the list of duplicate documents marked for removal""" +operation_mode_default = "filter_duplicates" +""" Default value for operation mode, will filter out all the duplicate documents""" class DataCleaningTransform(AbstractTableTransform): @@ -72,6 +78,7 @@ def __init__(self, config: dict[str, Any]): self.logger = get_logger(__name__) self.document_id_column = config.get(document_id_column_key, document_id_column_default) self.duplicate_list_location = config.get(duplicate_list_location_key, duplicate_list_location_default) + self.operation_mode = config.get(operation_mode_key, operation_mode_default) contents = config.get("df") self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents)) self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") @@ -88,19 +95,27 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab self.docs_to_remove_df = self.docs_to_remove_df.select( pl.col(self.document_id_column).cast(input_doc_id_type) ) - filtered_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti") - filtered_table = filtered_df.to_arrow() + if self.operation_mode == "filter_duplicates": + result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti") + elif self.operation_mode == "filter_non_duplicates": + result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="inner") + else: # self.operation_mode == "annotation" + duplicates_df = self.docs_to_remove_df.with_columns(pl.lit("d").alias("duplicate")) + result_df = input_df.join(duplicates_df, on=self.document_id_column, how="left").with_columns( + pl.col("duplicate").fill_null("") + ) + result_table = result_df.to_arrow() metadata = { "input_files": 1, "input_docs": table.num_rows, "input_bytes": table.nbytes, "output_files": 1, - "output_docs": filtered_table.num_rows, - "output_bytes": filtered_table.nbytes, - "filtered_docs": (table.num_rows - filtered_table.num_rows), - "filtered_bytes": (table.nbytes - filtered_table.nbytes), + "output_docs": result_table.num_rows, + "output_bytes": result_table.nbytes, + "filtered_docs": (table.num_rows - result_table.num_rows), + "filtered_bytes": (table.nbytes - result_table.nbytes), } - return [filtered_table], metadata + return [result_table], metadata class DataCleaningTransformConfiguration(TransformConfiguration): @@ -133,10 +148,15 @@ def add_input_params(self, parser: ArgumentParser) -> None: parser.add_argument( f"--{duplicate_list_location_cli_param}", type=str, - required=True, default=duplicate_list_location_default, help="location of duplicate document list that are marked for removal", ) + parser.add_argument( + f"--{operation_mode_cli_param}", + choices=["filter_duplicates", "filter_non_duplicates", "annotate"], + default=operation_mode_default, + help="operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py index c0b5fefd6..e5c1e5025 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -10,6 +10,7 @@ # limitations under the License. ################################################################################ +import os from typing import Any from data_cleaning_transform import DataCleaningTransformConfiguration @@ -51,8 +52,10 @@ def get_transform_config( :param files - list of files to process :return: dictionary of transform init params """ - duplicate_list_location = self.params["duplicate_list_location"] data_access = data_access_factory.create_data_access() + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"]) + ) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index ca64f336f..c05fe326e 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -57,6 +57,7 @@ "fdclean": [ data_cleaning_transform.document_id_column_key, data_cleaning_transform.duplicate_list_location_key, + data_cleaning_transform.operation_mode_key, ], } @@ -66,10 +67,6 @@ def __init__(self, global_params: argparse.Namespace = None): self.global_params = global_params self.logger = get_logger(__name__) - def execute_service(self, service_logic, service_params): - # Call the generic service logic - service_logic(service_params) - def orchestrate(self): service_list = self.global_params.services.split(",") for service in service_list: @@ -107,7 +104,14 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: output_folder = in_args_dict["output_folder"] elif service_name == "fdclean": input_folder = in_args_dict["input_folder"] - output_folder = os.path.join(in_args_dict["output_folder"], "cleaned") + operation_mode = in_args_dict.get("operation_mode", "filter_duplicates") + if operation_mode == "filter_duplicates": + output_subfolder = "cleaned" + elif operation_mode == "filter_non_duplicates": + output_subfolder = "duplicates" + else: # operation_mode == "annotate" + output_subfolder = "annotated" + output_folder = os.path.join(in_args_dict["output_folder"], output_subfolder) else: self.logger.error(f"Unknown service name: {service_name}") data_io = { @@ -145,12 +149,48 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") parser.add_argument( - "--contents_column", type=str, default="text", help="Name of the column that holds document text" + "--operation_mode", + choices=["filter_duplicates", "filter_non_duplicates", "annotate"], + required=False, + help="operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents", + ) + parser.add_argument( + "--contents_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument( + "--document_id_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text") + parser.add_argument( + "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation" + ) + parser.add_argument( + "--num_bands", type=int, required=False, help="number of bands to use for band hash calculation" + ) + parser.add_argument( + "--num_minhashes_per_band", type=int, required=False, help="number of minhashes to use in each band" + ) + parser.add_argument( + "--word_shingle_size", type=int, required=False, help="number of words included in one shingle" + ) + parser.add_argument( + "--jaccard_similarity_threshold", + type=float, + required=False, + help="jaccard similarity threshold above which two documents are similar", + ) + parser.add_argument( + "--num_segments", + type=int, + required=False, + help="the number of segments dividing the hashing space for each band (for scalability)", + ) + parser.add_argument( + "--duplicate_list_location", + type=str, + required=False, + help="path to the file with all the duplicate document ids", ) - parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations") - parser.add_argument("--num_bands", type=int, default=14, help="Number of bands") - parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band") - parser.add_argument("--num_segments", type=int, default=2, help="Number of segments") # Single argument for service execution parser.add_argument( From 828ec41b4a0727f008566a3ebf7a0c400ee5c5ac Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 08:07:06 -0400 Subject: [PATCH 30/91] Python and spark transforms for cluster analysis Signed-off-by: Constantin M Adam --- .../src/cluster_analysis_transform_python.py | 1 + .../src/cluster_analysis_transform_spark.py | 38 +++++++++++++++++-- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py index 8ff6dbf2b..c35c5a711 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py @@ -72,4 +72,5 @@ def __init__(self): if __name__ == "__main__": launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) logger.info("Launching fuzzy dedup cluster analysis python transform") + # Launch python to process the input launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py index afb8c51b7..30f9dd317 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -10,9 +10,17 @@ # limitations under the License. ################################################################################ -from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +import os + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess from data_processing.utils import get_logger from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, SparkTransformLauncher, SparkTransformRuntimeConfiguration, ) @@ -21,6 +29,27 @@ logger = get_logger(__name__) +class ClusterAnalysisSparkRuntime(DefaultSparkTransformRuntime): + """ + Cluster analysis runtime support for Spark + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params["num_bands"] + segments = self.params["num_segments"] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + class ClusterAnalysisSparkTransformConfiguration(SparkTransformRuntimeConfiguration): """ Implements the SparkTransformConfiguration for Fuzzy Dedup Cluster Analysis @@ -31,12 +60,15 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisSparkRuntime, + ) if __name__ == "__main__": # create launcher launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) - logger.info("Launching fuzzy dedup signature calculation transform") + logger.info("Launching fuzzy dedup cluster analysis spark transform") # Launch the spark worker(s) to process the input launcher.launch() From bc6b81cd231a328f3fe32bfe26b0d40529d2ee57 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 11:00:28 -0400 Subject: [PATCH 31/91] Sync spark Makefile with dpk Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/spark/Makefile | 84 ++++++++++++---------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile index d30013da8..7eb132fbd 100644 --- a/transforms/universal/fdedup/spark/Makefile +++ b/transforms/universal/fdedup/spark/Makefile @@ -1,45 +1,57 @@ -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free -# to override/redefine the rules below. +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -# This is included in the image name, if defined -TRANSFORM_NAME=fd-sig-calc - -DOCKER_IMAGE_NAME=pyspark-base -DOCKER_IMAGE_VERSION=latest -DOCKER_FILE=Dockerfile -REGISTRY_HOST=docker.io -REGISTRY_PATH= -DOCKER=docker -PYTHON=python - -venv: requirements.txt - @# Help: Create the virtual environment using requirements.txt - $(PYTHON) -m venv venv - @source venv/bin/activate; \ - pip install --upgrade pip; \ - pip install wheel; \ - pip install -r requirements.txt; +# Include the common configuration for this transform +include ../transform.config + +venv:: .transforms.spark-venv + +test:: .transforms.spark-test + +clean:: .transforms.clean image:: .transforms.spark-image -image-direct: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings. - @# Help: Create the docker image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) - $(DOCKER) build -t $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -f $(DOCKER_FILE) . - -publish-docker: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings. - @# Help: Publish image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) to $(REGISTRY_HOST) container registry - $(DOCKER) logout $(REGISTRY_HOST) - $(DOCKER) login $(REGISTRY_HOST) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)' - $(DOCKER) push $(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) - -publish-ibm: - ibmcloud login -q -u "$(IBM_CLOUD_USER)" -apikey "$(IBM_CLOUD_API_KEY)" - ibmcloud cr login --client docker - $(DOCKER) tag $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) - $(DOCKER) push $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) - # ibmcloud cr image-list | grep $(DOCKER_IMAGE_NAME) +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-spark + +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.spark-test-image + +run-cli-sample: .transforms.run-cli-spark-sample + +run-local-sample: .transforms.run-local-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image From 4d486d35a36039783df84ce666ab03cd21c0cf59 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 11:01:59 -0400 Subject: [PATCH 32/91] Spark orchestration for fuzzy dedup Signed-off-by: Constantin M Adam --- .../src/cluster_analysis_transform_spark.py | 1 + .../src/data_cleaning_transform_spark.py | 9 +- .../fdedup/spark/src/fuzzy_dedup_spark.py | 207 +++--------------- 3 files changed, 34 insertions(+), 183 deletions(-) diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py index 30f9dd317..5522d67de 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -11,6 +11,7 @@ ################################################################################ import os +from typing import Any from cluster_analysis_transform import ( ClusterAnalysisTransformConfiguration, diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py index 03976bac8..29890d05f 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -10,6 +10,7 @@ # limitations under the License. ################################################################################ +import os from typing import Any from data_cleaning_transform import DataCleaningTransformConfiguration @@ -51,8 +52,10 @@ def get_transform_config( :param files - list of files to process :return: dictionary of transform init params """ - duplicate_list_location = self.params["duplicate_list_location"] data_access = data_access_factory.create_data_access() + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"]) + ) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) @@ -86,8 +89,10 @@ def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[s :param data_access_factory - data access factory class being used by the RayOrchestrator. :return: dictionary of parameters to be broadcast """ - duplicate_list_location = self.transform_config.params["duplicate_list_location"] data_access = data_access_factory.create_data_access() + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", self.transform_config.params["duplicate_list_location"]) + ) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py index 6d0e090e4..5217f2f7b 100644 --- a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py +++ b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py @@ -1,28 +1,15 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - import argparse -import logging import os import sys -from typing import Union -import polars as pl from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration -from data_processing.utils import ParamsUtils +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing_spark.runtime.spark import SparkTransformLauncher -from file_copy_util import FileCopyUtil -from file_copy_util_spark import FileCopySpark +from fuzzy_dedup_python import ServiceOrchestrator, parse_args +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) from signature_calc_transform_spark import ( SignatureCalculationSparkTransformConfiguration, ) @@ -34,172 +21,30 @@ "url": os.getenv("AWS_ENDPOINT_URL"), } -args_map = { - "minhash": [ - "document_id_column", - "contents_column", - "seed", - "num_permutations", - "num_bands", - "num_minhashes_per_band", - "jaccard_similarity_threshold", - "word_shingle_size", - "num_segments", - ], - "copyutil": [ - "subfolder_name", - "data_type", - "num_bands", - "num_segments", - "parallelization", - "use_s3", - ], - "cluster": [ - "jaccard_similarity_threshold", - ], - "fdclean": [ - "document_id_column", - "duplicate_list_location", - ], -} - - -def get_arguments(in_args: argparse.Namespace, module_name: str) -> Union[list, dict]: - sys_argv = ["python"] - in_args_dict = vars(in_args) - if in_args.use_s3: - sys_argv.append("--data_s3_cred") - sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) - all_module_arguments = args_map.get(module_name, []) - passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None} - if module_name == "copyutil": - copy_util_config = {k: v for k, v in passed_args.items()} - copy_util_config["root_folder"] = in_args_dict["output_folder"] - return copy_util_config - else: - for k, v in passed_args.items(): - sys_argv.append(f"--{module_name}_{k}") - sys_argv.append(str(v)) - if module_name == "minhash": - input_folder = in_args_dict["input_folder"] - output_folder = os.path.join(in_args_dict["output_folder"]) - elif module_name == "cluster": - input_folder = os.path.join(in_args_dict["output_folder"], "bands_consolidated") - output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove") - elif module_name == "fdclean": - if f"--{module_name}_duplicate_list_location" not in sys_argv: - sys_argv.append(f"--{module_name}_duplicate_list_location") - sys_argv.append( - os.path.join( - in_args_dict["output_folder"], - "docs_to_remove_consolidated", - "docs_to_remove_consolidated.parquet", - ) - ) - input_folder = in_args_dict["input_folder"] - output_folder = os.path.join(in_args_dict["output_folder"], "cleaned") - else: - logging.error(f"Unknown module name: {module_name}") - data_io = { - "input_folder": input_folder, - "output_folder": output_folder, - } - if in_args.use_s3: - sys_argv.append("--data_s3_config") - else: - sys_argv.append("--data_local_config") - sys_argv.append(ParamsUtils.convert_to_ast(data_io)) - return sys_argv +class SparkServiceOrchestrator(ServiceOrchestrator): + def __init__(self, global_params: argparse.Namespace = None): + super().__init__(global_params=global_params) -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--input_folder", type=str, required=True, help="path to read the input files") - parser.add_argument("--output_folder", type=str, required=True, help="path to write the output files") - parser.add_argument( - "--use_s3", type=bool, required=False, default=False, help="if true, use S3, if false use local FS" - ) - parser.add_argument( - "--contents_column", type=str, required=False, help="name of the column that stores document text" - ) - parser.add_argument( - "--document_id_column", type=str, required=False, help="name of the column that stores document text" - ) - parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text") - parser.add_argument( - "--num_permutations", type=int, required=True, help="number of permutations to use for minhash calculation" - ) - parser.add_argument( - "--num_bands", type=int, required=True, help="number of bands to use for band hash calculation" - ) - parser.add_argument( - "--num_minhashes_per_band", type=int, required=True, help="number of minhashes to use in each band" - ) - parser.add_argument( - "--word_shingle_size", type=int, required=False, help="number of words included in one shingle" - ) - parser.add_argument( - "--jaccard_similarity_threshold", - type=float, - required=False, - help="jaccard similarity threshold above which two documents are similar", - ) - parser.add_argument( - "--num_segments", - type=int, - required=True, - help="number of segments to divide each band hash interval (to improve scalability)", - ) - parser.add_argument("--parallelization", type=int, required=False, default=-1, help="spark parallelization") - parser.add_argument( - "--duplicate_list_location", - type=str, - required=False, - help="path to the file with all the duplicate document ids", - ) - return parser.parse_args() + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params + if service_short_name == "minhash": + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + elif service_short_name == "cluster": + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + status = launcher.launch() + return status if __name__ == "__main__": - # configure logging - logging.basicConfig( - format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - level=logging.INFO, - ) - args = parse_arguments() - sys.argv = get_arguments(args, "minhash") - # create launcher - launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) - # Launch the spark worker(s) to process the input - status = launcher.launch() - logging.info(f"Signature calculation concluded with status {status}") - - fcs_config = get_arguments(args, "copyutil") - - root_folder = fcs_config["root_folder"] - parallelization = fcs_config["parallelization"] - fcs = FileCopySpark(root_folder, fcs_config["num_bands"], fcs_config["num_segments"], args.use_s3) - data_access_factory = fcs.create_data_access_factory(root_folder, args.use_s3) - app_config = {"root_folder": root_folder} - execution_config = {"parallelization": parallelization} if parallelization > 0 else {} - status = fcs.orchestrate(app_config, execution_config, data_access_factory, data_type="bands") - logging.info(f"Consolidate bands concluded with status {status}") - - sys.argv = get_arguments(args, "cluster") - launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) - # Launch the spark worker(s) to process the input - status = launcher.launch() - logging.info(f"Cluster analysis concluded with status {status}") - - stats = {} - fcu_config = get_arguments(args, "copyutil") - fcu = FileCopyUtil(data_access_factory=data_access_factory, config=fcu_config, stats=stats) - fcu.copy_data(subfolder_name="docs_to_remove", data_type="docs_to_remove") - sys.argv = get_arguments(args, "fdclean") - # create launcher - launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) - # Launch the spark worker(s) to process the input - status = launcher.launch() - logging.info(f"Data cleanup concluded with status {status}") + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = SparkServiceOrchestrator(global_params=args) + # Launch spark fuzzy dedup execution + orchestrator.orchestrate() From 19e0844bd93f52b9e02277a70065221d981bf477 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 11:03:02 -0400 Subject: [PATCH 33/91] Bug fix Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/src/fuzzy_dedup_python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index c05fe326e..acb1be3bb 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -76,7 +76,7 @@ def orchestrate(self): self.logger.error(err_msg) raise ValueError(err_msg) service_short_name = SERVICE_DICT[service] - service_params = self.get_arguments(args, service_short_name) + service_params = self.get_arguments(self.global_params, service_short_name) self.logger.info(f"Got parameters for {service}") status = self.execute_service(service_short_name, service_params) if status == 0: From 2ce3d8c440351723373edefdbcaf20c8d3730647 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 11:03:36 -0400 Subject: [PATCH 34/91] Added spark test data Signed-off-by: Constantin M Adam --- .../fdedup/spark/test-data/input/df1.parquet | Bin 0 -> 4111 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 transforms/universal/fdedup/spark/test-data/input/df1.parquet diff --git a/transforms/universal/fdedup/spark/test-data/input/df1.parquet b/transforms/universal/fdedup/spark/test-data/input/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2584725bbf8e96cb852fc2702f8014fe689c7116 GIT binary patch literal 4111 zcmaJ_2{@Ep`+sI^6N4Btp|LY4vJGjHjK&x#jD5{cb}=F3A`$5n2di6a=70 zidTVm3H~Y85E>99L13yzJ}F#^D*ADmuI_RCy|)!dA_WNf9rXh@nJqoNeXlTBn09pulc~MaCszeqR%*qCM}ZB{?MuDm;H z?K@$HPYB7Ej()n*E}5|0UgR8Aq)lOAtw1(@HZ_Wy`ucqdkIGDfuIoC!1)ZWV6V4D74nOQ2^`Yu zRk;JPSg?2%`5}4JdNzWXd42tu54OA7NR~DI@1n%YB+f8zr&}%QNkh=|N7~Xy2FrQ_ z|Jss_5iA`zOC6@nHmyX|thyFb%nk)!R-q@J+))aZxPDYYZH{vDb9J$PXZgpXic)<& ziGi1|vhX{&+>QA=>^c@4MV&R7CcM%plLb(+BBnyL!#wmTVM|$w%oplASXUxhD|vk6cr~TE``v1b(LGk~(q@`(kSJMxkLt{+wo|D{;z?ed^*@l@ zH}y5dT}b{-{eCVEm-}(9<@_g`F@#|=gDcjw`W^TXzw zSZipt`{sGp+UI9M!&Uwx#IJV@^hlwj<>O|B zfp;t(e|22-=U(u)sH=#~?jP{YRA?&eD!?@u>^NtV-h`2a2)4BOI8d@i7eVcbbj@n5ioLk*9s8-%KjN}tw?>haERm-h8gV?Gn+kHi!GiZq zjMJ}(Ijpp)fIxHE=5veqo-lY!lm9mj!_MvBZUfcdrWI{z> zWmSDs=W)Nvm?S6KH_g|19cCn_x#kD;Y{#FN4}M4eYmnT~0*+{SIGKTE7jTA(t4N?} z=me0Wwl>7+lP`}bfh+BfnBE;ZuMmq_m=d*m>;=;lXl1`HuAok{(+qao=G<9hPcvB9 z)inVTd?}c(jK>o-BXoXA56zxk+}C+6e(+UzuYRyeC94E5LQ|RH_ilC?OO6O^Fpp6W z_YG{3P#w@VD3qIRc8h_wzinzic~gWOZisSC&!uh4H^3syd-g}ulC-P_R&6ic(mi)>`c^eUZ?n`G`)#CoQN-rZgY!0y89tn(=Y@LKQtlqKe~M?5N{giBF}Z%NU$=Zx5xR)!*+07M7n^)4?he61>nlz&|mBZarW~ix%!d< zcZsb@1+guN$^kLB4F}hlFLoDZb(~vuwc${Gt(6e%I-g|a z(AE?39;oB=m-%7yyb44F3;`2hLpM;~rPu#htBlx<5D`tW)-<3$*-C2!a#W_?6>UE;?FkV$0SNCOPj`p_XHTryel#c7c; zrh6ABAs|4KBRW7xqJB*=bne^Wns4K{QkQ03^HcW(x3!@hb3^6cqhs?Ax5wQ33fdAE zKK*n{6<=-`>6}1g=P-5@nF&xpj71Q~-@}B(fdx1VeqNaR6EIU5ADNEwlAjJeql^dV zv`4!S2b{VR^}$6&nT_-9k>^h@(?V<=Kl%1Mp_fq-!{kUWVQ%%hD#2Y&W>4UqX~Pv4tDOPOBGsonNn9H^EZ* z?t^vT_=X(D_ewiXk1t)YyCJE^yDy66R!~oa+uwDjb89FuQPD^jzBJ)6dFgmP9hM(z z`zTswEJwn@lLvX8qG$)F>C2f53fx&c6524wZN1tSuvXIbO}${? zDi8^Dm;UQrfbq`x64ckZoCOh%l2bkTqnn+?9$6FLvWof5a>CZ%Oww(yLZfdLRXEN66=Um z+hWVXB)ewqnc3W|nfIf}kwyQlR*02d+x)T)wXgHpwb&bA9`{SWi61X~G%?Ig-dHR+ z_dZI#txXu4=3>-{lGJNm#iyV}z4@#yf8-9Y;9ZN%gO8nvDi|0XtnPh1q8Wxfa`BA& z#?k4r2#f68@v|p#Lp=qiLz!=>gqR#PqaVzw&8Uqjo(lCMs|8tK{mEE9l#~15i5lSt zy5>O#nyz^!Q41TV>=L`{l5XpN20SzCn*Fu#9jd)8hmkQ%7+Q;!ehd`I*UqQ&Ei^%Q z!i?LFm0LR9TrQuZZgF+!+mp*r7BsCNFjJf$ml@lB#f0@^P%`s)U zLronFW99v4;6f&Zrtx^Tnw124gMrhM5|ZREkR{=u^ksx)OLy|61aCd`x<~9 zr!x~6j@jT(AA$*Lt%EAP_*P5qdA?xW-(p?*C zwR^y`yDg~2u1C0u{;}y=$WATUvZ%RWbd@vSO>%fcj;2OKG{ba~_fJ%uznE%v0dP}8 zA9k8C&(A!xPm~DE8-O6QxTX{n+YP7?E6O^u9%2jQO_AbbRb(UHqNqja9i;y5@NprQ zq!B+kP;oMV01^R#8zbmPimv5=;M;_=m+1qi8rkfcUP00HYv?o`Zb!r;(Z&Ci^wIFJDQ4(blT=Y8(u@2>9roSM5 zDAw56&;k?aeBRZ^kv|ozzz}sSEM0UMo%%)kJcPJ6-(AwjVgs?Kuvq#?#2z-H?*|7n z_Vut>_dPqhiq&RxSZzjU7|IA5$WVVR?%HYN>2rES6Gj~RXuRitmSKm*;?(I*3Uq(^ z7Q=V^0A5tVVv`jsbxJa`ZA|R9Ze50F3B= G1^gQ$Q;6pP literal 0 HcmV?d00001 From 5e4022cd8289baa46ac09036f91387f67d01f16a Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 12:46:47 -0400 Subject: [PATCH 35/91] Setting input test data for ray Signed-off-by: Constantin M Adam --- .../fdedup/ray/test-data/input/df1.parquet | Bin 0 -> 4111 bytes .../fdedup/ray/test-data/input/sample1.parquet | Bin 36563 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 transforms/universal/fdedup/ray/test-data/input/df1.parquet delete mode 100644 transforms/universal/fdedup/ray/test-data/input/sample1.parquet diff --git a/transforms/universal/fdedup/ray/test-data/input/df1.parquet b/transforms/universal/fdedup/ray/test-data/input/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2584725bbf8e96cb852fc2702f8014fe689c7116 GIT binary patch literal 4111 zcmaJ_2{@Ep`+sI^6N4Btp|LY4vJGjHjK&x#jD5{cb}=F3A`$5n2di6a=70 zidTVm3H~Y85E>99L13yzJ}F#^D*ADmuI_RCy|)!dA_WNf9rXh@nJqoNeXlTBn09pulc~MaCszeqR%*qCM}ZB{?MuDm;H z?K@$HPYB7Ej()n*E}5|0UgR8Aq)lOAtw1(@HZ_Wy`ucqdkIGDfuIoC!1)ZWV6V4D74nOQ2^`Yu zRk;JPSg?2%`5}4JdNzWXd42tu54OA7NR~DI@1n%YB+f8zr&}%QNkh=|N7~Xy2FrQ_ z|Jss_5iA`zOC6@nHmyX|thyFb%nk)!R-q@J+))aZxPDYYZH{vDb9J$PXZgpXic)<& ziGi1|vhX{&+>QA=>^c@4MV&R7CcM%plLb(+BBnyL!#wmTVM|$w%oplASXUxhD|vk6cr~TE``v1b(LGk~(q@`(kSJMxkLt{+wo|D{;z?ed^*@l@ zH}y5dT}b{-{eCVEm-}(9<@_g`F@#|=gDcjw`W^TXzw zSZipt`{sGp+UI9M!&Uwx#IJV@^hlwj<>O|B zfp;t(e|22-=U(u)sH=#~?jP{YRA?&eD!?@u>^NtV-h`2a2)4BOI8d@i7eVcbbj@n5ioLk*9s8-%KjN}tw?>haERm-h8gV?Gn+kHi!GiZq zjMJ}(Ijpp)fIxHE=5veqo-lY!lm9mj!_MvBZUfcdrWI{z> zWmSDs=W)Nvm?S6KH_g|19cCn_x#kD;Y{#FN4}M4eYmnT~0*+{SIGKTE7jTA(t4N?} z=me0Wwl>7+lP`}bfh+BfnBE;ZuMmq_m=d*m>;=;lXl1`HuAok{(+qao=G<9hPcvB9 z)inVTd?}c(jK>o-BXoXA56zxk+}C+6e(+UzuYRyeC94E5LQ|RH_ilC?OO6O^Fpp6W z_YG{3P#w@VD3qIRc8h_wzinzic~gWOZisSC&!uh4H^3syd-g}ulC-P_R&6ic(mi)>`c^eUZ?n`G`)#CoQN-rZgY!0y89tn(=Y@LKQtlqKe~M?5N{giBF}Z%NU$=Zx5xR)!*+07M7n^)4?he61>nlz&|mBZarW~ix%!d< zcZsb@1+guN$^kLB4F}hlFLoDZb(~vuwc${Gt(6e%I-g|a z(AE?39;oB=m-%7yyb44F3;`2hLpM;~rPu#htBlx<5D`tW)-<3$*-C2!a#W_?6>UE;?FkV$0SNCOPj`p_XHTryel#c7c; zrh6ABAs|4KBRW7xqJB*=bne^Wns4K{QkQ03^HcW(x3!@hb3^6cqhs?Ax5wQ33fdAE zKK*n{6<=-`>6}1g=P-5@nF&xpj71Q~-@}B(fdx1VeqNaR6EIU5ADNEwlAjJeql^dV zv`4!S2b{VR^}$6&nT_-9k>^h@(?V<=Kl%1Mp_fq-!{kUWVQ%%hD#2Y&W>4UqX~Pv4tDOPOBGsonNn9H^EZ* z?t^vT_=X(D_ewiXk1t)YyCJE^yDy66R!~oa+uwDjb89FuQPD^jzBJ)6dFgmP9hM(z z`zTswEJwn@lLvX8qG$)F>C2f53fx&c6524wZN1tSuvXIbO}${? zDi8^Dm;UQrfbq`x64ckZoCOh%l2bkTqnn+?9$6FLvWof5a>CZ%Oww(yLZfdLRXEN66=Um z+hWVXB)ewqnc3W|nfIf}kwyQlR*02d+x)T)wXgHpwb&bA9`{SWi61X~G%?Ig-dHR+ z_dZI#txXu4=3>-{lGJNm#iyV}z4@#yf8-9Y;9ZN%gO8nvDi|0XtnPh1q8Wxfa`BA& z#?k4r2#f68@v|p#Lp=qiLz!=>gqR#PqaVzw&8Uqjo(lCMs|8tK{mEE9l#~15i5lSt zy5>O#nyz^!Q41TV>=L`{l5XpN20SzCn*Fu#9jd)8hmkQ%7+Q;!ehd`I*UqQ&Ei^%Q z!i?LFm0LR9TrQuZZgF+!+mp*r7BsCNFjJf$ml@lB#f0@^P%`s)U zLronFW99v4;6f&Zrtx^Tnw124gMrhM5|ZREkR{=u^ksx)OLy|61aCd`x<~9 zr!x~6j@jT(AA$*Lt%EAP_*P5qdA?xW-(p?*C zwR^y`yDg~2u1C0u{;}y=$WATUvZ%RWbd@vSO>%fcj;2OKG{ba~_fJ%uznE%v0dP}8 zA9k8C&(A!xPm~DE8-O6QxTX{n+YP7?E6O^u9%2jQO_AbbRb(UHqNqja9i;y5@NprQ zq!B+kP;oMV01^R#8zbmPimv5=;M;_=m+1qi8rkfcUP00HYv?o`Zb!r;(Z&Ci^wIFJDQ4(blT=Y8(u@2>9roSM5 zDAw56&;k?aeBRZ^kv|ozzz}sSEM0UMo%%)kJcPJ6-(AwjVgs?Kuvq#?#2z-H?*|7n z_Vut>_dPqhiq&RxSZzjU7|IA5$WVVR?%HYN>2rES6Gj~RXuRitmSKm*;?(I*3Uq(^ z7Q=V^0A5tVVv`jsbxJa`ZA|R9Ze50F3B= G1^gQ$Q;6pP literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/input/sample1.parquet b/transforms/universal/fdedup/ray/test-data/input/sample1.parquet deleted file mode 100644 index 58387d07daf4381a020444fc5dee676b1360ebb2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36563 zcmeHw3v?sZm0*=y?v~tkWA(IjWa1{}-tMBiY1^vO-%CiBq_QoyEUEReYz@#Ul}fg3 zNfwsmUm!~-Oag>W20{qQFf42c8!{QrOnwLEFca7r2q7V1b~0JQvCx8~h)Iq>UFZ>~h zH;J~-cGhX@s?(;T>`op1=&mi(cIPhBJ-dP+2_(6%lP`bmB+#6(y4;LQ%2*v)m&d`l zT7O+fbJZ`7m>2!J=P9~didql>`x;dH(#4VH3wsjn+CxFoC$JI>_Ng)_Ng zMOrEf`C)74MJL6@!tgRPWF3}@!x^C>43me)q3n!R{_!~ML-XauB1m(;$=o2#R3nT! zG)+(#atknojs`T#c^n&EGCCepN?*%ShkYi==-ZQ97^W zYHG|qyG&o`P%O_g)PAJYwR}jQ$efQN~zPUW8P`-+KZK?(qOS%60BCnE@{}1;C5nZ8QFBS8um<$>PFm} zEy*31)D@|-lB?t;tVOr&Hh-p*e)MhhOBSR>ZyoeC_2SV}rdy6Fdhz`uPyQIFoWv5Q z1S+P;nDSv6Q?Y8Pv{;s?wZ$s6A`~lBWswpMZ+rO<7mE_LvbdB%1wwJLGAAuji^cpJ zRbDJe)MA!`5~`9bNJG>N^c967wIr1!p(16Vk+dp_)e1C#M!sO!$HjT7u5o0XD3dJd zKbi%@c7g(PEiD=D@AZvyp)q!t=v?hynUjj_1B4z_(;yIH=jToi-PlVIp;P<}HAsa* zz8FPzuY!HEC#hH<MYE=XDqMETPRK4!X{ zu=i2D&?&)>dI_C_@45JJC>*Dff%teJL>jwcu;>xW@XuE=PM%7HIB$@r;$bSv$D;v$ zilOVcvfiWus-T?do*Rv}N_rD|0 ze?iCljWL2X#;l+k(3*+_6ETW4>>)4L1HTLp9N?#YJnttDbdUSuCwlY;r+X6-KNs(t z7U%Z4C3wdpX~!o7e*kHR+{B&l3N&LNEh=bKRvk zh0e};UO5>GPND^y2~fdF$`_uT9Qc!7&gbJJN8WC@gs`*5U8b?iy*x;d@Jj4Bf2TsA+7us(^Sr6^ z?z6u=_{hOUqhmS{iX`GxXfhlE1lR!O<;Mac&{bnU#`fWVrPeg)&oRf?n`K#U-#ZL#$UTL5S$K7Gl zFsyS12F^S`!npK-(3s)lh5^D&MtWo6MAT(?2Bt==neI1r&VI!NS>BpJq%cf+r7k$6$gz_QT#NAC8U* zADmn;-1SG_*GG8YS05pw#;-sANg__ZR~Jkej}x}g6U69k1mQQ`rnB?!xsA~K!Xa|O z2}RE1`}MjQ)%T~pV3j}yU;6WBiTK%n(Ze((|A4SQ?z-lt-V@Z=XpEd0xY>CXp>y-k zU!<4E8SsCLVQ@3I8}38~4FrzG&F^ zF~VU8B(jFUtrOiRC~xBN;{(?M%VK=I^Q8mPaFFr_2Cfy^|G1vunHO~~=KU;Tyv1aiK5#h&f+5ZzmcYG$oo320h1-;9b z{K+%NzTG`Tc_YW~?XrL^=Dgvk`FIx*I(E1ltagk~^ly%_55&XQ-pz~4NQCG~E5z#AjU<$?7?K>7oMI`%Qb z?U*NHrvG#x1`COA+5gO14gPh{&l1m(JmB}~BkvNevUdlIfWgs$u`GB6WS&!YS3xv$2i}u}T z;Qsf4qEyLchRlccV2on>PVS`>UHks+qC<%;Frm!jpXKfPKzv5{QwykkG8{c+d~o2& z?l`Oq`>BftytDKey^#EVZF_ZyA6T+#61ca%;NLC{;G_x?I^gOX!Cve%}Me&STDx5Ql|XAy*utqEdd}uKgA&SIjOh6%60d z$0Vs|sykegmO2j{E>loS6&9By_BhcC=u|7CR}sX}NP#?xh(o?sJQ~sP;eXg66VKQi z91-kXAty{;r_jV-j-XK)7t#PW43S%m)3YD zvpiP@#;Cvm1CcMPfl{5F$iuC%Qx-OGl#)k9T*YN|zBaVP3~n8Kh{BqNd~d92D$ZMe zOrE3fF1z*T(|bodf->^=k#n!|;6XQGhk>1VwX-iv6FKw^n+)*m?broPgo%AY6Sq+lHMiB>rd}mHxBCE!JxN*t zwGOFhfnTUr2dJJnqtazqBxb7+LBYCQF<(pn@@o32tC1!voi*VvnUY(S5Ap#od2VaW{}m6pD?wg$#%h`IhWU8l$Z0j4wcXvv)&=6gEm?+Wx&??Obq zzTq`Vrc8$wmiJ!X4JfE1pBmB3Zgst-;R!mc1@fX)M}Kd&n$MrfEEZtB#4`B{()WXLT<3I zn33}3K^!ASAA^E;O*L1}p_OzP(GRi0m12HT$bc;$t_60LzrZ57Zm6^d(X<7r_zsTq z&hTt7;~~tRk8w49u0YiAiPz8>7P zgJtvTCZ7H1x;oFga>b0adS=7$-qB&cyMzABwe*wkM(o;r+}8rD$%+xv6$&d3me5KL z0a)9h)ji0%DXU||YK4E7;ZF_q>TBEB(!Q#R{1?tb;$O*_*62r(e5@|NApn5@%d zh1|F3vS+iR&26{ZSkd7Y8Obg(Znu*`c(|;RB-m{(A(Isxo^(dCvI5{?7qTv!;I^?2 zd)jJu3K^#dEZO`4L5`9NQj{J{5 zK+?~=k3=#*xmBTT1LB$*bRp-7xA)tb^Zd=NDvj7xUCLuCAg4S({b7>+)-6~i-r`Bm zbMU#p|4|@Vak}?F$f+b0*D%q@=f3ZwB>jG9(a!5W2SA_uml#Z!$NIVjeqpMsw>S@eTqRc^vkIwinM}0j{N9};{kc1_WR>ok6YkTb z=~`vm8TVTN2ADl&(8><7t~jhwox6I6AZ3*5>9@cV#tbs}q88zSLpHclP2q3;D}~?>v^h_O+tm zr?2J@@oX(WFc~BkxsRg!z&p=Fen1a_ZdOhZa9_mvfrw^6d(->?N>@}NNIfS(*3nW6 zS_^-6d2`HKW2Rq-(O!trp4V)_&0kcl1A8+PY{GLT6xPCP?Q?zR2hH?-e@4>Zy&EqE z7xeLhK5nl*YPt#!lBV}4o>0{dHASv1OEIQ3THhNo0S?t?fd zq*~G8SM(R#&u@|D?~?Q@Pm=VfpTvY}jHwAnleBZ#LOt|#z)abz>ETExI*}@8fzf8| z)vrO6>T7^en-s}8K)U4{DoA?@b7gT+wNZQKY0&?t+XHG7(sRKsiL$yz?Cuep{k1}R z8jY0*n?_ZW2ygy-l?ZzvqAG%-NSUUn-uJ#q(l>l_LsenNrZMtnKsDugvFQ>;ReP)` z+gOE&Y8KAQWHdC?uu*BLe{CY>zHh0B*}WvqD(m>&r1^f5e&`=a`ZxazQ&As6J(?z# zaA``xa_`YS01@;Yos8GOaZjzvqI_w>?}cxx@awE=%59|i6C{28w@Lb`f5hl1{p$m1 zqIM2y`rNZBHPuyw+4^L-`&obu)!Q%`?te}NWj8*RQit=??~wG@ztbL$W|Y@$XEH7) zBifx}MzVXDw8v>@A#y4TtV4wRO?HUOvKfK3!;L4qB)P11!R~N-SclW;_IT1x6y=rD z*|dvggtQ&Pyl_Fu20>p>+RZxM8IchLcP8Uxt?1@bT4EiN!($hroy{f)lFbHpuv{lW zj4qGpv8U5^M>;Fn8~}!e&@&@AoPy+XWStJXBWr~-wWc}bplK$+Am;tRrX)?iL>R05_eFEJ(RW77yV5e8=EF0gN@O%GDD*Se5BY1lWg+h*wDCfacrn+qUph(sc7m(KI0h?PG#g_ zRYbRIuPm#JGzp{nsR^@si+(5&31?g5pD8>nz#BVR4ZF$lO${W3}a`WK+izaZh-aqEz6 z0BOeke>`sy=I_ly?DI8)u(nY_w0jC>; zuC@*-4P?&YV|ePK2GgmoXtpa#9hwS_^N%oh_4W6?dg;!*@POWtR}cIi+*@1rR+8WJ7V_Ejo0d$bw?FesZ)>uaexvzU`i-Yl>EDxV9=(le zIQore*63|chSA%dyrSQ%%n|)2rfujQOrMy)^%c_GMeEJa+(RB9VAE#V4Oc)iLfP`o zBkw*WUk8*EH|0Yk)E~b45WF2R1K$oUp|>I)Izb&mmj$7q2p7lU30_dLRDnk;{=Qg( zuMYfGLSLEdu7U@7Ja}y<+%=SH!#0kR%}MzTzJ^#ISw`y&x-$ZWhpNRKx&e$as1{0V z8x@P`c{sL*$_r9OKyh_w{_aCk5t_qIZqzpsA02cbx=w{mBNm71@5miu&`1sVwA40X z2I(%Ry%n#GVij75Y1f?q>}uqzhTaS;s@q1O%3DGqN!JRssj>;|xk;ggTxIa~U^PhH zB20XmuNB}%@%B=vq=Z@ls%y3vrjmkeg{iLDUYHbKQf zfj8Ej54b3K-VUx(x6N=h?6YjMw-*ki{aXpE6>cx2+KtmzXsU`Ph^n~)uiH~C{Hlu0 zCQ;4qVn3o~X>Y3BcF_3ZsTQ>x6-^+wI!Ey(&{lx*Y(Zlid6p`nBOt-zGhCRkh59r^se1~~M#NsPue z16166eBitlZcB9&_$_Kq?R^R&jW15)0Z^-PvEk&@ft9>!PYUrQKBXI{}UFMaPTIdOD_pa zYbbV8S`(I*7FQ4-6di_A2HkkD9)ehk!h2OEJH!mx4qZoBx;c22d2&QPDQtOkY#9Bx zX9bSqPQx$yImK~y&O6JEa@;KEo5f{bZ$teGDuj3QEA^-2@-uwmQE+{L<5u7u-5mVI zgTPcTXu0e5T^ za35WtwnZJ{7{ZCmWs;5(jK7eI1rTh&ky(+ZJXUGii^mNVJUO8-HJ|a}`bug#ItOD| zXJUZwcsZ9!I_A@Cw3IH0c)Y2p=vXj5o69!gF0MiQ(F*WS!HdgHB^?<5lpkpI*;i$L zHRIo=UC|3X20k)sT=Uh8f5uwlJCmOFI@Gj+e5|4Q(>cV`baKi%lZ?(~#`q1{WMeC{ zAjb`!2J%zYbYZlTN}dMUaJXNpkk6;bqFG#yc;K6uN@JR)7LYtK?47`^5M7?JO#yBW zE9k{SJKm+nJo6yae7YFT3Q3uEal7?k4Bl=ww(1G`(EO;&w+>r|tHHa4G#g?-7S)=5 z)#h6$n;FlqfKDR*GU-Ad>6kK~TI8B$C$32)Lp8ZG`OMfT%-QJN#=ID}mXWTiWM&nn zqve{;7lhDDRcp+L`mRZovzQrZdZ8p}kKpuv3l<>sd=};?t*)A2EHJh5w_h%^tnU z|9U-p+XAyFLJR}N0otsm6=6IypSD39U}x>#idgW>!#rJ2C8PDVT*Ll};}gp>?9{r* zPA!0pD?2<-)>6sLX11oGey8ljlJFGHv&?uvafaY^WVRP5TCj zGc?6?n&w7he5y@cA~WV$f&Kd?IH~Hje5LcX-q(B*h-s}0WBgh>Kdv4J2xUcfZf0Dc!S2;*-*bzwgUS_5L*X1X37wEZ6{03c3LvsQhd9O{lGri z6vTCu_{7*$+x=hLw&zREQxNO)O@NO(1$yUMLorZ@X_tlM1n?C05;#BMb-?$vv`xhP zRn1q3w+2=xWA>HcJeNc9a8O^MNX~TMrbi+yqLcjGi8c&q*(QXHo9t)a)4K zOWcs7f&T>+p1mvZ>@RyE_p;0-AmefxayQ-pdLBj3;nR?-@mV1k!_A`v%@}&-@H0-> zgiiaRei70&oWwyshDXoP5uWWReRFEXhv1e_zOUmG#shn8}Tdn`nCCk^$Ghv7?Z$~Tjpng8;>l@`ccyd z%oMdeP}iF8jrwL$du!aaxjb;bQt>%qKA`rVa*e;<&G^&eMEDJwXwGx# zff-}1+*uTe5X)<)V$q%Mf@3?Dx^?ebS%%pJ_W@? zpwQ<={pXtcVqBFYMf2dRuL;v$*yDJrTo~bd*-A~n6nP<`w$7ii5MKYF>NMgP z+XSygpD?`8GOUGGvADp6BQ~JV@2Tli8~KzGjfYy6e1Bw}jq2jM$DwU*|5c0}LoYqdw9$MOqcPejzd)YQI(|8jdd zk1xlRVSst0>X@JKpzt3VIW_;`+yTd-JU`elIR^}Aha#Zr!|Mt^+vqEz-COq=a8APk zewWClQ2({{8h>i~Dj#a7&lfM!ujc;IyecOG^*#~F2akvKcM>zTdA-3?X{?^tfsUXu zv*$ZUXTGuYYak57v*y+|{h#jo_~$dN6!>y=&n=oW0ZiZ2p+U7uxp+EUQBD{Y zT!Y%%{P3|Ee=xjyejAVEPe7;PViUX;`5;_iJyz%U(RvN*8rFyPjs5|QWD9>0=+TC= z3mg|j{6+Y3m>e`1%sT)*1FjNssPNzl(Ozy2m2k*Ns`VeYYd*Ju+93S_hYZFO!8j;C v^2{bIR7qI6rYcni#ZqYy9}J>u2Y)E}ND#c~4BqRv^Z$rDBLorQqRjsXsl2cq From c14bdaa471f2338bbf88390f6c0d94176ea792b8 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 12:48:43 -0400 Subject: [PATCH 36/91] Bug fix Signed-off-by: Constantin M Adam --- .../fdedup/spark/src/cluster_analysis_transform_spark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py index 5522d67de..feeb3241e 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -45,8 +45,8 @@ def get_folders(self, data_access: DataAccess) -> list[str]: :param data_access - data access object :return: list of folder paths """ - bands = self.params["num_bands"] - segments = self.params["num_segments"] + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] return folders From 1215ac5ab9f1c8c04e55252bc25aee305707d620 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 12:49:15 -0400 Subject: [PATCH 37/91] Ray orchestration for fuzzy dedup Signed-off-by: Constantin M Adam --- .../ray/src/cluster_analysis_transform_ray.py | 48 ++++++++++++--- .../ray/src/data_cleaning_transform_ray.py | 10 +++- .../fdedup/ray/src/fuzzy_dedup_ray.py | 60 +++++++++++++++++++ 3 files changed, 107 insertions(+), 11 deletions(-) create mode 100644 transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py index 970686e13..a0e8e7de2 100644 --- a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py @@ -10,9 +10,19 @@ # limitations under the License. ################################################################################ -from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +import os +from typing import Any + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess from data_processing.utils import CLIArgumentProvider, get_logger -from data_processing_ray.runtime.ray.runtime_configuration import ( +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, RayTransformRuntimeConfiguration, ) @@ -20,11 +30,31 @@ logger = get_logger(__name__) +class ClusterAnalysisRayRuntime(DefaultRayTransformRuntime): + """ + Cluster analysis runtime support for Ray + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + class ClusterAnalysisRayTransformConfiguration(RayTransformRuntimeConfiguration): """ - Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. - NOOP does not use a RayRuntime class so the superclass only needs the base - python-only configuration. + Implements the RayTransformConfiguration for Fuzzy Dedup Cluster Analysis + as required by the RayTransformLauncher. """ def __init__(self): @@ -32,11 +62,13 @@ def __init__(self): Initialization :param base_configuration - base configuration class """ - super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisRayRuntime, + ) if __name__ == "__main__": - # launcher = NOOPRayLauncher() launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) - logger.info("Launching transform") + logger.info("Launching fuzzy dedup cluster analysis ray transform") launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py index 831a6c9c2..e83960c24 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -10,6 +10,7 @@ # limitations under the License. ################################################################################ +import os from typing import Any import ray @@ -88,8 +89,11 @@ def get_transform_config( :param files - list of files to remove :return: dictionary of filter init params """ - duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) data_access = data_access_factory.create_data_access() + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", duplicate_list_location) + ) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") duplicate_list, retries = data_access.get_file(duplicate_list_location) @@ -117,6 +121,6 @@ def __init__(self): if __name__ == "__main__": # launcher = NOOPRayLauncher() - launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) - logger.info("Launching transform") + launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) + logger.info("Launching transform") launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py new file mode 100644 index 000000000..0b9be33ca --- /dev/null +++ b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py @@ -0,0 +1,60 @@ +import argparse +import os +import sys + +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from fuzzy_dedup_python import ServiceOrchestrator, parse_args +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + + +ray_worker_options = {"num_cpus": 0.8} +ray_params = { + # where to run + "run_locally": True, + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(ray_worker_options), + "runtime_num_workers": 3, +} + +ray_params_argv = ParamsUtils.dict_to_req(ray_params) + + +class RayServiceOrchestrator(ServiceOrchestrator): + def __init__(self, global_params: argparse.Namespace = None): + super().__init__(global_params=global_params) + + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params if service_short_name == "fdlist" else ray_params_argv + params[1:] + if service_short_name == "minhash": + launcher = RayTransformLauncher(runtime_config=SignatureCalculationRayTransformConfiguration()) + elif service_short_name == "cluster": + launcher = RayTransformLauncher(runtime_config=ClusterAnalysisRayTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) + status = launcher.launch() + return status + + +if __name__ == "__main__": + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = RayServiceOrchestrator(global_params=args) + # Launch ray fuzzy dedup execution + orchestrator.orchestrate() From caf79a30b1c24892e1262009d57b29a271993c73 Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 18 Oct 2024 09:41:01 -0400 Subject: [PATCH 38/91] Added python test with expected data files Signed-off-by: nelson --- .../docs_to_remove/band_0_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_0_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_10_segment_0.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_10_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_11_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_11_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_1.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_13_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_13_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_1_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_1_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_2_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_3_segment_0.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_3_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_5_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_5_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_7_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_7_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_8_segment_0.parquet | Bin 0 -> 1510 bytes .../docs_to_remove/band_8_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_9_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_9_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/metadata.json | 58 ++++++++++++ .../data_cleaning/cleaned/df1.parquet | Bin 0 -> 14986 bytes .../data_cleaning/cleaned/metadata.json | 59 +++++++++++++ .../bands/band=0/segment=0/df1.parquet | Bin 0 -> 2753 bytes .../bands/band=0/segment=1/df1.parquet | Bin 0 -> 3122 bytes .../bands/band=1/segment=0/df1.parquet | Bin 0 -> 2862 bytes .../bands/band=1/segment=1/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=10/segment=0/df1.parquet | Bin 0 -> 3305 bytes .../bands/band=10/segment=1/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=11/segment=0/df1.parquet | Bin 0 -> 3450 bytes .../bands/band=11/segment=1/df1.parquet | Bin 0 -> 1354 bytes .../bands/band=12/segment=0/df1.parquet | Bin 0 -> 1354 bytes .../bands/band=12/segment=1/df1.parquet | Bin 0 -> 3442 bytes .../bands/band=13/segment=0/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=13/segment=1/df1.parquet | Bin 0 -> 3413 bytes .../bands/band=2/segment=0/df1.parquet | Bin 0 -> 3177 bytes .../bands/band=2/segment=1/df1.parquet | Bin 0 -> 2758 bytes .../bands/band=3/segment=0/df1.parquet | Bin 0 -> 2745 bytes .../bands/band=3/segment=1/df1.parquet | Bin 0 -> 3122 bytes .../bands/band=4/segment=0/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=4/segment=1/df1.parquet | Bin 0 -> 3413 bytes .../bands/band=5/segment=0/df1.parquet | Bin 0 -> 2753 bytes .../bands/band=5/segment=1/df1.parquet | Bin 0 -> 3122 bytes .../bands/band=6/segment=0/df1.parquet | Bin 0 -> 1354 bytes .../bands/band=6/segment=1/df1.parquet | Bin 0 -> 3450 bytes .../bands/band=7/segment=0/df1.parquet | Bin 0 -> 2667 bytes .../bands/band=7/segment=1/df1.parquet | Bin 0 -> 3289 bytes .../bands/band=8/segment=0/df1.parquet | Bin 0 -> 2845 bytes .../bands/band=8/segment=1/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=9/segment=0/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=9/segment=1/df1.parquet | Bin 0 -> 3314 bytes .../expected/signature_calc/metadata.json | 62 +++++++++++++ .../test_cluster_analysis_transform_python.py | 46 ++++++++++ .../test_data_cleaning_transform_python.py | 49 +++++++++++ .../test_signature_calc_transform_python.py | 83 ++++++++++++++++++ 63 files changed, 357 insertions(+) create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json create mode 100644 transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py create mode 100644 transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py create mode 100644 transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f5da05a106242414df178b29c0ac05f21de73c73 GIT binary patch literal 1505 zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&yT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f5da05a106242414df178b29c0ac05f21de73c73 GIT binary patch literal 1505 zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&yT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a811ad8780065241630c479f45bc6aac12b04e09 GIT binary patch literal 1497 zcmcgsUvJV-6hAGjcFAJM;%(Z52V%6@!~W2bnPK*FU|WOE2^g93LZB4bGAI@X*%$B= z_~1t{zH9s}#s?pK@JpC@PD=$hNc4r1_WZr~oZqkarbR4UA}$i*{)vzUtOIO(+WD1I z5kP@Z1c-Ir%5jXZgNf`QS>)am< zYfaA)YnrTmk+g4;d2FixGsM^=0w81{l^_GKfKV*t!|QGkiYmGqc}ytsr`h?0-mXju z-6bl6+R&@{Zr2;RGpv5b%#ShRb-U|=S^vySTF5J^=xXHgtjM2c=kxlkPI-yHdDSl6 zUVGS?q4)ld`IYa+?dA(c&ZB3X?18QfoQ~VA3x&?{p*(X%!uZA)4wLiYfhHSC<6sFl z&Rp6ZFqeHQ+K0F%o2vG1P5-x?yFnt7q1z z*{!r1cIB#?DF^F2#Q-0zBse73QCunncmoiBpd%hKUb{ScojGka?QF~GR)&pC^^)-I zPw*fHNNH29#C&W-2M@V9fjeT$GU@*$3^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e303e5ea14abd679e479c2fe54ba994402d60350 GIT binary patch literal 1505 zcmcgsL2nXK5T0EYx=k8G8sBC&;XsUTdax~9QX{692W>YfR=`MgF9enaY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%^XWq=4H}icvFI(nSizzJ0?v$9!U=v{L z)9&xI$^jGzg#p`Sckhd=1WTlJHQJf2G}u}a#q*-tdJ#ZIN0B52%zS%=xVg0UeS6M#l!QwaA2R+887xPull*y?~J5a zK>UTy`61!8tK(PMlXlC_wcTEA)XX~Pi0|PP59b0Y9?DmUkIlKu52-kUJ0eRmt^YW5 zgeFy>uC_C9`zM)~d)Q|}(GHzQC@$5TN{AQq^q>*Ex$s8Y?qIOp2m*hS9(%#?%T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a053b47e2e4eeecbc7b1d7675075940736665f6 GIT binary patch literal 1497 zcmcgsUvJV-6hAFcyJRtB@iuM312Nj{VSnhzjF`O~*w&zP0!F615GVy^2F1c4`vQIf zAN&Z$ca5LL_~3&NehCxLX{q1_iN0{sp1=2=^ZWJQw1gE)#6?ovJrS~i4S>y0yT8&Z z0wf?50b)bk+oEz(C_+i;YE+qd8f-4e(*B^^S`k2|K$avK%pX!p#EpGWl7_z4KKF;C zddqXfx+ZI1B<-7I9-Hd_3^BHd00$LeRy{1_u%H@hyG_0PPdrM#kwu0|fuiu_r2F|W_+jFAJm!Q0N@*%X3#GjIVv+Fu52WXtI$q4pwmE z%$3~%bJeG!eTZwascP>McL0o(zK)g3v$IFxA?e@cU6u*v5wT1!%M(OObnqClgN06z z%hQ6V>=h7;Y$=W`MO-Sp;QVt|L90TMTT_t7Zjkg?^Ps6z+1BjbYsboEnpWcNfL7-U`c&c3+IQ`Si%RQPi;cSQZ6BLL0O(nz& udV0|C-(0$*ZD%mpZuq`8NsnEBc<%Mn)`J~uCu1C>0Kx~3VgM!jss9FCMDwr! literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..43cda6a0d1f01c2bf370612d58cded5f1e901344 GIT binary patch literal 1497 zcmcgsUvJV-6hAGjcFAJM;%(Z52V%6@!~W2bnPK*FU|WOE2^g93LZB4bGAI@X*%$B= z_~1t{zH9s}#s?pK@JpC@PD=$hNc4r1_WZr~oZqkarbR4UA}$i*{)vzUtOIO(+WD1I z5g-ns2oUSy!6uaxLJ>+*SEI_*(_mvxmi7nTRto?!1+pZ`VE&MjB5v$~k}&ku*110z z)|#Fp)-+lBB5B_w^Vn4XXNa*$1VG3@DnSNd0U;&i!|QGkiYmGqc}ytsr`h?0-mXju z-6bl6+R&@{Zr2;RGpv5b%#ShRb-U|=S^vySTF5J^=xXHgtjM2c=kxlkPI-yHdDSl6 zUVGS?q4)ld`IYa+?dA(c&ZB3X?18QfoQ~VA3x&?{p*(X%!uZA)4wLiYfhHSC<6sFl z&Rp6ZFqeHQ+K0F%o2vG1P5-x?yFnt7q1z z*{!r1cIB#?DF^F2#Q-0zBse73QCunncmoiBpd%hKUb{ScojGka?QF~GR)&pC^^)-I zPw*fHNNH29#C&W-2M@V9fjeT$GU@*$3T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a4ad5fbf82bf959f34c9a25974d34cb91aeff037 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AUQX@?-18p~Gt$>j#7k(@Y)D#vM3hB*1 z#fujY9!yO98$5Y79(wRkFy2gjGs_lm@!*9QcyHe~Z{B?0zJUTOnKVk{^zI3jDXal( ze%k$=R2V=Me4{|u=*AW$SVbsFXiBg$eNtg_0gC%Yx0O7Aj0uq>0>%$1L8JN}khrd` zw9dW3u-bHOx~dZOv!s5PjAKLjpF{L56aZ8Pba4y-n5XFe39`Krq`d>EMIZ*m7lnLk z*HTTwJ?qq%6&Dm}N@P;X#f4E^6yk0d195U>1B|z^sk%F;Yfv4!RnO_VBS%R5CaJ$9 z<4x3fA3>4fw*NS6p~yv33rL5j7Ocz*3!{Os`Kf(Xrn>07uIh!;YY#g!6a09~_{lFy zxH*>>Mq!aW{bUa`GO#;Nw@wKT^`1O)U;O;9H|&Fp>48f0gnqDu>$ki+U@Uto>X)cW z3`PAATL;h++A1rRo}TUdL(;D0U2bERWsDW^%k42^2(;NTV{Mje3oMHTPxw=gu{@8M z2N5&AlzYzcXWWTaxjb);NtSg4=yJ~wnpl~~ij{d~o0(L@%v@E@%u%yjZZ)j(WhGq_ z^__y?hm}YN?JCHNdBInZxS`J2A?ItAMlaK+t)`V}+1>K6k*-`IzlRe(j0vW=DO==v zEXHhh$eSa%BObX;`akj$;knE4Q>{wh?wzJy+{bgq9BuRcNX6#;Cg$@6E!nSouP>e9 mw%zY<*FDc2Cr6GqICpzV^TCd}lhO|o0R9J#@TXS5fA~LmVfT;# literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A@pGgY7zRwa_Si{MXNcoT`>dcsJ_ z2Go3=qxSoo<3(RgTkanlKYcEL@Vm{*$G#~a|3J=&M&(;&-QKB~dwlc&?~28qnj1&z z->%C$@4m}thZni8ZTR$W-|aeUn!;BS?>5{m-EFAb?0(OyR-Q=xqt|c!rk`CEdA+*b z=c(QW`9B4=uPq$<_Nr)Oc5v*O_0y(YTD`|>j%M~-Q+|<6?y}^TEg^&5$A8un;P z>S$*EA!Dp9*&e-p^W5LKy#?m&=<*xkV|N@KJ58JXbZcFg+ZU>iytOSn=eN2wTbJo> zJ^l3P=T{@9_FTVnK~<-HyW%>Z@CulEo9jRSao>UW3!lwczpJ;!oLJM!$H%9?_Q1wX zB@?<1Ui+|PiS|;L)Qe+pK6uQR&aV2!^p~n->o!$eH`kS`cGa|--nvs?(~gD}6Iyt! z>Df9nF=LE+_>S;ad2@DFo7>zNHg)Mhx20Z_-Ig?LYMn_ft_kb>*H- znWI{g*Qwh(V>4GgvXx|C9I|_DXH}Wc;g5WLypvJd*i$Rk(@$k;-=&l7S87spUk&=I zero2SsKkgkS(N{cJ;sB~wtjmaE*UZAN6sxTXJdVLS>_1a-afWx>{6 zmp4@F=O|Z29<4flZzk@VtNh(6!3}_ZLs*7meNZu0d9y zzF6KNrds;G(N?uGhpzm~vLNZS`4bnk$~63G=d{bUz17C-!NVSQ zON+ZN?R?;mkn^RxQ*WK`evk04np3R)q^g8Wn;>eRxq!XJM%|n-;`Z8_6tA*5$0i<{ z)IHUJsw@3J4`hqqhn0a?(j6tsl36meMf?K>LtTR z9SiF)Jb6mrS>+w(nYLH+_PdcGtLWI-6Y9{^_wRi_?qt`Pb;oMYmTYCx>OJ}d4~UBz zdGX!~?$0HD`=`7Snsapa{09b`W#8cg``b@%A?L3&B<`zoKh!Qgt$N^n&COeH4SurY z_Y~CvLo%Is+7v+VElD346rC)wo$fiQlW)>QFVUzuJv8$pN3JmjpNX4WQ#)y}^x5Ia z^>x#qFVqa|S+)8~m8NdWhl5t_mM?4l=AC}4DlVKWioF`>-)q(#TUhelqLKqs@^^S< zZcV(h``m}Ih2prG^@YB%7kbRvTHsMXr`zWjrQi3CBHdi zOlQ3}y`TR$sgy^5UdUQ~z;y?27^J~{Jqsyih9e+OU)hAoG zAbRGxBOdQoYqqzVEPuQ27hQK|x~oo4Y}ujex5?@owvdVIrnG)f5j5DZ_SlFq@Auo@ zvd4_o-`{Vzx!K;cMVF$S0mX~vJuA7iD6C?^)dg#_H(uDhFs!U$?fr%7f$g+&DgrA4 z2b|#*YIQkryu`zAl(l}zeD9tK5h2-G+0WFnEG`NMRNHGJr!=r{3jkJO8XiZ!E_4*F^3Uixe*uIVRhBX=TY-glDF4qoVwUsKjEJV1)*1mFx>d5S?O`6i1L6FGb(=A zJpV8otIo?>uF#bHFn&bcgTC>t=XgYiSVHBUvPbWOYdKaU9CZ5!1iKyNl;koVn?UU6RKP=ChdBQp|e}sHldvmf$-o+UgQg`f0fA>W6-fhLDAs=~&C${{hLX#>w z@IWTR+QQz(58`Zs1pFciY(>80|>D2BmrYHJu z7&z*~vamrZy2U@)5SiRD-k^aPmN2+x)U$-ISXDDnB-&rJj znAPId@+L~uqb4wyv7mIq#-jeDm0)=@Y2lHEXIK(tT1g9sa!eGfM`?_X;3?=zL2pWD zqKt$^oRy6-NIPezaAzIQXh{|+!=)%(p^O$Qdq>JjNrIIzv>s)XJVDWLTV>Ii7?wkM z45OzBiyo$?X*AHr@hH>AbCjM$$t-TqCajd6ixtBJFD6f5EIehgksMga5f@^6f&;JMzEyJz%aaX7>YwY z+)rT4V0&qIe1*MDzsB2gajJ!}TabZa5ygX1;4U6KMOlo9BWY46goiO8t>8ZkVI~oF zBxfac6hZS~trY@H(o)2k7?P7BLZ>4+j$tYAC+=ya3_LVrgB*)AkOXgI;jWcs^fnz# zYG4^NqDexSRhWoFoK0s!1cwqKz@!BwQJjfnr6`xOkfj7H27iEIlUR~OStMhnNtha@ z14rpu0%>#%&qIrufrXgC3y1*=SWf;TBH$&0<``rLc+jOXmNde+5RgV!a5gK+Qs5UI z;D|8;%s7%QrgSh7Wzo?#!CC@imnUI4yv%@Y(U}O=2yZZ!;0wlVhM)qzjEE&+-4>Xrh!HKvBUVzZdnJ7n#b4pGw&cWD#SYp`VzPcvV@(j}2DBy$53NXcv zHiNkq-eyJ+IDBm&%oHpU0uAeH1`BK|4^5Uvv>RE*&YK`m0uM@?dCQ4bj1dFH0NJEp(V_OU_@piO_&%nAxA2`p2AQDT>#f(pE%th zOm4;0M&L)hN?|^y@vmQGIv|yEs8$jdns7z9QZ5e3qri38Sd!p?7Xl9`Ek#qn5*Qm# zN@I_+FNqNX_BgC1OLU~wVI)MCZbm+!mH-DS0?;)A1tLtcAb=J@Tp8h;z=)SM5Q7}T zZ{Z1!2U%+h6$ZIYSaKtdgb3iV8BlEp+ncGfGz4)3rAGrPGb)CqV{MlLgGnr|pzQ#( zMyZpdkR(j#7eK_)U18@Ry&pqhyhu`gFZB_ zk}()ySc?%z^^(8=*MY{+k8_G0Wdw$RX8jK-XpC#~?BWP0cCes3I9oX#Z6@U*023G# zh5|285M#kCJ4x$+vP~|OqQWLs1~*bHg4Ae|fzS(Jdey$>z-iJb;1rGkA*9AeApmmG zrAQCHq5ujHGiSTmRm5E0~FP5f))YtarsdU8V*=y z@j@*Q-E5>DWMN2iS_%UFQf4ySI3T>R9Km{AU_bzjLLRq5cLs6_nDkPFoHpA5nV3+J zL$Ie1X|~c#DdZQ7cV(mk0?09t|3Rc<9XuySL&gW4b0i!y01HOYg4H&fmhluI|Ak)! z+JP=>&Aw zv=mb;Bgmi}rGR`1e8vHH2n1$ZIwTS(t-yQ@W-@%1A|tip2s~06J$PG6CH&bzorZUZ}Hhy#{eYjD?1x3Tg>R>d=IB0i@mxN-qa>l8Y0- zwLFwxNMnXM#X9F@JWE<_U=lxwb)yfr2)%x<^K ztt@aysPl3EFnj@37}S9r$%VnpfDIcf(>HA(WJ3KY!@S^RLP_hOFib3?pto}aA&XEb zq9gh!`$ox4yqOjcwdo2;7IqPk+AJuIg4_lUV5~x|8-uzJ;DHtdVkxe$6r*c26usR8 zerzGIBN>V<$dhT5ffT0`B`D2igy^xPIG?dWL@nKM^#be!U;)~-4AdtCa(+o@gO!mm zFW^53QaWQtCKAeKh!~)Z(>Ze5Xz(QXh~fYd8O}lmVGheKdMr)@m!a-=9H z7Xh}R0C8jkJd(x61O31ej#SA*etuyqmRqb}R+1l^$y-Z z`p1OD`~;BOF0r_XTt-1T4(%C@E^-)#eefDz%U*Ui{-VR&bOJymED4eWm@l+S z5u|C5MJVuvWD0S`&e+Reh{wObQ~r;TE&f+ODtx62lK47* zRPe@CA<&f*eov^Xa`V4DGW64~q31k;T-$qxX1TWa0;pZvd)M~fwY`T_?+||1_8uJJ z+TOdi_fQ78w)gmvk!ySJ+TJ%k_<;wAuI;^Rdk+t-T-*DmJ-=&v|Khi3c(Ty+L->EV zy?@<7zQDLhQGgexGm&OO;t41Fy1Bn{a8M-ig|?=Be>YKD5IiXG7Y7CfO8=kE8-}`` zK)4PVxDFV+{4wrPC<7=)12Cf4J!r97K&eP#1kO@s^{#*wPTn7wX z2Mk;X4Dg{j*8zj3hYhX+2KaQ9=6`n-|Nra>g#RbY-TcL&-u@Sf4kA%tP=K#0J3BKk z#_>cVpv*lCzxo(e@EU~wz$;ayva8faRgy|&R3!+vRBE+Km5y&Fs8j>+J6t!u!xjFh zesP`A9|r9UZHa>u;V=FYyo8U!Cm0v{kJg4qDu(7oDs*nw2GM&{bBjBi^9l=2(;XeQ1x(U`!70?k? z0WAOsj>Tl;5n_158%QUx+BD!Jv|@0b&1J7`w0Ro^tl>2sNV5|e@(ikndD14(Tm%ec zh#(UH;f+F)g@mUmiYx=c$ixx>CC)!lZM7iY%~MnbB%+GBt)n5i-@i+QA#qCX7&CET z=w$CCA0K9S*S9n~T)tYn&1Va1B8w(_ZN1ys((F>o`5b93Aj3w+a0cuCMaTyhxl<+V zQ(1&(uu|uG_F5FqSyKd_J=5_1#=Y#SU4O!Fd&Z)-6F2r#4J(8lLwB&VqPId^jh-#u zb`5pqpN8`pYuV1)v;9UTpJATO9|w%1A^oy9Aw2sQd|kT-7?Y843zI2=*Tj9EY(#6S(;;|6 z6mao1ctqwtuKsv6SI{HEx_uRBT>=LqKZ)jco%{>EHt-~q@Y-|Cg7dj>&{K}P3v&^< z>E}72i%AeVxVw*1Id+ky2Sn%+jtwUh^XG>M4d#AnC+iN5Rh0w649oFBs^E}*p<(^Q zBL+l9sRhkIQ5!u-H#lZU?9gGu^aV6mm6e3R?Q9d za5C%lWy+v#nMZuL3UWkS`}}bW%rd}}oD5G1a>7F5U=eY!xPFNvOopYKiAl+=-Hi0EDtH<}CwO(i09W{vn zE~H_xZr)d zDS1TT{=eMVM|P7myQ4I`L8a*&pa#(D#89cUbZX@U|HTqJ*knp+l39|Z67r=~C`ql7 zxk1X2qy{OgLD@4Cg(-BEB}w^oCVXHfoU-&%57dz~b^hq&o{$e!6}t4woStBTy!RZ8h3UWvcH zf@yv8ADjX4#%w;iCd6-$L~jrPe_Fl6z&#?P)x zZ6iGyOO;5ITWSAEivB|(duGLwWKOCha;s!+^`DcHEVUv!lF~`Pky508Un$+%oYO+$ zD0NbM;-8q(qw#Bk7%SU6b6Xs~#Clmyw0S)4*4TLtkGIL~icLvRN=Y(mEjoZZALt?f K{))+;*S`VMIFrHv literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dd4f930793d5e38efb9536223e4aa1ec1aefe431 GIT binary patch literal 3122 zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkY$7_Rq6xT5fQ5hu3-^Dk{}6!wHB9BQK?p{ z6|JJyRt1Gx#EuKp759Cm&dhOY$HC>)>e18oXb0!sWMZZ2A9K!}>35R5efxKBe&0#5 zloBKgqKD|6NGCW0PD5t3$9EcZvDe29#~XK4EIfHdgy<|P{V3l_l#6mrD0~$*o=Rbp z24LsG(NQ3T=d?SBMikT`kUObWb{RTD*5qkA(=?G&3d#*cxs_;RwTjnSQZru=Incq0 zPQdR&B=n4t+&VGA4k4-(HjYYRlU35NUp}P<{u4IG!mbvchwbZ-Wy~znXB*6uvvg*i zLoQw9W=+mC<`?M=mPC#d<$0oA%4^SZn*YM%E7SYCgLu~=QQXa2F5!Xvv5}8nI0Jbh z3sBc8p|yD`{5JN-_gC!kc&QKWwDd)0KUcDOixEpExx!~v3yk0S00Ay|*p)-Y54lL+ z`7c~%w8jr(n5PKPaTGXI|6ZhZ45m1{&+aDotfJ%*u^lkEn za4!G~V*-ZF6>zcM8+Ls>;Nm(A7xJ92EIky{7e``5YaS+)^n%?=7hFrK!=|pq=o!#J z&eUm$S92{~7stY@`3wqn4TRyo5=~tvV8v7omi+oPqN=hnWxFTNPLyMP>sRDvXM)Dl zU2wgd3l1IZ4Po7I1U5cGS)3P=>Ni4N8-xvx?zXl4Sks%D*a?=*|`&bbE1*_M2=k*eW9(8V|$ejCvG_6*d-Y*RR@yz znvL)_ox;=E(eSz*fXf+`xY2Gvi8&QjWt;G9=25JQ*^2e?;kdbmBTq})(7(I~JnMq- z;Ng61zSalc250hk!%xIL>lQKl5)>G`5WOr3qkmmQJ~5T^6*cYnxqcw|#=okI!ZQJz zdv6w|xUYxX#BcfJO^-?Ipyfha!eLV4c!PWx*B3!oZ$U4d;+j|PCT3?lqR;&sS90Mj zTIT(goZNVi1iKY*)y6E$xc>oX=Nm4oU3~%d2c8g5BywR#0!dAr0~vLw3C-FlQNiCmxX23h8@blKWNKVsL2WTI#r!r2ex_#$y1V!VFAiQs>9zJ2EnnJXMH=-vJJ zr8W)@X;H}U>CgQ*X)hL?+|OSb9ggyYQ@Ka`WaQF_W2kHxOLp~ogaXGsq^@i@dJa!O z)43kJ&w`zBjCzXiCf9=>6383HVNev63wx%lkUv>&!arN)!%c7GKHW5)t8NS;`}*wV zYfd~P6Hd4B;rn}c@+zCjZF4)pzc(>d=C|!5K0h#xKMoVZR%fF0POQv%?kxE2ZjyJq zM#!U|ctH0B0^b_SyU(&`Aab7_b9QkRm2N7vdv{Hbp1nMJdwO~MX#4o~?bqMWe?UNB zQ1G7yhJ?N`XmHq@LxzUGHB5?#93B-N6B`$wkT@b~Wb&w#(Ql6#n>sEnef)%plO|8m zWn^Y$PtBQ@n>RgQZ!i|jC^VUiW)_#sdZ)|mcjvrUI(Odu1@D)Au&{j5;w2RyE?xG~ z^2(1_Xjgu+s$bRWHJ`3sSG|72#?Ll={>A1kTWe~!ZLh2Ua>rK{pa0#_U>y+ zZvJNffrEz*A89#y?D&b3r%syr^&Wh);9f8qKpKD;) zo^2BHb4}(>+4DMFH@QS{3wL6VB4h#VdElG)Nh)82w3ttLk`{^0MNkJK@V~et)2TxBM zmokm<^~p58S$TA@*o(9np+Qix+OsSfuY(tGUP$aZf$MtY3-Sw&DyQqy)Q;(>K{92 zXZ;@d-uu1ZANTwH-upf`PN_aeUiVD$yWn+TGor zwk~(LOTcc16lwNN+W>&S$Qm;mX(HeBl%9fwprKW@Vpc9<7`SIe+*V z0_^Mf_^ubx*E8P0xxtI-zPHZ8$xVMoul@XUl$$K$JwXqAJ-m<6WE67;o_Y%ozH|*u z0|zeptrhLBj7N1RM&P(5gkBjqhc^u~u)S{?KG#qNUld%%;`0^E-j}sdaHI(h?K=q{ z4DN#OwuP9q?2Xt~IG6M7n4-FP>M+Xjjk0^Ts8kIb7Q{U%Pl@xVzR&GGaS3!;h}(Ou z3TyK)(<2;%is@J3OzuZ5>ablbl{=7)|zLfJR)VcS7qY*r>H zlmOo|$R1s>Q+4f$Fnn@yGdz9j5ckt(mf<}^Iq2AoVdlPH{)<`u#s#K$G)+By%UXOW z`(5>h)&lM&^_QIA>S9hq7>b_ifQ_Fm;*tjXA@ju%)ueY3y8L%?*WAAV>w@`AO1~Bw z{<#&Ie{vEoRKJ2ENgCuCwQ+ySwxY*c3P7`KGH@BKct-JIuH)4fu4+Vt9fupyj!X__ zzMaDDKl2@WF8K_T`P}2o1LrH?XlN-uR$YO993EtnTmQ)PC47JazW4B&FCW35uiVJK zP~$}W1sj$Vj-up1klZ5Sgj)C&n<)&aZ+^nc^$=Nlkc2*DOgb_!-@b`ZF$Z7u6JbEj z@@s&6%`B%<^P0E`+W3hHiIb8h>jZtWXh@l2oSJ%%DQ(*HbhBkf#>`pwW@gRKw&vvC zmzQ5Kr*Ll3Jlp)@1tpTbbYWR}#iGipYDZ1&VrN}_!;+756MLR<0uaD&Lk_7!t@D=%(M&x6* z6wYp6B|$VM8imJ*nP~Y-^4-uZ;Rd$(z1^LWC`O+~`Ztl~oZhlSPw?#a=y%!kP6Zma z<%6gScXW9JZh<9GZpOE~2xg<>`N<`ucF4!#_$~{osPTBXrFJ*_FJM9Cuu*STRL+9F z8kp=5I}>vXiEpCum5u+iSkUtZqrve%ey+Z!x8whEW52hXq#xAlc?++9crt-1gHhB< z_4SLJilP@w{27}~DPOWnl2k|jDOF2SuVn9($|b2&D(zI(?8Kmk&PpYzl8!`pM;d_JUEgl@@)w$d(pDjFkHAyqo4y<*fG z+rO?-={IRMp;+rBX(63f_-PO1mh!o)+H?CzPx7b{X>u>^KS?oqC}hp9c#_OXbwuxy z?7h)*Qj(=!#70s&8Ff;MjIJxCy8`7sWFMtY>QCYmS9&y2OAt-6JG7?98_DXBL_!=OaxVWfOw=gx%$J9->e`FXlozv)aOQsv{oO@Ru3jZzh1kU}=_xyd| zIp3ES3PtI>jF04FO+3dV8VJ?=zUAHf@xQIy%!7~Sl_``T$;)}UW-^jP12R<~1wawU z(VZiNgSMGyc$qo_@<_ExQEqlx7F3x%RXnHUl{ks1I&(xW!db=de1x3oK@@IlP(@)s_VnE_*yk+ z>==)0--Y49+9cfU9E*x^v1He77uL*=Mbf5DxPD3kbTPOW!J+APHOdbBjmy1zA+OBD z?h-XVyc>ySn+39>HjJ~WGjXCl4X3sm5$~9Qdj-AZd|N6uXjEkEkwnOD=@9CgiL3@0 zF195?F(wW%v03=m7KwFblToue8?(A>m{@2OX(Vs&4kUWc;L$Pz z5^m^lxqJhz_Bv7TEr!2wCmt{P8k>xJusv@YdNy<9(Tblje$^<%w@$>}dn>W)$`~X% zqsfEjKgdYSb>dAXa61#&qr5@*J0ykOYG2F97;Ke{B}|w1t|yp&f_g6y5=4n^A`2PS z(6BHoSrxJB?eY^5@ej^J zTb(Cw{rTqcW_M1?$aSYasy%Jb+>u#?%rEWQYqhsk==i#f` zb0ai7C$(kH;CHTYR6ALVT0Js*wZ|J`wOd&a&zFPD@DPZOrsUw1lS<)&Ad zwM-@60h@pvjS#qtq>DbB;^nxJFrvGfs6a=53uM`Y|P$^_riab44n8m1ULC9ra zdJcT)NhCeY19K!4EFG`YO`*h${2X@9_OX2i+o)?nFtJTv%8sI3NMq-k@`?HB<;8RB zEQYByr6omGOztS8_};M5P4t&7I-wxwlKhDzlMS->loe+|fV;tW@5NxrMBL`=>=grBaL=e19zEWs;zutJ*~+krbLVEWA&#HrAwhay=Y01 z)9k3cRM7jRdlJDiE2<-im6VLcBqhF>da1)uN8^ZjGJEQunzE~rG@*)fn`;->S-le~ n-0lhH+FF+{Zi%(lQ|)rbrB6;xPfb&sG(g`Uj>v#Q`Xlcjk_>gx literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..aca2026d8ffa41af6bda14a140d8fbca62eee401 GIT binary patch literal 3305 zcmc&%c~nzZ8vpKlFCi$KkQWk+iYyTnOkji}BISV)mWYT|7DWvNStN@j2x=|3(4tbU zRO?a&6^o)kUC>b$sao9kHMMp`-1XS%(bMT@N9VqmqEhvbIcLuFJIP(Y<@bH}p8HD@ zm_$FCqFreZ74ED}h@9LZqE+8+iZ^?B za0w6uc@t!f)T<1N`NJGY-T800@S+y@S;4R=Sppj#Il!#d3@XmIA#x?b zuz#BZj;v9Fo7NW|k9vVlHw=afsTg^;cmVZ44g!4~gq2WmuE7JW2e^WhFbvM-I>3_D zK$yBP0!Es1VM0+quwL#2m*eZ9+O81#%6FiX^-|>Cx&?#_qrtuPBp7xL0o~6cXtA3B z%ce+S(eJwM!ZSYXD)$CiIUnj)s^RE0dpLZa2j?plD7JPzco>hv^I1{gepe0`(ktNF3mp`h zl3-tTgc>v zz@T%7s3q|*=Jy4tF?%~zUi$+6*)Rm{^Ih3f;1&lp{cEVnE>+->=1C<~KSk!CrPQ6c zgJ_`q6#03KH~3w;1zPGj(YkyOGC5i!OO{QHM?Aitb8sN=Xs?O144K3G7hKm)*Sc+Vq3&e z^(>aK8A6Z+bTcU3e}kj`H+$T@_W@4`wh;7j8~nB<25eIzA+N75@yo;}Sa7V3yf`Kl z%Jxqoo;35&`H_dAV#heNtKSo76iz4VOGiN85pmFRx)13&ZztGCK8Nd*8h{K@ka{`{ z1O;W(-pR}O&z2hDcKclLzTm?9x_UfO+31Iw2Q-njM_-}|C+?7;ZT+p>OQ#cCosW{A zqy_SPwthk8DN@KIV1(dR8BlyLn&((D1AO+h@OxcG=*h1n%rYxM@!@dNWrhtbE*ekl z%^yn$eG#uYPKmxUmXXeBKafTC*N`^nCt}*qpArwI6jQrKCLqC`5W--;7ddP71gO7Z zU!Vh4^ckKHU1Ih6#iZEd7E|oy#&Vr>72=M3aR#AK9z1%8CnZ*tU>YETqfT+YcQ+a`f2o z6DLodK6Ccm`3n~>T{i!4j2_v#SdA7%S`OKHMFL($zI zacgakj-{kbBbAq9G+AZlWwH-Ee$7j)MDv|RYiL;wE!)7+-J6H^&L_JYe>U5&M+{(Q z$pj0&5Z(AwEa=h@3%YbzN_u-08~?$n1k0)BzVBls`&aywm#ND#S$Osc5}iS^BYk9L zeK04z?fpG5-<6=GJOIReV@7tSR!viwjy;JtSlv^qs&fj%#0&w1xOrWFjUlqbyQ)2EcSmq~igswC3biw{zS^r$%KPL+!fkY}%{f~cg{6+l#f4Q-L*-a8% z>Mat;1fpjW{QOCzVjCtgaa2;Uy>9l#)T98NMw!K^ zS$i%#P6ta$*F16TSau%EHkQ;m=oUOv@mjb^ofM!?Nsia2CUdFdc%v=(#eUQs4=xs# zH}J5f3^!<$<4o)Z6Qg4FIiHbByCPP>y zT;xKw{urX=Q1DtfXNi&H?C^dH6JBUJCk`xYg>@LNk`^N_Ny|EydWtrx0LS6-WbLtk zY|0*uMH9rX26g_lf=rWdhQZ*g&d=8ux=zo`H|FSdu6}`o{RS%}Dk;FL542DK3HVRW Fe*i)ig~9*; literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a9169d9f5dab747b86df507d1750efb57844174 GIT binary patch literal 2537 zcmc&0YfzL`^xXS>`#{-5;@fs{CDd5ScqfDmq2KacToLdAizzPb%7V-8whut3QL>z< zA;m^#)^vo@Oif9XW)M(N!x!d*rnH$Rsp;V(wHcgJoUD7lZF#8tYw`@-@7(h~=bm$B zbNT#Oj^zTlARUJ}2n3L-dfsXX{m?ak4F_BxCyyZX08Y$_RS6I-_3;#g7y$T%Vsc`H zFsPf5ier`WAP!I}{7MW~W3k!bG;>&vCccsbdDmEHi23a{l%8kXPHjBH`DkRW~ z{^+TiUooy^zsQlReWGTh%TLSM}} z3CYP}?B~z+LrYK<-nTXlLegeK$N7m&(7O95?2UskF8x0IP}~ematveRW`WdQ#~xa` zQ9Q803BPo&f%i`w5N+AM2ybhNMeS4Dm={0(i+SbjuS`nkB*~PuRro;kCCRGA38F5^ zEs-Tg&zu4$ByBE*)z`8_fwe0jqP0Uj{t`m{e~|Vvd;~EwUuJ?=_|vAminv=Sg1+kD z$OuoQU}-T4xPEtv$cK4w0U=80L#$+CZSB%o;fkl~-Yhw4P7GnH!wPwkPZe-qK69iGc$yL!YPG0_Po@aVAmI0~r`qod=)hp0q(_^@6M9Bo*~ z4UVqGWG!=f_lO;bcLMl`5s~ zKVH30Xf^&XovMCPoh0v-$+220ACLtBC{+r7K0kkML6Wzz6isY2g?L`W^L!rpr!br6 zt9eZspTYBGd}^6M(-46<6ielKE&YZBH^A%5=47owvR36h?ZMenKA^h~D* zG0pQjdT8|YD)9LjdS6}=mlIo3u+VMHn6X@6m~W>1%3KoPiVAWF_EH5SBnSdtdjd*% zBlMj`1@Ts6W*Kjw`ht6s4gtOIp1)uLy=T+Io4OFXH=gB$rg0ex;%)lE99vPLkUH^~ z=glvB{>XSpu?P%{hn7-fH)bw$Q3kKoQF%eA+UGw{E7Y45n}DzMJfB9rX=TJ_?~}1Wgh-$&T>#yr$awPC~F$i^%XoCA~^Ql3u!yy1p{Q zP5cOXQhj2dsM4qLN`fNPZg9NnHn}28?e<86!(poqtui^B#vP)O@F1xr0QdUrqg$PlC$fZgV5etHqCPkLL!Xg-r*ho-`MxI7g z6p2O!1q+xcMMQ(WL~O*DC~Ayd@!>PJ=$m_2EQ$W{zVCf6^TEuVnbUq}hWi^L*(f)L zVtko^a7N5PR{)zCBi5U?)?Sz=jdsx4FTG>xu(iGU)|nf+8D-zE-+PVRrRojD!=upG(whT|OgoE<3y5(K2gfC9=y>`h}0zKQe$Zzf|b z-()Nockal==#`EPqxkD8-ZXL)P!J?p5Cp0=>NN~UlSFMowk9b8mN$d;vIBr?H+W!5K;hj~h^_tu&b%lG zQL-0o9Ib$F+Rb75N*3j18WU*>FKF7PhC?O6V5@O~2SZ<=lU048NGV57_4YvBb_KCE z418x%&{Aa&l3q5@P3jA$(#&CTtS3xf;15Gu(_n0_HAt3qgYyxUu+B6GdbsXD$19b{ zuAu^?3kHK-!*Sqt_JfRjGN?Bl3ri;{Vd3-dAh0+ICT_8X6XQfs+PWQGF+@;v%m^-Y z=?43ndIGgN0Ms=dkRM_P5mjZNsBnX|gd?o(UIU-Ktb%JW5Y7cN#FcrUz@nj2P;3o` z{99L1T6P!MM%jZVUjjwWG&EONgY#!W5cyODI}4paRVacj#c^=>k{KL4O~dImY7|;t z2KHG;;o0;cu)E<3XX1XU|Z=9 z?GNU_hV#9^KBFsoxb_jUOuUA4jtIC6I|y1F0hv`Jh?=@ew+gpx$YsV4LJ&}{g+S`6IzUVyPhDw3~>UtbJsva9VHr;?Ltp_9V6Ve zw+wPmoq*=qKcgdMw~@OAN378%!jszdq{PvOu2^vjs`frbwh&Hu{h&tWAtq?pzItd- zWupDFT8P=TBrM7uL$nMJgu9*{gnUadS=wX_p4HK)W_>eUDp&?D3p2@{u6cxa{{eJr zQFqa;13QpgY9qBKUnCknbC6*%*WEC^$3?ni-+hp}AiDHvC}G@>AhVe|2)26+huwcQ zym9kmS_&r6e?}ubt_T5>m_SJH;X>RWUkme(G?HhB`#?d{B%-5^MyH1y1aEH#YNvGv z)JUfimH7eCBOnawPg;@EMbD7K+-fijdYGiQAL%sGUP1k@&hd!I|ww78)&qKVel8W|zpBX%oNs?h4mIPVs*n zxSgkL6nG$C{D@E#4efWa&?ctDd4NB`6fH6kOAL*SO-#+qyL9a)Wn>m|g=Kf8RS#>M zp0;-O4ys;`PQCj$yYzKcySabV&%^WM{sX)|@gC?i$d~o=4+snj9vl)H7Ct0mXymY{ z;Uh+liXI&kJ7(;-@e?M-#U~^tO-fElO`Dvq$zFuCuV&$rm)oV)EmVL8s{kI!7ZYr$dGX zs%v(B_x-NjwRQCkdm8sP?c4uD^MQkh4j(ys?D&b3Kb~qiedg@B^Q}K!_~*~RT)f1! z@$qQR*l5P(nTV5KxylFY%YpJ&6%NvSRf!2_e(ksG8*lh=05@rlifrn_sc&Itx3RN3 zr|+IUC*k1UO1!%ao56jWgH5#@7`NjH5>EL*59eQ~4LayHcu2R||NAu(E6(^4MnH2) zv)>;J00s^-zghRBVUN{icp7vW#5hG&iB-8;a}Z< zAn5yWDhfS7F3L(sNz%kI6n+jk5O2LI9F*Z(y7}S~fCnLex7dtGulz1s^LlTNfgu=C zxHoT2$%P}~UmgUG(0Mctme{`y?_UV~XRshMR45hU|Klgm-}LnOzg*nE?Iy_ztYwBO zv8+TX1yCsE#%xs7&}c9H#geb3T>0j~`mroK4FB>ijAe6JzhpLuWs_O|WMRz@D-7Xh z{wy29k67*;u_R2hcn`cIcIc;lTwVL)NnpfaUiQ!7`#`?Ku30vm@BCtTQP^kIygWJH zJ;E(MdQ^5|(16J?qoY#zeN80JZ(17e1b_a56PyHYdVj){-+RYCD>m9ABQZFcjpOZw z@E8tmF`aVM$dSAp%6EF`LhAZ_CgU|fU0k$>HfD5$Hg>cC9mgA{#}_){-F$GdaC<8s zzLb7kV(=&(zrlut^ZJ5PPG|gKAws=zv2p7xJ;Q z4qtzqqW)0unx9~aGso!gehllEqdzAhS>6ikutFvEMgmCvx&S>!6O@hP2sn9r>>r!* zN2AvSxeXVWIVC$u=aRs2E^(Qe+8mpyNts!x+6)^v&%SPb)rxQ>!0R8VrvM`He_j6p D2t(7I literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4748c07ab9f5c77e8fc8c141ac748fc7bb24c158 GIT binary patch literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4748c07ab9f5c77e8fc8c141ac748fc7bb24c158 GIT binary patch literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3c53b83a00add2b8dc0dbafc83e88f7ca6fca968 GIT binary patch literal 3442 zcmc&%cUV+c7C-luVTK~8^M)B55h)`oI0FHq29XECND&bWMNw%Y$jm$$5R3&I5tV3U zHDcFjEGR?;vq}-gVDA;(L{VdGQE_pNExPx;5tUW{*zenKb3VBDo_p%=+{-(cNH)rY zp%@M070!qlXa^u4v8$k7ufKG<(8hoEp4Z!DEA3An_aE}Kshj~u%E;XqbH)biij+R! zDsGX<#Vu9IF=z~=CU0~R*6(GD)FaW zDco>wzz~238$1DEK9cbd$2su1x>5)v2uO(;1(cQ8m&O`AiA)o3CS&a0Wh@uBNaSMl zMn{HGe7K8+7Onycf+P!qL3N6H3&Yha*^rd2OVMYIN>0d1uu8*=wB%7qhKy{TUI6wI z$(oR=np8Cs%KswfN8vVDGuN%dnJ1)x_;<6%_sHPc<00L9CX_a?&`>i5+}3u78|N+I z@q8D!Q{NSmx=GQ7jRu%EQVK3h>cMc$1wh>%934Sa=8mi2%lr5tswb_VL68pMV$@SjG(nKEaP zba8<8Qhzv|ZUYPAd|=$%Kp4=F4kL0qfn;%exEN6i>)Paiy?O^aUaCY+H6%4Ajbg zps8qv{17LIC|e7P5)W8IxWdW~6)@#>8Qg?Ea6!uu*JphJ^9M>n@q-rf@7_S^*==DP z)`c-1F%H9308;t!S!VXdXaYvx)*c=$5KyddNczzT+(&t7y|Q3ui813QpM zS}nCIUnCklwXbq-|A<;vIE=cW}AuvMkrAd%?r$60p z+C*^MRV`|J5uxTMBtXLVpy}I(w3}!NgI~@;x5D;VZ^sTXgw2i++=2~Axy0l-2 z@ejTt?v2T#b`FR{;#I?+_gq2B{T_QK8QT*<^D_jRT#s6_&x=1-F@I<~k zg-{d?P4_X=D5k`DfIq<$Eiw~J%q=Xf+F0APZP#AP$ZX{byAH~Z_MIF$J32YLsJggz z?bhARy@y)k;rW@Dx6kK2d-eXpw~t?6e>NbnUr?~Ne@JLp_<)Fkk%OWJ4;eZvIwm%5 z_=xzCqedqrCMBngNgbP(J}yJ2H*n)Kjak_fa&jkrIcf5gsd>|;&zLzY|Et*rbLP$~ z{CfU^Zx$B)ZPDUymlQ8uwtU6PRjb#m{chd*?>B7Rw7F!<*3z;cwr$^0Ua|AXzwg>z zSyf%Lr*`kYy8S=ZA2@jE@R6g(j-NPr>hzhj=gwca*l_9cKYsq@%2lqBk4JNs7L%_{ zMV$2d4L(>84wS#DaFA}xS6Fd2H-Ede@pb?QaEIoo$bD@&&0Xy59(MNo+56`&NH}<~ z3`-YdGk8dIu&I&*%O)H_!YLn_;QSLcLNnb6kLgC|f4xQ0k+Xb)5zw5{`j4jqfIfq4 z?o>Ui+0*eFJP*DGVw|F?uA`D6O#QEa>&79`rRn)zm|~SU@tLZf~Ez(3k1G>qnL2Nzda0*!oA`2K~d;mwS%Qmg(g=k)}Vi(7R0ZusFY zR(jLZ38AOob@)^>w0_$fGg9=aS*AXJ8Og37RYNCLVJF-UKXm_zpdZ7jDD(!o$e1)X zMVG)(_&MN0y!WbbQHF1A?~fk_;7!O|7n>28lv{UeUhmy8Fa#qC_vNiAxo{->%Y(oX zT8_rW4*PfD{R@Hr6D-Kg6-q_;|MN&yr~xg{GF zH88rj>0-&(P->pM*#MSh2jO3y!dNzk4M=5!SvHjoOcnM5u)+X-709w7{EYdQ8FRwY z$a~-&u>-&Cu2%QNi=c@9JRg|Dk3sx^U9)UBKLo_`tRQDKJU=ecGr}V=dRTUHaIbN( zF;QcAxh@jtH!&R#g1>Eqv6_p*$bT z4<_hB>ZW|g;k|&YglKO=Y)ph9E=GWk;|((53mwurA6zU<@8!dnGJs3g4$I;NY)Ck- zFDT`-#2*wQ)EgHYQ%mVtwjaOLX7DxO;(~jnCwk`Mnt1U_xX3wt{c(z>L&19if+fxz zqr>u8HXz4zPC~N071m*eN}7xWkfwbBdaN!u8^;lF^7hz2Hsz1TqzQ5dE+KP#c1o6e p634kGWM&$2944e>8q*AV2M?bf9z8UQa3#RoAE>4PBJqD){|0`F&W->8 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dde573d07cc0c2130eb43113befd5c673b2f0ef0 GIT binary patch literal 2537 zcmc&0YfO_@^xXS>E$v1pB42G;b@-s716qhMB#il$H&g_iLfI4wt+ZI$t`9)BD0`UT z#sV76FdAXe81XgdSb~6pI;QC6YcIAekuBTQIdutUQ?}^t{krl{`M1oI_TF>P`<#2v z$(P3$#BeMZ#D(ZN%t0`KMD90sxkkqzL%&{#sF z@L*^PiJvoy$w{~mQ;G7w;bI!@J!T}_2${)I?yWN0JtZci$0)NB#A+%jci6mDb|HZd zB=1G)n^=7et495dViS0LWLk9r+}i!zjku!|qrw7KkjnC{`ADvYtM;7Kn4k~^}wCISs*i{fo*yezO|wgHk|2UzFLwD^~YD@p)L{nV$mts zba)y1_?aPS52?ln)~CYs)EwwOKaJTGDT44fKY~eVf58tW2Z2e_G7fGbNWAszkzT+s>Epj`kSTHK&tl>%Wg){_F~q+%qF!*7|C^Z)!iYc6q!ga$gf; z(;Ar5;D*Gl<*@ErrYN{>HAHrFizk1B(9j$}PhG#ewzt)p85!DmdY7x;me_J(k0<*;@dpF@ ztIyt8(=&LeRv*3X{MGg~pWC}GtIthRaoE?End9HNw90CyEoyeNHmlncV78fQ50D2x zh~$@~@*kz@W|{go5pCMBxjgb!kDDU{w1Wve+^l|%LgL40>-h0eDp4NWtOLOlHS462 z*0q`K6&~Nb@wMye$?E3Gq+K7jK7r3iC8lhM1BKXKZZTIGIhKs2a6D$Zg{$=Yrz|A1 z6XLL9e04doPk8(^p>m^jCj(ZjPomnaLbwAu0WtC5zl-otVn2-97aTvBEXc7^rPTk& zqxUh5#{cD|svp-&l3$X^v05S@mW2Q)Rf<5qpkQ%fqVHiTT3HQ+I9|u|d_MV8$l>`K zURS}V^Lz!LQX$ZEL|_rcQg}X#?&R#>$w@%n)B-icFJAMkMw390wA@U3PN|`58eNEK zp4ZbwXP`%c&ui&9#?DCLdQ zcNZ7N*-aT0ypie)?nycX4E}S$(xvpAO&4G4Lg>DDRuP)cV=Rnw7>aTo#YIBu#9zKI zzw8&s$3u!mU`#x;lscy=W0{9C_$)n@7ldm3{`0ehdXr)k@Ry$FQz@EZqcw0Ar!S~1 zi>)O!nNNjCk!xuENr=8nAvB$!Ng^lN5x#-f)%fm72$pIQ8D6NQPf1A9M;B5zRHb`~ zA0bbwPwW#_dNn>tP=q;+u2;NfPgJ?n8D(_295rFpW|!OQu!qION5@2Km3kGx$Pc>7 KCr}>w$omJ>jBkek literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cd2748f7d0eb609ee249493c7051b096c5ef985f GIT binary patch literal 3413 zcmc&%cUV-%7C&>#vLFb^-gOsOMaqH-vJfGPh+Gg>U`512QN)!+1lb}Bf+d0)5>%p* zrx6=yFm{Oo<|#!5gS|v-#FyABK6{DLL^JoUs66$L_kHhsIUmf-nK|Wm&Rp*A62(S) zF%+Xw!!k6~@3}b)XQKBOuMq0)fV-oCr)4=h6nPMA&*F5u(>NVqC%d zyZG>R1q4A}1^GVeO$BaN8huihHd&XMs8MICty1wKRg;*cPtVfo_?lXf_&SnTk@Aa# z?7v9)!AcKUGfmbi<{`-!d)4gm6*Bb8Ns#6<8@4pF&|Ehj+*kL6E2k~t@q8D!)z}@9 zdWg{4b$XaLMg%TP8bN=_1;En|9<(Q*@J=ekSN;j7UTy$EiXW_tmBP38ZD86m7UgDG z5@}LDsNbr911pr^sP%vcLtdie<-MRtCPCe6oq@XT2||4Y1m;oDRPGGst`1-)3WO7B zHn1Sx7beaPhQZBgFgm9zm@l@2vyoe%tbI0g@vJ~cx5$uF-9`}29RyBwN5QbYH|Xw) zp|<^KSTtS+^Zxh&LW+}N>}E$emLP!A=56S*1%j$0?ckiP9W>N;18U_!P*gpF{4gho zEME=Mjb5;daD$bds^GI%iWt%pW2G>Gw*=Z@GffvTR{1G2sOJLMUf3Pvg#y@ItcHUZtYQC28cvodQ22(`;A}h$FQ$cp(@jq}HLeIQ zywpKXW(*YPm%*#a`(TN3J(PwAz~yBGdXakrdKPpB$1UD)|G_L+d)5`4bsf;-RZo$< z<{HX$LtxN3LFj@=$S5C9R881M6>fM5zm@k!yF7~92^}L~ZMU`5So>0NPH>^3${wTY zu*1;YXCbvQv8i1T&tv3QVco&&%r($bhl#qyJJ9d0M+k3yi&@TzW6(JB7j$U#ZRFk2 zK$Pe;@O$+d(%db8-njGxl<#?t96?3+{ir}2!mQBHhFYkTXP~`#O~lM<66WWOBASMU zz#ZR5gk-alEUmW$-^v(NwWbj+6fA~Ug&AZw&s@U4PhUE&Uj_0?-A$F`3k1Vv z46rCRbh1eAa-Lq%a1TTth%S8|PFVIP$SkG?lupm!p!Zdao3}otMPLPertgMl8^gdV zE(FrMco6r-RKuJ@yUEkT0-&INJn^W8MkfdF2Y-JTYJ1m5P$im7Y{?%8T?R%#?eWf} zX#NZ2GP@G2Lten8#Bw106{MaC1YuSIwR7wu!SjVixZXGmmhIa{FE1NKlvH`48rN#_ zlXZWRqmSGm19o>abIPAgZ1X&8HY358-eY!`PFKW{2fzq^OOqh?)*!k=-V|`(Q7f=L zi_oK|B!K!mPz>l#+E1~BVSmg)HzM|;%ns&Av)iBEnK6Q(-4Y1bfC@Tke>`1OF%oq@ zWlwK7_zG?6c#w48^fg%%p+wV~mGF(Rfb5v?Gnr$30clfzB_`edg19|Cm)br!3JGuc z69(&@sH0X-;5S!;8B;89XY$P6zH0f$KtgeU6j&~i4kA2}FMmQPfQI{bu##s+2`2%5 z>r%A9OlWS=&eE#AwT*2DI}s!9D3RKCl6CIV)uEfCle3H5)vbGvp6(vKJQZHvpY-kX$XhcKyb>^+5)Jo3w$7s<$<$ZsBMxINI%#|HjeoaMA7}gY2FO{C#AA2ebhm z;=qqEZrh6>w>4Nk#%OW8Cr^2-oiEzltof~OS7!q}!wF@rkmLoOWej1WxIt|R!d#oG zdCC zVrhOozfXGDju(lr)s`c@_jj?;Thdzt@~sXZBR%Uk6quf@OUX1*`8gy$kK`p?A{k$qC7eAVS`wf9g2TBiD21mHYkM+W!V%qIECK} z!Ulu6RWQqjaWmFiXRPr{BNu@S#15I-)6=sLUW7yr;`HEbZVcfD9GYcS+z=GUsr)&s z;Pi>(yd%BF#f;3-g!Y{n7aKi+JJ&{Gd{ff!z{d;bgJ2L?O!4?7ch<(=7$4)K(i1&gr)iFN$xY$U2d@Nr&&NtLlU-BBXme$s@>_Bd*Oy_!Fh!5?XHqJW-_oO#x zgqxhr^&dktT?*a{;yp2PTpd1-V}r6y_r!zcys!<+chcm<7irq(OOMlrX5l=1ot!_8 zk6pQ|F0k%vy)U9N>K}8?oasHu+wOkvJMWjo z@^QW#!?|$oT0DUxfJ2bBbIn&zYd3^~Ef8W^Z>Pc8a3i zL0QAR4BpdDZ%oO}PBmC&>U9>KT?Src=x3%FvvRWyLOQpQ^fpqK(aH;y;=geD)+7t< zxfS+W?l3LnS=j4I0UiG7T*&la3bpM#w71L#@2$h(`|GywWR(XzX!C-UVG>le!w4&9 zNWf!58yLUy08lx>V|xlp9%ew|zJI~Bu4)ja1;UQ0a`^I*11#Lcqk%7-3TaE$dFgFJ1{?~Kx~YLv4sp=sdERLp)TMg84H&)9bk200L)nt z3gg=|VS4^xuvzZ}H)CpHn|&S(Qq`mLwF>0cQUj6|5#ZKx9!&d3g5mE{XttjY>t-up z<*(mBSXnB}+U*J#l0;C^z8Br?jiBLNAGp=e3635a0?d|ipl;}ZqDVK0soM(j8egcS zJYma#23YjG4(`DixS`>wyUX5(RTCs2|5^h@KYougbNj&_#vQVYY@pPeg|>bBz>3SKL#mir&K2OE-H5xD&a8mI^$sM%9saXyVakXi?^%V})0!#f>!FQmN6yfDTHwTSHeIv4w{1ZE&Y} zJv=YTp@*mnsGw1!*^JTwq90GxBj1cxro2cbVm*_np~3ZR%E?4_^U{3cu7uf?YxwWDWA6exA_?%g?mZ*Cz!-@sZh7 zM-z*#jz0;d^;6LP!5z>bnMc(Yje|ksqM`ZXK-y#JKIj|v48EIL2Xv5{HgaP@oLkHs zoV8BWxz-GSZCe6f*PPiewoRkT8+=jI&_=rY^m8=*+H%a(3uO$su$cZvx%9;o-E|3}f$XD)cwNad;>mz0nqft~ z#5TSA*xK3mb?E2lB;lm}Wpd{Mih+X$y9{x4bN5gV_4FDx+}me_O6}|SyOI6@?~NKA z_2x zE8Ac+&CM}ea_8mc&tK4I;YW);E+|~QWa+Y^Pd+VPzG7v`XRB7PSzG$~I_3J`Zx~j# zanl!@x0F{@ZvArG_OGgT?5wWYwY#?N>pgqx_ciSQ=Gy}Y8=IPATMo4zK63Qf@wO8u zPn|w<_T2dk7cX7Da`oEv8#mkkaO;m&e+x_?zO7s@H^HA+6a2Z+1d2N^1eAy53XZbE zKt#`%)FwN_Kp-L2%w%PlExl5+QpqcepVUfLdQ>8FNCB|SA;$1xZVJ*d}E)kC4(?KBnxBluIYo>k^VR^gtw zMdV)9pQAXDHi&35GNBg)c`59nyC)vb*uD#~FF5|sWI-yDE9Bb$G35V7ApC#1vA@|( zk}i`=MM|->v%dsDu8`UCad8vk1FaWJR>`Oc`ST$>&rif(LeV^*$A_fx;XI$lho%W< zAy{BMS%va^B$@HPd&WCqX(kqkA%4Pw;VRWAyao91~f8A1}sR^an$ zvY(Ue7vq~8KP6WmK6*~V)VOrQ&yL0M&CkSxU@ufKf`h0nCe-p5UzO!iS^u%<4AZjEOSJ`1tv;{A;YQ)7&YQ-##AzlqlTqCV^y4=xs_ z*Ww|i3^D07Q!IqRM{0?@Ae7hbe`2IiZ(M9l-KFRGab&5Ay}e?Wq6^IRwW@x>$#A6LUwpA_9Ns;^s#-c zN?K!;1euFTmoqmv)#8(4GWqCoa*TN{^HOun8AgMPZ@>uO5o)RmOqDU#XCl8*Ik1Duc6%$8uJ_3CTY}>P)rtWk^cu3wR8q z@h2U?^C2>1jf%wGQNwN_Dx@BbLh7-VldyZP5`ZURV;Xid`!wuuU!%RE#%i*;%8Uk= z!PiU%W@A}}-BM$^lt?KiyYE}Vm7OTI&=KfH-zgB-ZoZiM@7TNqh*GPixh z1=!W`01X5kPW&PoZOigS`A6E}pt>3zU3?KQX=7kz-Bf(BFb#gwbYjKM3})kDAJFV6 zMy*?p!Ij2!@VKd(35{BWlj0LN^V)ur8^>EvjQJ4T+#rz@E*$5*-qFw768r=A!I9e# z7>T%z4<=xrQJAUW4?xDCdvGfDC+}<5M{|MT3-JpM!M*KS;HysqOK>ES zU6>4whiBr>104D)=L94rg|T}FbV6(3T>NQ6Dg>uyL;IKgnZT6~QP|Qw;FtCgek|Js zOj0ys=f{Jzrjgx0<88@3H3l6R(#DMZ_-|&~*{e+Qp-`_u z4Ri7Cs7qc8OEp}F*DcNxt!GYx6B0L6z@qD!T)^Ua5V5aa()$uZoqv+6X7~uA$BkwB z&hvr7zgD1koQ+IS>9}iTuK*7?(B61gnc$1EKbRHoY_xmz3?qFDH0wBz=oWx5i^X}#2`$5n$I?A%(; z0d@Ty{_}{F`N$FdI`y&OjfJBZElwPxae!h;c0EB}$~%hQB$y2Zb6LgmE|}_j0V*E5 zIVQ9FW0hIWHp(|9Cu=b~U0x=Oi9TfVYC9PDdSCgkKI(g~shdPD>sqiEKFx0AXrgpP zyq zXF+Kl6Azv<3F}MnLnyxJ`1xc(&MH+(-GBUh@``x=f4Q;hS9O!*Cwt3TwN&2PI}kvr zQuqi31rv)C-4{!-iH)KXCujvh$R}Sa*@93jXe)(uL8uf`D#bM|G035#6hX+MJ(+uY zG82bRia>#ciSvg?Ma7drT5cwtr_|CmjWz^Y5OlQB>giO}3(<5wyF4~Gro4D^jWIo8 zwth-M71dkwNPP2Vkw(Nz5kW{0MBI4dkm|baca{{#*^C*Lf`RTA-IH{P=soj-Nt5V& z0&U!>i=n&YnN8NTE<>3)Jww$h_S}v!$Xrvk(`>he#ArsujEGk1Q~*yu(9Qznk$4+?@H^wwLfib@MQ9t8!_RrHpZKt&!R7G+py1+=u+wuq?nITu{q zreNYjTxGdSGbI(Mx`Pg*l+**{We=J$HFYR|8-*>*} zIls18DoqhsAwp1F1S~)#fJ&Eks8k>Hd|9>tLZqNc5O^Zy1zwj9@v4AK?41EXa117# zM~DYQJJJcPHVya)tr%QmbJ?rwZQgnTYk18%q}hZF83xspENK%+C?H@UEP@OGWNj3Z zEW|wxO=KAeMkbaBC~^LQYO4kDL7bv0AQ4r}eH{&n{n1?_42e}@$C!x)LmS&0e0-SQ zUE9>?aQUk3HlHo5flM0g)wOPCQ=>~s=OWTvLWWkx@E+Fv1DB6Ya;Hkzud)a)_*?J_ zdm{?xzFY*Jj%oN{{XTZ>uDkGi#~Acp;--G8;f1hc*ba7f^j3(i(zC_eZlKQmPvJtw zdUkj9*?yyvPcbj%kA#$S-$En%DSmh3A#^M5Bu;f-3tD*Q6!fgPgg$xW2b4Nc##`KW zxYfCx369U?cE5H8I^MsHhJyu<|GjrJd0g+K^K{HU) zMi0^r9x^m$*zggtdP7|N$Wc!vBqk*rQ&LBdNlPD_F>d_RrU{u7CrRe4?3~=Z$@v9^ zmMKM3i%Uw&rj<{xm|?A)IjicK>e;rMTDxOTU46scdCo?c8)QLN%@6f(GVAqa%Ajsq zKx{V)a>T-xMPnD6Wq>7|49^O3!ct;j88NWDc7-EMhLvqZrcWYl6;a6OltK3#HM-K} zHe8O>3D^%Wll#8V;Kl|Qg&K#KbvAf?K@O*bzFzWF+d0j({+e^4hD-eotqP1iT(QT# z#~2~jLa~T|PK3!p5rWW#`y=T>A6yhw0g0#z=u*6Sbe9;pkChlPyy2$sB)AFh1s|uw zRp;~L=f_BM8X2yt4Lj8Ya2~b(1fP#fP`fq_MBZCl?`X6MEP+`)eh97gI?LXuNhEhj z!(!hm3%p;sFCJ382lXdEB)r8$PqU)Z3ySENcnI`FA5HAXQ2UDGzb6YCUaQku{^g&N z{|VgxlN#vBY*ZnNpf$mL#c!d?^)5QnO^P zlX4}gPRgoN_RK_K3Y}$1Qa&BYIxv!zGWAjm)Q~iF(NoFE8Dx@EG?}imnkmnrjF^@r z3uR_2T`F?PNZ0dfQj1b*DyBEtbH~lI&M2*?^2TBkU&~y=6nj~U5fTI;zddD2WqrKJQc99hYtaGpexQr| K`zt1YTK@tgIEX(0 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..64499dc53d0c919b6e990bada1910bc4d40ce3cb GIT binary patch literal 3122 zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkfnl3mGVG#M8qnBqJ~8TNrEH@)>>Rji%PXJ zTG1+MZBcP|YXh-MXq_I-X@yFY6y^KBT??a_d$%#}Muh|Z$YkMf;FxhU6!!dGGAsT4M8 z0CpZ69R)&oPP>(8L_r+_xszIDm!&i0Oqi@QO%^$&pj<&k^lHUVDbq{1+ZynchDf#Jdhj;x68D2@m9tjC{<8Q&A8y z7xk?YTAL@qZ*5ut z+zWuhn23m30xmRo!>*SHTwEh?zQ75KGD0zBeiVka7GPXi57;er!PVq?tm|Bg?g87# z>3R+EYTg9b`El@SK8>Os{b9JTL{sN+STae21;6e@bWJWMZt=vKOgUD!enoC{BxpR< z3D>%~;NXFt5LOLEU}GC9;=PdEuomh~L0H52VpZ2h%zV~>n;3#Cu_AZl{Wr02xGU6O z#-ifx_oSet3$_a0&{x=@+E0d-?c3ouGX^P-<=9cx8`>&4w$$iw{JJBKUXrV|RGpC`ZVjZ4MJA!4g8?ibe95+^QI3#@{JaI%E0^L~RWaW)pq#rmsa(jf?kc}~WE%;}-zU^o z$mKHMOs-e>Hd*%34B5(qKVrwQ6r#8_n6vNC@g?G3#CrXLB%(nJXMH@ZEj* zWm7yH(xXw>-Jkn${2t6Zv5&tzG8~l$CUI?hW#rD%d=5_Cr;)(YdZXPHOaeN zC8X^q9?*S(z_*6*?$hlVh}`GJ2fUxSpZ^`|1a@UAfjv-4KNF9RbmE_$Bg9Vj;1*r) z=LN+e5RMmQ@(v2Sj-BisIy*XbadvSPm2N7vdsj`j?mawudU|>LXnXng?$g)Lzh6LL zQ1Bc5LqZ1(92EBE;345}MM#lRL!)D2Y0N+NGZ?>r=CQ#iuJ*)vjK%_Oo@Lf3beU#=1?Lx70U$x%I1U+Z%W6{Cd~!J$su{ zn!nk%|G>dRhg*&uJ$C%W$y2A#oIQ8`!o^FMuUu{Y_S#>y=TsVzv*LMNdtkKF=NlNd z=bD7Ve3Q9DZecD1O^T5FdF3u=cNn6HGvc8zGSJu5lT%31)Wmr>p( zp;Fjn7|OqVN|j$k`MX_Thw_fkF~2a^kY~0wEGy1mbJwnLr#Szz_1E}(Rf6iO5UAv) z?8&)$ohZ=0=EJ?v9(^=PTU;V&6GjN9YF}khZWV6dhM3&%^%pr&o-~+g3o6T9uolWu z54Jn;ai{h@n0<@mKPC&Mpw_69{>Q!hPwJ`vmlv!3vtE+&wo)l*70N0%S3s>%*-NRZ z!_&g74@Zs5nJ2|aQl1o*XE}?c0>juUN|NH)O!u~#?kr0a zv%m~V!{6x}5HOG~qLbs;KB|;%=}=0y?NdjMV*3O( zSW~xzZjEOOJ&QE!(n5^sW0H*-V=SptelTizbbn zW=HwyQe>(1o-DyKEh-~fDrr@+Bxyajq@J#iDWQHWc`|)!pQ^H}u}VVaQKT!LT9Rw_ p&n_zR*A*8VOFgFL7Mt>o29Kc7enI^L)kzxQ`3Fq`kV5}0{0G#3JWBuo literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d0c68b087a629133e6561c3090078ae686a5e13e GIT binary patch literal 2537 zcmc&0YfO_@^xXS>E$v1pB42G;b*Q7FE6@^QNEq`gZ>R`3g|aCYT4}MgtPeo9DQ+{t zg+(-)VKl;`G2&~^u>=7v>X@RNuf5o^M7C^G=hP*bP1&Nm_v^|-<=-+-+I!DE?{n@w zCtp5asN+~JgbOopn1fINiOSfGOTsm~-{61?<>VSd58=d|Sd{=#k^oOJi~)c&0+SUZ zghAbkR2-{}2XTl}AuTgG%%xV7+sa`%mNy}JGg2q2CF5CAYDg%+VIWHg*#I64jU_Y+ z4~C|Y1UMs@oP_%@l_>ukE~eq$V}`?xl9`?5-YSd3Q))JOOfnlmZ06E(r`=oS5EAG> z@*bqViPgujYQ)b7Hjc+frd1!ry%S{M29Qwn9_F?WWo*cUg@3O{?bZ+3#Bscx@*G-^FUthLQxhkuZoA`!kuAG&-FT+LKcD{^vwChmU%uc4Q?>>6vvnx!BZ(7idbv1b3(2!3i!U(YD}(8yyo$J+NJDRRxMf&+ zg$kAylYr-Uw}@=ay$c9Y!T@4}rqtIjn-`^cs^RUjQY*p ztZB2i=Iz;B*;{#J>7izO#=dVI9orqJuHRkV(^{RS3Eyyfm#fd7)N)~uC+9%P2mSkN z&fZw%8#q*Fh}m}jYWu3s9bK2z=O(B)>~G7gvF}`3WpmILwYXWk&Fu-Y*e$dN$b%n1 z@=H?rk5YBBO#PdPHtpzK9{H-r%#nWD!2}*|)&NH#31GB!0{94(D35N|{?PH7b;5A# z+AWR>kAL3y+x7HBb@N2ht`A!u$LFIGQ#QtfLhLTLTB=MOOU6MPxX!A3^O4jvq`G|F9ITY%GO%UeEJ<0r^wN<@s7( zU%_Yad~i#^ORNU5ULIMFUS_^O^Qt*Pq3H!p5;@6^@Qu8_)_+ezuvCl4@Iod1NO7+s83X;1+S|Dc#?xG+ddzW2Y5Ge~N$U=lz5V=&56%h-?N|Pe&u5@DoH6*A6BhP{j zG}u5PikK)x1cSXqY{Zw?D?VG~8Ka42?p;xN>L2g>-uH4on3*$k%I}=H+}|aH4Rv8C zM#&6NF#-m90!Swo#%8 zr>#`fX$1h?YzRDx5J7GPMKh8!l*|nTMlz{hbd)-FNNWMT&5R=wqQ| zju;y-1mM94PXJi=Vg{-(1`ewwg+LDhsVDRT8lQ3^&`q367qn7Q=ZRE=UfW1;1@G_T z!`BrM1bG7#2dOs|*c-%ZVlqxPTW$guf-9k{M;2H(SECb^a%A050pdl&!MfoDXm<|)^#cji z_m~LFrpaOPAK!sraU4wDZUrYN381uXC%V=RLCx{*aM9QVnw$CnwayomHBTTf&>DiP zHh`?c1=bVxu&!4P%z0G>H(@B84`7IEpAUh;QDTsN8vuFj*HKc2G3=mhASF)^iX3TZ z*|iHC=lDa&a{=rwZ~#St0JaxL!I8^`aOeyTXG)Z4MEM4=NjnBFX8D8l9cMTjT?Ch3 zsv$c)42tv0;8o^9SQ)StN=JCYwbcZAk#igR<(q?5r7JvoJRde+umc-)Pt>vgDKd+_ ziPG&6Xw}x>za$t^tHu&F2|KBR@|W;i)c~~5skpnyDhM|B*-T9}D+QZLwp2)22dWJ` z25keEQWaBLyZ3cINq!aR050cmLJD<^Xjr}n{cd-haMiR6vrnIdmU%y;qZ{raS5qxf zqKSpyYd4a5_MUXbs?$)l|2eV(6*2e+B`OazK%<-Mp+S*~4&=5H^J+;b%$`8Bj`4&0 z?oSBm_5iZ9Ngv#Ig`t{_EpR!1IlL-JCHpw%5FP^u(TPR91nmc_kxODDRgxzVjF~&E zTd}rRw`7Y;^xEc!Aa+7@>GKhU{s4l^VCo>i`WYN?z0vK?-A`#T7{I{Ujqt1@5DX&x zAlbr+csRKh79MRR&yDef{HAHdlR6rm8F>giJZ!1my`MmhIFqQ%^92jvAgDiOPKpa( zAln7Iz|ijnT%A${q=%B!FkT?a$fx#BT_$+GG!1UG%!kznchYOhCJ-exE~w6~mi%PP zpX9{jw@J^&K0@ofOk$_=1>xLD?(}})13Fn5K^}%Q7`!S5a_$bNd*;pr$367|;|mBq zc}fCAZ35-6p`_VNeHin{LUcRm07~zvhhiH&=)I}q2-k*XhqGpM z`H@#>o9PkKaog8qT~Gj;)fNCN((*~uNk5Um@h0=xy1@%G$n8);+&>*S zXz&n^p`OFMSZ^O+KmUN?fg^%cBZEhUj1C<$cHH=|@QBC>6DLicGBqkXCN^$bd_rQ< z^yCz^Mmr-lEj=SMD|_Z=vu4kko0B_l{({f*zW6eK;iAO_UllI-dTG%=mMvegvUt_% zHEY+El&;_KP1(jxo40H&uh_P|vg+F%JF9oq?EdciJ$q~G>Kpbo?r&;7@I%YNLx+zX zJ$C%W$x}a`Zas7M-1!S_KVAIi&%a!{ti8hJqqX|oXJ4I*wBl>mxn!MoAfj-%Z>^Di9jJ{Ro)(#jv|z&}D-cuZ^IR~+~W z#%+8B6vkTp4vZGZ`}b2GtN9h9J9WP`>@(NGGn`P~2}zM}E@ucG#SQOD5PB(zYL4#W z(x~Lbv~*!yavVnAM&FK{`+}$~Pn}s?!G=dBky?h^6iI z{66VnJ6@!sPFs%j-rvPWZ%Gf+_72OBk)Gij3QUeu$EWM4{34QELW+{!idG9ud+&$; zNYW40DJyUTsUR&TAuc6~p)ljy67PIC*veJgO}sGogBv03+H6LkGw#aDoZZ`NUcKC|M;usZ~DaezdX2q+mj?|mrFzn zk)*X3epAZiQhhcwbX3@2-Ge1vPdRhq#(J|XI~xCT62!7utam)?&$98XPdvZpjSWU} zD<75(py6EM0jWdcb*c0@y$%a10T2Wz&B0BZZapO3B1UKkP=b`KHOviiP=}}>BnuzdVO=LJV}Jc9Mk(%lmemh-|kEZ<3;6JMlmpD#Tk#Xke*;p^o5aeVB` zU5(BXq?X#K)EOCZ=}s|Pty5HLswT@aGcGkPQKPnWaqsWaUnx_`0p9#TJp~Yg|DpN| DjS{VmT~_Iw+{XIF61pDuM{kz$k~;?h64b@P{*we3R_%x8MEk z?k_2pN=*VQ#0qg%GC>qTq}gBo>h$ktooYUGY0Ds!{ak+mgeXCiAn-yY`SQASh*t%5 zL{(4=077Fi8F_>l-taEc39L2^_(-i7T4Q%Rs_X5(dI4*AO&ij@g$#KH)#E&A6KF01 z1~NpF34riMA<0U@(-cLPfly>-iGUIpn5ec|5bx(Hs)7cC3lRO zI54!c_mYo~aCmB)8l7%`wZrbWM>LQ{gQL3E<7#SjE9HENG+!aZCdP0Y>;6T^M;7^( zO4y^a3eRDs&JFC1Xq>aQ2)w%{<9+qJ+0{G#gx_|hqW2Rw^->Kfgf|Ai!Oo1?260t- zws`vu)RF%YT*z3*wpX9+H6r;m^L+krFrE7z8rhHVp^f{|tzjSI6wl?5`Dac;_p-0h zhp+sEQu@hwtH%MiI$mQ!<1@MT&1YcO;oE2kSn-$-jA%!G1S&b&35N_`bZqHaysU$P zHS?$8vt`-vbNY2G?#yMj9SMi@y%lKRYu)fo>qhu%wU-&3yabywMsp48`}3EN??I-9 zE_Ulmo-bQGE^L$BKg>1oYwqQv*Fc?!xNWx!aCjIhWDKoCF_?*<1@tF6D7i zOIsjee<$DfJVMuhC3nsKIvB@|Wnx;wq3rinNdICtTr50>{86FE(PieoOg5smmFW<= zF&en|Dm*N64_AAlk}K#GVf~&dXng_)!#|JVcAWYPz1{B=lkoNn%)$$~aKKxR4;JPk za?>wxLI;x|v~l+zrE=^dOAm<96&%}#Ow6C3A~b;erG>29I962-2s13lhp0lsdWA>y z?i1NJDq1aQ`ia_@{<;AJ2gME^GBi$a7#2T##K?riq-3Kh<*C%P^idh3$2@Hwn>lX0 zWXYP4os*lFUr=bBSTw1)q_k{u`IL&Ow#sSKt7cTswAa)+oU`ic8)naOHM%_@3$ki% zxR;YzpFdLub<;fJyIGJU=eH~vwa_91EaGH%MvxO069-F(gQd00oDniC-$G>iBr>cZ z3K<u@Y0PY}GstZu;B< zAD7cz=MUiLNu>D{8E&c#`_%++9`^p2n2#z@yE+X--d9`iY_tn3fmuC%0Il^p>(1!$ z1aKh@i#@w6@B!tXcu4i`_g}yQZ#7eER#eV{B044!f@h+SA^u~jeug(ysRI>}D$mGGn-Lbf1ZT3$~10y+drS5gn;Gl%S1{Pv18k8Ns^r*f)NI*C^j zsIOph&wNO;2tAMwZKXwaFn&qC(r?miLc!Kc(geE9b+|0Ko0p^!a`Vo5S5)e*T(vNQ+INlBJk5gkeCWWY!%GO(|dZfneG zB5{;DsXg&eOzF`CG(n7&?cTXfPJd#pEGOE%UQcuEJg3*!;Bm*A(vwU{My*u`aPI@1 LdYLcb{w};tD~p(Xb0!sq_I-!@+{FVLTDDLDfmvCSH$jHZhG!^-w zb5Y+ap|yDu{MYos_m}MPXrT{oxAa0*Z&$K@gAohHyTWH#3yk0S0D&%e(3wN!4|&Mg z{%>4ttAji@92-Wf@%eox%vd3jvLbsfUmcG9TZ3?PWh^}P0eCR1jht!dizmv9@ywx(99} zr|LDtt9cV#=f}aT`4kFw^n>A^5>1`QV#y>87X0!xqN{T-af>HTkCUUe^(%6{BSGWI zPPp2|1qb%`gs^G|f*K#9Jl+e*4Qrs@6pYoJA69j3#LQ<6xPd{q94m6yKX?lZhq*%i zWh~0?d{6R=yI`x}4Sl&Cs{CbW*}fhAGh>kQM2;Pmy`Zg>V@tIT$F4cz$ORcL)C7^C zb!*^lI)SG%V&HWv5EnD6aIMXNQga%r%h%%Bw8K~yyAicRBXE5MN1m45M4yUo@T?EP z{ReZf{)#WW4bJ4z>Ys>v_6=h8BPcX@A!boBM*K35d}^xTE9=_uxS=2UCZM{L!ZQKu zd#)EIy4S*O+_!wn+DD|d|6<{0!a-8%c#V7<-wVN)Z$K}c;F_22BxYwjqR;yqS9A`$|AmJU>SFJQkjrf)m47y@HP^hw^yhs zm&;{-<2c`lZL+K*8M2iJe#DNX6r#8}khAZ{@x|gE#CrXVVq(`HmdjR+2_}!Ryd@p}#L=sqL? zO=r6CK6AIjG5RUKo6rD$U=VK<2SZU@A?%vCME+#43IA-F12?^q`)utPuBI`V?D5^r z*ByIC#-6;%N9^s{!K-{4x7qC&|Ngizng8Zb_=2Ex{wPccU!H}s+i^1IS<~Uavq|3N z3Ly`F;sM<{1id|ocb{(0K;#}T-skfOgbpl@JM zaL60|Lc`wdKOp?AfrBF69xO#h4T+A4jf)?ekeHM_EM<7=h<8SgN*kS?F=p(z@e?NM zGPAOCCgo1f%b!x9Hy8`27MaY&(@ILGzuRfXdo$lJn>BmR+z-k>{HS8y`~{UCFI@D= z;;KI_(JuXTS?}uQD?VGfs-|}Jn$OpMv2Oi_jdhzgZ>ewia_d*ywm0th`p-Le?cURr z()`Wdeftj_Jk)ad$kAiRPn{-cyR@MS2y^QiU z36;Vo!%+U^Q>y$T%HQewI+S;Oj`;;ShFr6?VOeqhs=IcDJH`2zt-r?Ss}fXKhC(Gb zWlhe}>qLR}H6QMU_UNNY+~P8rHerNvs`gbDR`mj{1g+NB3QlunF!|9h%f+Uqlk-1WgB;`s`xt6m?Dv-ohQIZtTX1ce{bZ1$b zm<47?8uo6Vz`*`=5uF^z_E9BljAjEhElG)Nh)ic&3ttLi`ze_r$-$Xvql&X*224pG zojRHE^(i#I>G^c9*o(3lp+Qix+OsSfubsXrBQ4aB9h)oZn7+k5O@~stZJ#=FB-;;V zgEe(a=+=0q(6dOhE-loUK04W$G1`(k^*7v_-_Up3$3u%n>4kV$DI*KBV@H`8LyAvi z@)n^IoB!eQmU`1-Q?iv_l7_HlYyqo5VMfe={LGM2T9bZEh!(ko)t`oFy%c&DY0;#S z)9ffeU5YHR-jgL*rbT5WOC_yJmL#p`mekYrF~!u6B~PYL?Ne2DHC9QeJPLJ1Q;T!V q0a=BG0lK0hV~NMKoFY@6(clpr);G9skUCKVJpZ6c08;3`h5rB{LO@#p literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4748c07ab9f5c77e8fc8c141ac748fc7bb24c158 GIT binary patch literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9d247f8cae7a335a4ecb364c654ae9a66b3a8c00 GIT binary patch literal 3450 zcmc&%cUV-%7C&>#vP%)vz3VQnf&vRD$bv%DpmIS#UxQHe&M zMpOh_L@Y!FeM+$gD{907@g<5awy5~<8B3y>dsi%p{_(!=eJ}ID%$%9ierJaJ8^YNL zcZOnon1LEb%s@K;+iAzA?3oqz= z##q10ST1hek&DqA9T`UP*Hye}<|LpXNHQ+~R87=d7)}=P`nXJ8LRv;_yf#B?k&GwF z@v(9G)J$EP0PHs;YeuR%Qq@c-|BIL(h1+P!T({IPPe}psFD6g!lL7N4LW<`c*xtxO zW6gMQS=SwITr`I#iyYu?tuw@RlcG%}dRRC{3Jxo3L4VBwz^y$zZbLx+gJg&<{~a#8 zDg{wuUnm)=fc4E*Fm)A+a?;I-6h&V+v_lQY3j?96&J`Yqy+UWpdcabp9649p19jgG z#QG5MnMpx?nLS7xZK1u?2hOEf!QyByn6SVP1~;a_=xiI1EN>4Dq1$0Yn=I(;whNu! zu0(bNcZc*?1)^{A~~T7bL*gZC&9^j0lPwccSa22rBB# z;BwpcaQILcpw_L|+fu$}q)Rvcn%k%&Ue<6b1`Ocuq7s0jyEu6S&3CGUUaK1>5 zf=kzdz2OwRoEiXjciiB@xTSFQRT^YxL_$IC26#Q`D69zF48_6y;rc29z0A1{-Sav@ z*Xu8$V`3hZl_z_555ncQum@w}}keN&s1lm1= z6CO8B@7(>2mVyQJo^}A9Z3zO4D1S)p>`FWuQwj4=9w09c?GJf}#uF`7G&(={7a(3l>7tj&VNN+%`oDy0v1LGdt4TfM1LDj)YTwvpq8CdH z@N?~KSaozKy?VnaqNu_hRXJ9YJxhKkN7vmZ`yc3HVwXFK*y+|_G9$)|K4|igPE|*d z$H4%7SH?lk-9dD_nUld~Z?&jx141oNNdWCPpdQeVw4Q7ZLw}o(ZigH}8SNw}{(v`q z*-edTrx?Pq|1LW2STr46Er+GMMj+=4)^zEK*Jx{p6Qs-5ugIy5fv_)qI8hZ6h`u)D zksV@wAhRv6B3<%N#KebR5ckLDP`d|*Bk^r-g0tL*I_UHS{)ClDn7twwrb+znyDMA= zImQ2R;MPFdD)2zQ`W~SO8k!$qp-D`Ma{zyWDOzMAmYABETePvXYTK^8l#z9iE37*z zJ9W0P?b6lG-a+N)hi)#-zwYx{pUY=f`_3qR6bMJor2l%kQegpjj0tW>JhiC?e zhJ_D_7&>hDh{%yq(W6GkjENnq9Tyj$Fg`IUIb}ksE=|u(OgCg?PRh!jJZ0*%=`(U> z&YC@EZtfTJ^5!pCnE&OX#a}I1`j2JHzg|(Wa@Fd>HATg1*R9{M@taL0n@hKB-L}2# z+Z{W1l~?TE^UuBeDyyn%_8&NS= zx^k6k;^Wbrx!JU9GY}`eeuEFzg9GJF6%NvEb)f}kb?fKbC3k!|fV(tDg&%6msqbND z_p!5I&Of+#Ny5RyRd{zfHiJhr2b(K7FmJ{YB%JcG5zfC+6SUAx@Pux%|Ib?_ojCKS z7y->GEq{F`0O&Ww>TcEZn*E)w!Ha-vAjT=G>N_bJ!pMJvJ|qr_E;)_o!UO}Anrz50 zNk~oLNd+@VlUUZQkj?F&s+XzG2t@n-fq#@H)Qnn#S(#@=xwy(4Xyl#1^A~Q7Z$^BT zKHzUbPtcHyTXlIh#MXaW#o`-ICkS#Qp?YEaDKfNtOO2@sX^9y|p1+J_SCHzCjjGTF z)8Sv;ejRcU!bpd*mE+7-f5peSM z*grPqkH)A8a$8QDJ~1;P!!?fMT(#-x`YhW?3F(GpeVVPiR}c3dYK2A#@b(9)DS&YN GU)LYZD$;EL literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..78e853246b125d8723fcba4db0657d7509e2c93c GIT binary patch literal 2667 zcmc&$dr(wW82`?_cb8oSLEtV6ZHf<8G~Oc!3=wyE!-{|@EQGkrt~}hA>>_A}SlQqV z`5G-49H7x05mYj45KvG0D+o%dp<;bS0j^6~HBGL94j_aOJu=f|lF?R*;*>0_)03{IJ}G6|zEE82F0pGO5U} z4FF_YM+x#(G*JV`8IDZqmZ zo&X+Ck->X}gtmz&j!97}vP+~QyRHPM{p2oo@K4$lNxOj=P21N=Ybq!+=#18Ut=g(~ zD#lE)How56FEbeVLI#lJERwIM<)hy82Ei>9W+&M7;Y#a@jtJ)nL-X-Oe4f*f(0RyY3w9 z?Y)Plf|82-Gzc{(JE4rD-Ec@=ijGyCr`B}R@J(P4+K>|lH^N3JX=@_gc*GIJTiu|2 zM;{DVZ-oanrSz1*)l^h?grIoy3&Ly1_oLwA9;RWfP$=-tqh>_!7Zmj82(}%)2`SxD zXuOv~InJi&GPVm6r`>_UkRQigxjIMS4o(ohst4}uP6j7cJm@F+Q#TiN!nzYZ^q2Fa zq58-YYNShmKA+nUoA)n72PTg|yZdr#&)PVc6qgF!XI`W?`w773t&hPu{!jQJe=pEc zL9~gD0Z~~sb8zt%;k`{3__^;*nE8d5pniKU)zBV{x@L6J&4UlnYo{*L(LGZfrmbC0 zwRjBDm3d(Tzm|=(K1f9mfCVD!3SjlsB!O$y3h?Xb7Eb&cp^@Kdczq}lf?j!<_FCZx z?^&v8kGu zIL4lW~BMeK>&Q8xu96 z-Suf=4f9KzSmRjUZR`)xfP0;ZICIi4l_$eAAo{**Sblz@jHPUVkTmvXi4Da@0uDM0 zqc65t9dvpfd3E@S_0p0mN6GQ=^4baV8XkBa?Z#8zkqhgtWGVg;iDW|rJ1{*74|a)E zWakJx*!hRM*mx8k7?59de-=DAkAj6>XDqbZkgfwsdXfB;n|zxa28buUKO^ST3iGT9 z1*y-JeB`3=szbg{$FnDbGu2BCKm}K7fJ3ry90R2q>fx> z=De&(+r^SLF@c0aIR(dY8Td~qmE+1eMIo2KafMuLA-|`<3Udh;%W=tM#%ueGS3GMW z9*84u-pZE(1H&;BpO!?{vE^ipCj)lPaY`~MRAkA^xgfG$stHL8)?_U#(ELU-g za@#ra$r3B9!||Q88SzEh_W9CPhJ-R4hp&^^WB=Hc9F0vAq~2zA>9R7N)xW@O_E(pd jn##SG>q;%fCZl(7SU_+b<3x-7&A&Ddi)?Lfgg`!fe zxPrx^pimX;C`%C)_Z2tlI67*rTiuSHj@4G@zL%m>^^ZAc&h$IU-M;<%?zz7tmW}ga zD8`9#RpSZl0BnQ$Z7B=TMm;?6LyYz5Ne_3mGr-s}VlPI(*y681<_{jiPM$>ADGdPT zP6QqW2$2H$b|hmcsUHYzr4sXGjW#tYLu1Hb2(dt1j>KD#JVGw~kR_=X7YjoG9&GUh zfcHwKzZ$#eLX=Pl%ux`D86453iIWyFk~eWmgq<9T5d9$|#<9P9ikTL!eCz~4x`Mo# zdL6unRjMu}FDFfFOiI-lHCCB;k(rv5qRY<9(emkBLgLFvUP{VO60-lo<$IHSX3fmE zQ8QOaKF=nzr;TLz=TjleZvoVFu+Y&w8N4?3gB#~9;c2lO+-dWKl)fTVxmgEGCW^pq zRU7E8xdABb;juLVMfWoyvF>*`_p%BE>4C6$tQ5X}WD7IauxL)MC6Of!gx2j!IJ`~; z&N<%jc=$_nsnLt$7) z7K|_G4dyHD;bKe;R9NSOqhcpIQ6obx&D9`U904xPCqTbz0B9eGp~-qYte7l=CBJX`3^goFIU*jveT_1%mqH-QZGpd)U|72dMQ!L0SI<3L{-0rgkGpt9_uH@PPF_ z>tWWbTDS>=;DU-Fu7CCk6b~1H^jj4a-o1gc^18!z$`x`7&7s7LhPJvo@R}74vCjpt ztH=}NMFQAXs(~X{ZQ#&Z8qRJ|qNu8k;A%JqFJ^>;%WVamOD=({FSSr$jEB;~3V1c` zAgoetfwHJzxW0x!FXr5WehYhnbB!-NdORO0FS>)P)($-_{{=au-b6+Z1bVFtgfESO z+}cq@eZ~%|sOlyByLJHj-n+D$&^a0^`&3es9Lm6Tf*TcE@f0;g9)pg65t}badlAbB|!U zdi5Eo-TNFlgPI6DphQ)XR%pb&CTN!DqWyE*iFpkq6c>ym+DC@Leg7wfWSfdCYqbRb zx_DH-sSU0!TnVp=a>+i5IYdyvKsvLer{M0voyaG%h1yUk5R9BX*rHV5(<0mP3cYUM zLlAi*y6kxrVL5;x^O#0Zxjcg-zCT*rzVk6H0xJlZ*#gh1Bf%;m46+@)iH8##VA0VQ z^8CnPSlBw5c+yCtv%?NSP>>t7tM?PA7fmB-3WtK@&}e8n)r%Arzd&va>cA%K1zbz2 z1u{rU>X;!Q%v(t9nY2Rie7OOBYMT#h4(_1WR*WMy)cc@D_XhH#&A*f5kKZDLTl$!} z6iy>{C@z}Kp5RaKHG4p3D-+1WV1U5YDKO_w1Z_8WI(Y4F5_G?a(34;AyQ299l!FJ6 z4%01R(a>dH8R+X+^ zvv%G34Q1sUzpmKyP37h-Rn=R!)zp5weaFta`d#1sefOS*#-`@)TlTi@+kc?#;Gx4u zjvhOH;^e8*XWGx6JAdI~#}AkOarvKDuIm5A`J?rg-DX~!jr5}HH#ld$dXUJmkG^Zy z(R#^`6*qsnwRua39^f{ur(#>X>qp(e*6w0!_s;%|t=;FWJ-BjJzO1K=AxwA^(e=qT z&&kws*pp_UvNH`vv$X6q?m5SAcR4L?V8qWv@+L-J%H!JGt$FVQtz?w1QApTm2{Z~y z1aO*fiw6$8-~B(Llxza+yz z^sf02IDKE9(jq^Q2n;D1X*n8(!tm=xyoKU!GW9n5AsC3kkC1dNHX|?zcg0Cg?oAsQ zf)S{LIBiP8w}eA_0(U`*?c&6a95ThpEPnR$@Mwd92PaXRkVahLRRM&WLu`s|o%5+Dy8KYX5?KDLil zxz?B@LE@y>D?TwT7?w6t79rcUeRzf!7}0lfY|69o{9|J3{k D8Fh$h literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0d95d679984cbe1a72e4410ce8b15f3ba58a4945 GIT binary patch literal 2845 zcmc&$e{@sT9sl0@-b-FeN=wO0lM*lm5{kC;hqROuN?%G_+NLFKfs$e^X%pH&(maxs zw#uO3a77WdM>cdj#T^+N=r{v$M@qGbLq$DK*$>#xHgU(n#546AJR9Ps``#Av-a_{}#?~nWaar3!x%Z>RGFQrNbmqaB<0}w6qa)wu*X_0#+kkTY$wj`ohBWWx} zkR>K$rfVwz&{-*tA|WE!pC%TGHy45?)ojwWd%JyYUEXMyM2({HF=E_J?8SERcAm`H z7z#)fa7e`o0QoYh*o7gm(5yTMIs&{-TLuD38EN7ZGcib}>%p4Dgv2CrS9Oe-z|FfT zOq!`CKq>bj*!S|+GRR8yg*$pe{_a?t&l~e52XGPawRMDpJ)v$jr;EgRp4f*t`^(hw zUmSVMOfQSlR-H>4j`v}d|9k<#&c0RjbW;WVt>_XpJy*r;c|{LJ&o`4Jdr!jqgS+7C9T6@wcQbVq&ldti zQ#2P}J52HeWBl+ojizbSqNLr*l%(LaKM6lL@hKQ`h_L5M4b{)3T#s}Ns%Bh`Hf&r4Dc;gscK2QzGo(c#~%b}m%I|AF^9OHh|R0@Nytf7~V3FKFGuY<#3<6pSr z5*#sXpg-NV7^W?*h0$M5)k~>Bnm}^Dor7iFm<5m6W3-Jru@U zB%RPoKjE{b0qyl)uzEdAJZmJ<2b|S_6Zh4-gp>;KSwAuZT3%cagmS!~(TcjH3Hs!T zDXEjvCL1JUy2+d|#WHo;9oEe0GiKWCv$AI2c~^GMoZP(pg1hGy7R@W3U$Vfluyj$G z>?~hgQCYR5x~A4uSHINV(AczW`HJS1o|aYjwBFmc+S}gY^RMab3anih40VS=krZ*g zJ|ZZ5G*+sBp=Sg7?v<3(zW)2>-S1QYHVO)SOH#5op@YrnV4!1*KUsmTPoc~<2Zskx zp?E|A{kPGnPT{w4Ha9#-&cSyw-F$RVI0xGox+O*b5C$tK`iFOvEGTl5)Qz=|)YVTt z2Rkdzfrz2l_Zch_jYHUy@c^I;1-coW`J;R=5RGa5K|ccrM1ldWaqlGK6BF$}Pq7cF zP~LXUT$n$+FTO9Sf;1>Tt!(6ubP*x>XqR+YSRgl#{ zX{nsQw!NSaYciK9VUc@T{V~LNQ*h6zT4Lmw9m+kjvp3!+HCSeaIF7vcAV3%jr;*9B(8N?zL|4N1}mnw>7_LPX3%cv&#Z- N?E|AcfE)i9{~HhQtA+po literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..81083b2ee90f8208676313b318811207bed04c6d GIT binary patch literal 2537 zcmc&$drVVT82_D1DeVTxqqk{Ybxfe5A_XT#*=D#!TfhoPraa=*LMu?{ZKcIBjgxF6 zb4t`07dxjD=i+OCY~tdgPThi!I3H6t+5VAX%ydqpnOQd7aOd2vJQV(0<_Vnpo$vYk zzH`1WEf5ORco`qXt1LXnBL-0D-W9nQ@kizf9()Y1PNw`QUe3#PQ;?(x$~1ix0A(CU zca9Ja`evfzW!iMeqqG`jnN_qcth9P6c}~r#*Aum$=%?xx!+FvsQz+m$&@GC7fXAT` z%46$sXc|RO(od-=+$Xuz_+PjjgWDU@2RA9g<|?mq+C}d|o7HQLsG>xbZDF~qrp_r! z2^=KqV?=+4(|^h7`s4Id!$cf1t%fjub(n?kCQ{Llh1@?%Oz$j#BmFJ3b_wV@vKYzR zC*az5;kdsx5jQ)>p?tiG?A+zTnguE(Zt8^Vr$oRIi+hnAnr>I2bpPMD+`9+ziVW;3 z*5aePQCPlNAS-LbIfpg_$J`zn7eEOTh-6hKxI$0NE`A6s}pwY>?q% zTLP40;}EOL#J7$ptSg;@>eX48-Q~di`sbm1FBVtwTCpS2htY=5$k|pMi9fOzs@2(u zKXMlC1Ct@%QR8srd~95-!-v=RJVVg&&9nG1Iu>6Ye*vc)_uIKHSur^->?LIRaf4N2+b- zCBWIJ#D?Thbnf4eUmxHguD0N#Lg>Yks!v9 z`^~?Tk+$o^n?&Fisg#DiPJ zlehkS<5;shJ9*@~Qyz zbUYVm%k07LT61f4>Em3D_iOr<56X~30aCLckKzZ;qVS4>_rvp}Qd6`+u zG}0Zg3CPh1!MjKtN&Uw$|5D(`lZBeo>a>>sc=SG{(fGf-RQ=O>N$QIcYEG|EKZsBP zTAe0bC@jn`$_P9xO|#6vR=Qvm1Yr*SXKR)q_yl8xU=oB1A*({-8L7f-Mr8>?4*Sw` z@Jmk;>0ut2BO!m;1cPBJC1&PjvvZb@?K9a%T?>MRZN?IIl;lDxJ6D%Y%1bLNnpbBt zO{*>`F05p7X930cx`S?#zbwfK1wmK9pF}d*AbU?~QMza|R|r;SFNLS+&{YyV7tWo_ z&be$0q%J`Z;HjoOqt{xL?kXwHbCniLsnd9K0{P{>Iv5Wv7F`4Iuu>Y`HuF3$69_pL zrY|Y^g7N3%NcE=0rYl%_L72fva}BG3yVNwzQ8uZb)?_kMqDA(x`csI3OQAfYWJ!_J z?5Mm%F!}=bB!OjCR7a3188DKP4DhAYOPr=U8b`{L*;D`2lwD0g6Ph@;wRTCJ-8-?| n?Vf0@t#$e0mfCAQRW30uZAwa7N~+eP1N#1OSOyf(A9?=(tu<#2 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..28e2ab00c7a1b435bea1614c8f8f03dd7c991c90 GIT binary patch literal 2537 zcmc&0Yfw~W^nBmF`#{-5;%>XRB5JH;yq1KJA@nY9ToLdAizzPb%7V-8whusS6f0+_ zAs>y-U^+shnVOO&%^;wlhVOjPls3~OH9dT!HiJ`&ll6V~wmek+HF*ZUd%p8N=R4=j z=JNSEj^zTlAOnXv2n3L8mU_IeZw#3*g99#*Q$`Sa04L?7ns^A8`*^AW3;_H>F*z|p z7_?1D!?Ef(kOrt#ekCTmx!7uQTRE)6%JoRufV2r(`AC-35hN7gFpwjFd;lH|jUhA& zkA|j_`#6J`l7#y>l_>ukE~eo=WCp?wSC}28-b#zzQ*1VQOp0=Xl$(o79X4;JT}Yr6 zDZ7yNHr5`&nn6E<*a#k*icuz}rE+%tM=^m*pJtQSD{&CgKDp2deZ zwW9k|&f!?c4Oz{l^U%HeJ9Os5n<(}Gbei4Iq zXDLwLu?{$_b)n<6J$Q8+18Zv*;-0x_a4WtKtM+CvjYs_<{$K%W+0hMG>o&pP^)6;& z^eUW`FkMpq{y6E46Q3bnc_+JVjZ`{!<&04c&T*q`A>T{39=iiUkx0_mpN;*eVaCg~ zL&oHLa60y*EB*iceZYIdRb?l)f z8>Rgl-0(~HoAAMj1ClM<7vgO#I@CU;jd|(gznE9g{mLYFPLxevQ-u#iUzV+06ffzL z-Imy5jLa!;L*nLASam&95?H$&B3e77V=p7r_XlY&(?<|9;}s@&xj${nYlypzBIv6Q zjEwL^DwYI{kA4W&VhYwPT^3Y+u5;)SZ zju{wTo5fz{5%-NauBXOo8^)4xecbv8KA)7Bx;_q6Qg^A&8 zgP7)d13mOcdKLJ5481QaiOtcK6fE$X)2A;p7Uo+izcQD^x4eQJg1r>M2nm9KXiq>X zZ|3SipRS9G|(bi;!O)^d#uZs z1%aoaTP6{7YXd+pcLI+BM3Bot-i~B6B@G0=y;LGhSL-yBbJV6Bnh*=b^+?==nA>UeQr!9TeRz;sDxngUq9&aO~7fge^zJqCuI0igR; z3@x@3VR@De7C-wAqE=A#F9eN8d&9*(j?lh$AW-W@fTHmcl*YJ0 zeBDNnZuNr=geR=)+X%B?)WHoH4(F9LadrL&uw=9dq~9u`^zJp3SI`HxQ|_QG6+(qK z4>}qezxHu1b5RBcseT@+;02A+4Ksy z{9Fe`<|J5ES_Lnr?}wGjZ=fxYqi^gNJir^Myg+u5&<-H~fyA zH8+si6M;eJ2GL97VeGSos4-^;RbKrZ{!tfz_V}#oEpUy4%>y@6Q=BWoJ-Eyff*$Q}3ZX^f#789Yt!+5zBeff9y*CW5&Hfnt-pU?A5BL;=l^D+*k^47Nh z2C*#?sC*JjSO*Yf0o@Erw?E*J|MgzC@4U|wfei$I+y;MajRBjKD9G>cL;NM2~ATnA*R zg4EODASfuKc28N(f3nO3H#_En*I8%YmsJyp^^Jb0c~BEsedq<6c=Q$-);7?}t#mrE z&FK*NNm>xkd)wz^z9NM@2qp+wodLynM)Dj=W`Osu7Ji=#2tE3pgjt@i zONu5CyN%-rkq_cE$0^YlrZUng?MJf6?lRKm{!C2!^;6NA3iwIBT^OFdL@!-LI%#>JBf@y$XPm0I4 z5(sx4v@2#;>Y2eZ6_hH`QCW)zsE~yM0G}L*vfx{=RE>Q*%o~>z=mn_qOl*p=1An zgNF_uIeP5)iIb;JpE-N({DsaRFaBhSH`4%uOY#*`1N_W0z%NY(5MAaJkU`dE$>$2{ zf~&kP_iJ1-*Ksm7U2usuSl_@Y@C=HZw_Z!Yrb~323*M(obcc)U-vxrZ7k`qU>?@-Q z3k;3y`G^a(xjGh-GEG!|uE}hbnV-o%@c1<^u@cXB60fD@yJ&eMhj*_Z-dmq+7usFL zVipD``_N`<_J{#GS>(cmFN6aveg&WsLAQoP(5=Hl(wn>3Sm?rmDuGB_`Q85SLL|GF z_>`Zi%QjnZ_8=0UMDmOM;X)Ktj^9%QWAj+!Hccy;uFG37W48WvwX*cNQ%26F5TmnRqy3|1PY5F7SUA3u1v( zCRP29e{%dq`2T;ov47c365sAC7RUwSa%cSfNo5jiCNXhzQi$bZNj6ZvECn(V48x4U zUzXw+rjUuqW}+D;n~BWk_9C#tD0UUeFfr_m`R*BW+|tB)U>z}|XAbf84aSS8_>n9h zS;&r2?0{V}jEWs1QdpLgGYXcUn(iO(m!34fKodP|YD#iq4lCCt;QVIf;eqoP$vMGE zV6yn*maMFYy(u**P^VF5GiuhJ3y;gel+w*7jvL4FvFuQj>A z^{L5R={VjPOMS5)^~?u13)5@)u&s list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.0, + } + launcher = PythonTransformLauncher(ClusterAnalysisPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + basedir + "/expected/signature_calc/bands", + basedir + "/expected/cluster_analysis/docs_to_remove", + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py new file mode 100644 index 000000000..fca5485b4 --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py @@ -0,0 +1,49 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestPythonDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "output", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + } + launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration()) + fixtures = [(launcher, config, basedir + "/input/data_1", basedir + "/expected/data_cleaning/cleaned")] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py new file mode 100644 index 000000000..07710b74d --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py @@ -0,0 +1,83 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +class TestPythonSignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + # # create parameters + # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) + # output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) + # local_conf = {"input_folder": input_folder, "output_folder": output_folder} + # code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + # params = { + # # Data access. Only required parameters are specified + # "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # # execution info + # "runtime_pipeline_id": "pipeline_id", + # "runtime_job_id": "job_id", + # "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # "minhash_num_permutations": 112, + # "minhash_num_bands": 14, + # "minhash_num_segments": 2, + # } + print("====") + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, + # # When running in ray, our Runtime's get_transform_config() method will load the domains using + # # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration. + # # columns used + # "fdedup_doc_column": "contents", + # "fdedup_id_column": "int_id_column", + # "fdedup_cluster_column": "cluster", + # # infrastructure + # "fdedup_bucket_cpu": 0.5, + # "fdedup_doc_cpu": 0.5, + # "fdedup_mhash_cpu": 0.5, + # "fdedup_num_doc_actors": 1, + # "fdedup_num_bucket_actors": 1, + # "fdedup_num_minhash_actors": 1, + # "fdedup_num_preprocessors": 1, + # # fuzzy parameters + # "fdedup_num_permutations": 64, + # "fdedup_threshold": 0.8, + # "fdedup_shingles_size": 5, + # "fdedup_delimiters": " ", + # # Random delay between reads + # "fdedup_random_delay_limit": 5, + # # snapshotting + # "fdedup_snapshot_delay": 1, + # "fdedup_use_doc_snapshot": False, + # "fdedup_use_bucket_snapshot": False, + } + launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration()) + fixtures = [(launcher, config, basedir + "/input/data_1/", basedir + "/expected/signature_calc/")] + return fixtures From 8fd9676f36d33e9c304309c956468a207a0eff52 Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 18 Oct 2024 11:24:20 -0400 Subject: [PATCH 39/91] Added python tests and expected outputs for the tests Signed-off-by: nelson --- .../src/cluster_analysis_local_python.py | 2 +- .../python/src/cluster_analysis_transform.py | 15 +++++ .../src/get_duplicate_list_transform.py | 16 +++++ ...t_duplicate_list_transform_local_python.py | 44 +++++++++++++ .../cleaned => cleaned/data_1}/df1.parquet | Bin 14986 -> 14933 bytes .../expected/cleaned/data_2/df2.parquet | Bin 0 -> 3068 bytes .../test-data/expected/cleaned/metadata.json | 59 ++++++++++++++++++ .../docs_to_remove/band_0_segment_0.parquet | Bin 1497 -> 1513 bytes .../docs_to_remove/band_0_segment_1.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_10_segment_1.parquet | Bin 905 -> 1523 bytes .../docs_to_remove/band_11_segment_0.parquet | Bin 1497 -> 1523 bytes .../docs_to_remove/band_12_segment_1.parquet | Bin 1505 -> 1532 bytes .../docs_to_remove/band_13_segment_1.parquet | Bin 1497 -> 1526 bytes .../docs_to_remove/band_1_segment_0.parquet | Bin 1497 -> 1523 bytes .../docs_to_remove/band_1_segment_1.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_2_segment_1.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_3_segment_0.parquet | Bin 1505 -> 1510 bytes .../docs_to_remove/band_4_segment_1.parquet | Bin 1497 -> 1513 bytes .../docs_to_remove/band_5_segment_0.parquet | Bin 1497 -> 1513 bytes .../docs_to_remove/band_6_segment_1.parquet | Bin 1497 -> 1513 bytes .../docs_to_remove/band_7_segment_0.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_7_segment_1.parquet | Bin 1497 -> 1505 bytes .../docs_to_remove/band_8_segment_0.parquet | Bin 1510 -> 1530 bytes .../docs_to_remove/band_8_segment_1.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_9_segment_0.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/metadata.json | 36 +++++------ .../data_cleaning/cleaned/data_1/df1.parquet | Bin 0 -> 14933 bytes .../data_cleaning/cleaned/data_2/df2.parquet | Bin 0 -> 3068 bytes .../data_cleaning/cleaned/metadata.json | 46 +++++++------- .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../expected/get_list_transform/metadata.json | 48 ++++++++++++++ .../python/test-data/expected/metadata.json | 49 +++++++++++++++ .../bands/band=0/segment=0/data_2/df2.parquet | Bin 0 -> 3984 bytes .../bands/band=0/segment=0/df1.parquet | Bin 2753 -> 0 bytes .../bands/band=0/segment=1/data_2/df2.parquet | Bin 0 -> 4763 bytes .../bands/band=0/segment=1/df1.parquet | Bin 3122 -> 0 bytes .../bands/band=1/segment=0/data_2/df2.parquet | Bin 0 -> 3695 bytes .../bands/band=1/segment=0/df1.parquet | Bin 2862 -> 0 bytes .../bands/band=1/segment=1/data_2/df2.parquet | Bin 0 -> 3684 bytes .../bands/band=1/segment=1/df1.parquet | Bin 2537 -> 0 bytes .../{df1.parquet => data_2/df2.parquet} | Bin .../band=10/segment=1/data_2/df2.parquet | Bin 0 -> 4466 bytes .../bands/band=10/segment=1/df1.parquet | Bin 2537 -> 0 bytes .../band=11/segment=0/data_2/df2.parquet | Bin 0 -> 4906 bytes .../bands/band=11/segment=0/df1.parquet | Bin 3450 -> 0 bytes .../band=11/segment=1/data_2/df2.parquet | Bin 0 -> 3317 bytes .../bands/band=11/segment=1/df1.parquet | Bin 1354 -> 0 bytes .../band=12/segment=0/data_2/df2.parquet | Bin 0 -> 3138 bytes .../bands/band=12/segment=0/df1.parquet | Bin 1354 -> 0 bytes .../band=12/segment=1/data_2/df2.parquet | Bin 0 -> 5020 bytes .../bands/band=12/segment=1/df1.parquet | Bin 3442 -> 0 bytes .../band=13/segment=0/data_2/df2.parquet | Bin 0 -> 3138 bytes .../bands/band=13/segment=0/df1.parquet | Bin 2537 -> 0 bytes .../band=13/segment=1/data_2/df2.parquet | Bin 0 -> 5244 bytes .../bands/band=13/segment=1/df1.parquet | Bin 3413 -> 0 bytes .../bands/band=2/segment=0/data_2/df2.parquet | Bin 0 -> 4782 bytes .../bands/band=2/segment=0/df1.parquet | Bin 3177 -> 0 bytes .../bands/band=2/segment=1/data_2/df2.parquet | Bin 0 -> 3988 bytes .../bands/band=2/segment=1/df1.parquet | Bin 2758 -> 0 bytes .../bands/band=3/segment=0/data_2/df2.parquet | Bin 0 -> 4323 bytes .../bands/band=3/segment=0/df1.parquet | Bin 2745 -> 0 bytes .../bands/band=3/segment=1/data_2/df2.parquet | Bin 0 -> 4341 bytes .../bands/band=3/segment=1/df1.parquet | Bin 3122 -> 0 bytes .../bands/band=4/segment=0/data_2/df2.parquet | Bin 0 -> 4035 bytes .../bands/band=4/segment=0/df1.parquet | Bin 2537 -> 0 bytes .../bands/band=4/segment=1/data_2/df2.parquet | Bin 0 -> 4860 bytes .../bands/band=4/segment=1/df1.parquet | Bin 3413 -> 0 bytes .../bands/band=5/segment=0/data_2/df2.parquet | Bin 0 -> 3554 bytes .../bands/band=5/segment=0/df1.parquet | Bin 2753 -> 0 bytes .../bands/band=5/segment=1/data_2/df2.parquet | Bin 0 -> 4872 bytes .../bands/band=5/segment=1/df1.parquet | Bin 3122 -> 0 bytes .../bands/band=6/segment=0/data_2/df2.parquet | Bin 0 -> 3553 bytes .../bands/band=6/segment=0/df1.parquet | Bin 1354 -> 0 bytes .../bands/band=6/segment=1/data_2/df2.parquet | Bin 0 -> 4311 bytes .../bands/band=6/segment=1/df1.parquet | Bin 3450 -> 0 bytes .../bands/band=7/segment=0/data_2/df2.parquet | Bin 0 -> 3765 bytes .../bands/band=7/segment=0/df1.parquet | Bin 2667 -> 0 bytes .../bands/band=7/segment=1/data_2/df2.parquet | Bin 0 -> 4158 bytes .../bands/band=7/segment=1/df1.parquet | Bin 3289 -> 0 bytes .../bands/band=8/segment=0/data_2/df2.parquet | Bin 0 -> 3781 bytes .../bands/band=8/segment=0/df1.parquet | Bin 2845 -> 0 bytes .../bands/band=8/segment=1/data_2/df2.parquet | Bin 0 -> 3997 bytes .../bands/band=8/segment=1/df1.parquet | Bin 2537 -> 0 bytes .../bands/band=9/segment=0/data_2/df2.parquet | Bin 0 -> 4018 bytes .../bands/band=9/segment=0/df1.parquet | Bin 2537 -> 0 bytes .../bands/band=9/segment=1/data_2/df2.parquet | Bin 0 -> 4326 bytes .../bands/band=9/segment=1/df1.parquet | Bin 3314 -> 0 bytes .../expected/signature_calc/metadata.json | 54 ++++++---------- .../test_cluster_analysis_transform_python.py | 4 +- .../test_data_cleaning_transform_python.py | 6 +- ...est_get_duplicate_list_transform_python.py | 45 +++++++++++++ .../test_signature_calc_transform_python.py | 45 +------------ 93 files changed, 345 insertions(+), 124 deletions(-) create mode 100644 transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py rename transforms/universal/fdedup/python/test-data/expected/{data_cleaning/cleaned => cleaned/data_1}/df1.parquet (79%) create mode 100644 transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet rename transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/{df1.parquet => data_2/df2.parquet} (100%) create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py index 7c162b1b1..915cdcd1e 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -37,7 +37,7 @@ "runtime_code_location": ParamsUtils.convert_to_ast(code_location), "cluster_num_bands": 14, "cluster_num_segments": 2, - "cluster_jaccard_similarity_threshold": 0.0, + "cluster_jaccard_similarity_threshold": 0.7, } if __name__ == "__main__": # Set the simulated command line args diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 2a5ec3e6b..412fc1fa8 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -33,6 +33,8 @@ """ This key holds the number of segments dividing the hashing space for each band""" jaccard_similarity_threshold_key = "jaccard_similarity_threshold" """ This key holds the Jaccard similarity threshold above which two documents are duplicates""" +sort_output_key = "sort_output" +""" This key is used to sort""" # command line arguments num_bands_cli_param = f"{cli_prefix}{num_bands_key}" @@ -41,11 +43,14 @@ """ Jaccard similarity threshold above which two documents are duplicates""" num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments dividing the hashing space for each band""" +sort_output_cli_param = f"{cli_prefix}{sort_output_key}" +""" Sort the output""" captured_arg_keys = [ num_bands_key, num_segments_key, jaccard_similarity_threshold_key, + sort_output_key, ] # defaults @@ -55,6 +60,7 @@ """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" num_segments_default = 1 """ Default number of segments dividing the hashing space for each band""" +sort_output_default = False class ClusterAnalysisTransform(AbstractFolderTransform): @@ -98,6 +104,7 @@ def __init__(self, config: dict[str, Any]): self.jaccard_similarity_threshold = config.get( jaccard_similarity_threshold_key, jaccard_similarity_threshold_default ) + self.sort_output = config.get(sort_output_key, sort_output_default) self.data_access = config.get("data_access") self.logger = get_logger(__name__) @@ -225,6 +232,8 @@ def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, An "jaccard_clusters": num_clusters, "jaccard_duplicate_docs": sum_cdocs, } + if self.sort_output: + filtered_jaccard_dataframe = filtered_jaccard_dataframe.sort(by="first_doc") return filtered_jaccard_dataframe, jaccard_stats def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: @@ -308,6 +317,12 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=num_segments_default, help="The number of segments dividing the hashing space for each band", ) + parser.add_argument( + f"--{sort_output_cli_param}", + type=bool, + default=sort_output_default, + help="Sort", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py index c7b4cbddf..c49124cf1 100644 --- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py @@ -31,16 +31,21 @@ """ This key holds the name of the subfolder with the duplicate records""" consolidated_filename_key = "consolidated_filename" """ This key holds the name of the file with the consolidated list of duplicates""" +sort_output_key = "sort_output" +""" This key is used to sort""" # command line arguments subfolder_cli_param = f"{cli_prefix}{subfolder_key}" """ The name of the subfolder with the duplicate records""" consolidated_filename_cli_param = f"{cli_prefix}{consolidated_filename_key}" """ The name of the file with the consolidated list of duplicates""" +sort_output_cli_param = f"{cli_prefix}{sort_output_key}" +""" Sort the output""" captured_arg_keys = [ subfolder_key, consolidated_filename_key, + sort_output_key, ] # defaults @@ -48,6 +53,7 @@ """ Default name of the subfolder with the duplicate records""" consolidated_filename_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") """ Default name of the file with the consolidated list of duplicates""" +sort_output_default = False class GetDuplicateListTransform(AbstractFolderTransform): @@ -69,6 +75,7 @@ def __init__(self, config: dict[str, Any]): super().__init__(config) self.subfolder = config.get(subfolder_key, subfolder_default) self.consolidated_filename = config.get(consolidated_filename_key, consolidated_filename_default) + self.sort_output = config.get(sort_output_key, sort_output_default) self.data_access = config.get("data_access") self.logger = get_logger(__name__) @@ -118,6 +125,9 @@ def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl. "consolidated_bytes": consolidated_dataframe.to_arrow().nbytes, "consolidated_rows": len(consolidated_dataframe), } + if self.sort_output: + consolidated_dataframe = consolidated_dataframe.sort(by="docs_to_remove") + return consolidated_dataframe, consolidation_stats @@ -155,6 +165,12 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=consolidated_filename_default, help="The name of the file with the consolidated list of duplicates", ) + parser.add_argument( + f"--{sort_output_cli_param}", + type=bool, + default=sort_output_default, + help="Sort", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py new file mode 100644 index 000000000..be90b3073 --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected/cluster_analysis")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} + +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) + # create launcher + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet similarity index 79% rename from transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet rename to transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet index 11964c2e20b7cef92e09929a3a25e0cee1f17d64..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd 100644 GIT binary patch delta 2417 zcmb8xdpr{g8wc=>(OgEGHe}{D8zpxgQ?UuzScK{@lUz=O8Riz5T+)SDZYx^a+)F|v z$-$6YE|Yta%Z?N>q|Qs{^1ko+eBR4n@B94o`{(z0p6Ac+jmS}kr8B|B2o8d)!;KyP z3;+xO(5W~0uDmi1h>x1hFW}7E(k2KY%*3(1U2U)j&sz;}Elane`25nM0 zd6lM?vuUz;jPc#?6Wgq9x0Lhccv%XN45R>+CKo>ArkZoA=V$87sUE&GRaR5VP;Tx$ zQCl4O6VG(Xky+{M)O327KAkg;5C1sKUw=`xo-*RWxURxkF-wPYqNNWGY}>iVH@HSK z@*@r&T((@A9*uHnf2qz-d)Qy5vz_9klE9^{#;%+?cwC4|AnoAtvJ5HLVBInCjuL)2mbQ8$;YfdY^s;UA{K%r0;)5hkTnGzFCgnw+7ra};rg-EvHUq=h3~579uN?{ zABtvbVmOZW@$%d%&s~JZ)wOmL?|@>X37r5@N10Cn7r{ZOq({s|h$tbD&qvF~IlOi1 zPog&v$bOJFp^^OiSOOBYUp_NpOy7e_J4-R{W()TF51z+_g}QIO*lOZacLE_ERL+CS z1mtf+Tge0H!>X&cYq~W!dk{rcTmCDDDHn|~x>pOPJuZ89z$xamD_`$j>HPs=Q)6@CQ!==xaPy?iz&fKSS`Dq}QKc2ISt zPu0v0qU;pR<>krvDHqawwyRi%PoL9d1hRzgwyOYp_CvB?7_jQ=3jYiZAB`@NMDyd4iCyd*58r*M`SPg;uH=Mv{HE!Sk zVn4d@R;wg9(rfYYKI@+)d7>fHc(zGLRYOv9p=pYFls~(aQ(ftiz={j+IndaecR{By zYmF`o)x`|cLYCL2BA7O$$taZV!W{I=3~#59$T*Y#@|7rHmfZoSUAi3;-12nx_|i5` zB1vA+u{=jjSNQtl+XP4#UrkBI&98OcmU)UGvQfyca@pN?sp-V1t^P5Wp^`}j3eNm3sJ8^L(8#9{_ zC8sk5UL;lK4@a(vij{MP`lQuyaJjGLB{D=-JZ!l=P_}Jkg=&$ z1dtL#*<8C8^PY~pE+-5~vzTBZU-X9++acF$?lGGp4H?U=r-ZDXqw3)B_xSdaclIBe zPx{fn49#7=%Ii)3T07418e9dEc!3PlqfIMR(r}D<<j;n zZ`87@4<#;^&#_%dP@gQ8GE12(NCz5PTVFdb8w&F-+9=Ecj}AHBw)X!Ww{iGTRP0m| zo4IS{Y-(ce&j-0?G9KL~?yG&1(_*5&bBKOxJEN1Ols!R6OuX|z?%g+Ek(n?miq0oq z{AF?)ksn)nRunGsJy)15E(H~ypCH(L_~}CjLdy&ttybNSXIEZBqCWS=)7$nMN;m=) zPIUO)?<18)@o3K5qyVQUgQQnFKzf@B^MP!Gc5~@zO+0bf8Y|^jZ1Fu)WlQl*H^8Qq zMu2>lJYKD|f^sZ9>!uBxg~DcgQg1vM4>$udzoCb=Ik#}h~hv*&`yNM=-Ci6t{TT+(WAVbuWv;EDm!A%#}o6mIEB zZy>a~>H*{_&)}P(tS_@%%drk$X9EVkIdzDR!J1I)BB-Nb-Df;=8}hVWklFqR~%ILxD?B99j zIRc2~V*h20x+}U|dE8$%`A=)~`yk=6e^~?U8%ksN(-;x|V8+B{#Eo;9b}dYMU7~Ki ye0H7A5o3@tQd#f+_eu6)U2XLLc&R~Ns)&gIVZexO?>aor8VG(31pxl;tUmxhZ76gA delta 2494 zcmb7`X*k=77RLV(B2l%3q=Iy;v6M(`6`>_FM8(#`QfnF&wJjFzZpg*lkH?l#Uq3fP6!J- zgct&V1Au0$-d82fhX9ZW0>FSUAOb*ue~qxS5C8@t00a`Dsq=?h3IRbt6y>1*(dD3^ ze>;(Y=Kr|l-{DNeuPsndiDgAb5)qOI<)l7 zqT7s6C#(Y!*dwK%_46^c+)SJ}!UE(4vIlv9^339^X_Gf~jmYajHHuZNBCKB%hXdIPHaQs%rV{4a%3OGYx(}2_Dc`i(!#P1HnWl zIhN4gRPZ*Y+dI5nY2ko=e8+Hnra$>Z%cAPWEO+Q|tG@Hw#oCt2fb|bIhHot!WN6jj zFX1ZmbebuTLF6(vV!kWdHL=|a{ghqbX&4c1#XAUt!M;zg)OK8k_svpG)g#zT;=I0@68Eb4e*MaGHVTsoh{4V6x5HzJ6?a!WKri%b=_39 zOoR67?ih%5Yo4u%{h*MNsz2n`|xp24bOg_HaXqek?x5RH+i(O}?ckNEg zw{(6XxqP)Sd;BhWT+JB$#{8{TBVh>aiB^}~Cj%XmNhM?5FoLR=q+0XjwULeM+W&&r}bOew%7KGe{;oJNVA$hv^VGU6=|E{$qKYX3ZwY6TViCa zD;sUXkT;F4boJ)Ojftw?vB?>a?l!h5A$aZ#3f(EzlR3hr7B4G+WZ+V8f{UN<)~kV) z1-Y%Osf@Dt&SOEk-lVy7S<^P13&BiomBU6eR~9?eT5OOZ%~5alRm(nDyV{t-KcG)m z5h*@4;hhkNY1F;7O!uH&9QCOB5>aB!-g(1Zn*MI6jL}tfqT?P_|p( z8-sx zV*W_-K#GRLc6^^!sZz3rgMUe>3EX{d=2b8C>_BAr=40>MJQ?r!U%&bz{JX8#dxUe6 zVP_H|az<7N{O@fm;7hpzJmle~|5(Vvx9IbH&mUs$DcIJptVZ1!bb*fI6B`a90^2Ay zr5(GK?`>w57gVdmnEd6tvZS8DBo1Bv*eGGi)zi|)LEcocOW%$*;pU>-Glaf>2I1v9 zq9^M`PBY0(mo22Xw=rVh7^$$U!-#oFi%J<6HXkR9tC`k~@TXZVk#SGGBYFn=dS$Xiu#A@itDZ%n0cm@?Hrp-Q+WKLay^eP_mL);t zFsn(rTv#{D{2j9^Y`r6`D7Cqn?=d`()B(dpV?^psfvkDubA!~QgARKJ+DUrMvp z;?}(6(iYP$oviM{R!SUtZC=fpeG*MvMj^FxUo-W{Z=zX!Y3?1ujxAO%yPjMkhKiZx zEQN{_XEk$MyunNP+EHR~6flV-G;{^VV? z1TJFJZ**87<=WdJVo-kQmAW8QN8S*4sp3I^GAhZ3vg3&4cS6#&g0-4?Q(Ql7;x%Cx zTl60ImhfB+EQ_Lm(pZk3X3rTz0cC=Tkx6az;hrTdMtz;NHFrSc-E*0%;KR43|wq4N}I z{-9_8m3=?32me~`=JeDG=>`8zReqITRmE-mL}%zu+s9>GlU)68AL*Otf-b>u=E;Ma zl(`^It#8H2XH}_%PLYPWo%$!Vp+Y+74c@95Mz)4KzHeM;LvCMkn3yv>lk{7XH)i=% zWX#13CwnuxLAu;nHs@(m(cn2E$<65+j>dX=k?pg%ZDJvj4<@KYV6Y0*v-e|1+D>e{ zg-`8F1*hOX^*)QqnoJ5uot8OyqwYuaq;xkbgtTv<-zk2UWufcq>DB+LH%gZFOY+q8 zgex?r*uPu#j=IJzXU(9oW|c{&_zqcOgNzR5VS_`$C-3vMcZL1%aqwBSQABfJa`uC8 zb++_5e)KcI2OQtP&7Zmq7E2a= z!J$!rAvU-E;`Ea%1%;!$vuSwiuj6A;e(ao=eVIPP;_c}TByGz6?l4cvDPfYLA(CZ< zyld-wDN*mFO~6%{TFpSMJByuYjYZ|P0rO>KY83|#)yug*b~5UdY#w0Wj}xT-a`D7WC68SirG>)3R@c@8#F?WuX0Xt42|D~4Z z4QPX;@m~<+PXf6YpokDh837O}lpHi5GJ+8p!H7YKBgK$hE(p1B8goMp^90;1fZmWS y+%$m~CTbF)VkkwF&fjAfH4V()`y?ZFVOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd zr)%p2Ubx#!+mwyv0QF_?+Op+i{+J=%~>8 zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N! zT+gohy}f4!viskNbvSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk zldhaoEl!8**-E20aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1 zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R$Zwx35Deu1M^|QO;^<5i^#8S> z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@CN06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1 zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^( zZ9ZkWQS1(*+75QL rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK{~Syj delta 453 zcmaFKeUqCdz%j^h@?J)9YZe9w5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;&cK`MuG#z0y%+ovM!?{)Z(_u zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQn;ei=7gI(tj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet index 57642d19925240e1a86b323222b420617a75e7d4..0e089dee3ccc91c9decf65449e3ae930f1d1717b 100644 GIT binary patch literal 1523 zcmcgs&2G~`5FR_J>!K9Ws%zO&4iPF=5B-UWib&PVCM{MVlq86>j`Kpc8ToVkIRUE3s(@ z;&_BI1rU7Dco^$sb8>9Kjw%N$)+<<#qXDs%Y%n%K0{xpj=0*PIRk^UctwEd9doL(o z1^x4~{(2edO&6cjauz);w-2h+x7v26MkE~S9dY8mc>X^p>;-4T14Yte+TjAOpT4j= zu<)rU-@}Te%gU#v4FD~sMrg74E>di-#A~ tc(3NZy0iy7RVG$-2x^A-H@Z$iT|L4Rsz)p9n$~ zNN})1Ss delta 476 zcmey&eUrOBz%j^BltolPRLMt_Nt8i|fk9)d{$B+SM+OEK1_%&k5ET+t(E&;eh_Z>Y z$w+X(WDqPd1`RBtK>c9TI6WB{n1CE6Mn)zEwI_@+qAZf83~U0D5)*rP^_j$WFpAw_ zRC80~Aja&;8=0jhA7>PoX2t0~*2xo@MKxja5*#oVcK5VRj%Sn=gt_7cv)U?VpgX{3 zFtScw&a48oO`VZx@_R-{c8CwQO+L%4!VA+egI(F*81uP7k($-R&V5Oyn zk02JJU?n2hh(#1P@kE3KyiBu~`RCtnX76&{+}_6e<`R3n$SVyF7zzNj)8wtHj{s5$ z9*o0h2uN=+MFuXinP=j>lLr+|1A!))Nfp#a+`44nd8`Q_+<{QYHVE@>pM*|GU88zI zvt#Hto;L4IuR82Ct~VUZ7k2d+U^fCE0+`5XvMro#T@@EF^XT>03pcF-$KR(l$Y?_QjEra#oC*n12BvGz zWLze7iRux}O=u26_r!38>)p1+R!F5vss8`@i6484#UGf;wC^i2?jt+P-U)sHPxe-< literal 1505 zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&y$z%j^BltolQ)W`=&F$gg*Xl&L0t02(Gz`(`;0X$GzltENTR7FRWO;iAg z*<>U*U@{UMFtuU~8XPAUf#(~G2jp~60qE-5D;z-vG7|(CAru2PpBk&pvbIbN}dv8i#0o!hLY z$w+X(WDqPd1`Q6ji6M&YOpJ_73~Em%)<~!_iS1w%yThpFrpAF=&twN?6%QN^V8-bH zn2ZDmj0JK4^JGIt$%(lfP)D>)j$)Kmhq>Yfv)U?VpgUk15G;t+d5r4%*g}C3rz03A z=QD|N!{jA67$?SwL#^I6`2w?wAk69+>}t0_!3NgA$T(S=MP>3zreJm^Nf}8S(!xrW dkyKx)FsTu8jVi|q7BMC@4zVelJy_N;0ss{%Nvr?> diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet index 06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9..bf131f43cbf10180944b4906799b7d6288c54724 100644 GIT binary patch delta 585 zcmcb~{h7Nyz%j^Bltt7))XGPcO_V{1fk9)d{$B-&r3?%S%nU3H{0tDl#E|fSF@aH( zK~zLkO$Vq{0EpRSBsgFSBsiF%O2rs7u!sUB!6tJBGB7X!Ic$tf%qk3?YX2BzL|G(F z8Q26QB{VG$-2x^A-H@Z$iT`V0CgTtp9n$~ zNN})1SsY z$w+X(WDqPd1`RBtK>c9TI6WB{n1CE6Mn)zEwI_@+qAZf83~U0D5)*rP^_j$WFpAw_ zRC80~Aja&;8=0jhA7>PoX2t0~*2xo@MKxja5*#oVcK5VRj%Sn=gt_7cv)U?VpgX{3 zFtScw&a48oO`VZx@_R-{c8CwQO+L%4!VA+egI(m>l00$Gw|Fn&mB5!d%2k!`pB#MZqFOLbF6;GjE^zmb+hY|S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDSo6 zerMF3qxb%n@s;n!&E`u+E~2NO?1PpVI$gKd5D7ZRd-B{B3FB*DI7}{v2db>6^@A1M zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0mccu3lJd5>j+1w^dS%kl(~5*<87>|n7I zqt%AK}bR=(}@YNKY(z94)L zr+5$pq_n9}VLle3gNNLlz#Xw=nf8Ai27zW(6)1_&QGiUE}Ar~Vt0tMqdK delta 386 zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R c0|$>-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet index 57642d19925240e1a86b323222b420617a75e7d4..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 100644 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| delta 386 zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R c0|$>-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet index e303e5ea14abd679e479c2fe54ba994402d60350..2838dd9727770220dd6b3f3ae9f0d4bdaefd8ec2 100644 GIT binary patch delta 341 zcmaFJ{fv8pHY3|a9Y+>6Mj%}%4J3NKtqnx2d_>tq8H5-ZG`8yhRghT9z>pxwz|6qP z00B%42@e<(7)2RGMMTwffJy~`m`z54LvS)bqo|f3RG9<^Gn56Sz$S48GB7X!Ic$tf z%qk3O3nm9L%L=lI-C`8`#;Df7qPB_|W(cF;x{3wu4dZ4x`#8cC{}cC&AQBWK^3xjVX|kZSo5yIgmd+m8?&rq1OPv?J)Hmm delta 484 zcmaFH{g8WtHY3YK9Y+=xMj%}%4J3NK%@stAd_-A98H5-ZG`8yhRS;-oU|?o|09Gh1 z${;Evs-go_CLqct$|fVh!8*BtQ8XMTFTnxR0n`9Ch0~LPfeFX~TE)zyHjPzAltt2% zflWYCf-faMxj4QgKfWk6H@_@Zlt+w3>;$9O8%DJs%xbHcVP-S3PUdD&nY@cpJQ8FM z2w=JbCL_TCV~H_nz}W~Mzs~}YOg>(gPDY2p_w^( t0i(?1*-U|qER)|d$$kWQS1(*+75QL rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK2hvO; delta 453 zcmaFKeUqCdz%j^h@?J)9YgPsb5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;dBB_MuG#z0y%+gvM!?{)Z(_u zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQn;ei=7gI(kWQS1(*+75QL rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK{~Syj delta 453 zcmaFKeUqCdz%j^h@?J)9YZe9w5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;&cK`MuG#z0y%+ovM!?{)Z(_u zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQn;ei=7gI(kWQS1(*+75QL rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK0*6c< delta 453 zcmaFKeUqCdz%j^h@?J)9YZe9w5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;dBB_MuG#z0y%+gvM!?{)Z(_u zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQn;ei=7gI(tj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet index a4ad5fbf82bf959f34c9a25974d34cb91aeff037..8e1fe121e25cb51ec8c26b1aea7ee00463f9400b 100644 GIT binary patch delta 441 zcmcb~{g8WtHY3YK9Y+=xMj%}%&B!wOKa)p*D2u3qsF4qlVi00r(AcW~S3#hWfkBV~ z0+^w+D1)ewsEQ6ynSdypD4UD~2TX+o2TY9^g9eA-WLZW}Fw5J2qLCNyU&w zYzL#*9Y(bk>}sz-ZbniIwFTV-U1q7tJxsyCzMj%}%4J3NKC+}bsk!EIq08s`}AyE|_Q6^CVQ8rOF83_*N z$^49>jLef&85P+<+8ET@CI>Rh3Nnf9U=+K%K)i& z>%Wrm*8pMgB<5QJWSQJrAyE=15|I+B9IU8vkgGsyMidM1s*44H5IRvo5aarHAwk00 zCP;BjU2Gq^!%@BESR|%Il~025P0*{l{69^!6~=&s0970f0Qx0zzley>h^0OPyjlVg zA|SzapIP{62t)z5DI|4~-;%$s%A)_p_)SM~k@LHD5bzVlH@l#}kDN2Sf!v1mkyCf= zo-?*N&7XquThQNz^$#KvovwRDx0yu$H9^5gKRv-Ry=Umj(TF0clPbzoO+Vnp$xaRB32+?OuaOIN;l%3unjk|N7Hja5g+pqFO@R zn!{0?-&xS-Jr(6NqC|CB`5d_kpe588tyCWG-}8o~ehsZNna~`iw2Uv4hm>NVLw70d z(0qr-a+vUledQ@FFpGX*(PK;bXDt4NDPAgHV5QN?(Jl`50~csYnO=#*|Vyk=#5klAau%xv50RY%QC z?FjMR_3=>3k>aLYnej0xwdgi0j^K8fW#ZR=;5l3~m1kYe+Q91XrJrwLp9zb0*nWg! zv3kCEctK4L8t%(ud$ei|2CEI%btcKN?G6u}e$u$JW~`;PtptGgzz*JJCA{_j0OaiY Ae*gdg literal 1510 zcmcgs&uUQX@?-18p~Gt$>j#7k(@Y)D#vM3hB*1 z#fujY9!yO98$5Y79(wRkFy2gjGs_lm@!*9QcyHe~Z{B?0zJUTOnKVk{^zI3jDXal( ze%k$=R2V=Me4{|u=*AW$SVbsFXiBg$eNtg_0gC%Yx0O7Aj0uq>0>%$1L8JN}khrd` zw9dW3u-bHOx~dZOv!s5PjAKLjpF{L56aZ8Pba4y-n5XFe39`Krq`d>EMIZ*m7lnLk z*HTTwJ?qq%6&Dm}N@P;X#f4E^6yk0d195U>1B|z^sk%F;Yfv4!RnO_VBS%R5CaJ$9 z<4x3fA3>4fw*NS6p~yv33rL5j7Ocz*3!{Os`Kf(Xrn>07uIh!;YY#g!6a09~_{lFy zxH*>>Mq!aW{bUa`GO#;Nw@wKT^`1O)U;O;9H|&Fp>48f0gnqDu>$ki+U@Uto>X)cW z3`PAATL;h++A1rRo}TUdL(;D0U2bERWsDW^%k42^2(;NTV{Mje3oMHTPxw=gu{@8M z2N5&AlzYzcXWWTaxjb);NtSg4=yJ~wnpl~~ij{d~o0(L@%v@E@%u%yjZZ)j(WhGq_ z^__y?hm}YN?JCHNdBInZxS`J2A?ItAMlaK+t)`V}+1>K6k*-`IzlRe(j0vW=DO==v zEXHhh$eSa%BObX;`akj$;knE4Q>{wh?wzJy+{bgq9BuRcNX6#;Cg$@6E!nSouP>e9 mw%zY<*FDc2Cr6GqICpzV^TCd}lhO|o0R9J#@TXS5fA~LmVfT;# diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet index 57642d19925240e1a86b323222b420617a75e7d4..3d1f158e9e79bac193f88f94d2b548b79827778b 100644 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJS(ac$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1h@mC2oUSy{w9@^A|VnfO^GT~Plb&+S=t|TTP*^}6v&b!gYiR3iMYNCiKMQrw$A*) zu-5b(v8KxE7fJml8ApclKST6QA^<`LQb{rZ3kW4cKD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t uf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEVGN)|KlR_NHuSUr delta 386 zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R c0|$>-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet index 57642d19925240e1a86b323222b420617a75e7d4..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 100644 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| delta 386 zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R c0|$>-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json index 26d0c0905..c08326355 100644 --- a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json +++ b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json @@ -5,8 +5,8 @@ "job name": "cluster", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 08:17:19", - "end_time": "2024-10-18 08:17:19", + "start_time": "2024-10-18 10:32:15", + "end_time": "2024-10-18 10:32:15", "status": "success" }, "code": { @@ -15,7 +15,7 @@ "path": "path" }, "job_input_params": { - "jaccard_similarity_threshold": 0.0, + "jaccard_similarity_threshold": 0.7, "num_bands": 14, "num_segments": 2, "checkpointing": false, @@ -25,34 +25,34 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 71.6, + "cpus": 91.7, "gpus": 0, - "memory": 24.71, + "memory": 24.01, "object_store": 0, "execution time, min": 0.001 }, "job_output_stats": { "result_files": 28, - "result_size": 33665, - "processing_time": 0.052, + "result_size": 38040, + "processing_time": 0.061, "input_files": 28, - "input_bytes": 78286, - "input_rows": 70, + "input_bytes": 115324, + "input_rows": 168, "consolidated_files": 28, - "consolidated_bytes": 33600, - "consolidated_rows": 70, - "groupby_clusters": 14, - "cluster_duplicate_docs": 33, - "jaccard_clusters": 14, - "jaccard_duplicate_docs": 19, - "num_duplicate_documents": 19 + "consolidated_bytes": 80640, + "consolidated_rows": 168, + "groupby_clusters": 35, + "cluster_duplicate_docs": 79, + "jaccard_clusters": 35, + "jaccard_duplicate_docs": 44, + "num_duplicate_documents": 44 }, "source": { - "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/bands", + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands", "type": "path" }, "target": { - "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/docs_to_remove", + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove", "type": "path" } } diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd GIT binary patch literal 14933 zcmeHOYgiLkw+@Ji5meA1DB=bYY@39OqT-cs6OfyLPz9`$$&idpX5!3*K(&@CUfOCE zue2&v@Jef~TCG}ZRn&T`7p#g}QB*#w^@dv89<|@M_9P&BJkN8!?>W!;mHZ%?Jv)1^ zz1I7_YwgJ z@p5UtC-t8B=<&s=EL1E3;f91H(Rh zUiYNzNoFarKGHSEA=x3v!Rx~H`w2#fw)o?VFtZTUGy? zN!4WPt}&-gmlmCm7mi68c;xD!4{QDKq4b>d0f*eTmHEaFn(t+#Fs zzj^6w;i&!JqncS?9ADS>Noi`2sg>;A;ya_?if;6sn)ozg$^78z<%5SWEU*1+@~xds zUmWXF%dBildvE6KTHldxOiliL^O$ZwZrtcn=kD%a78CPlxYzpu35wghoh#~|99}S> z*RI{e^6IN@4)L+fcbGnX#JHVB+WG1SPQ7c|&g`J*5twE$eYXEW=8bRGMIT(8w`1Dz zZN`E@R}$}J%70b&cfDFVk19&MA*VevP#bi&mMDxdZ8)D z=yreTy7LuP5AyPxes*)~?leKOBl(}_ruz7H?=(+1ryiHbX6J+*F0buQog@M$abq8U z_jp%L-c#CbTwcYh>Zv||pL#s~M8JTacVq9pu{k2%DW_+jPJdK%v;>B(*y6%|weIqp zsYPQlYLu7Pe!j|e@coEg3g5rq_IP;rQ2^i0yC~}Vh=A`NCtdrm+m#cxnlJK0g1-Fl z+~gHY%PR~uiHk?96L;i1_1(Jmn_l5>mQFZOnB05TtV7XSf9t0mQy))IvgiU{~EcM)QT65yO^Q|y_LH*k$+a6Mt z2Q#YqDZL~8wf@kkvSU1Zu5Z?3 z=8NR%YcfB+a zYt#?URZXY*Pbu$rVB_{_yO;Mb9aKn_udUcrJ#K2*Wd6vTJN7Qo_uVo7p2^kyjlk2a z@z%YILQLX?Lf_cy4er0!S3O;xAZ9K7{>LW6mGT2F?7It!m_6TIiN5(HqV2R!-ICTX z>h(tBPq%-W>UOB6msf{O-QEYW?~m3r#dKM?G5uNBcXy3E8>=3cernByE;;Ulx;rlm z9)HF=^~C7do?>F#xwFDz$38N@d(Q8tp!RpD@`II@+%w(w{5mgU&7y0x{9Z(tsS2+# zyMB9k%e1dP!-r1x`?lX6&Aw|##{a?p>^n1Pj@OJ)4`Onh_KfPeW9!yy0KPj%0{xx2~e_ISK}^XJo_=e=>7UcXJOK|y=Q_PgWzcewuP z@u+unCCsPDYt z)~KF_U&bFOry6=5ubk9j+@Rp&qgHC#$U;gCjp)>SzEsG)yRN4@MCR_VTE=$JbVwbx zwcUkdY3JIwHHweCR?6Bt`o_$jU3Ast^I^wE!>qw|Wj@~>pR_v8=fUO`yUZctt-Yfg z%bpHJF8tNV{bw%6|GGOxXZrnA!_1jgN2mX><$T5XlXo2wt7x(R&|P=(5>FQmsvcVS z>B8>Cmp%vm)5`t(ib_D}g}OZg^M$&K|5<>`s;+WBSPW!}lPa~wjVqGl7WoX3Qf z50@`;IeRK;K@|1Rh=&7Ig~b=Al|LJi*1z8%s>Uf}p7`a0gsxkDUbLqlda5#g@?`G8 zGOvYonv#-FS9QGnC)pIbxP#Y0yV~mqy(TpZ0up)!AFgKTNhvT!M~n{+38Sv}ZxM?}QuM{6otg&HMP|mXPhw zG`0IGKl9yuefZHG`&Qp?)4AKHe(QfatGF^acv`wg&Y8PApLvYvVeD`(FOIL7@ZO^n z3nN17+mf==*GIcJ8tNARnB99vRI%UIDE$pTw?L-u57f=PpocA@ys0G@^1SNPINf_(N{;m%ssGX|F@$Y?~FSdlKxi-f3vM;kkTU{ zzWr6du3y#pxE;|qg?2dG$*ZixmD6kX2J7#R@|0CNIs_P(kNC68CrbH4&)$c&$&xST zG9Pngzh9{we(nI77A%@}&p2fY4CE#T3D-I7WxkDUqNVxUFGH1IG&}o8xp0 z!RlaYhCzvDK}4x$QJ{4c8p-4KbizdIgduX6;Kk(894peSnG(Q4o)9TLO`;UZE`mX^ zMJ9q3u#IUjvLKJqWW}bWrKV>pQ37Wnl-NXdi+w0KExp^5b(|@aG(WNr1d5?ZDLh<0(n|hg2_uEDBLx#h z(gY)dwI&EK#VCS7Z6if}%funRh zfig%=6rsh)!9tAS1;l{`yr6y&5%3bh2pqBiJeXn?Pw8P?2uQOl1ha|aY48gPIO2=| zGlAj@X%Z%)S&}hJ))E-IA_dD4Rr%N!(m?Qfc!RMdUvNev1Qqb5M?3}VW|4_wXtG!d ztA_5_JIKO=6LAl`0I69p(AE~)l!98GiLn8(#IV7A$rjX#9MYO;;Dgx&FvX5Gg1M|{ zHX;ZdzRo9%G%OJU4eM(K3v8+gO>8sT^*m=04G<`a2gR+tWkV~*NIuO2v9f6aOb`H@ zt#Jh;9Vjh>0B^y4pd!seq&Sgywcj9Xz-SQ}AkKJZNe8TK02&HNOBLf0@SMrC&H_HB zz<(I%CLX-rIu4l7V>!M+i8KtMC5(h*L}~#;7&s%LMjD-t#!v=b0M}!m*xVpZZoW3`qlA}N#ANQf@eihMvV2@XmGplb#SM3~}109i>~Iq93gh?g}GgB-%oii9A7 zthIy+gWN1Fxfw@N1n}4#sJ4~utyEd*j|76&p+wq<3SsG3+m*mz3X3afI{>X&>eR?T z4iowX5V16vfJ_3i5KIBw;5oD2AWfv>K)02M1RqEo0X~^oo-QO9fMxTt5mkh#NUZ;)_A(UIj0(uzhHUjT*E(fK$7 zyo5$jFJKx3(PkUK$4CPmKo}v9ON@OP)Hg`nYpWgah-wd!Uw z^&ksFn$yw{=$A5+(JTPrrR7N0;{pQ$pqKKv3A%HTQ^2H`B4o4K0?5RKf*gW9g(#zm z;ff)@V7!Nhhd=-Y4)Q;U^bjl0iHVT$LFcRq#|XfJ5wu{nnPF5S4ak4t7m0SDOIogw zQW*)pfPxfA=y5v7NCr}XplG2u%EzI_Igx;*DS&<#!U`dj`A}d;T8=Cr`G{f*X`Yj0 zP>te2z9c^5fLjFuvn?4C36xe~egjHp7{lcwb5%4IX_yRi~3}tX}W5HR6fC^Nq29dQIl#(QhRl%rO zEN}%t+7gig=#Hra17r6>orUW)h!f&i28t@EB_OFo6V?TgdLt;k8q`TGj|SI@P=28d zBg`o$ZI_WeWio?F5D{GJv$)EF{_-puax$Po4-96cas}2k*dU6g!NI{6i$!hXfjd&2 zkNXGX3#h`N4iqRM7-j}+n0b}1Wdos->PHpkg`kp3S}TQNVigU&Z5s$xzmSlKegi`X z4Nx0IBO@PYCJQJYb`g-;Sd>IVZUYB!CaKmPjDiwGpap?=S|}*S=$Z&cZ;)4ATZtVb zq1b{vnMCJP@?@e2C7JaQJ)V+}<;)OKHV9WQz)k=bpk2#BeL^7Hm!vjW83pqK{^KB} za~5QvplpVS0m=l@n$sqNC&5Rw0EkGj6*4FjNP|ltL4-OS0s}g27Gam8M44F#unh%> zH5=fOShEQ914md>r3m@?g{@d_v3{8-qCm<-SPvx0ra@Zt1`&cU2eH&yc?0Pm6BhFm zKyJZ!ff~(7*-9h^ixEQ?KnL4G)$)GfgMoM!lnsl6HAEA<7UNSQ5>(aDz@m79H$sU9 zqet;jw4f=N2T)O@Q9NTbi=ct9w~3Lzt`R4F0-Os;T!pg?PA#xz%FJ^Nr3BY8dvxYkaMG+1dDuR&$dH0FksLNDg4W)T%^~ra=~= zz?YIK#1)UrV19}(f`Szq2dUX$5*lo=SHBRCe}AX^A0b=*`ritUYkd^1w%-a|a7_qg z6{MdD>fZDG=$*j*y#x2R@v-mit%7CW+e?VH@9ph-d;8uV^1M~`?R$H0gne&s-`hhC zWZ&E4=SB9ty?t-r^3(^O9@_Wz_Psqkv9j;&TQ>amz5R%5gxM!WG(Dw*7r%Nj~tDz|+aw$6NXTbc!(0{s6*$xWIn6 z;H95%?1u~NhYRe73+#srS{+@pA1<&TE|AVuzH*ihKY)yGIqzpbTwp(3U_V@7KU{zh z%-IhYv^;CDA1=Trt1|v~$MFBp9zb}meBaY4(8cq~gsw7~w~v>rCOthhd$9F+gjb1U zFn$fxYv45+|AE&ijmDyh(re;08oef3x}}MV(rA+L&1j7#0l&j_^E+JOk5MnKQ(|Dy zLC_XECKmqUFTqRtD1Cx)q5ni}Sa`^|?C=oMBDKQ((Rf^GaF`s%8>@-3Sm%{)O5LD~ zLHfYix>{Q`niTwjfbrfAV+!65ODWXGXNg+Le!V6}8jrM@f_uUY@RE5UI5#!POo=Me nWKt1u%Fu$~jFI$D2cZMhq3RHCB|MM8k7Rywm&wxLA4>lPWZ^x$ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d GIT binary patch literal 3068 zcmeHJU2GIp6u!GGYZrVOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd zr)%p2Ubx#!+mwyv0QF_?+Op+i{+J=%~>8 zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N! zT+gohy}f4!viskNbvSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk zldhaoEl!8**-E20aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1 zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R$Zwx35Deu1M^|QO;^<5i^#8S> z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@CN06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1 zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^( zZ9Z>@5MKEvI6chB8(N77(UfyU^qKic&R`FrGL;QYNqyHfWy(|`{Yw^Up+x);K*WFul63d>VLgZj zjp-GTiLND2{NC_*RJdyJ(H%xGx%jPhl4_IbCsd($pC z<92E5qT5<29^po)hq^d#66uBfP$}vJ%^Y-st7~_Z?F|OmP7wGLbLoEawWj QCEYOa<2(ug4g4j)0QCQOxBvhE literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 0000000000000000000000000000000000000000..34b15a76c75091485eac702e1c9f8c6b28c3b30c GIT binary patch literal 663 zcmcgqJ4?e*6h296478T3ctZjivKTt_VPAqy-js@us%=nOT}oR+6s@UgZNbUW!O_*f zATI7s{u}>*Ab4(K7jbd%8_svn^E-#6&YV1r(OZGmYLrk|07$*$KF#%afM*G2K!!NX zK@teeLxkA_5hQ^ERcR`XJ%q|sHZ*0L*A(fyQiaqnQA~ys@go2c9U@55-P?!tAR08L zmq8}FmOSx$!}iE;2X5aVyELH^^+{CkMJuqBzi8=cybh=cXb2JjYNX4c$-~~M+wbs9 zY)Skn2rAJH-T1Ga_-({e@V-R=-Ov)uarT>Q(TduukmD$M4AI~QvBTJe*&f?q%wv@> z#cZ3gU7oBkw#6C!DWg~Ap+j!)h3xVEO+E+F;KdQDm9R1z8a|_V3CTY17WQshMQ7Y9 zZeMhptA%6S2=!1G=S?ELus&3ZIzclBo#5))9c6lhL8cP~{=^)+!SKw#H1jLje0Etk N4E#8c0ze&q$uINqc(?!n literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/python/test-data/expected/metadata.json new file mode 100644 index 000000000..bf26b5228 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 11:20:38", + "end_time": "2024-10-18 11:20:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 136.2, + "gpus": 0, + "memory": 23.89, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.021, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "type": "path" + }, + "target": { + "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c7d3d807242ca605dd7ae80e4807895ad4d48c50 GIT binary patch literal 3984 zcmc&%XH-r&>{ns3qCWSl>YEfiY;u0EYF)g*Cg|rUM zp*e<5V8I)dNqK|X03aBJnN^4|Q9#Jsks(dW9D$=Flk%gZl4It?N2SNpSR#}(B1tQf z$IE$tK9>Ya!~>Ct=d8nMv7|e&v8EDQ z|F^8Ayg`YSNBpWIVJi9KEhE&j&{Tp6BJV7g_mJ;Ou@J?i%*#xQP0pAT6O|DqN?;BN zF>~gnBxWWhYl_=|B`=Zu8X-gszsT=vGQob%EB8+bGY?0g!fVI+c`f1aNk1{`(j&s<%y90=+$>yWbCau% zbfW6H-%*LS;lz1Jhe;*#;N!>M)X?RN!RoV<9Nn7;J^z^*wy6DJ`@s}Kf3XmP`_>|} zq851M*Mu^L@=?qgXX>_%E!rI61pGohpe!P>xl1)Q?_31sdr}Jd)gdU~ih|MC^r?dv z-lD_9E)Z6Sw-8HidBNvtGjN@s7Yf%a5%kelsDB>!r=qG7bt(=Y75gm=L!V@IVq^IT z?i|@{LEjY_p>SAA9!iPC*)P1%)z{s+73SW=g~{GDzqmFkdcsP2{phL~vrNvJhJDO9YeH4G$ZHD(qT&Yy8Px02~W}VP2 zN-9xW0cKWhOYy@6lb~@w`!8k5Ih8N^q&#i5s$Ig-(*zXGG0| zG3e##BvST8Cj?H(h3Surh>Ks>qPD0P5El=q-cp1b+QabO^>c@~4c&yxAJ}8TPnRL~ z)fM#e-a-=ZUn1*vxlC>e97$;v84}TbF&M25;>Gn?!jnf00_#o;X99O|^P-}SD<_+S z!~Jh5J%_ERoNJE_3^o(o)lb2D*K)ATJ$B+qPVXy73qAsjY2<<>^Kp9S3)w{j}ReYljUdow!7~`kIkPz95O$ z6Di$}@+thasjE?xlFPT8`W;&KZ=ZPw3_1Ba-LRA5LvK z*^By$Q^_PfVQL1-r*CGM0g?}+O-d&WqKP3 z*JB6^zL8S~=d1D5w!>lwZ4x;CDCW$oR-wAl{b+sTTHd=k>UxF9w2>rB@R z=~X-{x?b?x-$;^ey_94JiJ>x%sWGz=Z~leQddj#kvhX^wc!f$&7}rQ(t4C?fM<-_ zGRCJ!ZTph(cX1En5A^jab=5kaTNqarjLs^1HITv`lscdfRPFEqkIdkA{z^QuF3;;J1@hu?=_2{{i@x{Lf`9_AxCGO%Ax&kSVf#j>}o9Z zyE%6Dsipl`qPk@p-Lk>RZM&+&e*WcMQ&vj+Z;xAW{YYffcK1ryPddEfG-`7t_1Mj-#P$FaZ;=r$zt3^gGtEAKJ zxv!(GXEzs5``UAp(G)c)3%c)}Z{}a>UJ~@ccl#LY$kVH2!4FkC9Fw}6OM)K-lulXp zhSZd)w%erU$g*)2O}JPA1#c}h(ggnScL4R5mX57}7$ zvzZ}{2N03>`!Ir^l#tA#qu6vZF(EyJ8=Dx*LJS5^)C(l-BPC@f@c!%wHu^1=>w-|gBjo2{+=o#2peDofwlk#q91{FZc8`NR((I0Oa2frdctU-DQ zM*kH&(tZb)5@VC&G6v9JFOob&@&XHay*UGr{^t7T2KZ^y4P-ZAo{f~k0-vmbgFT*SUyCzApZnQ)w=Sj|w5(CH#r9wqe(A3aL1A`?k zBW+l6R45e+#Wd#2lAl76tx(1(JQa#Kg?G)5#Y`8ZrSYW!7hwzg+j$H%5av|$Q8CMzcAW9z%Dv8Bs0cy;==H-pm&qC%-nGqH0m=YcqkP;E5DV>QoZJ<8i>F?*mG>f6%<-@j8nHu92lEDfTJ}OpU zqm(@uf0~b`-%PU^8f?8n;m#ht64@T4MtDwK5N*GR>B%@&iD_~++kYm-fk$EPlp0GW zbEY~*9;9gUDc#>x(6Gu}gYtQ&+OxdRy&;+SzYE;^X unXwtx^HNi-qten+vQ4vM)6x@Cl1=TL#@dawm8lE?-hJRC2@uHqFaB>f=8;VR literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet deleted file mode 100644 index a9ea0990f6152fddc195c2471bf0d3bfdbfc49ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2753 zcmc&$d2mxj82@(P(X_O*w0YsBgi@dh1$v}~mV&YHQhJBdgB~fAk~Xm|q{$;mX$#6J zGvFWy!U*U<9T1@+BZxE577Hr3gNllb!+6YiB7!q8%Hg&9LVybV;fy2SB)j|VcYnM4 zONyn^6oD0Dg*XeDAQB+LbYqLiKmS!w^|_$Tj;vGb;sp>Q1x(U`!70?k? z0WAOsj>Tl;5n_158%QUx+BD!Jv|@0b&1J7`w0Ro^tl>2sNV5|e@(ikndD14(Tm%ec zh#(UH;f+F)g@mUmiYx=c$ixx>CC)!lZM7iY%~MnbB%+GBt)n5i-@i+QA#qCX7&CET z=w$CCA0K9S*S9n~T)tYn&1Va1B8w(_ZN1ys((F>o`5b93Aj3w+a0cuCMaTyhxl<+V zQ(1&(uu|uG_F5FqSyKd_J=5_1#=Y#SU4O!Fd&Z)-6F2r#4J(8lLwB&VqPId^jh-#u zb`5pqpN8`pYuV1)v;9UTpJATO9|w%1A^oy9Aw2sQd|kT-7?Y843zI2=*Tj9EY(#6S(;;|6 z6mao1ctqwtuKsv6SI{HEx_uRBT>=LqKZ)jco%{>EHt-~q@Y-|Cg7dj>&{K}P3v&^< z>E}72i%AeVxVw*1Id+ky2Sn%+jtwUh^XG>M4d#AnC+iN5Rh0w649oFBs^E}*p<(^Q zBL+l9sRhkIQ5!u-H#lZU?9gGu^aV6mm6e3R?Q9d za5C%lWy+v#nMZuL3UWkS`}}bW%rd}}oD5G1a>7F5U=eY!xPFNvOopYKiAl+=-Hi0EDtH<}CwO(i09W{vn zE~H_xZr)d zDS1TT{=eMVM|P7myQ4I`L8a*&pa#(D#89cUbZX@U|HTqJ*knp+l39|Z67r=~C`ql7 zxk1X2qy{OgLD@4Cg(-BEB}w^oCVXHfoU-&%57dz~b^hq&o{$e!6}t4woStBTy!RZ8h3UWvcH zf@yv8ADjX4#%w;iCd6-$L~jrPe_Fl6z&#?P)x zZ6iGyOO;5ITWSAEivB|(duGLwWKOCha;s!+^`DcHEVUv!lF~`Pky508Un$+%oYO+$ zD0NbM;-8q(qw#Bk7%SU6b6Xs~#Clmyw0S)4*4TLtkGIL~icLvRN=Y(mEjoZZALt?f K{))+;*S`VMIFrHv diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c355b299a2b6bafc20dea943a5a9833b966e68b4 GIT binary patch literal 4763 zcmc&&d036<`@f&(ec!XEsFQP&igZN#YK!TV_K<89(P>Z1NsDX^MRiKH2q6?j$Rssi zDpYnuri3Cf_GQL0Wl-VwzNcx1S^oH4*YEm$pX;3GeU|&aKlkT;-gEVQ@{)R62uY!r zaJr|ECxih2hC3F2a%Ycj(%o2op`t81$mz2car&$PfNzNDUVsQfVT~mZk+zT&+W<>VEaC>rVuI&| z%i_a@SW}?cj5OPjWRZmPZ?VM2bh04CKzC~N1Av|f2wgqtB$(vQNd#O3EH0Y_z6u8f zDjd6hv=!4-dmoJJS80nleLEr!`rSs8rtTGn5J?LOS;HEs2HjQ%Yzc5Lt+x< z1j`Vtc21QDYTpj1>*uEcyL_+oofx?+1U{gG*SZdig&A<<9$>$Mgr>v8fID0d)HUp(alRU?^RtD>EJv8sG9TtF(gkj=I^3LG z3VZq`fsR=@x>Bl*^iLcBjVxEtKXC}olJ?(+~-T;{{A{_0R)Hw4K1iC|vQ0cmdfF!^u+h!0r6ZfpqogDN2T^I>=dg39a>AjxLIp{+8w zcvlt9U!&mKPIKg5QUJr^e}L|#&Y=I)44MOT;O?gwSd=gewx;cY&kN7N7MK07%Y6de z--c25@+V-V7z{&8E#dW>R4BeV9EQaVK%KjPLt4R)P=XiHuY~^!?dO-1zVlqH$7Yy`oCA{`@qL$Gc$R{4B(IVBmXpP$lu(aQ^fUY6F^ZHiaV+)bZydk$SI$RDs?*` zR`ob4O`8Zh6Fs2j>R`fPV}`@9R|YAoRACcfs?2pj|XgGb!~`;$7fSvV6+@> z-!mKUtgt}U!>fpri=WY)%TI_2r-rcg(-z``11}Qa`rA^*2iFr(=Dx%Qh=;LxL9qOp zD>Y!nVlX~d!|Hz%p^o1O09g^3e>0xYS}cHq|Adk~!UpO;M<-FTBT;IF2TL_PB7M-l zg>6_zGL*eD!VvW|hzY6HGV*}jA1A$YLD#z5`@c#(&rNR(;C_dGQQ0);EZhBCRl&B; z_fUVsJY=D0y!}8X9a)ST6xJ$gPewx07;n~s z`_9N`XD*tzYCEiL)rI1<#-y&-5RTb~bv*v*YCPZX4Z7z(iZkra6BN!f=3OqgWzSrn zqQHjk0&(b8-_=J5mMJHTmEs)gX0MZXqss zWQloiuRzF$2K4@ZB#DcbirXh#A=i89Q~WL3L|{)aM(N(1u%{#8#gjVT=>g~Al-CZn zLKdjAf821ediV>aZnXgwur2YRLF)*1`YUkVu>wY>9b|bX%;Cg#e9QHEK8Am0=<U-s*sUjswc;JpG3i#i>r4-k~35BO0`%=n&5PWlv44ya;x4RgkifzuSCR0;L zPR9*`)2hW~-#<#+EN#G=N93Hr`-?#PR~7Ipyi8nbHDkpSmnnO91M<*Sk}&Vsq}5t5 zo|_q)j?NNQ?2%JmqU0C1Ab*oHF{(#SYmwr(7%+N`Ex*H}sQpJ57T zo0716>O3^bEs-a=eFq-uEF2y?tpX=xl(IuN>rzgkE?BluMjl=z6kOFG&z>0KMqcL@ zv1329q8rCoVC^ln8n(@AU_n!vp1Xk+66KvEPEIc(e6yaS>4Qnmgpa<|`kLpcXKO4O zt*!&%j{8vLlp{pg86dViFoi2i^0ALai{Sf#ahz}bw2*HU7n)Q{P_lY9VSgB5-p>+h z(1miGEPggXVe**USRR6d$~{r_h>vJ>eI_Ss`hL9dbR_F8rG*1|`oxvA1h|oZ05wJg zg3VP=^48;#_^$04bWNO!FQtn4Sz2}!Z?}T2m*d~B{$VKMYgD0Bl?b?2aG%)B(p5>* z&_cNh8;NCm@1RTZTUbVw-e4c_fK?!<;O~8u2gW6%__cQ0V8xq>p1t`MM|bfdX@ZL8 zvQ0wN`tAk(JU5*?sa-(4IUkD#-`kFKr#awOe+junV-jnRh7$_&{+6Pz7PSK5dV#U9 zmiOnSh-6AHBAGv0blc6+)1e>4rymgtrJ!>*bBn3vk=*5gIU@VrD^du~07IYJ-6<>C zYC~PO+83mz@kN&5Ewu$JGgS2)6e4Nis!Vl5@BKb?g=wn?ng?FkD&3R5X3*%kr!94R zGS=zZr*gy&dleh>U2_b5>-T1E9PU-E#=qFCuaBHrI_E^H(Bp=&UuD#t`dp8jrgOcPeCVQ# zx0cDyWQoV68|RgWT;4WjDa&MAMMU!f-|BRe?cYbYRW0sjneI5Y;NhjbaT%uhRf#{} zI=+-`wzFn&=kwO;46|J)lY6>8bhFKO*QStsEjxvIL4BHF(AZ@hi^4OR8b*FK3X47G z)@YAg(!;Ubdtt*ck8O6DmPMCxjb~Ra<60G8&a;lZRg-D8uOZ)VX;%-|djF3F6IW}E z&$2FQD*6lh;A=FxO>!T4d=`zqPl>K}fJWcrKE4lq2k&e2k|jMpHhWzU&N}iH`avan zb5{I8CHh0&U(lCy@c!h^pr5o&X4~$pDGPkj+KV34{h`}MbAD?|=ojdAdm>fSIwP7( zW_%ZV=hoZEmLnloi~Vo^7TtC{@zvuyx8KR{)upKppWF6+!NZHWW?}7ZAL5@j7EReV z_fA*h&#jg6C+&B-lU_?YEwqKWR|mL$eR<|aN5n9iAS9lQiil5OheU-i7i;$JXI==FMnMBcvTJ%PX0q3aH<7BA+iNiY!1i}mjgkxK$!RYt^XDC^Q7C~ z1p@7+Y$8sd4I>c#zDgT?Ef8vK{!JjL{vj8lLSn)adi4)25k4Wwerw4|D_SD_J^sI- z{u6I;whf3_@j>As(J~=P%OnH*h1xRE_B^C+PYWin!J@B=FJ$!^f308`yFZUbh=nXq zJI0$7DUXOzWSBpO89a<{&WI6|5W~$tm znltmrK+DM%fwN{N20M?5^!4!$XZEAL==>JXr#odlM`aK?3A**hQx2KEul(cvX4%99 zyM#$)jK30|rb9R1zPa~|8O+?B>3XRvQ}4|ul3sI2kj=7@`}$0l`}ruT(|A*R`MFR3 zIv=`Nbo(M7rj!n`!7eism<_3$Cu6TPO6tR(>ZYtWU2MAbm0l{H$P8Vgm>R_TIggni zXt{{4i8W(H7deTkKb>Olq0nm%N>4g-njO9GD|JZfJtt+dj2CSqRaUasNlCJIT}jG-rOb2Pn{AkvGK#Vtq-NiYeDjWv-RKi7BbssI20 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet deleted file mode 100644 index dd4f930793d5e38efb9536223e4aa1ec1aefe431..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3122 zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkY$7_Rq6xT5fQ5hu3-^Dk{}6!wHB9BQK?p{ z6|JJyRt1Gx#EuKp759Cm&dhOY$HC>)>e18oXb0!sWMZZ2A9K!}>35R5efxKBe&0#5 zloBKgqKD|6NGCW0PD5t3$9EcZvDe29#~XK4EIfHdgy<|P{V3l_l#6mrD0~$*o=Rbp z24LsG(NQ3T=d?SBMikT`kUObWb{RTD*5qkA(=?G&3d#*cxs_;RwTjnSQZru=Incq0 zPQdR&B=n4t+&VGA4k4-(HjYYRlU35NUp}P<{u4IG!mbvchwbZ-Wy~znXB*6uvvg*i zLoQw9W=+mC<`?M=mPC#d<$0oA%4^SZn*YM%E7SYCgLu~=QQXa2F5!Xvv5}8nI0Jbh z3sBc8p|yD`{5JN-_gC!kc&QKWwDd)0KUcDOixEpExx!~v3yk0S00Ay|*p)-Y54lL+ z`7c~%w8jr(n5PKPaTGXI|6ZhZ45m1{&+aDotfJ%*u^lkEn za4!G~V*-ZF6>zcM8+Ls>;Nm(A7xJ92EIky{7e``5YaS+)^n%?=7hFrK!=|pq=o!#J z&eUm$S92{~7stY@`3wqn4TRyo5=~tvV8v7omi+oPqN=hnWxFTNPLyMP>sRDvXM)Dl zU2wgd3l1IZ4Po7I1U5cGS)3P=>Ni4N8-xvx?zXl4Sks%D*a?=*|`&bbE1*_M2=k*eW9(8V|$ejCvG_6*d-Y*RR@yz znvL)_ox;=E(eSz*fXf+`xY2Gvi8&QjWt;G9=25JQ*^2e?;kdbmBTq})(7(I~JnMq- z;Ng61zSalc250hk!%xIL>lQKl5)>G`5WOr3qkmmQJ~5T^6*cYnxqcw|#=okI!ZQJz zdv6w|xUYxX#BcfJO^-?Ipyfha!eLV4c!PWx*B3!oZ$U4d;+j|PCT3?lqR;&sS90Mj zTIT(goZNVi1iKY*)y6E$xc>oX=Nm4oU3~%d2c8g5BywR#0!dAr0~vLw3C-FlQNiCmxX23h8@blKWNKVsL2WTI#r!r2ex_#$y1V!VFAiQs>9zJ2EnnJXMH=-vJJ zr8W)@X;H}U>CgQ*X)hL?+|OSb9ggyYQ@Ka`WaQF_W2kHxOLp~ogaXGsq^@i@dJa!O z)43kJ&w`zBjCzXiCf9=>6383HVNev63wx%lkUv>&!arN)!%c7GKHW5)t8NS;`}*wV zYfd~P6Hd4B;rn}c@+zCjZF4)pzc(>d=C|!5K0h#xKMoVZR%fF0POQv%?kxE2ZjyJq zM#!U|ctH0B0^b_SyU(&`Aab7_b9QkRm2N7vdv{Hbp1nMJdwO~MX#4o~?bqMWe?UNB zQ1G7yhJ?N`XmHq@LxzUGHB5?#93B-N6B`$wkT@b~Wb&w#(Ql6#n>sEnef)%plO|8m zWn^Y$PtBQ@n>RgQZ!i|jC^VUiW)_#sdZ)|mcjvrUI(Odu1@D)Au&{j5;w2RyE?xG~ z^2(1_Xjgu+s$bRWHJ`3sSG|72#?Ll={>A1kTWe~!ZLh2Ua>rK{pa0#_U>y+ zZvJNffrEz*A89#y?D&b3r%syr^&Wh);9f8qKpKD;) zo^2BHb4}(>+4DMFH@QS{3wL6VB4h#VdElG)Nh)82w3ttLk`{^0MNkJK@V~et)2TxBM zmokm<^~p58S$TA@*o(9np+Qix+OsSfuY(tGgxYM-s5zPU-U^|k%uec$_D&qrqF%$)6aW|DIV zmWQ~qH0#4oSF!>Y%mHkcR#63GKPaA$mVNuf$Mstt<&7R5b!y9$`%7=R4ZGP}^kW{IcLt z6zyL*8#0^1a7)rbx_r-bc-%A*ed1JOC>S*pc8%OcFRwJo;2pj{h=Wjq7eG-4X{v-6W{pZ*%<1T;6nN!fS{4(mTyNBF{Xz;d-c=)-k zo)S8EGkZ6kfu_zUXcQ>%lw&SvPoNQ++jRs!mSv*uf(G?KFcL%}|5X z;a8X~UH3ukgqZD5g0N^JrgGSJ2(bMLK6ATnaQn_AMhr&axwHd*+8YQ)DnCda?u75Z z(FP^G9n@E$-cWpWA%57-ptG}1fR~pY-8$kSw1^ku_ll;&@aZ$*$mwB}c=c0cx8egZ z@p}s2#54irEwUMq`Ii6PazAYY>I>YJ2M#bg15l&qa!@C3kW^@7kO;z z25^}+m9lzE1fc^Z$Zlf~+?v^qvdx7kzQc>T$qm|R6^$zS6| z->t;=7Ut8fvx1S}mKWBT97aRZGH~xjXI>WdIpY&#NAEvPQ!WqJSzWA~DlE~gL_Jg+ zf9#y^(6W1%VCy=6YS@V9z)Qc+Uq0$MyqA)R4^+N}+BTRPd7Wux%;t@Up1xe1F*gRy z3d}K({q1YGK76sgCbR`-uX>NaaKi}3Z(#sdFILk{@35lNwp01j69Vb4gbjSn&ll18 z!v)xSW2e~TTm>xZJ75!JHvvgE9j87CYoJu6w^7(ITHyUm#Z(=+iw0^mbehF*Nb=o{ zQsx|_5|073@!Rq6Mc!5%;o2`cG$d0nEz%09QiafGvIi}*D5rdy5H|RSj2ZfAGhSwS z$Gq6-rtnO20*-4|qV_S*P+3=rpfs!zFYZp^U1h9rtbr}{MNu}K-?|r_Nsa~g(@Of% z&9V5Z$1!x)vJm$cS{jyGc`*jH#eAFUXydNy3z4C?4HX(C!`ZsOQyX|AjEcloXnpot zYH9t~s5fgPZ(M5#_$>IAS0`#QtiQ1d9QRBz?DVpR2?q1go$r6ZX-^Cx*V{<4bRCN> zKE8)vys=U^>xzi_{)7e%yS5pP2vy*V(K0$uJd3we?1vIVw9M1VjG0=&7cMw> zMttSP37+uT1!3xS3BHD93fd-4l1=WY z8(T6_&%Z<{kAVkegsBzL!hFC2sZVXea!O$C5Ybt?d8P|wWU{YQ7MsvO}{D6mx27q79MQSG4W zsxMhNf)~s$~HPL=qk4K-BrG6LQ-F&s(V+(=1Fjk^Dfkiio1PQ z3$?WGZhtLES8pYrE~{9r^>X{lZA0JAysgjzJRoArh)pA{%TL7K$?e1*SXnfhX-)mB zh^ZY!<{dXJ(4u!4?d$sdRg@{Or)KYnoI?1z;x z)ABj@k19j#3f%I6W~}#)`8PeAJ&D=U$73VUG1})p;(z^!Wi2DES*$PkGhVHb!nEXcuJ|Wp z(W%K<+5Cjm1g?6LQdujK+_sUNw~}>9Wmj~a#cyBQAJ#)3*7IfSB1pi%Am}Hh|33FH z3EkFD+&g}yX^{&0xpGcODj1aEO8@V7iO4UF48~NLvF!TjKdbyEFZ+Phg!II0eRFdc zNp2!ppRMfqC{p`>Z~jl5{t;Hoa(9sOvf`2w($p+XdLTRen^wxsTDjlChx7#Cj-{_A zoaO0-Uu{h|xnHk>#Vk+h#c9)0-L-J-7A77BugA`c*tg;A>m2_lSdbW7T3agr$9>*k z)Mx!)9`9fFk|YOY5@VS_veHHjU}-HC$wNZsgip~wENLz6%#piXA(zYNk`G5Sv) zB2n%ymnX`76LotEA~1_v`O4*i+)VhvnQ*#g7H5GoB%hNv&e_?MEc|BA;CSC$ZuH{@ zVp=X&a)Uy}vAT1)3&*F#y3KZt4WFMA?>{+36%mreou>tp_}+Sx3_5$hIwK?qLVA0; zC3p6UepY0-dwP68qFl}C>)aDKgj9ok$h>(RAH)rM=(^DL@uZMFMYcNJJwp{SJ0mhe z2c7tvtH&1<_G&yNS%iKQ50^@XCO%+(Hg_NoRC4k4@tlJWjlZl)VNpdcie-fhpQplb{r%56w=*YQBuE^EjlP*|Ji^#}znba%k zKdefHT${xSnOVsh={Bw& W<6XzQSSqanUjD%m8X%bbtNCwBcqY97 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet deleted file mode 100644 index 26c4d1bf6cf98bc32a1045d139db4a38b76b8904..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2862 zcmc&$e^gV~9sl0@-b-FE2x4A%!AOIWh!!MJK*Wf=ARrJhfC51&1`;8fkY^GEtW~tN z!_C^(qg&f~bUSt3w4LXu^qh(~T05<_J+5`_>UP$aZf$MtY3-Sw&DyQqy)Q;(>K{92 zXZ;@d-uu1ZANTwH-upf`PN_aeUiVD$yWn+TGor zwk~(LOTcc16lwNN+W>&S$Qm;mX(HeBl%9fwprKW@Vpc9<7`SIe+*V z0_^Mf_^ubx*E8P0xxtI-zPHZ8$xVMoul@XUl$$K$JwXqAJ-m<6WE67;o_Y%ozH|*u z0|zeptrhLBj7N1RM&P(5gkBjqhc^u~u)S{?KG#qNUld%%;`0^E-j}sdaHI(h?K=q{ z4DN#OwuP9q?2Xt~IG6M7n4-FP>M+Xjjk0^Ts8kIb7Q{U%Pl@xVzR&GGaS3!;h}(Ou z3TyK)(<2;%is@J3OzuZ5>ablbl{=7)|zLfJR)VcS7qY*r>H zlmOo|$R1s>Q+4f$Fnn@yGdz9j5ckt(mf<}^Iq2AoVdlPH{)<`u#s#K$G)+By%UXOW z`(5>h)&lM&^_QIA>S9hq7>b_ifQ_Fm;*tjXA@ju%)ueY3y8L%?*WAAV>w@`AO1~Bw z{<#&Ie{vEoRKJ2ENgCuCwQ+ySwxY*c3P7`KGH@BKct-JIuH)4fu4+Vt9fupyj!X__ zzMaDDKl2@WF8K_T`P}2o1LrH?XlN-uR$YO993EtnTmQ)PC47JazW4B&FCW35uiVJK zP~$}W1sj$Vj-up1klZ5Sgj)C&n<)&aZ+^nc^$=Nlkc2*DOgb_!-@b`ZF$Z7u6JbEj z@@s&6%`B%<^P0E`+W3hHiIb8h>jZtWXh@l2oSJ%%DQ(*HbhBkf#>`pwW@gRKw&vvC zmzQ5Kr*Ll3Jlp)@1tpTbbYWR}#iGipYDZ1&VrN}_!;+756MLR<0uaD&Lk_7!t@D=%(M&x6* z6wYp6B|$VM8imJ*nP~Y-^4-uZ;Rd$(z1^LWC`O+~`Ztl~oZhlSPw?#a=y%!kP6Zma z<%6gScXW9JZh<9GZpOE~2xg<>`N<`ucF4!#_$~{osPTBXrFJ*_FJM9Cuu*STRL+9F z8kp=5I}>vXiEpCum5u+iSkUtZqrve%ey+Z!x8whEW52hXq#xAlc?++9crt-1gHhB< z_4SLJilP@w{27}~DPOWnl2k|jDOF2SuVn9($|b2&D(zI(?8Kmk&PpYzl8!`pM;d_JUEgl@@)w$d(pDjFkHAyqo4y<*fG z+rO?-={IRMp;+rBX(63f_-PO1mh!o)+H?CzPx7b{X>u>^KS?oqC}hp9c#_OXbwuxy z?7h)*Qj(=!#70s&8Ff;MjIJxCy8`7sWFMtY>QCYmS9&y2OAt-6JG7?98_DXBd4Z)2? zsen}$#SN@Jwbr;C)A^E+HjNDgrF3er64Q#h!5Muw)7vq=YTnZw}Tgh%L;ox@>`QjP{6 z_5kvZH&Dj05Hz2JOFKP4G{^-U9E0FuniN*Yj)e5(!7#2f4W`Z+2%_~4@cH;A*kV%v zFMIDr=bMzs{YWD?F4u$mk@JwfZvH z#SFQzgm^YxOl0d)C zy8%OsUIMo!Ke&5uA#D465O`$SqrU2YBd5gcC{K+bJIftv_Rl2EbO(Qb^Ekd?PlmL1 zdz0hX^k`H!`%B_wm7DN`R5dbpD0qzjdt{?=8d30*4qg8Bn%$2JPm7jZoFUqUevqtl zJ}DeyvbC;!+>Pwi70B1IR3MJJLuFp0MDJZWL^ZzUiVmOkA*O7y66FjZENHQ9BEGoh zNxUE8XqRoZo7gb@78O;Z;b+Ke!P9Gwa5jH7?0LL_m@>CjxWtw(XnF86S+M6F)KN4- zBE2yL3Pwc=X7y-M^atxv+~SQ;e03mfEA}D>Mz~VmrK`nO#}5%{vG-8-m{%x|uWq0e zv6uMV-jTw|B@3jYLvIl+*N38qM>9#qCtpIu>xB^c(Wm^g-yTF44L?Ej(RI9ILnNs6 zQZ#YNHo?JGR7=!c_anp)&O_4A9q3_CI!SDutGM>Yd9ozJoo`j9aGPhtUt|q7Jd)SG{fh8G6!2$gM#J|lM&ZSh&*0E2?ZlDNi@e=$xY&BG zDkJ?ScJXQl>3JRI1Jcp^ilv)nLHy&|M1gZa5xNrkHs82Wub6bJjF+4OsC&f>n_&B5 zf>@hL4EvG*ZJypb&|fF|?qIDr#-TzSrd|xqis^De+cyrx*&-49828K;Dgn!2mGSTDZHZaSmDPD>cRTWQ)FVu0OFcXjan+T zw!;%3cY?)}G_x*Wy7j-9dTL-P9lNa*_>0 zPO?8ZOjUdr1)g}1E+7=ihdvv2x0@nG^MIh_LDlmY@F;tA^s(v%i-b})eR)vL!a^&# zUrOh(nng=(T{J~X^0tP*5WF584% ze_Zi}oymH%u5sOP6TEhzy>k7|4a4&9bXvSgOjD~kPr0YRui`u{(mnAcGQmANGuhiK zGD+X*!{8()^0O((tejQyJ%?aHD*6-zon}Scy|OFs=3g)Z&!)yeB>Z8s|ok8$`K{X*0KDv3Nz`a$6?!Y$+=yP z=8>+O{C)$E#g>#*`4;kjo0L)0oNT=%EM5OJA?IuMFIAPeDrv$}-u2JdMNwvI7F+6* za>1ly=QDf-kQYv3c&fcUuV zC*ZOl|D=}3J(*uD`&fe`In^)2mi-rZvC#8nzr*9N%D(N>CLkjzD>={7(cr@0hg1jM zRb6hl@V{vPFTDO4WW~w=kPC7XQj#(aG>ID@PvV*W$x|7=!yyPa1Q0;TpU*ojun0fj zm#}hw+z3t3g7DW^ZBowN4%=f9*hBv_s^K)mbt72)(|t{-J#yzui{1McsJtjIbg+0%V!12A4h6m1GFlPh@fyrWzTe4@*>F35q z1!N`ak{JW5&$-8NFvaxqk#E1v@?+S+0-X!p5>Gnb)8rYV0*o=yES#0+gbMooPYmPojgyV3KlKa~!Y*|gY!0$xwWHGF z{pR4Dj9`Uuk_*`U;}9*Eg7-9>CXO7V!{;%KroeJfT(GPbmSMO|T9i1DmVFL-Or|y; z`{8i1`q(~JWv|8}338WgL(Z)Hq`YAX+1bMkIXT7xm)S`L_!=OaxVWfOw=gx%$J9->e`FXlozv)aOQsv{oO@Ru3jZzh1kU}=_xyd| zIp3ES3PtI>jF04FO+3dV8VJ?=zUAHf@xQIy%!7~Sl_``T$;)}UW-^jP12R<~1wawU z(VZiNgSMGyc$qo_@<_ExQEqlx7F3x%RXnHUl{ks1I&(xW!db=de1x3oK@@IlP(@)s_VnE_*yk+ z>==)0--Y49+9cfU9E*x^v1He77uL*=Mbf5DxPD3kbTPOW!J+APHOdbBjmy1zA+OBD z?h-XVyc>ySn+39>HjJ~WGjXCl4X3sm5$~9Qdj-AZd|N6uXjEkEkwnOD=@9CgiL3@0 zF195?F(wW%v03=m7KwFblToue8?(A>m{@2OX(Vs&4kUWc;L$Pz z5^m^lxqJhz_Bv7TEr!2wCmt{P8k>xJusv@YdNy<9(Tblje$^<%w@$>}dn>W)$`~X% zqsfEjKgdYSb>dAXa61#&qr5@*J0ykOYG2F97;Ke{B}|w1t|yp&f_g6y5=4n^A`2PS z(6BHoSrxJB?eY^5@ej^J zTb(Cw{rTqcW_M1?$aSYasy%Jb+>u#?%rEWQYqhsk==i#f` zb0ai7C$(kH;CHTYR6ALVT0Js*wZ|J`wOd&a&zFPD@DPZOrsUw1lS<)&Ad zwM-@60h@pvjS#qtq>DbB;^nxJFrvGfs6a=53uM`Y|P$^_riab44n8m1ULC9ra zdJcT)NhCeY19K!4EFG`YO`*h${2X@9_OX2i+o)?nFtJTv%8sI3NMq-k@`?HB<;8RB zEQYByr6omGOztS8_};M5P4t&7I-wxwlKhDzlMS->loe+|fV;tW@5NxrMBL`=>=grBaL=e19zEWs;zutJ*~+krbLVEWA&#HrAwhay=Y01 z)9k3cRM7jRdlJDiE2<-im6VLcBqhF>da1)uN8^ZjGJEQunzE~rG@*)fn`;->S-le~ n-0lhH+FF+{Zi%(lQ|)rbrB6;xPfb&sG(g`Uj>v#Q`Xlcjk_>gx diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet rename to transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a46cb40fef1eae7a5d0207565c796ee63a4a4fe GIT binary patch literal 4466 zcmc&&c~lffx38-1o_%0IW-C<+Lu zfXF5&TO?qV9RW8mF1Vm^2b1^|OMU3!0g&15RLbs@Auckw^Pp}^Ozy}+TM$xJ?S>=&VL5Y zKa-(DF(Ke>!%>WBFrO&P?gf=i@8A^RhUT`J!cu20T-OL;X!PK-K;|_|d--VAiHs4BrMY;#2YO0y1dt(&gZBdonoKQAOkw`k_O$X0UN; z17=Y?iY+_;1ela4XQG^PFF0qBAV#G5K*Q+*f}5vbF+iN#z7}SJ#if{sy|Ir zyAI*-ujFM5ECQwrzQ8osaX|j575E7MB5|r_ArV*=#VaVU&~T0PhQ(`cqedhe zpl`I6zJ^!>sy-Y*e*bEwZ&4*^wXfge303Rh#nhQx-V;NRFvo+o`o077KC~bDZ_WX` z`*pyv-A1^M`(%bm`W_a$wGplKeF5*e%wl|g=LrmF8L`e+ThW)MCGnVzo~ZWGbok~> zG%mVy8@PXw0zAI`f#`YA0DA*}1KwvcuqHz;Z0_?$eX79|RG9@R|B*Rjy}kfK-(G}o z?nmON=wG6N`4{jscRhlgsf`7Vgdn)ZgAw-B5IlR*%xYEd1ZUlI>8XK1lS<}I1r`r~ zCMH;yDH$2nA<5}1-euS54zWT0cIEn!56R(NX zE8$qGn=idPsRVEqbl@R&pP>Oq16X^|LDhJ163!UDhB5jzne_WL*!7BwNKh5Sm{?K- zv|n<8Z^3!2yWfN+$IcTrF8cWKOE_jaoT1sD{{?fKYzu718t8_Let{dF-2{0V4%kGU z-vMpab9&O04p0#pi%#yD2OAFXxYoUOgxXg#Kzm;TidpOr7dpqY#JBE%hm+P!m3h{p zxUCiR6$f+(`xTBTa7`dyxtY(or1u5gF4P&n$}FPG-uA<5-z6gL%q9)1A9jJ&eJ6EY z^evz;y8}D3qzLob{uC~mh%@Go`VeWShu}z-43C~L3544g!^lNdSXdjtG9SzU7rx0u z-exyA-+mU$Sm3J(eWIA4Pqh?on6MMGsf392lbFytSB*A^2GvuIA2WNaLs4+G8*H35 z3U{<@V{Bhig4VP~((V$PD2SzpT}X)o*Ye6>Z$uEVyyS-8d~AsBTD8F|qGZ&aEMjlh zv?f?bQt7&Tm#efqTmjh{4KSG-0j}iV#}3eRxG5T%aDUuBZ2i$Yuv?x=Q?LC?`v`yly$RsJC1Lc(t-DUun>aM%_6jd>)f={DWGZM+w7idJ7op z90ENpRk+Hb7u3(@c^>~4wh=#3<=?;jICZo_oVv)Q9AoGjkRA5>We9f@;FSs_wL}%1 zxe*|S(B?>D62?$B@NPPiw297};*@2RpPa%Lnup(L%HOnARacrSloV{#cFba~;bvW+;I5<*tMKD-RgQKf5s& zjjLFer;H@;{3%ZvX_Mn0k(dhbLTvTq+q-ec5kv2II zPT-ChAU)@UGt;!2U5Osi2yX5~rX8#-&d?FK5ucoEJ_BJxXucQn4r?&F+TymYN3fgxFEmqz>S5?@#N7r_{wsZB-u6+icWy`v%i+VCmgS&H` zYmQyWnJpi>-BnY3>9EZv)Z4I&?ZlI}jIzKEne z9qayv?vgu|B4mu)P;~dNv!v0-bXQY6Wcp8ZSM^Ww8x^`LEN+a}=OV#oB)(=MemsLD zynpro4^jUPx@e~*5Yptq;i1uie4J$e`skyV2z_n0;}dMi*8pINgrC-*Pa8A-l#f$( z|2`Q$;?vx$sol6xc}moa1|lPjpO(HR>0g)fuMGS@!2$tKq%Cs$KYqjc54~Lczue{j z*i8~-m0o@z+f5{Vnt zNPQ?(xh^rK^vEFdFU^9ymd1xT%!%~z_6Voequt5;)~zHPr9WGx6EX=>#{4OV)Y>Qe z<-T5)t3n*ZB!Sd^Wq1+}DL&)6$5&rbx(n5eL06_emQN%(CyfjAvW)TZ{xZhbTM3G5@$EcUTKst9)Gd3vfgB|Nf|G_L}Et`9iylk z$b22%?e=l}o! literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet deleted file mode 100644 index 1a9169d9f5dab747b86df507d1750efb57844174..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2537 zcmc&0YfzL`^xXS>`#{-5;@fs{CDd5ScqfDmq2KacToLdAizzPb%7V-8whut3QL>z< zA;m^#)^vo@Oif9XW)M(N!x!d*rnH$Rsp;V(wHcgJoUD7lZF#8tYw`@-@7(h~=bm$B zbNT#Oj^zTlARUJ}2n3L-dfsXX{m?ak4F_BxCyyZX08Y$_RS6I-_3;#g7y$T%Vsc`H zFsPf5ier`WAP!I}{7MW~W3k!bG;>&vCccsbdDmEHi23a{l%8kXPHjBH`DkRW~ z{^+TiUooy^zsQlReWGTh%TLSM}} z3CYP}?B~z+LrYK<-nTXlLegeK$N7m&(7O95?2UskF8x0IP}~ematveRW`WdQ#~xa` zQ9Q803BPo&f%i`w5N+AM2ybhNMeS4Dm={0(i+SbjuS`nkB*~PuRro;kCCRGA38F5^ zEs-Tg&zu4$ByBE*)z`8_fwe0jqP0Uj{t`m{e~|Vvd;~EwUuJ?=_|vAminv=Sg1+kD z$OuoQU}-T4xPEtv$cK4w0U=80L#$+CZSB%o;fkl~-Yhw4P7GnH!wPwkPZe-qK69iGc$yL!YPG0_Po@aVAmI0~r`qod=)hp0q(_^@6M9Bo*~ z4UVqGWG!=f_lO;bcLMl`5s~ zKVH30Xf^&XovMCPoh0v-$+220ACLtBC{+r7K0kkML6Wzz6isY2g?L`W^L!rpr!br6 zt9eZspTYBGd}^6M(-46<6ielKE&YZBH^A%5=47owvR36h?ZMenKA^h~D* zG0pQjdT8|YD)9LjdS6}=mlIo3u+VMHn6X@6m~W>1%3KoPiVAWF_EH5SBnSdtdjd*% zBlMj`1@Ts6W*Kjw`ht6s4gtOIp1)uLy=T+Io4OFXH=gB$rg0ex;%)lE99vPLkUH^~ z=glvB{>XSpu?P%{hn7-fH)bw$Q3kKoQF%eA+UGw{E7Y45n}DzMJfB9rX=TJ_?~}1Wgh-$&T>#yr$awPC~F$i^%XoCA~^Ql3u!yy1p{Q zP5cOXQhj2dsM4qLN`fNPZg9NnHn}28?e<86!(poqtui^BVtrr(`!gCWF>2H()_3plOyZa1k8iEF-rI|N>fZae_dfToIX^tbUgi`*1yP|M zlmZ2+0A_7I6-xar*H`oX^po_q{-mV8?|Jv*%MX*5mL00-cz_?ZFQv}=`%F3SbZFunPu6C01VzCO^Og%0gERR za)TwYq4Ogoi4hbgP!&8zf?tq97Bb-coj;KY9XL=J&<&670OS>HVIEkn?3n$f2W^iGa@S&#Uwn&qkIT zMvy!UpA-(gVsMcBEjuH{(72G~n6TKS`Js{|iDD!@hzy+{5*M8u6D!O2G$ts)3x_&YAw=V#bvN7<(6$N;;Ngp0{ z8$-xwKH5_m2kXD%gMNND#Qjqrz*G%hC}FVSX(R+5_z0IjRRb%0GF1ADp!_)xmK2H6 z@&pBJp=dI6?l*(eB`%;7GZtRV{DdynkAqxIAu?{$1LBD(aN?$c-3kKw>h*wYs10g- zJLp}=gUx}~5Vg($ruQ#|xoKL!-KGZDX4JthrBoPc+KA5AX(HYB8sM*U1>N@Z5Z`PG zu|Er-O=&J{U7!i;KO6+7f-nf$s{M5{kCLcumTp;uD4`^Ys3hXEJASROw zxh5oZA2CC;Gx&=f8Q z=fa&&v5=PJ2L+kC;B(4J$akrN9qzVpuLwi$mk+|2tZzW4&H|pl$bdc93_&ke6}{T| zD^d@AfRc<5#K-D_^X3_lQ162^MKll_sz1S-dP{V4Y=JySXBzAowTB2&-vN5_^a;;h zuTYEIS?HgbL)6UglOJt*0pI9m4CYrKKn!sfYu|Pl4H=%pEaD!s(|RvJ_sW~-OxY7; zF(Mu-jSGdLmI|C}WJ}f*_CkHfJEQ|1*yIytsM<{t&FX4{c7p_Td_^C&vIU1tX>+hX zZzp(a{Sp)Ib-{OZDuDF?Kh#vw4R^A(!RHML_$brmn9amVWMr-e>+#7(WFC2pD9vQC zyrtjD6~t@EMUT8qmUKM>{#Zorc;}8OSYmiG)e0`UZ=uKHzTCq{6G%QN!o;P=;BAc? zC$ENPpDz7XIFc|MYn|qT@)EP~5%UJ{H03)eCh``x=;t-qlLgC( z=INe@GiZawD<454V&br}Lkoxx)CKa;f$5~AA(~fSTgRUk<&P>;Zek-1bl9blM(D!@ z5iTCTl&FcDho!!8L6_d&R(Y0jio2#ah`SFxRotr4&32b6D-?aci&TsXkvTtyCFl2q z6LXuxU3#^ZQ{z1fwVgG?=9I{D6UG^{nw0CX8@KhbWsZE6c=`QU-gl2Uep~FxXkj_% zo20Q*$P{S!oQKU>(#c+F7e1r6O-|G^9@QVlBGojQmTtq4}$eVbg#X z?Ac^OXnBs}nC5JjlRw^yEew2t?z&Il=-nDb5ppJS=NhfqzFSuExUJq;(}OYS?a>%Q zbnzy5PR#_bgO|y(KeV7;$s6!Lx)tvjt%y4N{INde5H)3PIac<-0+V}v9>Rt$pttv; z2yFKf(QVuF#1>CoQa)c35B?B}p*3Ed@Q0(}$zZ44an)0B%(H}@B?%r`HN_Cd-+xM~ zjn6@4Yzs_7V>8ZP^9)=|R>0`YT9!xBTu%H;DcAGyB>59M%azDg3ZR4&cu(zB_G>>N zgX|~6Pfc;`-Yr+4bwUr;p3{r(x7AkG-~lOmU@1$GKV$ zI(g&{fgPE1;sj5_Dhnkh3glws592MR4nMkbN~F=z>a% z3zQFzNZ`~q&%*;kIPpRwv0wm9GK^cmXaH;Gbp2yNs%;c-1ZAstbODk@nGl}`EG0k33Me(?0 z3Qh7A4+?>^_p?f~R~2WxoG*Ix;up+gOc$iuDhifvrO?2eC)ndtk74?41^jGR1EEy< z0<^sCv4MF8#A5z*)-Jvi3ipzd^u44{m3pUOLfuvV{fb2}r56&+j~y-5TjvdiFz>?^WzbX{|e0=g=)zc=(O zcP+Hvy>8nW|Hg=cuHEbNO@q3!L=Jm46poMRs|q;2XJhe%_0^_PXR&6PB$Wk>9JW-Xi%Vmp>TajaR?(S&?7LnIaRDWMNbEkc_x>L>8YVRtq z>cH-rZMA;;gL{gcYV+#m98P$2y}LGl-~5gYu7>m8f`*XOxyEx&?kzkJ{*Q9EV&}S| z=BWP0$Qvi?iVwxbcl7**9?)S}B12c5Q2!_N?2>F6y}F;0>dn4z^N9CD26wU)%y!?9 zfqBZnNNGeh4K>h0Lp2m%t^WdbemetIBn8dt++jb8QledhltNXX6yUi?s?c5RP&sOG z#QjHf7&qIPBB;9TlZA0*&TtWo7gJK9;SwhG+2{jj`J3l6J_50ApY=z8NA&Rd8YZLn zZ(qNBqa0+n66Xpm(|8j5OnH%Y(pz4zW44;HZ&R8|i1S8`Nhz=1s)V}c=(^=urmKdz zZyOVEFlk72fk)x^$g551YT+}sPf7pv)sR|*XPHB`z-lo+(rfp$LhIyr{DnT%-qpU% zi>WBzx;aNuUcaNF{To7h@~oB!Vgj3^hG=vog=nsnGv`p8NUGP>OhcYLtnna4Q3`{U zl+%|yq#~UdFoMB&`s=Io5fPqk=@wGp_{8}z-L3Z&l0IKEM0-5AZ+`P~#1_~T#7mB^ zRWbB!D_9hAYV#g=5U&+XElP^KR5`2Cx4kGi>e@cZsl2Jh zDKR$=EgHPqUYr_t=h#a22-}jhML(R%G4wlHlAbttvCJjkcKhPwC)XOnuN~dKB=y;y z&b1@HEnS-aa2nIS6R?xA^6e6;Ix531)Cu$fBYi<@?IW9&G?OFaat9CL_`=y((I&PkwZuj zmuD0FZ=Ugx`KD>1KmD($m5?!$iIDR}hoK~YUZs`(O}8+bZdEkUjt%-J{2d`t{*9hQ zhsA~`4O6gOOt2a=IBIQh*@~tle}?!Q;J->;w8095ti+IruowwN&_qoi`vc+A*Ywz{ zW=AtNu)>62*PmhyD}T+H7`@+5hQcV8hYe#*2xX_la6*hm_;OnM>a>4t#=i{s|0W9p zRgtF1<3G(t|401qzw$Ew&t4_L+lc~I1CHRW1s_19DO3=9dCl~jJp5(}cM_(ISc&b$ zV(~2cVPu+EoGP{t7dwl^;bMny*_u7AFr67Wh{bM9PtU*f^h`EPWIQmA#4{Id!4@;jgZ+GyL!Bo@1^9bKF#9o{bbRRx z=|<+yLFR-Gf|g-_vLUngm3?BMpH*zAOSo9V*vsJQbZ80qGWVK2o0+>a&2Z|n(1+uR zqSx$`Bz{(L0sb@M0{vyF(|EIn^K z5Zk8?pOY+D#){Sv%PKi+BujF5U6y)4jB_%LBg>Pqr~T8W%+U;Mf>1kNlCUT_ENN^= peEe8RLPA`sc1l=6Vq{#bwz>5<^KoV(4^4pI{-B)z@TC7s_AlGe7#vP)O@F1xr0QdUrqg$PlC$fZgV5etHqCPkLL!Xg-r*ho-`MxI7g z6p2O!1q+xcMMQ(WL~O*DC~Ayd@!>PJ=$m_2EQ$W{zVCf6^TEuVnbUq}hWi^L*(f)L zVtko^a7N5PR{)zCBi5U?)?Sz=jdsx4FTG>xu(iGU)|nf+8D-zE-+PVRrRojD!=upG(whT|OgoE<3y5(K2gfC9=y>`h}0zKQe$Zzf|b z-()Nockal==#`EPqxkD8-ZXL)P!J?p5Cp0=>NN~UlSFMowk9b8mN$d;vIBr?H+W!5K;hj~h^_tu&b%lG zQL-0o9Ib$F+Rb75N*3j18WU*>FKF7PhC?O6V5@O~2SZ<=lU048NGV57_4YvBb_KCE z418x%&{Aa&l3q5@P3jA$(#&CTtS3xf;15Gu(_n0_HAt3qgYyxUu+B6GdbsXD$19b{ zuAu^?3kHK-!*Sqt_JfRjGN?Bl3ri;{Vd3-dAh0+ICT_8X6XQfs+PWQGF+@;v%m^-Y z=?43ndIGgN0Ms=dkRM_P5mjZNsBnX|gd?o(UIU-Ktb%JW5Y7cN#FcrUz@nj2P;3o` z{99L1T6P!MM%jZVUjjwWG&EONgY#!W5cyODI}4paRVacj#c^=>k{KL4O~dImY7|;t z2KHG;;o0;cu)E<3XX1XU|Z=9 z?GNU_hV#9^KBFsoxb_jUOuUA4jtIC6I|y1F0hv`Jh?=@ew+gpx$YsV4LJ&}{g+S`6IzUVyPhDw3~>UtbJsva9VHr;?Ltp_9V6Ve zw+wPmoq*=qKcgdMw~@OAN378%!jszdq{PvOu2^vjs`frbwh&Hu{h&tWAtq?pzItd- zWupDFT8P=TBrM7uL$nMJgu9*{gnUadS=wX_p4HK)W_>eUDp&?D3p2@{u6cxa{{eJr zQFqa;13QpgY9qBKUnCknbC6*%*WEC^$3?ni-+hp}AiDHvC}G@>AhVe|2)26+huwcQ zym9kmS_&r6e?}ubt_T5>m_SJH;X>RWUkme(G?HhB`#?d{B%-5^MyH1y1aEH#YNvGv z)JUfimH7eCBOnawPg;@EMbD7K+-fijdYGiQAL%sGUP1k@&hd!I|ww78)&qKVel8W|zpBX%oNs?h4mIPVs*n zxSgkL6nG$C{D@E#4efWa&?ctDd4NB`6fH6kOAL*SO-#+qyL9a)Wn>m|g=Kf8RS#>M zp0;-O4ys;`PQCj$yYzKcySabV&%^WM{sX)|@gC?i$d~o=4+snj9vl)H7Ct0mXymY{ z;Uh+liXI&kJ7(;-@e?M-#U~^tO-fElO`Dvq$zFuCuV&$rm)oV)EmVL8s{kI!7ZYr$dGX zs%v(B_x-NjwRQCkdm8sP?c4uD^MQkh4j(ys?D&b3Kb~qiedg@B^Q}K!_~*~RT)f1! z@$qQR*l5P(nTV5KxylFY%YpJ&6%NvSRf!2_e(ksG8*lh=05@rlifrn_sc&Itx3RN3 zr|+IUC*k1UO1!%ao56jWgH5#@7`NjH5>EL*59eQ~4LayHcu2R||NAu(E6(^4MnH2) zv)>;J00s^-zghRBVUN{icp7vW#5hG&iB-8;a}Z< zAn5yWDhfS7F3L(sNz%kI6n+jk5O2LI9F*Z(y7}S~fCnLex7dtGulz1s^LlTNfgu=C zxHoT2$%P}~UmgUG(0Mctme{`y?_UV~XRshMR45hU|Klgm-}LnOzg*nE?Iy_ztYwBO zv8+TX1yCsE#%xs7&}c9H#geb3T>0j~`mroK4FB>ijAe6JzhpLuWs_O|WMRz@D-7Xh z{wy29k67*;u_R2hcn`cIcIc;lTwVL)NnpfaUiQ!7`#`?Ku30vm@BCtTQP^kIygWJH zJ;E(MdQ^5|(16J?qoY#zeN80JZ(17e1b_a56PyHYdVj){-+RYCD>m9ABQZFcjpOZw z@E8tmF`aVM$dSAp%6EF`LhAZ_CgU|fU0k$>HfD5$Hg>cC9mgA{#}_){-F$GdaC<8s zzLb7kV(=&(zrlut^ZJ5PPG|gKAws=zv2p7xJ;Q z4qtzqqW)0unx9~aGso!gehllEqdzAhS>6ikutFvEMgmCvx&S>!6O@hP2sn9r>>r!* zN2AvSxeXVWIVC$u=aRs2E^(Qe+8mpyNts!x+6)^v&%SPb)rxQ>!0R8VrvM`He_j6p D2t(7I diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f82d9dacab54b6721a1079f37ecf1cbdf3e4d8dc GIT binary patch literal 3317 zcmc&%c~}%z5`XXYF$Xh(LXR^r2vHdjIYsb7q(M#x1q3;w;s7Hc*9Ze+yjG18yzzo~ z22rD;cp%0EjPbq&!AL|A37&YwYhB~6iTk=I7+Lj?{l5J+^-aI4>Q()!s=un9a5mDN zAsIKOznY;L&llyRS>Ml zbh12q95TENf|vf)(zVP!m?)^9AleNdNmp8(5Y)c@)@;q2R@l{HHFURFMvG$}2s2xR zqUnt_!t&9dz=P-9(BS<`%yL}%&{d`t=vs>%`by?Y=}h*Z;_>amm?A$aLtX~#oTdnl z1t!yb-xQ-yryUU#m{PQrXCZ71h^A@_`%BI4Is(I>NP2QxAdKCx3MR~53zjt5fUlQ0 zk=bj56|T#cicOByph@vhVQc6BVejjAVXD|ke7@34Fs5j>nW$zos=D15{(2~rRMa(t z@S*cSRC@_=?q)Tt*SrKNsQ?~!l)@tov8Z+7gf9NwmLRku6Nz730NPiz@Yl9<5^b5L zXz{r~7KQhsOiGkQ;_GAt3nGOn_Z-24yGO*%%?SJuzD`i6N$jz0h&}MQ^@uX}SOx?2 z!4fO0rNj`AAW`tTS-^4L4qB}rE7U!oDGI+o$mCXanV9-q0=f_+ab`!O;6)6eCio2o z538~T^+lIK&44rL`(^dS9v@p%yTv7Bzlf{E&+sB0Z#M!x&!B+G?iW17x;>*qx zXPaE7MM76+i)Gu(xgR5Zu%0R-}=6676Jr>OF zc@k8l=b$}HhrsG$GpTp|eyVG<3pmkWK-m!qFf1fbth#m`-0G2UuNz&3^j}m6CKlUJ z0TY9eCSODDn#)M)dJPo}*M^XnMOy^AS52_-yIDwCa@fM_$HicB!yemEI}a#deUdm7 zwS|aVa1TaVlR}^0;;5p7_u=a=b!4V_50L7=6{e5cOQakJM9EDTaAE3t6zhIP@=f;~ z;jnm17?&Xe4W`@S4D&^V?=FbM4^@=a=}I(1(cZ1l`HrZ*Qj3x*)v%_|Z*bw!`N9QJ z+fe?obovHmi4w)Vhzs-dpmBXUtWQe>o^@*S>K#XP!|OP_q?m)w&QX{wu=J+HrG*09 z<>R^>y)_Yo+CcRNoWwQD9 zYT&eefXQKRCGZfBf$dM9q0E1p0E3TIHhm=nn_fOZZ^jph!dfK6(^EQV-MSXqjP^rK z<5c8Si!geVMF32RoJrv-b%kOUP)5 z9{oMN1`HfDc!>8kBNzmi~n@&xbX=SG>J*c z+KDNtX_L}3GPAOQ4oxO0Yn?Kun|UrH)8|Qbw)xnG!ETEEX9}HUDpoGiDW`For~40A z=*Su2BHh55yuQDUPne}ioSm$d>Yn7x$@+Zmyn-(Z=Py{eXz`MzMa!12Sh=dW;j7Y(Wt%o{`FiWN?d3amR_xlnXK&@cs{P*_`1ZT%nuFgT`r+`AqsNY)IC<*y znX~84U#P9SSl@8z$IFdBU1_>{t@(P(jn<#rZr-|m=kC4s`wt#I`sMMHr_Y|NPFpD% zWNbmfovXMgGcAj2N7`I6BP~}i&}L}43XjWqDJ5%TWE&XOKNwXx-|ODZ%m-_>8{_u2 zF$t3(prCgG$BcLUGMCf9DPl)W?d$3~5HA8o26KCV12+b616Iwl zYHsk0sHwiTy^pC&rA)OAZ{A9v2&#%JDP9F}}|x;eprZ&nv+oFd6mnOODs+ zer|k>XI64h3ajDn^X9QTnBqG2kC7N&RbaG~_mB?pbsa||{_&B^mZh7S7?A$+`XurYOno@Iw~%b*M{2DYbBxI%iEzPPV}|Uz?MgmYrqm?&ad{;-*k5 Q0eJfZ2T1_J@gJUl1J8q)CjbBd literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet deleted file mode 100644 index 4748c07ab9f5c77e8fc8c141ac748fc7bb24c158..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..842ce2caacab4c58384e71071de8b40ef03e2e4b GIT binary patch literal 3138 zcmc&%c~nzZ8o&3xmlXntyqI7VsuB@dL~inR#F46h%sF$W-%0NF?Z12TeJ8PO zye~sDuFMcMo`53&4nZ3{?e-qKe>b7w(9xn{j~bI1z&J8ePt12<_zYhW0NjOEo=j+! z1^|d$5gus>DWu#76$~x+2Yd&)Oq8L?*QRA_3bPp`rKMY-v=%BOl)_h8k`rHmK>!a9 zcmm+hA==?HLiU!42=)q5CbV*7LaVG4hyCIyHt_GT5ea)U{VZ&EiPn%=q}Sz}(zF_r zMv{XUIoh;LLtc?SpG)KnlwN?!a!T2T6#s?Cm!?-@$CTNrnP!T+gc89&-=?Y$&7l7d zFYpf?h~}lbO+s6D)j3VdO@`ZxucCp9{FMf8gsDR zp}>0>zrnr^UFmT{A4r3FEemQv<@7tkq*W?jo@^_yop72y5vJv9C-y^c%sD3L33rG6 z{Yzn2=u}=?=_rZAZ4Y1$jOWj~83vO#uZ3xgHh}VO7w}QJC*80<%-*MBrP$^~6PlU& z5Z;IyEgW*~Hp~`#iqF*t2ohHkhkZ`Ap5PmX;d>u-ZQ?H7Ir&}Y^oGg##1eV zOX7uD-+F+1w~vXvt|IVR>_&m$<^ig9>`>r$>w8{bzX}*;im>h9e)&Sl+w!n^}sk6Jf1do#dFI_bTeAi$Qw6D4dnns^RpH{R} ziXSDwZABHGy7e5@((S`9q|Whzqul7-=V@xhqczUm)#F43@*Cji-xn+xbQ;v<7NC7A$HK-+2T4%-L0+G!-r!V+85!czV02`WSb60dxHYhN zsBuaIGQC?Tm{IA%3!M>;G{qWv&mzY5{NQndiMmMoqNqk-{J9%;eNuuHRn1NTU#TP>|2;HfEJGbHHEk8$X)pKeIu>!I}l`tY=^lq`>CvxfU5f1 z8??^egpz$P*?!!wKsX`Q8K&fkK!^PfIIr(=DtHe>;_sBa{%7jZJoz2RQm=1B?e#j8 zS+9mo!+wR!jxQCKCDfwg6S@5BJZF?49!#|^Hi529J7IfH2Jk0k4!Z5wR1{bL>Q+%ejwIY}Gzhj|S5^z!x@;XBfAlz+hJ*8|6l4H`FoLNKcenHU-t9uXNeNgX{oCN?g9O2X8{ zq~w&;H~uzldRn?BBU7uJk(He@GdEA4Z!pd(C^Quno8NqE_MEx%=D%ICVBtH97B6|X zbZObLh$Ml&bEAUuC?v_h4zllFE4g|b*cOEm8;i!t}7qN6b!P| z_lVwwQl!txC$(NzNay7gngqH$9Vw!?Vm4FKOODc8j>^MM%H>=ic^=G{mc?F;DGd_H z0l06OUQ+d~5;CDxhE)9*PqBgLtA3f{RaM{qSr5q5<^Dj%FU-u==`{?E`GvHfHv4AD`FpB6M;D*@u|P7)l0gnq}2wP^FM9$7e^7eQw6c$-Ws$ ziACD5z}%GN_-w-0$Kv?jnu!O_UI=Fd2Z70Ak6RL6Fa5&QB>#MEcowT6`kZ^54yF|A zK7Q&{vX3HzC3P-zOFX&wOl8s}`5RJ_CmT|exzw@0I7@y}3BBXNU}1VL9s;Gxs0~jv z5e6HnCi0w++3G(olEWK=jmZi<%T6TA@H~QnF*Pi3X2!@jF(#vk5C+*y@W&xqE(M>d zI87Wm&JOcaSe4mwPh7A>3(K$^k`^T{Ny|BxdWt@*2>apkB>LDsRwY+skpwbVqo!b1 tkP^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fcb03c17aa1da69311db0ff5f9542aafcde37756 GIT binary patch literal 5020 zcmc&&cT`l@*5BurX|w_6jx&OSB27_>fDMrWsiGnvO^pskK&o^_N7105SRhIj1S}K< z1;Ij5KoPqE6~P*f2Fr^*v4HyS9byu{B!7Hsz4hK++*9}7zrFW4Gu+>}N!(2-f?7lc zx>AZ1r~#NX9ZhSGjO^{q;M=S}{<%h2IQ;TO8_#zy4!sFJ=h;?d`Yh{Pt<-rbHK#tB zKDawUao`{YlqRKZL`+zhlLTmvn zzF5TdmqrFI43Wl$P?%64e1e3pkX{3wT%;+j*!mpudDPX z-%6ezMv#0n&xBlhMejKIdxi!|fl&eR5kZl03j?KbQl(IO5E{5JASygQB2u1n4<_7= z=@k=t`MBo)1Nd(Xaa4rSj$`+hP4uSYojjs(K{Q)4UjrO2)vC*A(H+ zW?gvPZU_OR1!!+s6m0xP0J=Ht5Oq%%z*rq#Dr1oGb13*7`UqD)RRAk^GL-pRv;tvkuIvE&*|+DHLM{P^eJ{D?eAmLzoKPjudwP+bOVlwgAKj93l0|k7!A}DjXz6 zK}0GSvW!S*KXeF;Ryu&&2Nu+47=m5~3-;$q;p|-%=(s|{l_C@5Qc(hzsy-N#HVj5qnZb*fYhdqneHay~ zhTataiiQS0L~#ZPq9b*{VaqItsrJO`LTZSNicj#i+8iApo2$SXIRo~N*h?%Lx)Vk% z&?Vej}CQ-StOJTI}C47^!A((bQgb3n1*1Y{Fdar*0GmCn{PU^e_?J2j= zxss>IY)CX#6cq^X8+PMd18cG}uM?_UKA@4{icLOcf-0Pq(44j=Xx58CCs%i2DGfMm zPMU{xdDz3xmaj30;lsyb&%bGa^m8ZhYaZ5+*18yb-AJMk+Q#~j84EwPuYvs2HDtlAd00`M zDQeVjz{i(;#OGh=!>vz@VC$qNU^T|q*|G(eWDENl8E)c>pMhAIoEHGA9y^h0tCxY% z(I%Gab%b93iUUaZfXTF}_|RpF;4zSn`evL&acW!?c*2UjVQhj(g9VtrbuAgt;YYeO zi6E=i3mIM=N>-fxjP?yViyQ6RiYNCt!m*gSSmO*wlpULf4_VNQC#l>;5uvxSrO(!3 zPZzHu>SwwkPM;MPt#S+viHO2Vjw~hys7vILLo-QfO*p@FUzK1+m=7vTxP=YZ8_6yT zH9!NG#JFVq3ZgP<0hajI5nbuOqxyVJ2X|fPBJKh7v(h$=cD9R5MKS;L4@lJ@519%w zSv>EjoQOLd?uzb4PNl~P)O6kio3}%O8#B(3Ri{#g-MpiVt(-1UjaE2_WqTGxu2B*@flPp!&)L|#KK?DSZn*$G71vqm@u=mk-vZ5}>ZaEyYS6nqjQDl7&&xMd1z z$mT0qb=H%rbH_n@S0c9P)D6x|=Xjpp&D-!`c!GYkM;#WozKXp#TZ^<`?1)Jdq{O(! zT)~~s9W3t0>)dPHJ?!ZBy{P-xYD_byMPPY#6D;jIs2%=5h>7w}<9Fk-NZ$>QkmoRh zWBt*W+}iX64dg}>5$eOCc*=S*Y<3MEd&R8e2PSfpXh}#YjQW zfYvC5!j+Qycs5H*DOE5OZIAm7U$Of(Iv1P68dL8M~zZQHXk z$9Pd&n%Hh{M&9_4u5#a#dp8wKh`xU&&#r89;gqCTk9tp*ZP__(HO3t}y?kqlU7DJq z->LGeQ1JJq&IJxt`SoEvwV^jpR}~zIif%dk zJGyU+{th|1+Jx%Apr`Lhr_n2VDA}mFmu?;Nc*NjNmVwEhn{qHWIT$Gmsi2{H8)&GS zg6`@sP!|gssC*e{R%8wOQIZkO8l()$yJP?_#4^S90=u#i%R?SKro*_^#1uiz@qj#x zYx4$+V7Q!;iS(11)aRlB&Z@UBW_<)=`vJ?30FUY6ixeiK{&&B;eyg&`CI#mT&6D_2 z+f)UyW!yWy|MYZqL$A6d)c}W08j}*V3qsxZ%*eBh{~%c6S>aLPRll4H^QxM6Ea8_ARJcz~ zz}akzWMPC~JzWveJwF< zL&9^R#V{EW<%wj97nA2kkuuKherlWyh!yXZ24E#Qpm@*k#47OsfgMHzibpi?M0)ui zHNfE|yjPX+CyD4-#WF~DkpVTJBzv)un`4oD_@K0Yy{f)fQ|{7$jx8e`v#s)Cg3jld z26s2*#RgyADOoR^njaT>rEE@_S95-R*!2U_j_j!g2@yAsEbZ%VE=Y{Jdm@EB#Ck{4 z(jPlA^}UbpSQguNxx_KYx^Q{?)9W?C*N+z_CqBR1x_-#Cq7}-}V*NnwMm$LVWMz>NrvWEfCTED;_W8^;a` z4`TQNO%7UAg#+fohH-jZ_4W3%8A|l$&-ULkfN09gl~R_!{!ipIDgud!AdIBBP>&La zkRmQ00%!u{{~N1$#C${0LVuBbMdn0|sa!;yFFFhX`s*sK^c%s$7=l-$Ow*sHN&iHD zRDLHz;X#qXaf76<6cc7(dZiQf4o;v6&|e|`0r>CI7iU<2h!qh&2$~w|Vt=BT zx|**0)oo~w2o{*=>ocHOgUVm?Ge+-^)1femnE7REeGE@1!#DtlL`J2r$_4oFQ4|JFm=Ii4g!t6)5 z(eW)?LN{`Mc5)|l5VQ>XlMk7_uk2&}ye%RF9fKuO#$FCjr$dYHm%01gxy;;!X$Di5 zhdvlj7`w<$oyMCpn4gR1*YVKBqUBHVFr~DO4s`U2V>Tqtu8h81 zDe(*b9A|mG>0;CJrSuZXbY|!n&eR~<&tcLMf3qaICgzM1UF1Zj{&a|gheEH}$}Q>0 z>FnrzUx{tv;5o^IWvpl&iM*17M)D*F*X61EMmWUNIPyFhd)hy3${fw0CWy47r7=t6 vgW|>pL`RR6#>7M=Y9|E6#D+#iYMWY)GaY9lcGU#<{STT6`nN0kce#H9l9NZ9 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet deleted file mode 100644 index 3c53b83a00add2b8dc0dbafc83e88f7ca6fca968..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3442 zcmc&%cUV+c7C-luVTK~8^M)B55h)`oI0FHq29XECND&bWMNw%Y$jm$$5R3&I5tV3U zHDcFjEGR?;vq}-gVDA;(L{VdGQE_pNExPx;5tUW{*zenKb3VBDo_p%=+{-(cNH)rY zp%@M070!qlXa^u4v8$k7ufKG<(8hoEp4Z!DEA3An_aE}Kshj~u%E;XqbH)biij+R! zDsGX<#Vu9IF=z~=CU0~R*6(GD)FaW zDco>wzz~238$1DEK9cbd$2su1x>5)v2uO(;1(cQ8m&O`AiA)o3CS&a0Wh@uBNaSMl zMn{HGe7K8+7Onycf+P!qL3N6H3&Yha*^rd2OVMYIN>0d1uu8*=wB%7qhKy{TUI6wI z$(oR=np8Cs%KswfN8vVDGuN%dnJ1)x_;<6%_sHPc<00L9CX_a?&`>i5+}3u78|N+I z@q8D!Q{NSmx=GQ7jRu%EQVK3h>cMc$1wh>%934Sa=8mi2%lr5tswb_VL68pMV$@SjG(nKEaP zba8<8Qhzv|ZUYPAd|=$%Kp4=F4kL0qfn;%exEN6i>)Paiy?O^aUaCY+H6%4Ajbg zps8qv{17LIC|e7P5)W8IxWdW~6)@#>8Qg?Ea6!uu*JphJ^9M>n@q-rf@7_S^*==DP z)`c-1F%H9308;t!S!VXdXaYvx)*c=$5KyddNczzT+(&t7y|Q3ui813QpM zS}nCIUnCklwXbq-|A<;vIE=cW}AuvMkrAd%?r$60p z+C*^MRV`|J5uxTMBtXLVpy}I(w3}!NgI~@;x5D;VZ^sTXgw2i++=2~Axy0l-2 z@ejTt?v2T#b`FR{;#I?+_gq2B{T_QK8QT*<^D_jRT#s6_&x=1-F@I<~k zg-{d?P4_X=D5k`DfIq<$Eiw~J%q=Xf+F0APZP#AP$ZX{byAH~Z_MIF$J32YLsJggz z?bhARy@y)k;rW@Dx6kK2d-eXpw~t?6e>NbnUr?~Ne@JLp_<)Fkk%OWJ4;eZvIwm%5 z_=xzCqedqrCMBngNgbP(J}yJ2H*n)Kjak_fa&jkrIcf5gsd>|;&zLzY|Et*rbLP$~ z{CfU^Zx$B)ZPDUymlQ8uwtU6PRjb#m{chd*?>B7Rw7F!<*3z;cwr$^0Ua|AXzwg>z zSyf%Lr*`kYy8S=ZA2@jE@R6g(j-NPr>hzhj=gwca*l_9cKYsq@%2lqBk4JNs7L%_{ zMV$2d4L(>84wS#DaFA}xS6Fd2H-Ede@pb?QaEIoo$bD@&&0Xy59(MNo+56`&NH}<~ z3`-YdGk8dIu&I&*%O)H_!YLn_;QSLcLNnb6kLgC|f4xQ0k+Xb)5zw5{`j4jqfIfq4 z?o>Ui+0*eFJP*DGVw|F?uA`D6O#QEa>&79`rRn)zm|~SU@tLZf~Ez(3k1G>qnL2Nzda0*!oA`2K~d;mwS%Qmg(g=k)}Vi(7R0ZusFY zR(jLZ38AOob@)^>w0_$fGg9=aS*AXJ8Og37RYNCLVJF-UKXm_zpdZ7jDD(!o$e1)X zMVG)(_&MN0y!WbbQHF1A?~fk_;7!O|7n>28lv{UeUhmy8Fa#qC_vNiAxo{->%Y(oX zT8_rW4*PfD{R@Hr6D-Kg6-q_;|MN&yr~xg{GF zH88rj>0-&(P->pM*#MSh2jO3y!dNzk4M=5!SvHjoOcnM5u)+X-709w7{EYdQ8FRwY z$a~-&u>-&Cu2%QNi=c@9JRg|Dk3sx^U9)UBKLo_`tRQDKJU=ecGr}V=dRTUHaIbN( zF;QcAxh@jtH!&R#g1>Eqv6_p*$bT z4<_hB>ZW|g;k|&YglKO=Y)ph9E=GWk;|((53mwurA6zU<@8!dnGJs3g4$I;NY)Ck- zFDT`-#2*wQ)EgHYQ%mVtwjaOLX7DxO;(~jnCwk`Mnt1U_xX3wt{c(z>L&19if+fxz zqr>u8HXz4zPC~N071m*eN}7xWkfwbBdaN!u8^;lF^7hz2Hsz1TqzQ5dE+KP#c1o6e p634kGWM&$2944e>8q*AV2M?bf9z8UQa3#RoAE>4PBJqD){|0`F&W->8 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..84c399e67f3a1f307e88c5325c849ac627b14610 GIT binary patch literal 3138 zcmc&%d010d7C-mC_d)_h5qKeqQBWjqAfSRv5qTgW7-Vt5&>{wifNYTjQ0s!USVXK# zaU1Kxh*n$}rPxtK!37uUhFihbIy1Fe>rAy{zp-^%oqJyU*Ws5-wgSf(Jt~`^NJFA;GKDeH{m<+5?-PSgaJ|; zPbsxY1Axo}laWP;!P4d)Z2>w_av3jfWZ;IcA6k(V2!9KFgq3?DnJvd0 zLAJ&P8fseMR{0uuR$0LG@m++2MvmgLR`-;Ad#WD!XEn2%%OnzRKr-$hT+gMQPU1d1 z{s5wzl(6MVG!|M8}=z#kpXmBmUd&|Z_?{O2L>0&QtwKoS|?|cm1!~Y3? zPN@SX$dAeALqJ+s&K{akC3&*S40l>fVc=CYw{hE>cyoh4I^4gJsXhA)y>;#`6WrX# z&a4Qa6G-gwTrWG;9gY?Ac&453H=VBqbJNbnmw zhEdOTgtks!iJ3Xag+zO?doHq!-=ivZXZ0A_QqvOD$~4*yiv12PxPJpSRYfqp+@FFZ z?}6RozNcVsb^+eE;tkZe)=3`Jet_#SWeBuhF2?!sDJUkY&|drbO}O2=sK04a1Gc=k z*KX!ocP@NpBsLTo*t&Ol$BUk0?8X_R*lV)wcBY>?(e*4 zq}*_dIXZbeqp$b|P42}?gMZO;E1SMWKd(2jxh}mSGwdUj9lM{&I0?+UuZF<+d7JPw z{|?7b+zO;)lhjC`BZJG`YtaIiN+zTZVf(*oxt?e0@dDL7=W?Grvi5o-POI0U!vlXo z%Z@LVR!rWBi%w)qZgFayYVXOMFSEe)O}kKgRw@Ks)Uh}248pepPogWTQruRma;Q)T zarRrw?L1Z|cRPN2CUS6XM5Xd9xKjNWX060sUgoMsYb+ly^J{LRHuE~k-~$N|GUF>r zwPS-r&DR^iyLN;_OOOVJ*-u6HzW+PUec}Mc!E(j?Djsz{zK@?xUm}b7(vkW8vf!9-$}8NIDs@jyuiow+efoO#^V0SoFmTXd zZ=WH)e*Qy;4G#z$F>=)CH-g3lj|~w*!^VY2M2?S&o}i1F7#kO#Flq9Xsfp9{NpHTD zJUwNGAvG=CI5Q(NYgTqnZeG4=c7fSaSX4Y`?!33>FIc#!WO3;`@0KljuY766vdZP} zuUNTi^#@gJ)~;K>;ddK9+_ZVi*6QDHtJ%Keqn)+8cJJ9+_lJG^>pyNd@W~$!9%?+? zbmVAr%dz7pPPU#pedcW2r{~UJxOl1k@|COCu7B3idE@ghZhm=7`%tOjv9-RB?^-Bj zxmkI%)*H=iPL|nXXUs9uB1$S|3!~_8R@`>h9(2`Kihbn8WWKU2c4`2H;jV3o%Kaz*Jmw|3AgawO#k-&_ z6%!A(JMmHz`ySN3==j&cf`V0PRJ#9hFaM2t?EmG7)&5pblA_#A!D^+7J#MZ5Dvi=n zNJxlH9BqBDlv`O}N&!NsAPDi~OKE~26bqplLWCe>2w@rGStt>Rp{p=Kh@vyu+h(#8 zmu6~#8WLjX4fgdNNfzM~$J2dSF&)F{KuimQjt-%Ex)u3?AKhoC4xQ+qnmDyEJz`Y0 zep*5%<>$r`d~;`!L9`bp8X+JES?!5S%Il(UPD%{OOOMPD3{+oqPr@Ojx9td-kPC&Q@_NpdmGKY?hy6mk|SY7*om zI>Ofrp~cpF62Vd}A|r^Iv?_@~TF=GM^|=v+#E%#!)hG6eD!m%3Bq%*hhJx9JMvG6H p$>d`wD9A7NC^8n9v-0yi`~!#h5Ajp!Gyu;(&_sS$$B};z{{p%vL_q)m literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet deleted file mode 100644 index dde573d07cc0c2130eb43113befd5c673b2f0ef0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2537 zcmc&0YfO_@^xXS>E$v1pB42G;b@-s716qhMB#il$H&g_iLfI4wt+ZI$t`9)BD0`UT z#sV76FdAXe81XgdSb~6pI;QC6YcIAekuBTQIdutUQ?}^t{krl{`M1oI_TF>P`<#2v z$(P3$#BeMZ#D(ZN%t0`KMD90sxkkqzL%&{#sF z@L*^PiJvoy$w{~mQ;G7w;bI!@J!T}_2${)I?yWN0JtZci$0)NB#A+%jci6mDb|HZd zB=1G)n^=7et495dViS0LWLk9r+}i!zjku!|qrw7KkjnC{`ADvYtM;7Kn4k~^}wCISs*i{fo*yezO|wgHk|2UzFLwD^~YD@p)L{nV$mts zba)y1_?aPS52?ln)~CYs)EwwOKaJTGDT44fKY~eVf58tW2Z2e_G7fGbNWAszkzT+s>Epj`kSTHK&tl>%Wg){_F~q+%qF!*7|C^Z)!iYc6q!ga$gf; z(;Ar5;D*Gl<*@ErrYN{>HAHrFizk1B(9j$}PhG#ewzt)p85!DmdY7x;me_J(k0<*;@dpF@ ztIyt8(=&LeRv*3X{MGg~pWC}GtIthRaoE?End9HNw90CyEoyeNHmlncV78fQ50D2x zh~$@~@*kz@W|{go5pCMBxjgb!kDDU{w1Wve+^l|%LgL40>-h0eDp4NWtOLOlHS462 z*0q`K6&~Nb@wMye$?E3Gq+K7jK7r3iC8lhM1BKXKZZTIGIhKs2a6D$Zg{$=Yrz|A1 z6XLL9e04doPk8(^p>m^jCj(ZjPomnaLbwAu0WtC5zl-otVn2-97aTvBEXc7^rPTk& zqxUh5#{cD|svp-&l3$X^v05S@mW2Q)Rf<5qpkQ%fqVHiTT3HQ+I9|u|d_MV8$l>`K zURS}V^Lz!LQX$ZEL|_rcQg}X#?&R#>$w@%n)B-icFJAMkMw390wA@U3PN|`58eNEK zp4ZbwXP`%c&ui&9#?DCLdQ zcNZ7N*-aT0ypie)?nycX4E}S$(xvpAO&4G4Lg>DDRuP)cV=Rnw7>aTo#YIBu#9zKI zzw8&s$3u!mU`#x;lscy=W0{9C_$)n@7ldm3{`0ehdXr)k@Ry$FQz@EZqcw0Ar!S~1 zi>)O!nNNjCk!xuENr=8nAvB$!Ng^lN5x#-f)%fm72$pIQ8D6NQPf1A9M;B5zRHb`~ zA0bbwPwW#_dNn>tP=q;+u2;NfPgJ?n8D(_295rFpW|!OQu!qION5@2Km3kGx$Pc>7 KCr}>w$omJ>jBkek diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79a6f24b35cf797574a2db8d32609d38de32a0f9 GIT binary patch literal 5244 zcmc&&c~n!^*5Bt2$xR@TKyFMR%!8mPgMgwq1epazL=hAv3^|k%uTkEa&_Trv#@BQ0*pPNH|2~mVP z(*&JPtAc1f8VmqN+?wBe_9GwOz$1;jeli?I8EGnaxGl>J?%laV^I5}L&hX1)>xTCv zXuRrGH#R6z$X6SzH>Vdh%XJ>pK+9=!2U<)EX%5YiyMrC|eosoh*9HJRSBz=J2q*K7 zmg@o7uEpduA#($cP$uQYC{yERCn_@&X-p!P{E8$iG1~%UOZ}5SnFEsm&=@d{km&%J zc@d%$gP2Izj6w(mJOnt8O8_aLfWtgG%%jUZ0_M>ZP@Y1@j1^IT$NDm-t^deHO1>x4P`(+w@I!g>T+ZrJ4o(+JbAw1W^K>aKUB9Fa+t8e!M zCt)OfIb8-NKMG;d=L)neLl>JX8woW>oZ!NCKd?+00?#MCMVAi`h0Ss)vOjGN#8XG0 z(k8&TViDj@`F9%E=kD^Ou za%6Sp0EpN6gVmW!kY3>msozWBwBAg}pCgA)Umpjbf_R8NWC^WN9N5)Sj=J~=G%Y%C zQ)CFWH5NeZ@CGN%3&;(yf{BNVL3Y3yiZDCaVXT2A?+(L57!B9`Xsm11C|Ey946?8M zAot0AG&f5GM+j?3$>qUj2ND{N9Rr6Yz7X;&2P)L|V5{c9p#mi|-O-1}D4XCJ z7lUdjx~?Ou(|t z1%7WEA%hsq7G>r&@vUhKZ1II1JMymJ{mzZ{g)(S9EqrfevLk z0rpz#C8CXXfpwG(5wg1noeF4%juD%P1GC$8206Ck>jLb-`T9dhA)2u>TTY_Bfi0Lz z+7s@AwpM6ZaSL56eu`WMq+>hN;-K$TDbBMSLmnt>gTvLoB1;IuM%Fu_{Q&|rx%M=i zvCTkr%iFOPr*K%mU%F?m?f^=hkvgvJdd+iS(9)ieSzXoEut{9$0SAn0^FVN)j zfd9CAI4K4Jj97dQemM{Tf(Rc-HXVZfFzXboxp)r0Hf0Rt)y%#TN!PI*KoW5*=i`TzGHXk1YeVJ&KIhj>nec7H zO8ESIIr+uz8Q4yZGpZVR3Lp068+>NVBYey`3$9geHdgL;1?hh zMi$1xvTlFUVEICDIC+{Qx`EJ(pK$=?9&j2v8aG;~3sYXNL60WXA(a6S#hvpcD>J5I zq+Jv?aLiFMwlR|2d~_PJziLG8Z+eFg4rsz14sOJ&CitO69e%JiGY=mS)rl|Azk^be zzQN{yzZ!cwXBklu9D=Avo>;nmB^r>Dh83TfL%gP2$rHzdNo9Gmu;gHwc*4Bt=*#R| zn5nHLcW06vdfh6+6~h)22hyUjoR@y+%B$O=A67Q~joxI)OQ znu2K_4o1J6O(A5LZ$ZfDTnIgWm2AF$3biR;!t}HGc=aFws%f8&wd=hnsl;)gC6 zzxNWv_qC#5y5`s1yyPYOp? ztQYQ*j3euO<2c4{dFZ;&R5I-ge_2@fHasB_(A`g?^~M{l$FR>*upzfF@KyQijvD62 zdvL0RA7NO?pDmlkZyM!KlioAU9@YrSP4)G{%uESCMdu{mt@fe^ovjrLhj~-?(AS(u zn+e#9WD<0~>Lv58B;t8Nk=%HnwOj4dkne5vG@}C3{S<-@E>~!R0;w|L3KpW!FWdeWYL5ezyNAuRCuSQBQE^pAJ z@6nQ{H(^JXS0=oULZ0&m2EYnXM#-RuzbEziZ_1-rV3_g`!cPkn<*$ z;|cYEzgkiU=N|c?onI8_A0LoG9ju7LBa(U0uD>5GF;wH@4kL{Joh@m6;V3q#?VuRK z>-cUxVovPQAXH`l27Ow)hFTl856iBb$GJlqVKICw{8Fw8uJ1U2+LB^m&zlh4o8Jz? z?zq>ZE3%c?#g#IhwML#KzbKDuzBx*-_Q4#aBR+*z3XqCdXQ5p3zD z1p7~wkxCtN7pBACevQy^5_d1+j}^1LjuwD`PH zvgY1-1uYYCahYV>cz1w&KInAx45*p-ftQ9wYSwJ*Q;T%m)ttyc3*P8rMiDQ5yjtxZ7G$>ehv20K*OjgPs3BTZo;v8WRt9o+D+76~ z)%80T@>v{G+nv-RzG!8#w~$&_?`~uO4KJH5M{4Vz`te*YTe?V*Oil4d7gHEp0uw&uRvc%#w?nX&_3bT#o?%#jhXkJ<6&qm4RP431i6`k3} zsg>QwjnmHb{cN0G!y7K2-yqA8XEfQLkY`@o;bm7Co9Mg}w_n$Cb^sMrw>Eh>2!_AiTcIKEZ?dv{aw(L#cOXjuVk1$_B8Z9ti zX`=m1B?Zdgf1s3sZ)1q|7{YeDhn&XxiJbq3G>4axl**F2_)H=>DO1IbPmX6P5W{z> zMUv7HlDp2fN9=4*b6J-6=WFI~NzHOZ&kAC$C6?WdqpEa)L_|DBGW@PXKoLsB6GALQ z>|z*VH~TlX_L#k~7-2594@jz%HP?!idauJ0yT7h7O1}|Tj3KVB*Yz1%_uBKHXr2D= zBriEWH9^%+W;K{(D`wj@%J$ol46*ww#6JN4T@7SvH;{5NV-w?3lr+ImUK{LB)Y(QJ zbjWZVQyp-_q#qsx&FNSEkax0re;g5w(VQSp)|!xNkBep37=!Ts*ldg#|K_ZJE%3i5 z3lgzRE(`ik^ZNf0i~g^?)&H|sNpfJUL~Kh*wvP}4$mCL8MQG@x@R9v*mb8d)WW`P4 zrBEm)GY=~h6p9>$SAxPqs%_qrr}j zBbb5DM1OWZK8J06*oJYfPz14!R|GrN?kk+w`MemHiOwd z7S3fFt-tYFCrl8G^!w8e*}V_!Gb6*@Qsev*6iU`!3(us(NW}Yj=+vp~JdkbrQ`d&x zAJ05y%}b>WcT0nTPy__A`dX!& z_xO_owDo3+&B*)GD-_=B&@Y*-L3*UG$J`i~1x!s`StX{(Ic)uz5c>~>S@Y6bGLbXc zG5Zk;ublpK(gw>~F**uuCHsxEN%pU6Q;$gT&0=u0d9wD5f5wzOntn}?nx`u>=4ZvL thQy|)4^d`hq~(}r$7f_FrKOrXyAO39>Ld%21N{C6X9(suIp%lA{{vSbUhDt> literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet deleted file mode 100644 index cd2748f7d0eb609ee249493c7051b096c5ef985f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3413 zcmc&%cUV-%7C&>#vLFb^-gOsOMaqH-vJfGPh+Gg>U`512QN)!+1lb}Bf+d0)5>%p* zrx6=yFm{Oo<|#!5gS|v-#FyABK6{DLL^JoUs66$L_kHhsIUmf-nK|Wm&Rp*A62(S) zF%+Xw!!k6~@3}b)XQKBOuMq0)fV-oCr)4=h6nPMA&*F5u(>NVqC%d zyZG>R1q4A}1^GVeO$BaN8huihHd&XMs8MICty1wKRg;*cPtVfo_?lXf_&SnTk@Aa# z?7v9)!AcKUGfmbi<{`-!d)4gm6*Bb8Ns#6<8@4pF&|Ehj+*kL6E2k~t@q8D!)z}@9 zdWg{4b$XaLMg%TP8bN=_1;En|9<(Q*@J=ekSN;j7UTy$EiXW_tmBP38ZD86m7UgDG z5@}LDsNbr911pr^sP%vcLtdie<-MRtCPCe6oq@XT2||4Y1m;oDRPGGst`1-)3WO7B zHn1Sx7beaPhQZBgFgm9zm@l@2vyoe%tbI0g@vJ~cx5$uF-9`}29RyBwN5QbYH|Xw) zp|<^KSTtS+^Zxh&LW+}N>}E$emLP!A=56S*1%j$0?ckiP9W>N;18U_!P*gpF{4gho zEME=Mjb5;daD$bds^GI%iWt%pW2G>Gw*=Z@GffvTR{1G2sOJLMUf3Pvg#y@ItcHUZtYQC28cvodQ22(`;A}h$FQ$cp(@jq}HLeIQ zywpKXW(*YPm%*#a`(TN3J(PwAz~yBGdXakrdKPpB$1UD)|G_L+d)5`4bsf;-RZo$< z<{HX$LtxN3LFj@=$S5C9R881M6>fM5zm@k!yF7~92^}L~ZMU`5So>0NPH>^3${wTY zu*1;YXCbvQv8i1T&tv3QVco&&%r($bhl#qyJJ9d0M+k3yi&@TzW6(JB7j$U#ZRFk2 zK$Pe;@O$+d(%db8-njGxl<#?t96?3+{ir}2!mQBHhFYkTXP~`#O~lM<66WWOBASMU zz#ZR5gk-alEUmW$-^v(NwWbj+6fA~Ug&AZw&s@U4PhUE&Uj_0?-A$F`3k1Vv z46rCRbh1eAa-Lq%a1TTth%S8|PFVIP$SkG?lupm!p!Zdao3}otMPLPertgMl8^gdV zE(FrMco6r-RKuJ@yUEkT0-&INJn^W8MkfdF2Y-JTYJ1m5P$im7Y{?%8T?R%#?eWf} zX#NZ2GP@G2Lten8#Bw106{MaC1YuSIwR7wu!SjVixZXGmmhIa{FE1NKlvH`48rN#_ zlXZWRqmSGm19o>abIPAgZ1X&8HY358-eY!`PFKW{2fzq^OOqh?)*!k=-V|`(Q7f=L zi_oK|B!K!mPz>l#+E1~BVSmg)HzM|;%ns&Av)iBEnK6Q(-4Y1bfC@Tke>`1OF%oq@ zWlwK7_zG?6c#w48^fg%%p+wV~mGF(Rfb5v?Gnr$30clfzB_`edg19|Cm)br!3JGuc z69(&@sH0X-;5S!;8B;89XY$P6zH0f$KtgeU6j&~i4kA2}FMmQPfQI{bu##s+2`2%5 z>r%A9OlWS=&eE#AwT*2DI}s!9D3RKCl6CIV)uEfCle3H5)vbGvp6(vKJQZHvpY-kX$XhcKyb>^+5)Jo3w$7s<$<$ZsBMxINI%#|HjeoaMA7}gY2FO{C#AA2ebhm z;=qqEZrh6>w>4Nk#%OW8Cr^2-oiEzltof~OS7!q}!wF@rkmLoOWej1WxIt|R!d#oG zdCC zVrhOozfXGDju(lr)s`c@_jj?;Thdzt@~sXZBR%Uk6quf@OUX1*`8gy$kK`p?A{k$qC7eAVS`wf9g2TBiD21mHYkM+W!V%qIECK} z!Ulu6RWQqjaWmFiXRPr{BNu@S#15I-)6=sLUW7yr;`HEbZVcfD9GYcS+z=GUsr)&s z;Pi>(yd%BF#f;3-g!Y{n7aKi+JJ&{Gd{ff!z{d;bgJ2L?O!4?7ch<(=7$4)K(i1&gr)iFN$xY$U2d@Nr&&NtLlU-BBXme$s@>_Bd*Oy_!Fh!5?XHqJW-_oO#x zgqxhr^&dktT?*a{;yp2PTpd1-V}r6y_r!zcys!<+chcm<7irq(OOMlrX5l=1ot!_8 zk6pQ|FfG(=;Iv7oW*J7_uT{nyN{fBXJ@&YJy=lgQbGNASFO zevUjY55@o(H#@%FRrYIjmd%K#?Xi>L&tew-bhaqo+Du9a#WgNb6yAD~oX4)=XLV1rqWIfjO|` z1$>T=cbMPGK<}tP9;Pg-dLjA*{SNeHJd&>mTgCM_^zTqJ;pOm0(F`z(^6E$zrhWaS*s4`2d$cRsbttCLDAXK*>{iShr7v5+k`-uwW*%)*8dv zJTYj78p5+hAJN5{$&fDOqlwMhKs+)6D%=4qHW1KXqYa!18lb3T0e!*pu+x1itWL9n z1%tuhu~r>8nTl|AVKtP>#(|nqJ-SdWL|QGCpp<3{S}hkKqR|Y(eo=;ISr5oqC4{Z- zkAii!KX@I|gkDb;6b{y*VJQR+=cVDgf+DoFjt3%tJ{UKQK(d_{EUYO8L8S>4VS121 zt^wjd)xcet0|R0nHvH`@*s(|n1c$|t{P0H<9HRiWgf@gGb0FP-gpMOez#!fRoZhma zaoa@D-Nu4L+1}87OAfj&k#MQN7}-}8gLc$8c(=|5wC)?hWuJ7o^)U?AM!P_Eaw&X@ z?Sw3GITYH@h2eb|dY5<)CZ(!^X0<6ieU=0Vu1)~$urcUG(XU9w?=FhgLl6p&jWP_2S#@ zCW6VpT?i%4VJ(^8q3AIjDUOYAlwh$oY;?Z^yZS4Iek!b z`YqA~M{MR9V^m=$gBG_nLyK-CYTwY0Z9IX)bt7Z+cC(sK&GvSco#wywfPnn-kM zK^LUgFGG#$Bha7}i&ZDj2etVQ(0ox9*V%Lg$l3?Ox6*9MUIabhMf|@3rz$>~xA&>L-@VTePv$c|Av8u7%c!K9t(x7S=9%Af< zpM@xxnd1wI4{XUX8@>jE@0wW(R}mWd6^C_$VlbXJ2Uq!;3p>^>$BsuX#gq&Y+2kNb z#m80=?|HrCu_Fse@468AlB#MYht;m=VC)S{O;?j$5Tu9R_X=>)l=VbqxF;6(N{lYO z8&Y_Z)Wu2Z^Wq#vk7Y8(b+GLdhpdDt|v3_)A}-~wci!%m-AJWvxN$picFGg5 z&D#L_$yF@JXb&o4B!T1faEA05%|uyp6BlH00`IOGV83($(#vut{M-=E?%Q<*nx=JQ zExY^h+PNBXI@_}d)5X{Ek_onW@B8oMXEyGTFI2W5+im<<0v21_n0Kw%l1$#GE@PN~C^3|6;l8IA9|oBuF2ANv2*3#8VyJ z**zP}fjh60@S8st8xrfGhJ7}2hKn~6)W`r%T~pz(hmYV_t9vo!x^POhybOfTWx&1U zJl=E7h!usOCoSxCh(i|%{M`&v;ac$=&X$N2eT}5T(#g=#ABTCJxlS#xi;>d({wCZ}i=7a$qydZGT+Lp! zSDmz8CC0pCy@|<9*-As7x>%eKS20`I1o^bH*sSV=h$|$eT;Gt`;9$^o=9>RhlE`F}9j*u;Q2I@;J zv1^{X#5YO{SfxtVD8M;^q^}yCvb?v_2D~EfA2%g}DLtQHe#B@gWou~B9scoO2z^7s z3o~Z5h)Zx10i*a9MH@Ea)EGV2(?uIMvE?;wvn`5~lBM~kfrF=uH*J>FuuSEPO1|Bq zsOMbn+FFvlb*!<^*=$j1%Jy;7BkvEkmTumuZjnR@Ecd7G*0N34b8FkbWzPhs65AZh zvb4-euJwV}+RCNMVJv%HRTL+WBJ@lFlvbh9n#TkQ8vt~i}xcS(0){_J>+XMb7Gxy&)&GSmKw z{<3P1mL#6TfPs5s$j!D)hpUrUIDPZxEoqQl>wP9oFe}9%r{4ekz8ULShWi?VE?2rW zr5NTO4gJ4I&mD~}lVA{oOCk?3k=qi0x>f1MNq8E6yKT!jaZTRwsQdjD%e!jwo8q1h zcjt-^7qqN<{rtiAU55)#B?2xbw5=^_-9#zsdz`H;Zr?1gWuIqTSJJVSKPC9a*}BrM zof=kYLc99?-Fx(wRC;vRmz~Qp_U*~DJ96Mc?)0dKH@c4;yjWpdX`Qc-z~9qDN2>soAly|+2di@TR7fD2?NI`V1&P{qX~LYw{dU#M zkDDffc9q2Mi=L4xGLa%38m1-!m8R$*_<$0%J8kmCM+;i&SB=OdY6sBCr|u2DzA0EW zk??RpiDZIscQu`G-BH!a3BYwYX6%bTP*oR>(Hn|27Yo;HlFAe$Jlz;wv*bj~Hp4-T(-uyXMP!_eXg5rD&%3KjKKEG8#E_0D2!nT{!v6oP*gPAKg6H8 zUDH?TBChhFq4M5Ix&u18NoW-oFOjnU z$dmkEdz6&#Oi`BArIa_AC;Mt*F0vGfM2qP!BMu@_oX9dj zWFryM0LCdH= z$&lIm!amB~#XQVU93b*$>?Pr8IJCHZo;xpH%FOMVW)!+4_0fD*(`%N|-Y(|hZmtW% z-CZTn>3EAr@!5a%Wj=JVX!%n{-L)Fy_GBf-ho)Bb5w=4eJWfv*wa9l0jPKibeYBErx+GBP|)Bi27M dDkwZm!({4YlgY*cM0k%vy)U9N>K}8?oasHu+wOkvJMWjo z@^QW#!?|$oT0DUxfJ2bBbIn&zYd3^~Ef8W^Z>Pc8a3i zL0QAR4BpdDZ%oO}PBmC&>U9>KT?Src=x3%FvvRWyLOQpQ^fpqK(aH;y;=geD)+7t< zxfS+W?l3LnS=j4I0UiG7T*&la3bpM#w71L#@2$h(`|GywWR(XzX!C-UVG>le!w4&9 zNWf!58yLUy08lx>V|xlp9%ew|zJI~Bu4)ja1;UQ0a`^I*11#Lcqk%7-3TaE$dFgFJ1{?~Kx~YLv4sp=sdERLp)TMg84H&)9bk200L)nt z3gg=|VS4^xuvzZ}H)CpHn|&S(Qq`mLwF>0cQUj6|5#ZKx9!&d3g5mE{XttjY>t-up z<*(mBSXnB}+U*J#l0;C^z8Br?jiBLNAGp=e3635a0?d|ipl;}ZqDVK0soM(j8egcS zJYma#23YjG4(`DixS`>wyUX5(RTCs2|5^h@KYougbNj&_#vQVYY@pPeg|>bBz>3SKL#mir&K2OE-H5xD&a8mI^$sM%9saXyVakXi?^%V})0!#f>!FQmN6yfDTHwTSHeIv4w{1ZE&Y} zJv=YTp@*mnsGw1!*^JTwq90GxBj1cxro2cbVm*_np~3ZR%E?4_^U{3cu7uf?YxwWDWA6exA_?%g?mZ*Cz!-@sZh7 zM-z*#jz0;d^;6LP!5z>bnMc(Yje|ksqM`ZXK-y#JKIj|v48EIL2Xv5{HgaP@oLkHs zoV8BWxz-GSZCe6f*PPiewoRkT8+=jI&_=rY^m8=*+H%a(3uO$su$cZvx%9;o-E|3}f$XD)cwNad;>mz0nqft~ z#5TSA*xK3mb?E2lB;lm}Wpd{Mih+X$y9{x4bN5gV_4FDx+}me_O6}|SyOI6@?~NKA z_2x zE8Ac+&CM}ea_8mc&tK4I;YW);E+|~QWa+Y^Pd+VPzG7v`XRB7PSzG$~I_3J`Zx~j# zanl!@x0F{@ZvArG_OGgT?5wWYwY#?N>pgqx_ciSQ=Gy}Y8=IPATMo4zK63Qf@wO8u zPn|w<_T2dk7cX7Da`oEv8#mkkaO;m&e+x_?zO7s@H^HA+6a2Z+1d2N^1eAy53XZbE zKt#`%)FwN_Kp-L2%w%PlExl5+QpqcepVUfLdQ>8FNCB|SA;$1xZVJ*d}E)kC4(?KBnxBluIYo>k^VR^gtw zMdV)9pQAXDHi&35GNBg)c`59nyC)vb*uD#~FF5|sWI-yDE9Bb$G35V7ApC#1vA@|( zk}i`=MM|->v%dsDu8`UCad8vk1FaWJR>`Oc`ST$>&rif(LeV^*$A_fx;XI$lho%W< zAy{BMS%va^B$@HPd&WCqX(kqkA%4Pw;VRWAyao91~f8A1}sR^an$ zvY(Ue7vq~8KP6WmK6*~V)VOrQ&yL0M&CkSxU@ufKf`h0nCe-p5UzO!iS^u%<4AZjEOSJ`1tv;{A;YQ)7&YQ-##AzlqlTqCV^y4=xs_ z*Ww|i3^D07Q!IqRM{0?@Ae7hbe`2IiZ(M9l-KFRGab&5Ay}e?Wq6^IRwW@x>$#A6LUwpA_9Ns;^s#-c zN?K!;1euFTmoqmv)#8(4GWqCoa*TN{^HOun8AgMPZ@>uO5o)P%$Vdir5fE zrHC{^nu%hgs$d0IBUaX+f^MSG>{_C6-y4F7o8*uEzWp}$o4NO%dr$kFdp_oehuqVW zAsIWy!JVNQ7z|*za(T^**yrnePfWR&bgVbyQOb~p1?LXQf}d`xo03}~*;L8^qs>Un z7$Kv{@ED$+4VcgaIud%o3IK#7F*hSj^nk1r=`o~qJn%H75z>n14`(;w z!Tv!pkqJ@JDqwGs_!*L2AY{+5-hUDE58*b5nRc-|bA?hN&J+E(pK{8a0dbYHaa~w7 znNj*PysR9DPL0_mqK8j|gToGz3w8Fx$oWR3N8yjC-t|1(n3zSD`FDy;%`Xv~Tt|WB z^#>3|p2sJ597V5^$PrUqGtpCIx%=-lcMzo`UumMD2g$ zhSkPlB7r#$&Ldtyn^ljnw{HTa4eBs){r=5r^QXOEZ!!$T?S)b6}{1Y--TMHVFKf&MqD}k`JAYvFh zpcB%_V+*(Q-ef7^VQUKPK2uHYDV&S<)mftB#`T1C@jGJPg-68H)5G{9QWJ5t`8WLJ z`8HGo|9dLZ!k0J)N|>Az1grX7sKKk3f!Wa$yg}a}^x|J^vjr4^#b;9pon>m^^)?;t zRHhL_=64ZB*{$$sS~FUzaSz207NDr`J9x?W>u{!@4G4Bno%$$Dh8Lfy+`Fk+6c4qErhRDj_@sWiTb*BIu%eIsaa57 zp*=0a2NfsY#zSSp`To)c!nc>Cgxq>1Sr#)NC%tq=SAV)YsKD5TX#3QKp&aHJY}ak& zyCrL=?S9{l2ASj_OYJP4(EAA;b(a>byndW6^BRU8G-^_Fb47xl(8Ws4~T_e0- zY)8e4s&H0Ii>CJ$1r;eN03)-d{6s1ds^9O#b5}I-*J)5Z-SKHCsd^c@lr~OX^N}ee zP4eU|x$lI0_U%CaU+#g;T?U|3iiHLq!)WuYOrfaxI9}}c1L}60K##oh2!#pFgcoXT z_;a?b(ToxuCF&lGMz2mqk4UpBF8suP z@MfIn73OWzJI) z7brV7Bl6H?lCXHOU8gI5iXc699cm@&`KB|!M_)X-346CY5&8x{126h1f6eeSP!SP_ zt1>@D^*c4yZ99%ogFYV%t({3YW|lvi?wTN!-MRxkLlcc-z3OoM`U?J{od%TSB4-?s z7(iBj$*5f(F@^6C;!0i<6!BwUccJUYR%5;F25psB z5fW+FqZIXUxSD^T*vT_cPu12zJL0zyD-Ya3?aFN4=p&wBxA0qDzFM8=z=IqxE1e)} zu+;-=;T+WW!*d+Ge2YQzs`W08LM9%SI9z+LlYh0I&N8PiSw`ddkIrI(OgBe7#IWhiAA;^!*}c~bCfBP(>%q+k_b zT4I;CdJRDjHt}i5TeFt0IoxHJU4BZcNMaRsqalCodJRKGnnYgkWxAG$XNgZ^LF&dK z7J=t>$qUzQ(w!LBd!w;%eWrn33N2L}NXr`Gvem@5=|K86V~+xt97WNF9ix3}!n&G@ zHfEbIY)X^bAI!+H4(lxOZ9ceZ&jjdhk=o^u6Xsl@7CGF`%kX|La66bmDjkI)-s)Ut}4HI zz2*qFT$k#C){PSD#kbE@7oN>DwBMlTT63Unn~7K1ytbO6^Vt?b?YXYC2QTiKsC<08 zt+x2`e!I0oeYd)jjzX7hrt{nDO0O1s6uRwoJ5qM7+^2T&o%SQ;-yB}pyg`53SBJW4 z0Y86*)HK+qQLI2+( z878hWkk#wUE~)6(Z(HgQ$%7p;@dLl3LoiSXZm>Kl5OFOC@U(y7c+yAAPD0;MH7S6E z9?;=<((g}M2fwgAtYUkWqyCI0X?!3}ks;Bc@%;?&F%mySvRo6{Ib)V6{oehb1pOnd z(v0yS;VFZ{LZSi~k|l~p_&0phNYDL{mK{qOVLX<6TyTchulzBOSzuNpeM~+(|AEmD`7^&J?V|bZ%!a zm%DN+D-W!!q}nPu51b?U%;lra%_p)O#~Ci1Y@fuSq-mHFnZnHnmhb8q5I2I9|hRn?m9tT)RmOqDU#XCl8*Ik1Duc6%$8uJ_3CTY}>P)rtWk^cu3wR8q z@h2U?^C2>1jf%wGQNwN_Dx@BbLh7-VldyZP5`ZURV;Xid`!wuuU!%RE#%i*;%8Uk= z!PiU%W@A}}-BM$^lt?KiyYE}Vm7OTI&=KfH-zgB-ZoZiM@7TNqh*GPixh z1=!W`01X5kPW&PoZOigS`A6E}pt>3zU3?KQX=7kz-Bf(BFb#gwbYjKM3})kDAJFV6 zMy*?p!Ij2!@VKd(35{BWlj0LN^V)ur8^>EvjQJ4T+#rz@E*$5*-qFw768r=A!I9e# z7>T%z4<=xrQJAUW4?xDCdvGfDC+}<5M{|MT3-JpM!M*KS;HysqOK>ES zU6>4whiBr>104D)=L94rg|T}FbV6(3T>NQ6Dg>uyL;IKgnZT6~QP|Qw;FtCgek|Js zOj0ys=f{Jzrjgx0<88@3H3l6R(#DMZ_-|&~*{e+Qp-`_u z4Ri7Cs7qc8OEp}F*DcNxt!GYx6B0L6z@qD!T)^Ua5V5aa()$uZoqv+6X7~uA$BkwB z&hvr7zgD1koQ+IS>9}iTuK*7?(B61gnc$1EKbRHoY_xmz3?qFDH0wBz=oWx5i^X}#2`$5n$I?A%(; z0d@Ty{_}{F`N$FdI`y&OjfJBZElwPxae!h;c0EB}$~%hQB$y2Zb6LgmE|}_j0V*E5 zIVQ9FW0hIWHp(|9Cu=b~U0x=Oi9TfVYC9PDdSCgkKI(g~shdPD>sqiEKFx0AXrgpP zyq zXF+Kl6Azv<3F}MnLnyxJ`1xc(&MH+(-GBUh@``x=f4Q;hS9O!*Cwt3TwN&2PI}kvr zQuqi31rv)C-4{!-iH)KXCujvh$R}Sa*@93jXe)(uL8uf`D#bM|G035#6hX+MJ(+uY zG82bRia>#ciSvg?Ma7drT5cwtr_|CmjWz^Y5OlQB>giO}3(<5wyF4~Gro4D^jWIo8 zwth-M71dkwNPP2Vkw(Nz5kW{0MBI4dkm|baca{{#*^C*Lf`RTA-IH{P=soj-Nt5V& z0&U!>i=n&YnN8NTE<>3)Jww$h_S}v!$Xrvk(`>he#ArsujEGk1Q~*yu(9Qznk$|AS5P$f(rqWT|uQ_Nl?~+2r5f!l}$j_1_Wz0EEbB2xD*7d zsGuk=c$MO<1*{70A_^*kMas2Ww_00q7xm5wm$rUw|M;Hgd%io#nVEOyeShyVlPA9< zP!Z%Hrp1BcnJTf47={8^ZgM&lWX{bGo_gdJ#2!z}nfWUJ;ZCpEnCX+;UitY5M9Oh* z&TsJPSsf|{u|#ZSE!HD1j+kTO43@mXJt=Qc8vq2BnDlyxqM%Y=eiNC9X_*sn^kh2PFhRvp4%?4vTLPQ`HTgJj$4pKpTmbs4xtwoE!Z*T9(Pe&E1bybMrSMTBgbK> zcymfLyr|z!@hzt?yEnAKzN0VEC{SUSc~fD;R6l6EFq|?h zdWFoFeFeJHUctZS?gPrzo=OqB18-pg{mr~$&darF@Z*VG`10fdX5+RG@a8%P)L>Ch zjoe8BE1S2M?%Im-Q+Nwg26PD2{FY>0xayWY&u>`%b@P$NhG zDnflv$z_YE0{i#fD5Fn=5Zu2UnU|h`+kVX`eJCGAA8}=_+1aDj;m*J>F#yJL4z`$5 z%S4?DXM7u_u&#DCT4&9`*vm%Dt}}1Zo?&Mw>pg3zPdmNf8}%&Q;OB)ZzMfC_i(8qm zzw&1y4kYST?x`{KO9(|fGrI8z`6%w@cuUmZDx(zRm(aUY=HSd{Ug*-R9{q>8r}+78 z^Z5Tj4@AYrC%C>EUE!B+dy&572IOG4mZKALpO@6b<1guG;O!0`i5kz@;}13o_-fm+ zoI2ea{9TVZUOLrKKUJ_Fum9*SFJz69Nt9NCx%DD$29p5?-mb?Ve0q$Vugh@ip1hzl z4}5}J3nqy4Zd*a-B#_6DqPl8l(Im zUhEw!xPSYYPV>-Ha3pXOw;&>NMD=?XF#hHP#$fzfRKaz`#>T5DZvI2?-joeid3!jj z^bdKdeHwn?y~%>(qq2rD%Y-n5qNy``I=D|lfSIRsfuHJ9xNU2$K*Pi{`0&~`YX1~7 zUGr6?wBw9xRHcPC)!KhZ&*fl|-WG{F)9e||F?K3I9n)qpDI2|IGw+sCv2lQUSI!&a zF|-KdFOslLHwMpiZ{d66z4$llD|P;bE^@-n{cv9*1A>1%VG1tAQ3a}S?%AAb5Wats zj-EOU_jp;Nx-UI-ZD!=qyuJ>K*JZ@#-uQ~TTGNUp2U2*$tE<4|kqE*oTd1>LcAPY- zg>m;ar}tiu~z#HQf2@O_^!)y>LWE z1ikMwvGBrZH}2FJANmTvikteP3w3;xjZI3A8aiKI1s}KXH}f?gkE9z;Qs2(3qQX|( zK{JQbyeY54m^F>}P=8q}on$Zq;yiYugy{#U*yBKz{$LB|7nkEuhfd)?hpBn*hZ~`= zL_W0Z?m|lq3Muz}2d>mbp<>IrsGQkQXS4L-R0oQEZoFPp& z=Ock(J<1iu!=;KF)OwDoD9_Lcm85?`E!ln@olPs{j5`s=EC zM_o-|yv}TN_t!@_>7@WNr-&p=ip8kw*?s)>qkO)9kC6KHR4N+YyAhcND{ zpR>(y8j20lFy!m6MNfQM$6A~{3;UFKeqnOGUfJywWGb)S%x)jU;c$fV5Rp4Wz~w3aiO(w zT}@RhOYP=0705hx6mA$F*IpggykphIiO|b>=WB!o&F)2f4ehbhQv=eXa^k5wOsp}< zu~@y?_*3;Ar3T;^qPCP^iZu4m3EsIa1P_c1s`WK`o@)qHCDHlJQ3JH_OGXpW5BaQ& z`jO_BrFp{P%3|-fQJ$+z+;R#z<#n0URZ#P6!@M~9cZ#u8PhedLM4;U4Q%U5@k5Z*p5E@xL}|q!i>vTy{0z%CZ#tmn*N>Z}FI|`-E8KmZ2<0V`6=Sg#6NHs;vfub<3~p zMXz0ZomY@Zjivi;Oj1kEmp8~XzvSrt&{$|RQT@}MUq_!_GsuY7_51 zGH)q!jBVc=-zRU~LMbJ-GkOuJ>pz*T6RM zx&4jfZ_=ozX_P#FoQW6@5K8ZN$rnEi)(y6JwHN(+7BTA(r$b`5_zmMQg$^|BU}P zUjHg>S)mh1IcZUGF-Z|(n$T@?{0G)+l;0)5tlsaZBF18l z%9XXIrP>o>nLQ>R22aY|h~S&C_*%#RGg*-6$xLLb|Kpz&|0Op6|8kH2Yd1+!Y%kH1 z^CUlyH3X2ENQH`^py?ql0~brWm9}G>lR~LbC}xm{ZGH+xra~F3@Kh*b6&|tLH6>B- zXJ;M?g%3Ltd2l3>cACZ_ut19Gi^tj7O(K(N0p6_ak;(Sc*p5Ie6e_k;hOwe{Ut!P6 z36YKg4v`_V7e;$dP6!JPievYa0!e(IEFhg0&qE7Bf*@@GPdjDz-my;$4{=J4_KHC3fD0FP literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet deleted file mode 100644 index ce4390a418031143882c26e15a5cf27824bf0a3c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2745 zcmc&$drVVT82`?>4+?@H^wwLfib@MQ9t8!_RrHpZKt&!R7G+py1+=u+wuq?nITu{q zreNYjTxGdSGbI(Mx`Pg*l+**{We=J$HFYR|8-*>*} zIls18DoqhsAwp1F1S~)#fJ&Eks8k>Hd|9>tLZqNc5O^Zy1zwj9@v4AK?41EXa117# zM~DYQJJJcPHVya)tr%QmbJ?rwZQgnTYk18%q}hZF83xspENK%+C?H@UEP@OGWNj3Z zEW|wxO=KAeMkbaBC~^LQYO4kDL7bv0AQ4r}eH{&n{n1?_42e}@$C!x)LmS&0e0-SQ zUE9>?aQUk3HlHo5flM0g)wOPCQ=>~s=OWTvLWWkx@E+Fv1DB6Ya;Hkzud)a)_*?J_ zdm{?xzFY*Jj%oN{{XTZ>uDkGi#~Acp;--G8;f1hc*ba7f^j3(i(zC_eZlKQmPvJtw zdUkj9*?yyvPcbj%kA#$S-$En%DSmh3A#^M5Bu;f-3tD*Q6!fgPgg$xW2b4Nc##`KW zxYfCx369U?cE5H8I^MsHhJyu<|GjrJd0g+K^K{HU) zMi0^r9x^m$*zggtdP7|N$Wc!vBqk*rQ&LBdNlPD_F>d_RrU{u7CrRe4?3~=Z$@v9^ zmMKM3i%Uw&rj<{xm|?A)IjicK>e;rMTDxOTU46scdCo?c8)QLN%@6f(GVAqa%Ajsq zKx{V)a>T-xMPnD6Wq>7|49^O3!ct;j88NWDc7-EMhLvqZrcWYl6;a6OltK3#HM-K} zHe8O>3D^%Wll#8V;Kl|Qg&K#KbvAf?K@O*bzFzWF+d0j({+e^4hD-eotqP1iT(QT# z#~2~jLa~T|PK3!p5rWW#`y=T>A6yhw0g0#z=u*6Sbe9;pkChlPyy2$sB)AFh1s|uw zRp;~L=f_BM8X2yt4Lj8Ya2~b(1fP#fP`fq_MBZCl?`X6MEP+`)eh97gI?LXuNhEhj z!(!hm3%p;sFCJ382lXdEB)r8$PqU)Z3ySENcnI`FA5HAXQ2UDGzb6YCUaQku{^g&N z{|VgxlN#vBY*ZnNpf$mL#c!d?^)5QnO^P zlX4}gPRgoN_RK_K3Y}$1Qa&BYIxv!zGWAjm)Q~iF(NoFE8Dx@EG?}imnkmnrjF^@r z3uR_2T`F?PNZ0dfQj1b*DyBEtbH~lI&M2*?^2TBkU&~y=6nj~U5fTI;zddD2WqrKJQc99hYtaGpexQr| K`zt1YTK@tgIEX(0 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d0f1bd9b4393fd8255b1fde58a8b75d36fb67909 GIT binary patch literal 4341 zcmc&&X;f3!7T)I$$xQ%>l3)TT$Pkek1XMsuf-RpYYQp@b*jap*jBCfX$9ro8;Y&3?T@$CTd#X1XWGNr-##a~U*ak8G8Yg6 z2f+*v0VRMj09whpi^mVWOwzBOKJL|&T>m{MUw@Ije0LtNw8Ca?-;#A#&j>)k7YL1L zd!B$LU};!`J~g5fQX|#?z}3a*mO>cb3*ru>As|Fnz~YI7oB&y5;F1toY={6;rBqvx z>SZJj5mSE`N@PUm2m%b~hDUb*@|>rB$eBRELBQg$9YBfJGQu0~7?+Q-6;dM_A%%Xk zQKb?5d6icFh5(Zz=p;TM&{qnKS{5G>6dAW9P!=as3Z)03flHP}g~vxk$}#mL)$ftG z6&F9pH2#Y%{}Sl~bwR(nhu{rMen8KaUOi(uWvql1RvV$JO9EX@%fYC`5Wc&m2(Py2 z!IQR0u*`su4jqnyoW*?5+t~(D-|7LFs=$j;807vK3VtWv!Oi#OzzVj3!#*M?ea?e5 zyCi5`j3Tx|WCP8|%;4f4XV8u?h8J_*qpzw>AYVg>CN=5;@zfNkD0gs3C7`2P7dR8O zK!xuB?JIb&)z2EjvK?Vo*9usaqzRk?6}UaS3J#1)gz=^|=vtKq(rKyyezptfG+l$} zI!lQBNfjDLErRXKH6Um3B+M)ff~7~b;krKy_H`Xc54i|xuPQ*#XccH_o&ZGg3^1#G z32Cl6FuS@0L>1<+AJd0o^;$?7s)k1}9Xg!_*uzb>uw@P(M4vlDTK{)wMf_+uM(9FB z8VB-?NN7880*q3e!1FB&>T)N6IF|)S3uSQWzA{|6LBfr_X2`9)1axDsz`z|jI4C0YE|%1O9op94L6=LOB8#!n z*xsl>Sou>r#?iMYD~j5o`s`bz4IY?Hs~IYHRYG%H8lg!XgU+RPVC(B~*pjpm`_cL( zCOqnl?`u|s+JkLyKeGUaa%1obrt7e$%hwU1`Rc6x^EJpk^bD~#jm0AM{jrJmHRQ4j ze&n8(XHYlG6H$G3n4%?y#|s+3S?3L0vgqYLelmsR)1S+!$!FkAg)1oe&V=ys#@MsP z^^kS>41Q~#J!Cd7$6hv&=*FxIkYBR^)oH$jXysF=Ds2XgpWzOTUyZ}{Hl6_GnFH|c zl4`*1%%P$R~MOXW9`!f^R zI%x@5<=9L3T7PTOsB#+~Zsv<$gjldCS_bQ$xR7H~SAo%&jjYkP5qkMA96(kEW}i&Q z)mABjuSyYjP;j06;>0Xcc08O{T3N++5A#8X6YgN+#oFv8qQ%_7>mpn-Ihm-4^2ZWi zJEI!|_ePgabips#x(LV_i!`^Zx3S&UDl6_9dVogj7a?Yr z3g;TI6@D+!1Gg!Z?%h5V!ZqSvt+8e=*tVV*q3|VM`^XTzIUPZWzPbaR)6-!7lb!gL z@9I&z>@@_306J@+gqk~ium?FyM>&q!f|We7z_`C$gP>opqc;!32&`<4=$`#GVwC>YoCXzuo&#vR$ElRgQsNIUPq4p)CAf(60xPNJ=84Mc&_-H zyU;s6VPf>WS}ZQ3ioJY?COLDtGbT%r5he|V{Ch(eSe$paIkz}v?C4*+QRk^tOk?L+ zzV*$`u(IQrR(P)}CM-IS-;c{DeX}2<`Qr%6{+%zmt+5{s7Df{hD&t|lZ3Y=O=QtkR z3iv?EIXKhjjP~v>R6aR2hN`Ud$9=;&(4kz8QdDwrhiZgze-M-E7i+LN?Uj7+KF76s z#b+(6@jwle-l3e9EGm2c5iH?c80$W%h6QkS@M~#t&{*>( z6cJi{y2IapjgUPFuZEb_R5rm`2^b}G*q^!{r^e{}oZY{EBb%q~Qs_{!Ax%MO5z=+G zWMhW1mNZi+DczK%qVILYr@1sOXRKMk#X`w}^v&v1V;*-kAIR9M>9BzkNe^b`>A2+U z`?eg+`fQ?SsY{WxEW5zarzWJkr7UNs>C%==k>jDJ@wyOszi-Q;QblrE^ zWm%M6E-+eHpUjzj=xUK=SWjctS=q;osD!C`_rob1Lp>PgBm6&+j!Q54?S+ zF!k7j=1mIf_PJA!KWxiYw_U3^t)}+>SRy<$HjW(>9>mZ& zn!Ba4RR?ud-{^|J))u$Q+3g>%*^m@`7;@% zt>q%5Mr;@|`sY>J=p!=v+~n`bsPYGD8XgoG95+m{X*$}E#P^KF>Bcl2{d4%g+4Qf_ zigK+$$ckMS5)>g55Hu6l!~Q_%^)x(=syNWUBftt1eq3?^YuNZ>w#?Z5ei#Bw!1AzT zya}QFU>MSm(L0PBke(VHUyF$^-}pblf+{7_5PAHMe{G2W8S4LkxxV6m>L#h?8>muZ zitgxdZIOmhQR3w_$J=K3VhQ&Xri@rgq!Ni_F8yZ2T_Q=8NP{I#5=pScF<8DPr442= zBS(qEmFemENKen?!&oK)6G$>=wV|o$RC+LTwhJ?NOk~=bOhbp3NIaNE>dQ>!dWjh` z4-2rEZ64sgAU@D(T9~hoR|unz@TBuwwSsQs@f_tr=p<+vjwc^7x{v&0{k*Lr1D%5< zGR9xNJq?Ez-;uf3r=K!&H>MeeE>C?ppD=n&8YlC%it_cD9p&dEhfeP|cNm}B{EzdY zi$%*H@?lCTjSh5P5XUGat{#lN+$eEm|GBR6deg?8*{ye^0C8{rgB z?<2>__|x%eSLSGjEkURiEsI$h9~5W2EIQg)784VdsFe^D6B`;8sby|$Vs2t4^3VYI N@PbAHz?1&N>Rji%PXJ zTG1+MZBcP|YXh-MXq_I-X@yFY6y^KBT??a_d$%#}Muh|Z$YkMf;FxhU6!!dGGAsT4M8 z0CpZ69R)&oPP>(8L_r+_xszIDm!&i0Oqi@QO%^$&pj<&k^lHUVDbq{1+ZynchDf#Jdhj;x68D2@m9tjC{<8Q&A8y z7xk?YTAL@qZ*5ut z+zWuhn23m30xmRo!>*SHTwEh?zQ75KGD0zBeiVka7GPXi57;er!PVq?tm|Bg?g87# z>3R+EYTg9b`El@SK8>Os{b9JTL{sN+STae21;6e@bWJWMZt=vKOgUD!enoC{BxpR< z3D>%~;NXFt5LOLEU}GC9;=PdEuomh~L0H52VpZ2h%zV~>n;3#Cu_AZl{Wr02xGU6O z#-ifx_oSet3$_a0&{x=@+E0d-?c3ouGX^P-<=9cx8`>&4w$$iw{JJBKUXrV|RGpC`ZVjZ4MJA!4g8?ibe95+^QI3#@{JaI%E0^L~RWaW)pq#rmsa(jf?kc}~WE%;}-zU^o z$mKHMOs-e>Hd*%34B5(qKVrwQ6r#8_n6vNC@g?G3#CrXLB%(nJXMH@ZEj* zWm7yH(xXw>-Jkn${2t6Zv5&tzG8~l$CUI?hW#rD%d=5_Cr;)(YdZXPHOaeN zC8X^q9?*S(z_*6*?$hlVh}`GJ2fUxSpZ^`|1a@UAfjv-4KNF9RbmE_$Bg9Vj;1*r) z=LN+e5RMmQ@(v2Sj-BisIy*XbadvSPm2N7vdsj`j?mawudU|>LXnXng?$g)Lzh6LL zQ1Bc5LqZ1(92EBE;345}MM#lRL!)D2Y0N+NGZ?>r=CQ#iuJ*)vjK%_Oo@Lf3beU#=1?Lx70U$x%I1U+Z%W6{Cd~!J$su{ zn!nk%|G>dRhg*&uJ$C%W$y2A#oIQ8`!o^FMuUu{Y_S#>y=TsVzv*LMNdtkKF=NlNd z=bD7Ve3Q9DZecD1O^T5FdF3u=cNn6HGvc8zGSJu5lT%31)Wmr>p( zp;Fjn7|OqVN|j$k`MX_Thw_fkF~2a^kY~0wEGy1mbJwnLr#Szz_1E}(Rf6iO5UAv) z?8&)$ohZ=0=EJ?v9(^=PTU;V&6GjN9YF}khZWV6dhM3&%^%pr&o-~+g3o6T9uolWu z54Jn;ai{h@n0<@mKPC&Mpw_69{>Q!hPwJ`vmlv!3vtE+&wo)l*70N0%S3s>%*-NRZ z!_&g74@Zs5nJ2|aQl1o*XE}?c0>juUN|NH)O!u~#?kr0a zv%m~V!{6x}5HOG~qLbs;KB|;%=}=0y?NdjMV*3O( zSW~xzZjEOOJ&QE!(n5^sW0H*-V=SptelTizbbn zW=HwyQe>(1o-DyKEh-~fDrr@+Bxyajq@J#iDWQHWc`|)!pQ^H}u}VVaQKT!LT9Rw_ p&n_zR*A*8VOFgFL7Mt>o29Kc7enI^L)kzxQ`3Fq`kV5}0{0G#3JWBuo diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1cc7b2c26517de57e1a2b4fe464578f4f87a314a GIT binary patch literal 4035 zcmc&%d011&7N41WZ*oJ3AQDIb1<`~>kVV0zh$QTbia~{75!r;u7Fo3_1r&`tAd0wy z8>neokpnB z0fkXQ^6y;nEvOs;7XfO~q&5JQIZytOGmZlm1Z)=52@Ip8G-;?d+T^2bdE}6UN8-Qh z=uj2>^_CL;UI9{8(1U-dz?2gnAC?>&5tkGk9+DKokx>U(cyL&JOmb|TvZev3a}Nre zG2v5W@Lz=dWTn}<+?~2U+&hGFKD9CRCy_0kB``%5^+^Q4X(cbuQ@TTS<#xivzN(;Kc(fYU082F&6&DOBQlLKcr)`4$? z39u=>8)ejB;Hs51+!AOH9@`HhUb#EAuR$HKwv7PQ2V208if!Qi#sqA%)tAU&@)Sar ztIrr{t%5ePv-sXD27@pSLMDz?L|9uOvAgjJ@T%j1eQ&*x`b2~!b8CV7nAf1)_Kw&6(US!=U|q*q>^nb4plDcx25Sj;c}5$^t(pf<8V-VL z?G#jzYsmy= zSM-Qo2j-)_)i$u!qy{T#e-A%De;;!^JDNErD+QJ5wqt2Q_Jl=QHWp(o#m<35Fts2I zeA(|oXr(U$7N_eN!@q}c@D~g$?{x>()2Cs2%hW;PsYUoZZYOc-#0(;&GDfqgtU}v6 zIshI?xsFB&jhTC8rtn=S9}`VjftSVyq10FI@ba%WhCj({W3B00$T|iebMo|Cm|kfb z>O0@xgu_h>ppAA8L(Tso8GD0dt>~#GOZ`T}x^Jz~{M~F;!gw=AwMGTHc0+(xx@ZrV zvyY?g3;Id_Y%vkTD*^(GrOXr}1ysJ@j^;0KV6M?17}YP|;Hj0%V5edtNAtcpNSz{K zEV<fpfwSlv~ zyP1lR&{4&onScp*9up%b4fq7OL!zbx;Qa+f*@*SAMZZ`gM z7_mwn48w4&y{w1%(jO2D#Z$qf>Ud^X_IIFmQak!4rwcpoXrv+7l#koazKZ<>FEIiI z-l)jL0~?GXKt!{?3%E#0Xoes(aN;hC%^ zy!HdQGb+VI?pKYH)>beVZ8s!b7rCR5lo0&rYOealG1HhX5uW&WtV2xsn?Bg{bviQ0 zuh+J}vI#8dK5pbCm;iYNt=O44hcM~-2XM|voOJw`l*q0dfbVw5@z@cgK$Ps^(qGS@g-oXG&P@4z)pFKyhZ`mN#k)yLBj|=-=Jw)#p ztYOW#p^iOklf#iWcR@oxG3pBv;$LXbU>wkPg^`jp>Jyz?MRK!A3vN2|kFQglZaojD zhfh%fZlq0Z*l(90bR@uwVUXg|nK)|+KqSv;e|iQ+YMBPq@6TAp)HL?k;Z&HJ#pc;Y z_0|`zTB~6sR`5hctJjS%l@tdw6lHDDwGKVELv&!xCjH3?4|*F8tleVhlu7c%2NgME zJaSE?jR)6lHSsO-C=ef7zinJVRa9T&p$+*~3mX-D=ffKdCPZ}?OPdaF+BHdj>s*2J zkLE%Pg?DIOl+Swz20*S74uIg^j~c$%Rd%epUKj9E8kHWb}rX!ertKb ziO5Sup1a&CcAkvxt&&}9t=M%cUS8kMa#uws($Y^)PjAi9pK~;ObmDz2GjSUgJ86K5 zUA;M|*8Arh4cCUJsjqi0&Eup!J>|8Rig~YV*W6>xY2H&Fcq-xXz{_#H$3xq4Eu(+2xLUpV`>NQ7zgS*BE$kHLQ6`mj6`M!{Y77t!T0niA5sOZ0esTZ;>cf zB#sohiA0ei=Sbz9m{OQQpPWS^PkN{Lp`GHC$3)r#?MO81i*Z&~lc|I2Ob?oOPNi2@ zdZAp4L_YK)meQL%d|}T$=Zy7gg!X)K&RJQ+MK|5P!RPX~4|* zzyM|GRJ_@$`n=|RoDbD3ivE-j-Ab`M+vZJ(c~YnoZGA>qR0L z`sf}*_dp)#Hf3?B?NX{I6KN%?$*FYzsT5U1q3*;=ODc1!I!Z1TiBnZ`QYK4VQ92@J zCsjtuB31Xw(xtI($y6L=owPmWpE9LKqtXOCBY8-|lH`aa%P_gzG9)1(KGi5CA|X*0 cA7^A^Ki+1%HQ&bofDeD5j{5cLOZ|NRH{sIBfdBvi literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet deleted file mode 100644 index d0c68b087a629133e6561c3090078ae686a5e13e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2537 zcmc&0YfO_@^xXS>E$v1pB42G;b*Q7FE6@^QNEq`gZ>R`3g|aCYT4}MgtPeo9DQ+{t zg+(-)VKl;`G2&~^u>=7v>X@RNuf5o^M7C^G=hP*bP1&Nm_v^|-<=-+-+I!DE?{n@w zCtp5asN+~JgbOopn1fINiOSfGOTsm~-{61?<>VSd58=d|Sd{=#k^oOJi~)c&0+SUZ zghAbkR2-{}2XTl}AuTgG%%xV7+sa`%mNy}JGg2q2CF5CAYDg%+VIWHg*#I64jU_Y+ z4~C|Y1UMs@oP_%@l_>ukE~eq$V}`?xl9`?5-YSd3Q))JOOfnlmZ06E(r`=oS5EAG> z@*bqViPgujYQ)b7Hjc+frd1!ry%S{M29Qwn9_F?WWo*cUg@3O{?bZ+3#Bscx@*G-^FUthLQxhkuZoA`!kuAG&-FT+LKcD{^vwChmU%uc4Q?>>6vvnx!BZ(7idbv1b3(2!3i!U(YD}(8yyo$J+NJDRRxMf&+ zg$kAylYr-Uw}@=ay$c9Y!T@4}rqtIjn-`^cs^RUjQY*p ztZB2i=Iz;B*;{#J>7izO#=dVI9orqJuHRkV(^{RS3Eyyfm#fd7)N)~uC+9%P2mSkN z&fZw%8#q*Fh}m}jYWu3s9bK2z=O(B)>~G7gvF}`3WpmILwYXWk&Fu-Y*e$dN$b%n1 z@=H?rk5YBBO#PdPHtpzK9{H-r%#nWD!2}*|)&NH#31GB!0{94(D35N|{?PH7b;5A# z+AWR>kAL3y+x7HBb@N2ht`A!u$LFIGQ#QtfLhLTLTB=MOOU6MPxX!A3^O4jvq`G|F9ITY%GO%UeEJ<0r^wN<@s7( zU%_Yad~i#^ORNU5ULIMFUS_^O^Qt*Pq3H!p5;@6^@Qu8_)_+ezuvCl4@Iod1NO7+s83X;1+S|Dc3yDK>YaWw^T+S=c|Y%aInTXb_w~KM*ZrK2`+Ho(t_A`^;3Eid z5=aYR6o7t1MnIyG^~8rC6&vGskz)kU);Sbfn--)Ve;z&LH#i0zHP_R@#J^rV`m*)S z^ZY9d*9btMAW+m5@TfnAfT3&*+N>`zgsd-K03fA}Q6&%Ic-mBB&38yyK!}Wh!4nDD ze!dX_OM`u5f(4iYPoW1XJV%;oNR#zX!9=>$u0enSRq&_^fa<3pfrk^-rD<3Y2-pa4 zHj@C7%K`@VNK+4&%QE9p?zG8`RCJ7Q2nbnUW`r#ChmQh9>aU~J_+AvXq5Lr(rx0&-8#5=?E4n1awx62ey_P$n!0rV>u7Dc^O0KW-FM} zwHy{DssekbJlt|FheI+6prTieu9Pby^~Mt5Z?*^Z#w!q2I~5{cDnNtGBG|D^8MX}l z2sYV);8QjZ+7~n6Kvxxd%t26dSqg5;$wPC~SRnRUgMQ6hNOe#L=ZZWKl^DQ5OdIwo z)xhe{70?Sa;ijDcd%WHhGUoC@bkq(~2cDqiadJ>ejEC@4He~6N&~oe;=&rT}myZmn zP1gaL#_sfC@!x4}e>kiqehv0L3 zE9|x_f&;V7;qe{}4JJQ@iD_eCT)83q@_H>CzNH1@BSxWj2Y*G10lg?z8$ncrI@oS= zhUkj_`R+GXKS02 zB{>~Xapof$2TqvDIek>@z(w<#8=z4$8l6q)#Mac|kdf$(b-LNW3*)z#u*?oW&?F7U z$J|j(K?^)c+XK~xZu!qyvKaJ!A7pAW|4IfGzfa_@kVJefbS4-+()5X*H z4L5U0Yg&fAttZj7Ip@L5OoOOZeG4`Gc&t3t8dR(uq2cNnoS!j>G}avhS(`z4w6p?n zGkrW#U;(VSG~%Ss4#vmK82GtmE$nHnB6AOUWBY3iP`y?iKIO!(4RFESNu*I z+=e!xr;cY)>?k%0IBiCrj9!3|+KVwQ^J>!nycd~Oy%6bKS0szuKBLmnZMbgfcD&xv z4z296gIzIc_~^ymc%tkB6drO9Tk&!O_H0=)Q9H*4v7VY?QL-n|=ZD43(mxm<0`T)2G=k6rJE)$~q8A5Mi6qN{hoWo9bO@86AIdQyive1C(WU_fUk za8XmI2ljA_kBrr*3=I1r9OJyd0)fA`qYsZm39N9X=)U+J>7>v18lsO@Rrir3|Oo zMXac|No<#a8B#rU`5bbcG|1ot-d1{(`Q9B!A4?N>UK7de*nR`*r?p{?nH_kgxtgrT z*4>2RyxX{^d=6)+XbGpy)LuYZFH)VIBrDz9#3Q};O0hG~o#Vxm@sLpU9g2xj;HbKc zWra(f!1H2qj~2Z^r*yZMY+{ru^Y>o%XX=fHp}cV-*{hQa(y=c{^#=y_NQCy8|*@6 z)bHKs=E)RHdG{H<@%62+qO(#htVaP8=CtAuVzWrk&3$P87=mT~$&=jPFo1@#qlj>M z6*y?Rkqn(%g$JDjd~o$yIQ`TP?a$4Y{c&_OtF(48?it30PT69#T0R}Os6ZI!xhAP} zp&FapQOXDRvmB#$e1?Cu6ROwwgtjzqVr`yZgvFl?Wjr7iF+YwvekC;)Ztg2V9U*>D z_{l~3_RkZr2gc{nHPKq^;#!f^W<@iSb1;pmmbF->xn~)Y;@6?I+z_~y_ZZ*BQ01ob z717SvAMjNL_t3?d-HeH~u23Z4vGb&BqzZa-fK~QcuGY1PHHEVf^}l|DIZbSa1aq#! zsvQE<{o6Ba;QRp0xi5`hYOW$=_P+*IH%qL0u_p07e-7gi-v$M_CXv)NPu35+A zjkC|T$XlB#B{U4~I+M3Tr6_l;F-H z&$EZO=1z-xd?Cl`NX9z3DD#lr_Hhn-tc$Y> z9kUHXZl5du;mF*BmYWo9N_G^x6}c9BwUq2Eb+7bm+hbF@tK9oUbpP#^(%nawo>|LQ zvMtN5^1qO!s-amIv0wV6HaT35I3KP0e(k~lm z0>8g|@#NOk2eVQTDKp;Vr-$P8Y{BG!uAd%RrO9bH7kG3$wn-NmMf7xaJh9t8)+S@T zXJ?PY4xI%R0o|Rwj@kYOf$asJ*M6R}cShXHp6+W;=N=TN%4m39@0(ZPuyb?H6ia`JRy@|IXju(R2wGvF_4=qUANl1 z110kxQJ@AdQ8IB+gPY81FT8`2$t1XCND+A@YRx)J0^OcN}qA;=PP?nR}M>?)bw=}IOdvfHK`rA zQRZEr#5Mi#d3R0dol{#)kH5UvuySyiYj)z7N9VRE&B`!4`KI@Bu4%H&%({2|*Gs)m zXUwes^~K%#m7itI8$P`5xs*F=n|b5M_ff;|u{ZBA&6+970&GNI*nfK!XNQMG&{`uf zh6oFZiDd?c1=81a>Q0^_qp(X~p-5LVTT?TgNsEs^fBpXU;_V}F{wkeV_0m8hA`l}f z)zQUe5&nqHGxPgHL-f&KV9Nd~H>hP26SBVe&~oFiqm}_W%3r3 z3h8btCjZY~C50Me1)e5LVXq+{K%^{`7Q4F6bvGG#vxEl;J=%=KmSVAZ z9`(@XC>AG(ErZ0iVsVhzDo8SBNjc1+TUKJR16@=7FE!PZG-K!pbRhBE?)l8_u3mr(Z%w~eE4Bs}T$RD5bFeKaGUAXJO;jb0HK7(2;7 pDr%B%baZ5bT6|!1Oh{yent}0TgUR|LCuM-IA7~^1T&Ul@{sjY`8d(4U literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet deleted file mode 100644 index d10d236054191cfdbad484e674d3f07a75c5e0ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3413 zcmc&%cUV-%7C&>#?xG+ddzW2Y5Ge~N$U=lz5V=&56%h-?N|Pe&u5@DoH6*A6BhP{j zG}u5PikK)x1cSXqY{Zw?D?VG~8Ka42?p;xN>L2g>-uH4on3*$k%I}=H+}|aH4Rv8C zM#&6NF#-m90!Swo#%8 zr>#`fX$1h?YzRDx5J7GPMKh8!l*|nTMlz{hbd)-FNNWMT&5R=wqQ| zju;y-1mM94PXJi=Vg{-(1`ewwg+LDhsVDRT8lQ3^&`q367qn7Q=ZRE=UfW1;1@G_T z!`BrM1bG7#2dOs|*c-%ZVlqxPTW$guf-9k{M;2H(SECb^a%A050pdl&!MfoDXm<|)^#cji z_m~LFrpaOPAK!sraU4wDZUrYN381uXC%V=RLCx{*aM9QVnw$CnwayomHBTTf&>DiP zHh`?c1=bVxu&!4P%z0G>H(@B84`7IEpAUh;QDTsN8vuFj*HKc2G3=mhASF)^iX3TZ z*|iHC=lDa&a{=rwZ~#St0JaxL!I8^`aOeyTXG)Z4MEM4=NjnBFX8D8l9cMTjT?Ch3 zsv$c)42tv0;8o^9SQ)StN=JCYwbcZAk#igR<(q?5r7JvoJRde+umc-)Pt>vgDKd+_ ziPG&6Xw}x>za$t^tHu&F2|KBR@|W;i)c~~5skpnyDhM|B*-T9}D+QZLwp2)22dWJ` z25keEQWaBLyZ3cINq!aR050cmLJD<^Xjr}n{cd-haMiR6vrnIdmU%y;qZ{raS5qxf zqKSpyYd4a5_MUXbs?$)l|2eV(6*2e+B`OazK%<-Mp+S*~4&=5H^J+;b%$`8Bj`4&0 z?oSBm_5iZ9Ngv#Ig`t{_EpR!1IlL-JCHpw%5FP^u(TPR91nmc_kxODDRgxzVjF~&E zTd}rRw`7Y;^xEc!Aa+7@>GKhU{s4l^VCo>i`WYN?z0vK?-A`#T7{I{Ujqt1@5DX&x zAlbr+csRKh79MRR&yDef{HAHdlR6rm8F>giJZ!1my`MmhIFqQ%^92jvAgDiOPKpa( zAln7Iz|ijnT%A${q=%B!FkT?a$fx#BT_$+GG!1UG%!kznchYOhCJ-exE~w6~mi%PP zpX9{jw@J^&K0@ofOk$_=1>xLD?(}})13Fn5K^}%Q7`!S5a_$bNd*;pr$367|;|mBq zc}fCAZ35-6p`_VNeHin{LUcRm07~zvhhiH&=)I}q2-k*XhqGpM z`H@#>o9PkKaog8qT~Gj;)fNCN((*~uNk5Um@h0=xy1@%G$n8);+&>*S zXz&n^p`OFMSZ^O+KmUN?fg^%cBZEhUj1C<$cHH=|@QBC>6DLicGBqkXCN^$bd_rQ< z^yCz^Mmr-lEj=SMD|_Z=vu4kko0B_l{({f*zW6eK;iAO_UllI-dTG%=mMvegvUt_% zHEY+El&;_KP1(jxo40H&uh_P|vg+F%JF9oq?EdciJ$q~G>Kpbo?r&;7@I%YNLx+zX zJ$C%W$x}a`Zas7M-1!S_KVAIi&%a!{ti8hJqqX|oXJ4I*wBl>mxn!MoAfj-%Z>^Di9jJ{Ro)(#jv|z&}D-cuZ^IR~+~W z#%+8B6vkTp4vZGZ`}b2GtN9h9J9WP`>@(NGGn`P~2}zM}E@ucG#SQOD5PB(zYL4#W z(x~Lbv~*!yavVnAM&FK{`+}$~Pn}s?!G=dBky?h^6iI z{66VnJ6@!sPFs%j-rvPWZ%Gf+_72OBk)Gij3QUeu$EWM4{34QELW+{!idG9ud+&$; zNYW40DJyUTsUR&TAuc6~p)ljy67PIC*veJgO}sGogBv03+H6LkGw#aDoZZ`NUcKC|M;usZ~DaezdX2q+mj?|mrFzn zk)*X3epAZiQhhcwbX3@2-Ge1vPdRhq#(J|XI~xCT62!7utam)?&$98XPdvZpjSWU} zD<75(py6EM0jWdcb*c0@y$%a10T2Wz&B0BZZapO3B1UKkP=b`KHOviiP=}}>BnuzdVO=LJV}Jc9Mk(%lmemh-|kEZ<3;6JMlmpD#Tk#Xke*;p^o5aeVB` zU5(BXq?X#K)EOCZ=}s|Pty5HLswT@aGcGkPQKPnWaqsWaUnx_`0p9#TJp~Yg|DpN| DjS{cLo_o*sJNJA! zC*gdA8%J|~+!!s#a?l6BwjgJ6)wJs~Pv+Az>f*J1r#!^yP;%e0z;n$_KNtRC5M zz*%w%7f!~Rb0SV;;|We|uZ)82l>z|iU`%Ei!qgf{)sAd9TIm5IbEQHOr%O+mk*v#3 z=CE8QZ$|PnNR_N&-=s?ELVN@c1DVXp0zl|kE=Ws!5PVkC6i5)z5-|;oDGMg1tdERv z3B$s;2%>VX-|t>4*xnljOZ~^eU(K&uBMs zc5S7W2fqz&9oHd^K8qVFj-khn=dgRmZSkD8RyeV+3!T}12f15i;w>2o@VIs-C2{g) z_OEY)Lyb?6J!rA_=W4Vs*c44@u7?Iy7HV10ju+NaP&Q{OZl5?7?s`7JijM-RtxYoU zJQ|5=cAkLmN-E%aX%=PYT8MqTMlmU?tVOp^RwK8R*JyDRuI!?!CY`TRxh&n2B8N(lVsEF0`9398#j-;Ab9XMzi zIy(yO$?L)cRQBR6DNd-TRY~!~3+VkB({cWzK=k#~D;D<_osukRiD=FxxWA{(AL-l9XcVhCP12S= zr{H+_CUJ=_ZouAm9AWsiyG-BVTsE zK~!tcF>~*a%gndR{g{@31d+8z3F;U-naS7~q?~lCl1fSjbah$mTmF5@FkYR8hjw8I z$O)3YGdxgot+q-^e(oA&+%~Zb*C_q{cbXR5SI`188QVm*1IR)>`#VB28MwcU5RFn= zQV5t;_-$LTkYf8dMKx|)SR^*L4_fE9eNnNA!acdOaeL8+X11CV1z)vzX6DuzT= zCtqydwXD)Lwz)*_>Wb#{ zHVYkBY_Xn~byK4UxKG&T6P>1dbvMy>b}P{b{rc{;(3=OWB&wXW&Jg4g4*l@mfMV8WEtE(IHba5dh?p#uT^22edc2>%DEaE~tTw*>>z1&<8`pvNS8VY{Fa&4zN?nG&S|FznS?lZsKG`dJW?$U@^7e1Qa zvUc3Jp6QqC=TlZ;U0zw&&u#Xv47==|ceksd$ZGr*pAWnVq=?bPXy`t8NSS#3t z^fu1l5n~VU7Fp=&?<8Ry0Bq| z@GKwyoR3u-2HJ4gSYv};E=-cNlysrqCT7#ADcL#V#MDHgXpsulD3xF8FaKPjsfBHljOB!$dB`S(Jiov7#IYdq~hw3kW>KizpW$%U+NPSY^NdZ02FL5uTXJcJtazh z>m9J6cQ|AJrh;qsqWq>NrYGeX`-$5~{sU4Svr}EOB_;P&^=~lxI;_f44^W7*+z|+Tzdr?sIg!>!kfs%eXAwFaOf2+1 zk3&DA-&W9XF#MljK`v9;D7F9N*6L4cZ~iaItNODtNxo{3T&7~>KlZl-P}(SDd_=_f zNN;0eDYntB!tmfVJkL)cUtxsud_J#9;sba-iT6)3oM{NdIAQ0{^TEPO=-!o3hHbW> zK~Thx|6qu#s~6df4GR)z|9oK{D@;T+&ufKA6D^R2dtNQjv*X;u+~Ojq1P&R|y>!In$pRfB zOh)Ji*Ny(nCTE%)U8F}wbW~VIOq2mSu{XhpFJ#io{*Yji@dtl|P--$00;l8%7ksc* z;5RVk_u8KjY=}1rHW|I4=lLB^I|iJBJkd`?g{Lucg2Faam-T*b sV$RU`%*>&>tgMWD+q}fA?39djTQ|>PZo||{tqs7757g5D;pE@luYn)u8UO$Q literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet deleted file mode 100644 index d9326808f106cc0c8551356f9e11397011237c29..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2753 zcmc&$d2mxj82@(P(X_Ns+Pv^m0u*RMfgVY1OTkchDZNAKK`AMek~X0&q{$;mDFx(| znc^S_10$dVRlKOk$l(l>VmT~_Iw+{XIF61pDuM{kz$k~;?h64b@P{*we3R_%x8MEk z?k_2pN=*VQ#0qg%GC>qTq}gBo>h$ktooYUGY0Ds!{ak+mgeXCiAn-yY`SQASh*t%5 zL{(4=077Fi8F_>l-taEc39L2^_(-i7T4Q%Rs_X5(dI4*AO&ij@g$#KH)#E&A6KF01 z1~NpF34riMA<0U@(-cLPfly>-iGUIpn5ec|5bx(Hs)7cC3lRO zI54!c_mYo~aCmB)8l7%`wZrbWM>LQ{gQL3E<7#SjE9HENG+!aZCdP0Y>;6T^M;7^( zO4y^a3eRDs&JFC1Xq>aQ2)w%{<9+qJ+0{G#gx_|hqW2Rw^->Kfgf|Ai!Oo1?260t- zws`vu)RF%YT*z3*wpX9+H6r;m^L+krFrE7z8rhHVp^f{|tzjSI6wl?5`Dac;_p-0h zhp+sEQu@hwtH%MiI$mQ!<1@MT&1YcO;oE2kSn-$-jA%!G1S&b&35N_`bZqHaysU$P zHS?$8vt`-vbNY2G?#yMj9SMi@y%lKRYu)fo>qhu%wU-&3yabywMsp48`}3EN??I-9 zE_Ulmo-bQGE^L$BKg>1oYwqQv*Fc?!xNWx!aCjIhWDKoCF_?*<1@tF6D7i zOIsjee<$DfJVMuhC3nsKIvB@|Wnx;wq3rinNdICtTr50>{86FE(PieoOg5smmFW<= zF&en|Dm*N64_AAlk}K#GVf~&dXng_)!#|JVcAWYPz1{B=lkoNn%)$$~aKKxR4;JPk za?>wxLI;x|v~l+zrE=^dOAm<96&%}#Ow6C3A~b;erG>29I962-2s13lhp0lsdWA>y z?i1NJDq1aQ`ia_@{<;AJ2gME^GBi$a7#2T##K?riq-3Kh<*C%P^idh3$2@Hwn>lX0 zWXYP4os*lFUr=bBSTw1)q_k{u`IL&Ow#sSKt7cTswAa)+oU`ic8)naOHM%_@3$ki% zxR;YzpFdLub<;fJyIGJU=eH~vwa_91EaGH%MvxO069-F(gQd00oDniC-$G>iBr>cZ z3K<u@Y0PY}GstZu;B< zAD7cz=MUiLNu>D{8E&c#`_%++9`^p2n2#z@yE+X--d9`iY_tn3fmuC%0Il^p>(1!$ z1aKh@i#@w6@B!tXcu4i`_g}yQZ#7eER#eV{B044!f@h+SA^u~jeug(ysRI>}D$mGGn-Lbf1ZT3$~10y+drS5gn;Gl%S1{Pv18k8Ns^r*f)NI*C^j zsIOph&wNO;2tAMwZKXwaFn&qC(r?miLc!Kc(geE9b+|0Ko0p^!a`Vo5S5)e*T(vNQ+INlBJk5gkeCWWY!%GO(|dZfneG zB5{;DsXg&eOzF`CG(n7&?cTXfPJd#pEGOE%UQcuEJg3*!;Bm*A(vwU{My*u`aPI@1 Libh}`HL9r20P(g|)7Kl=! zQlyCm5Cd4iC>SiL#2Cdq6;R*3Gr2Ozgg&OlOp5Jrz)9m1N)uYmgYUFOYN`5U7^|RV|ul$X>sWNr$?SF zyRp0UY-uka_=pIt zvHg6*{Z|J0MhEdRMV{g#r1%RGijk1@PtF9oRMf!7fNJEZ4nUHH-RCR_1Z)HhHj@Au z85WF_VVTQPTC~VXJKFS%xRqFAJ4!6{hmInZ%U@Ti%|CL%q`9P%pL5Zc@sC&)8x{~A zv(n!;##bho8U*{VTon--8x}6fE*(>RhlKBN;XYjTe}Vb8Dc7s>ThyKSzcD3e_dxbd zAJZmvEvzxy1|{7h=&oH2y7@Zr@D>l=?9zf~&C_Ak3~ zcr^}#^cTV4UH$=Xemnt;z&UW-Qve0O$-#z$BD5unhpiFJfrc}NaP_b)ObOG2SFRt? zjZ%HcP*pm{m4v-Tt?Y_|i=+UpQmX$s-LDniY;Ww39xD(o2j0TyHhfY0eE z(7v1jN4n3Vehz|)HZI&9FAt3klYz*w2E&RWNVeAmm(qL?oHT+wOdE1kDqz#6Qg{sW z;I=Iv>;HBx>~d8A!S}Y1{QMzW6FVNx5K|#6nGG4bBs7iElZ7i?By{B(B8L>6Zb0AHyLb#sjjF3*l4TWjJ743`ZR1 zL;pbxjcgeJ9q}ZXQeq6hy-I>(cc#JA@CoQm-oKEt|6>%RjUX~y6Kr<7z@pJKv_D#m zr=R!;Z%a+lMZGL8YlO*zS9^&8$0n2@+lO}APY0vhk0Fd` z#cDIpqnHV76c&6ROXz5a=EQrbC4UeZPmIKJBm80QuPGQ?dp>zGy8}w=htU*p!sc8u zL?`TJ(89(Vs1-(`rmdY=VigX%5|&~wW)5LWr)}{g4Lqng+6;Z-O!$-@g-!JqN@q938ZgIrrWhW}uLfJqcW}-4 z5$EZ%S)>BUz*igV;N3}kknvgop=x^A%N13S)>4PxS~MTT4Xd%CY7%uhUxAFWC8$z; z2qI-Kpps;3P_uS~nj4dFt!?EXyI=%`>uSEVsf4RZpwJPh9(imbDGw^Xn!vHIw78QxmV@o0rcdbx-ZVLk+#~s}K!yvRA>D zXLjU-t?NMdd<|p#9fXGdg#-8&f#KKlaOHJ8@RHBwjPl#bbLGyY@7Ykff>R|5jv=1t zc-%cqO*n5K~566~j5E$oP*1sTab zgYCU^NzP-BB^jzz09v{U%s4U*&VJgDE#1(-OpzrSDrRC7e|8;e7n{n+4a|V}+3t+B z{Wi!mHxsS=_7LpqQHSVgMUJ}LWR}6+-5hRHHMYk46?)(>i#7HB01D#ha@xvfGMDU0 zlndjY$15J|pm!I;2*Hhe;5IKA7C$?Hw?3>w9lmcNFbGin3>nnW>4`nq;WN%^!Y(ZT zu`$N^{W=7^Z%6O?LkO&BgP?c*bz+a3CdoaZiu;ZFV<^R)75H=p3=TAK^zLC$=XRJW z_VrUMo;wXp9=#ytP4=RErZJ|XvKwcnyac<$TVY1>DTY(bGFIf!X13e&+1y7}1srl4 z560mHe(lt4=IWOP!ifl&&zSY|D6s^RZ)Q!aAvULSmt0F?G3;OQjK%x< zGSOt?&+xaNi1pfPquUF3SB`OZu$g-;!VR{P_3D(eU9Q!T|WA!Mrnf>LsO^hAy zPDwkacs7DHskjJ!sfb{zzLo*4od*c-f;POR$AA%yw~-bOTEyua1a3IAPq`<59y={E z1vTSU%o(n~qD_N$A!nZrK1ux}Fv4Fl6Q^8;l8`9u% z_n_MswqmLW>J?_*+zD$t&uBPknINU?%lO5`MYz}Yr)cpcf;Inx7rCeAIU3E1B*Ns? zAjs-C3UNJ)2VMgFz(akwzCH)@H0tL4FfoetwYM_z3S~p5>&Y>2JLe?o2=)WB z8&1UCCo{0VnU_$PAPH+p5^%RGo0FV8F;gRB`MAbMs}WbB3MI(|Lsx!3zMr8kldPbO zGGo5OHy*u@TA~jybSmA!!sj6)pI5;>`ZybOPt4-ho2!BeX9;@t>NOTN%!T;*GKw4b z@lns)LG06t6t;6O4}Wzf5>0w=2&pf!#CnzsiS-K3j6#J4DA0W~IWn6Z?`z3q`&_!q z(yz`^=>2qs!TxZE9r{QS>x;=C^X8_KBb%C_Zon4h9?FtERk<8@?Kq_X9!(sxHwuiR z4{p3-z8sr%`qTJI_hOclP9r?O@*G+l&l2NuU(wf=dLBQQr^~PB{O86+&_}C8&_9Tj zAl5tRX zeg3voSq)3ElBnR@GSyyP+_3x4K0VOJI3ayjRmM zL))wIXxewv+zRZnEsM5i>Ufp~^)wdkIAGw@C>B^9OV2h5>MZtZI=1uBtjPYW*;dDQ z<;+cZ{nYDBs;m0fTY#KkDt4P}l`5y@ULjtnX}eovL568sMw-KZ{S)>3?7M_6Ip&)% zzQavz_adhp%S?w`GnSMrt4-oN-q!W5487l&>3Bzfh1>e~!=!H38Q)9W1#?q$v&#b7 z4$j`d&^uTWeDkDNb&B4hAHwccts7zJA3neKQA_sRRQ;T)*kA5m*uXT%ty%Zxc~5n! z!I6ubMu*>zFb(tS6A7-ex!5qjF`1__XCup~;8L1`j(3gNsPOVm)w%0OS;j}N?w#s* z&^*n!s3lW(Y1Kxy$+5O<(~!G0X(q?pbIdmkkFrgRf6TYup*(N9>50xFr$g(bIW)R! zs|0;c@xmgPO}1vaH7A!0E^ppWqkB~jkDSX4xs$kpyLsLp=t(l%F?8*?(=>X<2Pt~1 z1l^&0485E}x87k=(w?(s!|*b$1ignsuPFKHT;#)+9LIB|_p1|zp5Jdh_kCa8R-CKm zbpFiGO=5YS6>aCwKD@F=(=pfSLfPZiER&GlwhQG0H}b4@s5w_vJnPaaUUX{3^{UFj zJ7-q4<~mpZ@ZzWQ(SyC$tIxgctKY^|cd0r5>fzPzbXK<4To@YovC#2|OKsKfgWcsJ zeeJc?Z(q{z+>536fBOPn_u?3Qg8BQ`f55+Q9gxBYzR9yX?ooHqbfim)UOV#kPv}j3 ze??cTyAb_;sL=7aD}_GX{i08bE^ISVJUW4F{pjB%;I`)1f)YUsoxmFFDA*S^vaAWIr>K(17s37%9i1Sja~t+&WXZ-;81* zfA#+lsDGEYAl(d<7}2YO0>XUx1jWR(us<<6Emfz}@)i`c1T#$O%i{AHQspm660P^2 zW8q_bhLbsMO(;o@h$ej)g)nwhTFR7v4cfm1_*ZVLZ9l<)uLNf!RM z9wo&m#)>>4OR>UO0YIRt#1px@yL!x#K3Ph6gaIvPB1@4-w2=DJ;wTcui!1|0HX>1= z$SP2>W=SbH(<3X9$e!-0`B+cQB*SRh1MNuUx?ab?;45mdz{QT9TgB7u0=l7Gi$qRz zW9datCHo>ndLH6u>|*5Su_V^tW_E~|r+W~+ALd5Iw{8v9Nc>qzoKQhfBK0R3(tBUn zM|*pih5OqEihOB%2|Se!C0=85_iw(T=MHouO&Q<6G` zw@{j&!{RUFp^8PxpW>lQX&LEnyCjC*5ZOD?`Vyu1G5m%0l6q6crev)2B9S#cv<;Lt5&&-0e_;OxMuIRh literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet deleted file mode 100644 index c7bee26c70d25045c9d71709e8ed0d492f391026..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3122 zcmc&%c~nzZ8vot*USd!X1YSrmDzXGrkflPED)oVEvWQqka1DzHk_1T*thKn57L{sc zw4zneS{GEPRqVJxU2)%6>dYLcb{w};tD~p(Xb0!sq_I-!@+{FVLTDDLDfmvCSH$jHZhG!^-w zb5Y+ap|yDu{MYos_m}MPXrT{oxAa0*Z&$K@gAohHyTWH#3yk0S0D&%e(3wN!4|&Mg z{%>4ttAji@92-Wf@%eox%vd3jvLbsfUmcG9TZ3?PWh^}P0eCR1jht!dizmv9@ywx(99} zr|LDtt9cV#=f}aT`4kFw^n>A^5>1`QV#y>87X0!xqN{T-af>HTkCUUe^(%6{BSGWI zPPp2|1qb%`gs^G|f*K#9Jl+e*4Qrs@6pYoJA69j3#LQ<6xPd{q94m6yKX?lZhq*%i zWh~0?d{6R=yI`x}4Sl&Cs{CbW*}fhAGh>kQM2;Pmy`Zg>V@tIT$F4cz$ORcL)C7^C zb!*^lI)SG%V&HWv5EnD6aIMXNQga%r%h%%Bw8K~yyAicRBXE5MN1m45M4yUo@T?EP z{ReZf{)#WW4bJ4z>Ys>v_6=h8BPcX@A!boBM*K35d}^xTE9=_uxS=2UCZM{L!ZQKu zd#)EIy4S*O+_!wn+DD|d|6<{0!a-8%c#V7<-wVN)Z$K}c;F_22BxYwjqR;yqS9A`$|AmJU>SFJQkjrf)m47y@HP^hw^yhs zm&;{-<2c`lZL+K*8M2iJe#DNX6r#8}khAZ{@x|gE#CrXVVq(`HmdjR+2_}!Ryd@p}#L=sqL? zO=r6CK6AIjG5RUKo6rD$U=VK<2SZU@A?%vCME+#43IA-F12?^q`)utPuBI`V?D5^r z*ByIC#-6;%N9^s{!K-{4x7qC&|Ngizng8Zb_=2Ex{wPccU!H}s+i^1IS<~Uavq|3N z3Ly`F;sM<{1id|ocb{(0K;#}T-skfOgbpl@JM zaL60|Lc`wdKOp?AfrBF69xO#h4T+A4jf)?ekeHM_EM<7=h<8SgN*kS?F=p(z@e?NM zGPAOCCgo1f%b!x9Hy8`27MaY&(@ILGzuRfXdo$lJn>BmR+z-k>{HS8y`~{UCFI@D= z;;KI_(JuXTS?}uQD?VGfs-|}Jn$OpMv2Oi_jdhzgZ>ewia_d*ywm0th`p-Le?cURr z()`Wdeftj_Jk)ad$kAiRPn{-cyR@MS2y^QiU z36;Vo!%+U^Q>y$T%HQewI+S;Oj`;;ShFr6?VOeqhs=IcDJH`2zt-r?Ss}fXKhC(Gb zWlhe}>qLR}H6QMU_UNNY+~P8rHerNvs`gbDR`mj{1g+NB3QlunF!|9h%f+Uqlk-1WgB;`s`xt6m?Dv-ohQIZtTX1ce{bZ1$b zm<47?8uo6Vz`*`=5uF^z_E9BljAjEhElG)Nh)ic&3ttLi`ze_r$-$Xvql&X*224pG zojRHE^(i#I>G^c9*o(3lp+Qix+OsSfubsXrBQ4aB9h)oZn7+k5O@~stZJ#=FB-;;V zgEe(a=+=0q(6dOhE-loUK04W$G1`(k^*7v_-_Up3$3u%n>4kV$DI*KBV@H`8LyAvi z@)n^IoB!eQmU`1-Q?iv_l7_HlYyqo5VMfe={LGM2T9bZEh!(ko)t`oFy%c&DY0;#S z)9ffeU5YHR-jgL*rbT5WOC_yJmL#p`mekYrF~!u6B~PYL?Ne2DHC9QeJPLJ1Q;T!V q0a=BG0lK0hV~NMKoFY@6(clpr);G9skUCKVJpZ6c08;3`h5rB{LO@#p diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..151008dc482639c3df72873480ca796f9ef31d42 GIT binary patch literal 3553 zcmc&%c~leE8o&3>OlA^-1kJ<*QK*)P>`GBVOPQd68WnJX@Ser62*@e~sE9eBfN>W$ zK-7l_O5LgyZ3Ul2#HB7M+M-xXU1(LbuUhM~ty+6$Qmj1nk9W>Huiqhe`+nbk&wQ6) zE_5hMvVQCsEz7W=Hvsmzt*PGE9rjKoHH$vr_uF{cbxzIf$k#s3By=9 zt8~G587pE%w!?uV(~VU!-6#Nn ziYBNy1=aCt=1r%(ID%JCG4qcIkPtx|`62>G%b3*Y%;eY^pL{%#((g->Rqos^pO6?1 zw`W~N1JncQ%?Xb1X^V>BJafq%sWVXa<3M=vNr!di;2@%?b*A_u_`T)Ye#hvRr=Q9; zJii959n0WQd9g?mc9%)+V8pp?4a|-yZ^1*SJ&@1-6|!`~qz(yxac;rGjy0&y?FaO>v z5L%OrB#+L5*k4-U&o>fDR6S4C;d_=`6+DQtD6u7?p2i?p6w1Wib^>>A9g(Bu04#ii0-h0jI*9BCX-?Oxi=EIJnE(qO-nI zLM@bnUIa;ezO#*f6b7i78Xs`4E|qRw^(AN+@j3cuaVxRc*WSuuc?mfzeo)<+p zOh6BlC}8o?Bg%9!o-k=6=u`PMK>FTsGGox>fxrXGE%)>THy}#+fzBUz9PCa^M|)R}g7xcVmZMwuQ`Xbmz_AP2C^aM!P8^ph zQGfL{=p2wWIBiNDGA!Cn&suLsjhz*U^jUgxPXQ}EH|Sk@Ozb%FOK~-w_Def#JD7)T zOB&_FzgQ0DT-a+r-oXIc)CHfGrjf~g27q|~?J#lj zJ|eCe5G6O=z}cKlD17K;>4CoK%zF_wP?sbI7py*n^ZP6#{PsX3xu>T3ovcOkRd;%u zT)z{y*2bdfS}klC@(eC*TFfk&T7$BVCW^kHY*3VB5OKE90NOU~fUOBp!0Viryz-qB z`et}Dyr^1$PAyPbEU_6)Nh(aV{puOLnmT7e3wb?UV3`0eZn;6M7ui`B%5C5}!x|#D z>T7r^qeSGqKNR@QyeZlut+S~5whXv@IKrZFv@P(IOow+K{D_i&wE)?^mWtf9ENp*# z7d?N!NIbDaN<27`2K!$thjvpmsC|Z-%#lwNZIzFOaiK=)i8p1f*U;jb%~u$=hEjRQ z^AjTRvrFQnP6hhLu$rnEwTOB$|0wwERv_G5UTU>-{|sVs>rOccJ1X(|NiM2>7A&n{ zWnyO6bL;(~HH_A|33Q$40;ikyk-auQ;J<>k44X&0;NPF0ZIUmolH@NwrZUoAjwj-` z^AP${;Qms)$)ic}9Do?5UqxO%!Sr?vZ>-2)NXrHWmHKU2P-vkX7Js>M%fdxg_8OCt zt1MXD$1$`f{77Zt(!L&1Cri1lMa%n*Ouv2k$ks(G?fe!nDovHCcu>%4M_p6Z;x&VV zD}%~3)l1enhu6lpH&rh!ai7^_Qu%LNR^}Ohp+ zYbbIpKahO2e(n>I+s04lbe<|3wa9H#edfI@2j|i5n-9%}M)?@`;>w(5Zbm@HxLX@# zT>}p1R*8+$!_%HbP(~LQW8LgW42h(GXA&L|QVVf8g&BrrYM}HH{Zz1yVBI7{2kodWu0k8t=)0 zZ)&_&FWPWYY)YKL{G2ei-UDx`Z`kA3`>Ogk7=2x=s%2h4DaweBk4@IIBz|=`pqCym z4z}7|ef)4s23|%IJDQg!l|otg2M)p(y36BWgZ10<`US)P z87xpRDqEHIfBdpi|25B?|I6i7|5Z0hvD!|-s2N39fBc12*(#-6Xz1iHAM?dhR*>#I zd2t$!<3jL{rwJUF&1vGe0FH~}{Nsc(4F*i)SN*buoZa0= z;>FlWLHyo7n;*yW16Iv(T7J;z_^rU_JotTL)UZiIqr#?V#sqjL>cT_gd46&*_HXWN zJP7*y1tr)COlEz;lIQi1&xi=~N{I=K=8qN!Ps(oi>f%kLL$-=z{BJe-iPT z#-I=LO4Wr=N{t8?LdW((%<+w%+S4DLEKD!?!>3Y{785w#z%#gUS{^Tevb*hvj1%&W zlZ~l6^&B^bUj`=eIY^5L@SYtt>;s&WcX$v^ayFlT?4r3S_)H^6V$X4On6KkB+2%S4 z&hkfF>8aWF hS+VIE38^XeLx;N!b@NbZZ2@@k2M&<{1ml11eg_+(_4xn* literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet deleted file mode 100644 index 4748c07ab9f5c77e8fc8c141ac748fc7bb24c158..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b485d3882928aa5ca25ac11f280ec415a3432930 GIT binary patch literal 4311 zcmc&&d0fp|8-LDkxl8M9`Ca#_AuYEeS}5z7?k(*|p$+ZYyJ?Scg^40jBgztGX`zU+ zm8A%cZDflv%otu`3`UmJ@SfjIhS$s=@8|P=-Z}L<=RD^;=lMS0=ltq^%TwxQ%}`7b z6XwCFF`x}#)p&aDzJ-&YT<8v{NM4#^?y$LRI6rGhcKO+pf#1nW0_PWQbvXU@*XEzE zTr@P?r}08}IRlK4>2JXZ@E3>S=-YuAuP=s(*XIQQs%8Z41&AcGC3ccdq|Z=dTi^)9 zB7R74d{|&)a8e{g2nE8sNcby~WFrah?}CXfaMr*OfI9&m0I;2j7+();vNY@|1bhS} zpGyIy&I1}h9Q>%N^Tw%TceZKu4s4812#9!nJ0c!>?<2&dK3~O~?@1Au)X6uJ68Zxv zGxaceXi8jod~#q|aB{GE6rMzd1%}FFQ{v*4G;0XqGD5P9lB_26{|n5&!Jvq;X7Tv3D0JQL_>F^AB>x@dd39G1`01(UTc zAph9}z)}aEX%LY6I12pt{sEWYRsttt43ztdq3o#u=B<;W1&L}zw0I0O?Y4r`8=S!~ zZYVsP{1#oP83qOVB4plZ4Adh_;K|*=K9hpB8e`y_8i0E|pjzx5`p5PC5hK^PIrwfhAlHDS+E=;~_oS2a2+`z`N9w zu-3T(Ho1+5JL?G4z2H6!$^HTit8L)vvqiA|nkg8^YopH1FH!%ndnnlqK|;I{IIWxn zi8WJ+y2x58xAHB#su_Wf3@uXS8M?#vLEEXI{+qyfrU~V_r4t=;Jp=8d@~Nu8Hr2tF z=gAeW=3ssG9>h^+h{J^kP?zah!bbj(n||>;v@E=dzAJr%Z2Bb-CGs%nI<%GKn~kTd ziZ4RVu{X#NJcuzTtWc$^I-1hl2!|z!=y+xuvG5QHdFcT}oA(5GZ1ch;dm>)QYlK%Q;8jLi{8-u z1ayZYdea*>LTv;=rZ5fQZ1fshZSJZ(=om@sf;x=)`Y61vas_q22@pGADDh<0A;|ge zD0#(uJY+Y`CZ0FY=+eYfFm9X)RX^}K)aj-Y)me@(z|kEVFML7j=5-^J#d|?>LO1*z zSOerZD^kwb11}|;IvBK?^CmwD{?)Pw)}5@S*KY|RO6sgpgXtl1c=;dXjI;O2@ka-7 zjj~dSTFYx(#Y{W;822|i*2<4O4M{MjI20CixX{{}8DMdsk)w4Dq3172fZ%Ol_0^YT z{|q(o?#V&--H)SWZ9WP+I*z_!X@zLBnS|;1J#^?Pf7-241OB`o3Xh*+R z(qhLda$dVL988=>G`KsXHA&fIznRy`bj{l+F6u{O&TmVIN3$1D^%Ff2@BTOo46 z<~4DbXwsaz7hP0ZZ3a4@Jwjb@e=wwt_TtRBml#@K-ivVK@IdA#Iwv_ zsiJzkfr$2hhJJDz$us`(K8jSaP&vEDjyrwTLIJd2FycYHIT(+B8FTH2Gyj={*}l<;6J&?>diO--)4!ZS%yp#-F2B zc^c8GYxT*Ho-hI}@!~~17z~f@H>n)gJ_ScTH*m9qLk3igHHG1KAJaO+^HC|+hS1Yn zNphDw0hbM#FgR-m$0KIuUI8uVf{&>$(Yb;nii3m>3^WMkmhIyv{^ zsw>bivXwZTf05ih-ayl2#ahZ{$_>&_r&uLWJWHj_)P-yv>%!2EGX~yk640B3_H_P< z69PTkY?PEFREbkPKu+ygOGZQj`e}KPhJ$tE6 z!1SnCFqs;Eek@aLSP{kkvczpW&%g{15^vM*hfr7E(CY(G}Ic&Vm=EL$Wk`zA-n%&Wq;sVr-GKdX?_Mba%xR_KjN ze9+#sW$DU+_KSF8+1BiQBbNd*zvitu-zGD5zggd8;9m?}Ij!pml!S&r?WN$~qSR;(&@$ksOZKT(mt#d(#X z>pI8Ef^F_aHc>ZDRDN4Nd9y61|Aea5mEILzmHsVNg*$w9hqSJnuwzYiz=6b$8!bE5 z?g~7%h_C0gv#2)obb)!m$(_Y}Bfc+lUGG%Au0E!HPt*@5tJm+BCmd`2fbMt9bb}II zdt}XL=s6p5FnVP>qcEO!{^mjN2Q2Ow1z2tSK?&xm1fvy^l^E*!Aq-W!{%TDh)VYl; z)H(&|R~GcfQCCoh^=JiDv?&0diWO=t>mAAm&5gXeEmUFZ9*ZNq+!UxXbPjfDSx;x`bua{rbi8SDHcg3rW6kM7khYmZd7TOMWc~ zam>*%pI(=)73#D?Z**$sYppPsd?VL2BQmtZ-3o{J?Mv>`p6yXQJnCv)hEBw!jbk%j zc6RARdX_rm2yN%;MtN;>FSbj0qZ>W7(z|kc{ahwydUe3T)E94I%5eD0potU0RA!e*IXC$1giG`bBy+X4oiz zH&s5B9e(U)kiwt~2i&(Sus-4K3*z?|grsw@K11~CN0(2f7C$a3p4F1!NmOi9QZhF@ zHk_4VSaW4+2y4uR4~!(c2TF3ethoI6N8&Gql_%rq!Kf%zl^L%=4QND#6Es$p?N~_$ zzn6z0@5j%TBtD^RB3_>_t0X^P#ZDh7$*tCsJ;VM^Nos!3m9gRR5y`!hxd#b5k)&j( z zf9zEfHd+b=5}vTlTo*vBFH)0wc}?~i)B9$LHdB^tvz5xEQt1@@u+3d6O_RzZq)t+4 zgw!EIxhBI76WN)ARO-r(*zOy#rJN?Q5!gV|$#aKTT8_e#36oq{+aZnZC$Jq3EtPt( zoy?ClmHSdF){Y6WnPeT}Gd(5DX>^RAuU90yALohl%ZSFEGMyZ6cY zB!3^<_%P=PX)t?U36JUE=GSL?O`FErZfw^}U731sJ~4PrmK^M3EBEuAB=`4KQpb2x zdil9c{WKq3EZjcIhb^TnAK|1uk=!>BRh4DWowY& z?=(6(#3mirWCZJki=4*RAE(%RDR@n$Jc%>M?C`#yRF>9zPs(K3Q`kqUtYq&YCCT1( zC3U|zrxc8%I7|U};(uxU8z)yG4gdfE literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet deleted file mode 100644 index 9d247f8cae7a335a4ecb364c654ae9a66b3a8c00..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3450 zcmc&%cUV-%7C&>#vP%)vz3VQnf&vRD$bv%DpmIS#UxQHe&M zMpOh_L@Y!FeM+$gD{907@g<5awy5~<8B3y>dsi%p{_(!=eJ}ID%$%9ierJaJ8^YNL zcZOnon1LEb%s@K;+iAzA?3oqz= z##q10ST1hek&DqA9T`UP*Hye}<|LpXNHQ+~R87=d7)}=P`nXJ8LRv;_yf#B?k&GwF z@v(9G)J$EP0PHs;YeuR%Qq@c-|BIL(h1+P!T({IPPe}psFD6g!lL7N4LW<`c*xtxO zW6gMQS=SwITr`I#iyYu?tuw@RlcG%}dRRC{3Jxo3L4VBwz^y$zZbLx+gJg&<{~a#8 zDg{wuUnm)=fc4E*Fm)A+a?;I-6h&V+v_lQY3j?96&J`Yqy+UWpdcabp9649p19jgG z#QG5MnMpx?nLS7xZK1u?2hOEf!QyByn6SVP1~;a_=xiI1EN>4Dq1$0Yn=I(;whNu! zu0(bNcZc*?1)^{A~~T7bL*gZC&9^j0lPwccSa22rBB# z;BwpcaQILcpw_L|+fu$}q)Rvcn%k%&Ue<6b1`Ocuq7s0jyEu6S&3CGUUaK1>5 zf=kzdz2OwRoEiXjciiB@xTSFQRT^YxL_$IC26#Q`D69zF48_6y;rc29z0A1{-Sav@ z*Xu8$V`3hZl_z_555ncQum@w}}keN&s1lm1= z6CO8B@7(>2mVyQJo^}A9Z3zO4D1S)p>`FWuQwj4=9w09c?GJf}#uF`7G&(={7a(3l>7tj&VNN+%`oDy0v1LGdt4TfM1LDj)YTwvpq8CdH z@N?~KSaozKy?VnaqNu_hRXJ9YJxhKkN7vmZ`yc3HVwXFK*y+|_G9$)|K4|igPE|*d z$H4%7SH?lk-9dD_nUld~Z?&jx141oNNdWCPpdQeVw4Q7ZLw}o(ZigH}8SNw}{(v`q z*-edTrx?Pq|1LW2STr46Er+GMMj+=4)^zEK*Jx{p6Qs-5ugIy5fv_)qI8hZ6h`u)D zksV@wAhRv6B3<%N#KebR5ckLDP`d|*Bk^r-g0tL*I_UHS{)ClDn7twwrb+znyDMA= zImQ2R;MPFdD)2zQ`W~SO8k!$qp-D`Ma{zyWDOzMAmYABETePvXYTK^8l#z9iE37*z zJ9W0P?b6lG-a+N)hi)#-zwYx{pUY=f`_3qR6bMJor2l%kQegpjj0tW>JhiC?e zhJ_D_7&>hDh{%yq(W6GkjENnq9Tyj$Fg`IUIb}ksE=|u(OgCg?PRh!jJZ0*%=`(U> z&YC@EZtfTJ^5!pCnE&OX#a}I1`j2JHzg|(Wa@Fd>HATg1*R9{M@taL0n@hKB-L}2# z+Z{W1l~?TE^UuBeDyyn%_8&NS= zx^k6k;^Wbrx!JU9GY}`eeuEFzg9GJF6%NvEb)f}kb?fKbC3k!|fV(tDg&%6msqbND z_p!5I&Of+#Ny5RyRd{zfHiJhr2b(K7FmJ{YB%JcG5zfC+6SUAx@Pux%|Ib?_ojCKS z7y->GEq{F`0O&Ww>TcEZn*E)w!Ha-vAjT=G>N_bJ!pMJvJ|qr_E;)_o!UO}Anrz50 zNk~oLNd+@VlUUZQkj?F&s+XzG2t@n-fq#@H)Qnn#S(#@=xwy(4Xyl#1^A~Q7Z$^BT zKHzUbPtcHyTXlIh#MXaW#o`-ICkS#Qp?YEaDKfNtOO2@sX^9y|p1+J_SCHzCjjGTF z)8Sv;ejRcU!bpd*mE+7-f5peSM z*grPqkH)A8a$8QDJ~1;P!!?fMT(#-x`YhW?3F(GpeVVPiR}c3dYK2A#@b(9)DS&YN GU)LYZD$;EL diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0da33db3c308003647b082db9d88177f6ae33a22 GIT binary patch literal 3765 zcmc&%c~nzZ8vpKn?bU}~p)$}v-dD@r~;I>zdxmpQ$?rsM0{*lD+e z(|h;ap34EJy2kk9?_JNV33TAdP}_0AxOc3kfGl2+3<{8YBor5-|-{ zau#HAc9fiG3DT{z6HH%8Tg7(ms95?J9VLPC`YPG{1qvomPGYBa`Xv;NJi!>BZAeVZ znx3G~(#uoGASGdXyfHP~kY)k50xO>(r`wd%BFg^1i21vauiJ1x*o1S3NHGZNN9U-llfcAHuQEaV>f;RE{E#Ho_-P8R*!;3%H=3g0J1QXmzX?Jn(A6s@=iV z=4Kg4*7bn8?I+;dl6CN6MF!Q+Egz2>>CL2+*@=EQxgWWw9HBQAibRZNDjqa?KNEi{ zmZ>`W5W*T&u(>@9%Z6hrn`?mJ0ngyH$4%+odm|WIki*CYN8njaD9CkzklM!;Kb%$% zOV1pkzMME3N}6Zlwgv`W9DfSR_D@C!`nExx?OeR4FaY`ljDyC{dsAgD3^*?S5Uc}# zg72p91!|O*GID;vW|z<(&sZyJFU^G8ClM*b`6!P)lr{9Cis-cL;k%Gru)KNa5>Z{viS<3wjU|TyAllA zF>g@4a%HeSvxMq7^(r;b`deg3`3BGVM-g5%ZwkIYGoL;%J_50KM&Wepk5Ny95rcjQ zZ2f2^ZRT2-k3JmF==Y^6s%!Vyj+-5W-ksZu`#AL%Z%WZ1bBmhdhb^FMj8k#W(_nP* zr)xG37oCz6ou46jA3cz-wL2jWGg-?jUR*~unsVfBTPl)9-)9ZiSjmFR4Q$QCeyH)R z7Eh_Nl4J}WEUL5Kga3BT5&tE?)+XKR1H69PJvMrcKa;Ae21l29;{;= ztO3u8{U^E}Hi8}a%^j31b&;Oi?Aw(U##GL)U z9-Y@eg_y%@sU~MRYQ7MIFIdO-u$ENgt+za|^vP$C_`C%@yD^)_JMz@mMt?@Hi5S3G zZLp`}%n2A3MX^bDo#FnSX6do+r{G9LmAFJ7*JqdSAQ*P@0n=kxDcUObz;<@4DRI$5 z2&q~K&V{w2@T|Al^fr?u;sS;A5!=nSf$vk^ka?%C5Y@iOVH)ONsO^FME%yi z4OCJxpzF(LbQ{}!6~^xvuxl%ZpsWy?@33IW&H8Gou1C3ax_X-Qv~LK<1ian%RZ-tPWB4R`_$yhmR^0!vfgv&=;au$8oqVl>w{wpD-mClc|#MSn-*HT_78C zl1>QdiLV7~P+dikwQFPn&9+^p*sHxH;u{}QU+rnZ%6&$*_pTja|1UYjR-dED&uEj3 z`;m3wnxwxz0JM;XiWU+jvPI<3hji4>Um!G-fk(^9q)AFk@&U6dzby+3D7L#MrfEyT zVzHus$VR`diwdn&9?4glwk|HVcJMDz@zqO~_RvJ_ifOJcT;5X~_vuD{ThU6pkr{Wd zG;b?j-PdmstM=bsQaT`{Ortxxed#-cBC12m{dX)|Hza0%^3|g|mTz#IakNA|cIS%n zVaXSE>5lDOSvdl5liF_!GQkbT1XYZQ*1k)cFRu1B!79$gTKQ^iOl;1hyJ04v%ii@e z0orCGCuUm{ieEYXx!2d?PlJ)a(2wmo#Mv#x&0;CFV8uky)l zr=8dCirgMn?o;Tzu4dBi*pKHkL)P!s)y=wEUpQpL-l>iApSLqE8$Zw=U#9jgaw*@R zc&@@bPvlxrmvX5_*HGkIdC<^WpZk+&Xw{)PH_w#&77zVveYTGYR5lGH858@xc<4|9 zsl&z$KQm^9O+!A|ab}unL}pn|6+K^Wh8vujl`mkBY2Xkc;Xo}(Se@_H0g`;KL!Pgd zR`?dTTyM@^plAP zH8RnWW{(b_mz{IW%i27AIYD}{_(AnSgQ_9-Ec+7agl+XJo%Z*)=Wr)Y2zm8wl^9ae z1j>+@NvEb{W{DG16NPt|JhDwv<+8rYGj>joRZe#;gygsP@poU+2rl3y^I@-GU=XcG z451J4ggV3_;T8i-|D}JOIuHwssMszYfjYdtN|b)34t|6MJ#zdb>R|nnNTepFC1rIo zBQsJyMoz5*oy-FWd3fFbA4L5<$m$iIpb}-qCnp;898Cz6BmNB!bF>fN-NTQN3h>0L zSL@D+I+b7LHiF)-C&FP)6h2C@rd5{H5x5#A5xP#vu@~|0Ao#Zg{!g%=l&S62;s0Zj z_$Lvy|H~cz({7UT_CTe~iB*>MvjtGwt7Lps)SJ;hofk{Bg?1CfllSL&K9YO|8OQTE zynhlO#Pdn~*d)uEKT#MjjK=bOsL+#nS5Ib^VW!|gaKyhke~6pgNHPeV5F*US<_K+| z&=A)=A1*ZhI$>(r=e5FocAUoq_qgcE*$F}3vvo01$-=%Ng5;Mwi!>I0V=Ycd5=1)v zS%$*iEB2YO(Vl4u!AZPcu(yOKaER!-=24R-3G*4a`cy)&QLWX(TIAMI(>#Y`~9 z##o?}c#)m>!rpo{A5ttLzsW}^rGI)t@Z>CEgAWZC^esv`UGXDBE%heFCel@Uo(~X) z!Kp$G(qn_XXT^EUBQ+T=D3K!P2=yl^b{+~@^S4-%%n3TOujBo5I?u_HtYAfScuOTa zjVvHL*DcU>hM;T`$AVL^C;o}4a5SBopmIppXUxe?%yNxSPj}U4WEgWC<|bxjrWn&4 Z+`Wdn57nx}?EzkXpppDNiy(hk{|!(~IQIYm literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet deleted file mode 100644 index 78e853246b125d8723fcba4db0657d7509e2c93c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2667 zcmc&$dr(wW82`?_cb8oSLEtV6ZHf<8G~Oc!3=wyE!-{|@EQGkrt~}hA>>_A}SlQqV z`5G-49H7x05mYj45KvG0D+o%dp<;bS0j^6~HBGL94j_aOJu=f|lF?R*;*>0_)03{IJ}G6|zEE82F0pGO5U} z4FF_YM+x#(G*JV`8IDZqmZ zo&X+Ck->X}gtmz&j!97}vP+~QyRHPM{p2oo@K4$lNxOj=P21N=Ybq!+=#18Ut=g(~ zD#lE)How56FEbeVLI#lJERwIM<)hy82Ei>9W+&M7;Y#a@jtJ)nL-X-Oe4f*f(0RyY3w9 z?Y)Plf|82-Gzc{(JE4rD-Ec@=ijGyCr`B}R@J(P4+K>|lH^N3JX=@_gc*GIJTiu|2 zM;{DVZ-oanrSz1*)l^h?grIoy3&Ly1_oLwA9;RWfP$=-tqh>_!7Zmj82(}%)2`SxD zXuOv~InJi&GPVm6r`>_UkRQigxjIMS4o(ohst4}uP6j7cJm@F+Q#TiN!nzYZ^q2Fa zq58-YYNShmKA+nUoA)n72PTg|yZdr#&)PVc6qgF!XI`W?`w773t&hPu{!jQJe=pEc zL9~gD0Z~~sb8zt%;k`{3__^;*nE8d5pniKU)zBV{x@L6J&4UlnYo{*L(LGZfrmbC0 zwRjBDm3d(Tzm|=(K1f9mfCVD!3SjlsB!O$y3h?Xb7Eb&cp^@Kdczq}lf?j!<_FCZx z?^&v8kGu zIL4lW~BMeK>&Q8xu96 z-Suf=4f9KzSmRjUZR`)xfP0;ZICIi4l_$eAAo{**Sblz@jHPUVkTmvXi4Da@0uDM0 zqc65t9dvpfd3E@S_0p0mN6GQ=^4baV8XkBa?Z#8zkqhgtWGVg;iDW|rJ1{*74|a)E zWakJx*!hRM*mx8k7?59de-=DAkAj6>XDqbZkgfwsdXfB;n|zxa28buUKO^ST3iGT9 z1*y-JeB`3=szbg{$FnDbGu2BCKm}K7fJ3ry90R2q>fx> z=De&(+r^SLF@c0aIR(dY8Td~qmE+1eMIo2KafMuLA-|`<3Udh;%W=tM#%ueGS3GMW z9*84u-pZE(1H&;BpO!?{vE^ipCj)lPaY`~MRAkA^xgfG$stHL8)?_U#(ELU-g za@#ra$r3B9!||Q88SzEh_W9CPhJ-R4hp&^^WB=Hc9F0vAq~2zA>9R7N)xW@O_E(pd jn##SG>q;%fCZl(7SU_+d@*WeNT5o*FZrT)|lKlw%4$~03z@hH9_#iisa z7R6FE2VM5KEuTGZ1pv}o7(H?kh8IZ*2azg82+V*bFW_^6gW^L2B7>45DNLEG+>4Y? zAz`GD{db`RhIEcVVL%V^^aMcbfs~UQodc7zF@b=C0Ov>%K+3W~PL^#YPg^mfFg7u+ z%9P5VS@YTBCww;gw~aDg$zONrLm6F3B^Zup4`9Lv#QRcoWkVA}QsTnmlLJD7l7nQU z=tWd$KuAJtN?g38xLi#6cO<-w3!h@D{{`kJOWr?`>YeCDJs~Ake<$^*!!)gP#eT54zwraz}emw}$=b_#839xxN5421AA>k)&03#)MHUWd&U!uT& z?_0S1t`S%fmQe32fL%}JVbu;1%1Gp5(E>|2y2lt!eB}rlans9d%IRS;NI>dC2oOhnP*a;64-$pRH5_PO%bPThIzM6VhOc zQ3pEPs){rZHvw;x6KEbj3*xSs5dTCO4o&zBio#W4^B>=UeQ6j3wrIe)Wh|&1YDc%E z5pPTU?zMKPF zJ$NAa+7YrI-A2(V3b2RJg19UW6d01wzjrSfu5kd**DUDD)dOKJ3tCEp;N(p?IDUzQ zOFNB`OJg-?C7p&hs~kY{fe~B|E`Xcw;$da750qxrz-a0KlsGm(rOQ0HwF5(MGVX&x z_GHj#HHD|oGGX^M9ngwbL@%m-Ln@(nQL-)qal9rt72n#6CO#eym$`!*L{ahRX;|i6UErhgiv_ZU5j&c=aEfi7on}^HPQe#%(BlIH9E_p zg}sO1urLuFTRVuY>&9X0%B9$#w>|t~{v6}CIO3H@xnRE62X)r=!_Dks7|l(@ry6Bo zRu;3#sDeqXM*|(mBpV&Wex0a}Y-gtlI;hi)&!~Tke|F|RKJUm>Db1`@tlj9ERQfV=vPbF(8Efo^ zpMWH=EDM2*hfbv8+Rwpo{~?ybHH4o3h64oEf${vgxXR~T@cttQ-FH2Pk`*~9^oSLC zAaOB9>Mp}{=5>%E$NkBIjwML%vI^OFauhXBJc%1NZ^I9|I-*rWj&N$&h?Rt&A+24s ziI(#OZv4DRWw?4SCr7*i_2b=A`W{cvn#b3nqR0WCto9CA@gq{}GzOqGCK20{HwSfZ zmzTA=*hMNVo(}zkX;{L-0OanRA}ze}J=~d+sw4L9#F96*N`-G%BkjW-v7po-qU}qH zdtP&{lwFuJafMSSC4M!Gt{zy6sh0Hc%r9?&6@z=!U9@K)e%S#2?V>u|Z_@*`XfnZ` z_tuZxcIXlMqf|`9DNTV$+jb*sqdvn;E7Byh0$6{DFNfj(u zS`$B;l?+!an$X3lU@$xHMqIzAkKHuyLze`Z*r`l`%qA5pQo1TzO1)s&gx)*hNQT#q zGG(LSQuQr-J4;PAi>HE$lMC_Hwcn#tNhK_UE-$bSyv?fScFNS=Ed#^GSu#CVsxU)( z33~YKS1j(e45ZDIRbE|0q2ZU0vC-ulIPO1i@n^@yX!4KcNX^>@8(tk@#Tf zXwAkvHS0{az@|1kU(=~T*RQuWr%=apms6Qd-KJs#-;T)P-nz{tMuENA0^8lWWiuiN z8~l##-cmkGeCtG+ZT;4YIV*pCFnp{&uX6rcjH6=Lu&vrQTT#!yuOYw2ahrzo4!g#J zI@eOusOx==h4mg)HaRNxO+}5~4PK4@{Y}NqK6`>s?yzs(-nw*u;=}9x%_Uz4^ki}- zIkc3vhny(TTRPBEwm0I!F6VNG)*W3jLmg2!23pJaC5U@YzDM`#(fLY(t~jgh6X-cK zdgG96x?%pL#pk{|FotZI4#w3tBygS*I5PcdZ5@NwO{2Az546!}zc6TLEA}Of{McVO zD{uMSRWG03y|gbcV1C9Zuy_{R0!8bk6oiZRZws>BC^yxi$Ra=3e)B|=@XP!2Lmcze zL>qXs3qqX>br$Ykbf6&2wZtfBplEhsxO@4m756S5D2(u^oWG7mSr$butZ~d2F77Uh z@~-!&b||(ij$YjC(;j}MyEw+@YX<#g@$$KQB{%yu|&QSl6k3riwmqmq)P!eYZ1CPdSo9%JG$Q3(l&ooT3uYMq-|H~Wx$6k_hi@q{f$X2e^;Q(wrR|0&kS^Ek;si1Z2TBiaxOAv^q64N1t!5hOHx7|X28bWdh7Ms$!NtW@VZA6kzjyXw+9NU+a?ic5fLgPs4Wc=y)v@3HpW0t^I z7Y8M-NC``x79ti;3rb8(NK;P@OH7JNh*vi;pKdbUSm34#@csjb2mnv|KL`H;bMo-^ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet deleted file mode 100644 index 3704000ef5a4c06ce7bce030a206d86afe278c9c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3289 zcmc&%c~nzZ8vpKFLO?blFD4ifSpq1SK!qYj^nnnTh=>b<3x-7&A&Ddi)?Lfgg`!fe zxPrx^pimX;C`%C)_Z2tlI67*rTiuSHj@4G@zL%m>^^ZAc&h$IU-M;<%?zz7tmW}ga zD8`9#RpSZl0BnQ$Z7B=TMm;?6LyYz5Ne_3mGr-s}VlPI(*y681<_{jiPM$>ADGdPT zP6QqW2$2H$b|hmcsUHYzr4sXGjW#tYLu1Hb2(dt1j>KD#JVGw~kR_=X7YjoG9&GUh zfcHwKzZ$#eLX=Pl%ux`D86453iIWyFk~eWmgq<9T5d9$|#<9P9ikTL!eCz~4x`Mo# zdL6unRjMu}FDFfFOiI-lHCCB;k(rv5qRY<9(emkBLgLFvUP{VO60-lo<$IHSX3fmE zQ8QOaKF=nzr;TLz=TjleZvoVFu+Y&w8N4?3gB#~9;c2lO+-dWKl)fTVxmgEGCW^pq zRU7E8xdABb;juLVMfWoyvF>*`_p%BE>4C6$tQ5X}WD7IauxL)MC6Of!gx2j!IJ`~; z&N<%jc=$_nsnLt$7) z7K|_G4dyHD;bKe;R9NSOqhcpIQ6obx&D9`U904xPCqTbz0B9eGp~-qYte7l=CBJX`3^goFIU*jveT_1%mqH-QZGpd)U|72dMQ!L0SI<3L{-0rgkGpt9_uH@PPF_ z>tWWbTDS>=;DU-Fu7CCk6b~1H^jj4a-o1gc^18!z$`x`7&7s7LhPJvo@R}74vCjpt ztH=}NMFQAXs(~X{ZQ#&Z8qRJ|qNu8k;A%JqFJ^>;%WVamOD=({FSSr$jEB;~3V1c` zAgoetfwHJzxW0x!FXr5WehYhnbB!-NdORO0FS>)P)($-_{{=au-b6+Z1bVFtgfESO z+}cq@eZ~%|sOlyByLJHj-n+D$&^a0^`&3es9Lm6Tf*TcE@f0;g9)pg65t}badlAbB|!U zdi5Eo-TNFlgPI6DphQ)XR%pb&CTN!DqWyE*iFpkq6c>ym+DC@Leg7wfWSfdCYqbRb zx_DH-sSU0!TnVp=a>+i5IYdyvKsvLer{M0voyaG%h1yUk5R9BX*rHV5(<0mP3cYUM zLlAi*y6kxrVL5;x^O#0Zxjcg-zCT*rzVk6H0xJlZ*#gh1Bf%;m46+@)iH8##VA0VQ z^8CnPSlBw5c+yCtv%?NSP>>t7tM?PA7fmB-3WtK@&}e8n)r%Arzd&va>cA%K1zbz2 z1u{rU>X;!Q%v(t9nY2Rie7OOBYMT#h4(_1WR*WMy)cc@D_XhH#&A*f5kKZDLTl$!} z6iy>{C@z}Kp5RaKHG4p3D-+1WV1U5YDKO_w1Z_8WI(Y4F5_G?a(34;AyQ299l!FJ6 z4%01R(a>dH8R+X+^ zvv%G34Q1sUzpmKyP37h-Rn=R!)zp5weaFta`d#1sefOS*#-`@)TlTi@+kc?#;Gx4u zjvhOH;^e8*XWGx6JAdI~#}AkOarvKDuIm5A`J?rg-DX~!jr5}HH#ld$dXUJmkG^Zy z(R#^`6*qsnwRua39^f{ur(#>X>qp(e*6w0!_s;%|t=;FWJ-BjJzO1K=AxwA^(e=qT z&&kws*pp_UvNH`vv$X6q?m5SAcR4L?V8qWv@+L-J%H!JGt$FVQtz?w1QApTm2{Z~y z1aO*fiw6$8-~B(Llxza+yz z^sf02IDKE9(jq^Q2n;D1X*n8(!tm=xyoKU!GW9n5AsC3kkC1dNHX|?zcg0Cg?oAsQ zf)S{LIBiP8w}eA_0(U`*?c&6a95ThpEPnR$@Mwd92PaXRkVahLRRM&WLu`s|o%5+Dy8KYX5?KDLil zxz?B@LE@y>D?TwT7?w6t79rcUeRzf!7}0lfY|69o{9|J3{k D8Fh$h diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e9af93b02ee386e9ac98f1c9aea4130450e7a99 GIT binary patch literal 3781 zcmc&%d03Ry8h_9EzHjE6VMd1K8)t+RR0c$lO%|1uZ`c(LL1l4)VG#%rVnAGK02Km3 z7R}W3qTrgAhT#&JY@(S)38myU^L8^tE4`0(UBc{qLu6j-kNZ6LxgD5u&U?=L{@&%B z;rs~aBiuNebLA$6aUu@P0aOdToonMS#q#;GaF27~Dhl0~HoxCs)t=7*XUVCYI0g9; zaUyFkaA1e_RP2y807%DT(kl?AV4S8MS#z}76GRHNN|K;kk`$M!)2DJ+sZbt9$|j_V z*06sDrFJ5*0f&Kf3Nio?xe#&DVMG*y^`j|}AfP2;8W=eXBJwdIADNu>l@oQLnGFsC zhLRicBzzS+w4-9FUv-ourDs>krip_wB}`GJrypu=(!b^8AWzCl%*#w(k{g$#%hk!# z$RsT(E-@=3FLQ}8%Y#_C7Hck2nrqnlzwr5ox%ZlJUzvq*M@dyk_TREUq|-rFvmoc- z9Nd`NKv%r^Gu%5k5uJ42Y{HJ70efus(D5U-!`Ov(bokC6P*dnR==G_g_s4acIJ$IE z>q5tY+r{gUNuR^-Z#<43*q_JlSvSSYE_6X>(G~Pz-7VxkJR8?$CBcKHU6jOODzksf z1vuFH5RHZ~Jo%I>dNWjxX1AY!_cb}Fqp%wnHBnHxY(DOe4218z?qbzj!PNFP8F;-L zg&KEt!k48R;c-O{W$RLaeSIb|Y3r>-H%}i%ZfPxaZLvrcX_zM6oNXn|7I2ZjzB>7cK{<<0nLBmk_%9Z*tjmyTNtZ^VEpt zGKd@~Lw1`wp>IY9$~Bjuq!wT1vWqLKiuD4?MhjpZ7GnFUhnd7PvCQ-nDp-Gb9$N3j zz_?FGFmIlHj1CMxOF12=rdIX_!+SY%ar2B|^t!&38ouyzYMJSmC^PK~{L=R&_}1bA z`pC?1#P<2(Y}5D9@XRb+cWf~|z;!Xl-kr(l8Zs312i~@vksgiqF1>F;;SIus(IwL8$5HC4|zLURZ)hV8CFqOUW_!=^E*n-?FYedqh zTWsc4R`T-2W_Ew1Ejn?|70=&hBFULBPSj}nHvaonJN!z3rCGMgJNWfQ{cKdVmdQ}n zgPqed@ls|fG(3JC&tKUlE-__9jXyr1^Bb0dwuHA^@)@+5! z&qu+YN+)_$xGn2aQzbR&XvQzb{tI23?#+(6*td_wril)vUxG4TH z3Yd89Wcao*OMIdFGidfci{G!gK)o~7#?)@zCfa@WWvbpjgz6eNu9$qJQn6j>$8-cG ziL5+J(Z#^IOxD&A^_>1qR7xtKYvu97{LL#d-jIo%uV4tu4UtXp2$p=^R4=#b5Rb*qX^ zRPL$0t#zwkHMP-}s`&afWfl$*`=Zt)N%$V+LenFSpiB zTEQ6YqLUvhICl8zu|@?Q@w(wkz%wRoMitpI8=QH)E4)a(81{r2Pxq z{ff;!*UlLixi@@w#F4dg$44EC?_BR$KF@jnvFz*JN6MpI;#vwB3$Kc3x5UIV)pn6b zD`GrSKHd?y!E0TtXZodgl6#J>o9~_VWoxm8cjbaf`o2$gM;@zOI3@q~wayLRRg0cq zdGF5ko?}&U(+Y_D2?e0)nMT|qOS1x%^f>WYbO;q#r^geAtnyQ{0@}j|6~GzJ;4~~~ zLrHb)ku}oci+i3kfQ~UZq#y6n8If1|2-Q*8${*2YZs?JsXIhyB&po1N&q0r5MTZM6EW^4awwCSQ4OQKyFaEe$#WzWoOXOdg`H`(sYlu=I|ho z2zmO=m1L$Z5jaJ%p3X?q=Zcdvl7;t}Jidj)l$B$Y53Mv;RGMBRVfp=C{lgbFoC^r! zq{ipfflbE1An28|v<@+*evpWS8xD~0>-a3C=rw)^2#aEMzv>8-;@MRK@`O@^)SGFl zHGf7aOn)O78OcjhatEo?08;*dGy`Kb+r|=7@htwIc>SZ4)fJwg66q6DlQVT3O&FFP z{vA=XvkrU9!jF&*@WiU8>&}S=m7nH3g5Hx8;V>r(^A)UVmGN{0GKYzWp;NLOLGWz^ ze52$4Ocs<1wY56zf6OBPA_Dh+xx;_iO;X-;P%1R6^02KXfZAFm<0B$oh?+cju~a)~ z7omCbTAt@;laJ75@O(b6P2q!hK85#BF|KKe!c1Z2&-01c|J_&v@wEc+~-{dIX%IBmRmy9yu75KN$D}s5vjs{W;ls& z`HQ48;`tjvND!nA;u)vH-c$Da*eK5>Nx><+POvw+C+UzDGbBgMoh!)Gg>Eo)W9WnN zq?0vmt}eeUKV1Co*JRJ`y7HPkWM<}H>J1KZxuCT#}h6(ycrTiiP zvqO#bCdDRgsPsG^AWVZZgc@YW22FY~!F?I2$wWbk6ggk0KM8U0P{^9rXh|X`*^&Ji zUYkF7PR3vbE26_2D>-OnOmcADn0icRP#*DP%u}!@_{3B=nn6ua*<|Z-Udl_(bxzFA ocGl(OWaZl|P0rD$Wi7FB^P1o`!Brh*4e;A9oS*^1$v?2a0DmGVAOHXW literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet deleted file mode 100644 index 0d95d679984cbe1a72e4410ce8b15f3ba58a4945..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2845 zcmc&$e{@sT9sl0@-b-FeN=wO0lM*lm5{kC;hqROuN?%G_+NLFKfs$e^X%pH&(maxs zw#uO3a77WdM>cdj#T^+N=r{v$M@qGbLq$DK*$>#xHgU(n#546AJR9Ps``#Av-a_{}#?~nWaar3!x%Z>RGFQrNbmqaB<0}w6qa)wu*X_0#+kkTY$wj`ohBWWx} zkR>K$rfVwz&{-*tA|WE!pC%TGHy45?)ojwWd%JyYUEXMyM2({HF=E_J?8SERcAm`H z7z#)fa7e`o0QoYh*o7gm(5yTMIs&{-TLuD38EN7ZGcib}>%p4Dgv2CrS9Oe-z|FfT zOq!`CKq>bj*!S|+GRR8yg*$pe{_a?t&l~e52XGPawRMDpJ)v$jr;EgRp4f*t`^(hw zUmSVMOfQSlR-H>4j`v}d|9k<#&c0RjbW;WVt>_XpJy*r;c|{LJ&o`4Jdr!jqgS+7C9T6@wcQbVq&ldti zQ#2P}J52HeWBl+ojizbSqNLr*l%(LaKM6lL@hKQ`h_L5M4b{)3T#s}Ns%Bh`Hf&r4Dc;gscK2QzGo(c#~%b}m%I|AF^9OHh|R0@Nytf7~V3FKFGuY<#3<6pSr z5*#sXpg-NV7^W?*h0$M5)k~>Bnm}^Dor7iFm<5m6W3-Jru@U zB%RPoKjE{b0qyl)uzEdAJZmJ<2b|S_6Zh4-gp>;KSwAuZT3%cagmS!~(TcjH3Hs!T zDXEjvCL1JUy2+d|#WHo;9oEe0GiKWCv$AI2c~^GMoZP(pg1hGy7R@W3U$Vfluyj$G z>?~hgQCYR5x~A4uSHINV(AczW`HJS1o|aYjwBFmc+S}gY^RMab3anih40VS=krZ*g zJ|ZZ5G*+sBp=Sg7?v<3(zW)2>-S1QYHVO)SOH#5op@YrnV4!1*KUsmTPoc~<2Zskx zp?E|A{kPGnPT{w4Ha9#-&cSyw-F$RVI0xGox+O*b5C$tK`iFOvEGTl5)Qz=|)YVTt z2Rkdzfrz2l_Zch_jYHUy@c^I;1-coW`J;R=5RGa5K|ccrM1ldWaqlGK6BF$}Pq7cF zP~LXUT$n$+FTO9Sf;1>Tt!(6ubP*x>XqR+YSRgl#{ zX{nsQw!NSaYciK9VUc@T{V~LNQ*h6zT4Lmw9m+kjvp3!+HCSeaIF7vcAV3%jr;*9B(8N?zL|4N1}mnw>7_LPX3%cv&#Z- N?E|AcfE)i9{~HhQtA+po diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d112e179eb9bbb8372b524bcc40eea2e5cae3f76 GIT binary patch literal 3997 zcmc&%cUV*B8h^iYPLh+bA|z=7Q79t}QBcIK%?UC^K^zRV$dXMEP`tIa2%@NniUSk} zLnQ2NTSv^_2fT!~I8vS#hZwu`H>4z^DdPw!R9u|U7AQamQNg z+o!bh&k8UMzXP&%;&la9I-$yt<*{LLNr9n3NkKZ1^dd4eFhm)%JT^{UQ$G@Yk7NzF z>=q{dFCgDpX_H8JTI4Cbz)}~Rul3{r%VqsCh;mGW${q@OjxPn>f4I|CTd#US&?NC>EZ0bjo?1$OvkDDjg(;X^*G+C!mF z6ST1?$z*7*v4>N8UBNVV6byL1MCYqUL$*|mtdE%k@xTs9$%1dNC1jZQGDCkhs=GzRWY1GqA~5{h(_VT4^B>a3I^v*TqT*ysjk$2%duaU8_` zB!XkQi(to6DQtRv7@Tv%U`d53oLkI>13k5lCm+7k6FP!qXt<0stUe?Y0&K|#QN4wfh;cpNUB{Sz5h0fT0RVF2y=){=R&qE z3)<@I!FII^c>m6Z#!PFFWwN0nHwZdz>cPoNEV#7a9(j}&fO+B>c(%#~%q4vseKZGqJ+*O2N!%KUA{fIyORP%Gn=jg`S_2;FQD1 zM456imi((Ly7cVEutG~Wd|lTPZZ-N*=Wz@R&OQH}zvtC0q*eV3Ym8tUo9Fw0%(xVR zXLts1%I3b0j-9c`CO2mB6GmIJ8}ur%Yd0*g10p%=^cofH`tUyKyH&x85f_5?2kk_w zODH>Z#(UVpRn43mu2#s%F#~lw&u5*>7^lO(YXixXeAvtST#(=XohWc^9%S_z!=Wr& zqG6AVp53-BJgwHFSX96Oy5%u}H2>xHKx{kqD-g`NmzIt3W#S$Fu{KztJ*oIt^ z+_a61w%}JP&tYzRQ@|#@oGq$VlJSo};d=K^(i*NR23DFj=;8$4QQpn@#TQsh6qDhH z1|_F!>t#4Pp#wXx`Z?6{vfYfRgrGCBdP*)jzk*~QnH+SSM!Hh(maf+4`}!H9zUOw;g<&Cep6ICc3$&?a1G#aY3R}?{&A!Pp#DaNdcxQSNbnh!eU6H}y zc;1t^`n?TybNmT(Ns@}4PL*hFG@Q!fP+P9$6^=)-rhjdjum zhG=KfcKqXm-=NcpIqZ>*J|JImn_ZyYpmp%O-C$cfL91!16dZU9(EWi&SnTgwkUU*S z^zjZM>izWr_UglR+*voY@qv@^X!xx>WIR`a^)8kX9|>l$iv-Rn+~*V4vq`LBK?)9c z$%(7v=%cxU8?R2Xxi7A8WA2Esn@QQM{3+{M&sMj>S9e{}{=8hh^2WuuS68_Jd|P>r zPXz4Z7v9<`A)iZDyc*W%qar;=wm|>MemK)oOX%(&V7_(gNZ~=!R+z&1>uZu=u2xJi zA854fCQJnMgulFi&~z3&Q8A|}9D=(HFj6efPg#SL`c{5T`D@ZRd{ehvc|mHrmUwJL zPg6nKdOZ_GhL|c`yTQQ9N9EUCn7-*f``}Z#RMEQ4MiUe6^)wf)-(oCJB_)c38QaX< zvaS4E4sO_P>0RiyTT#4m=SaW0h~AdsO*wW;S~4U~hcb6NL|jz)w;tM@HzB_7)NZGe ztbJ2fKDyW2TC(N92PqiWaE5AYfm4RQb-)SLwj$T9rtW)Ylx7#t%pDtf^+f6R60dy4 z21Dnv9i?+sKBWO|Wjo7#Yl1uWIG69LT=;py{i|)|In{wpsazwMirm_eQ`y!F+bee0 zhhHdk&vU8V(-_@T7kRC{GVe3xt(FZ&u2p-#NW6Ekbm7UWeMgfY_I2dBR_{N)>gl8V z*G^U+`07)@c~ZBU{N^;$z-G~@nu6B#d^3-|ZncGNo5T)L*H6_Jo!nyLv{C9_cd%o- z)!ed09d*TLa_mD+?{%*~)R{LivHyBUeaZO)@-*IXj|Np&k=u5g#itueFO@LpoW{Lh zsD_|hHV&dIZ7!G5=;)J&K9|#cHRw(ie6yvOk2IQI`4heTO7$D)sf*u6mmDtfstrlw zS)bl`YtUtxGN};LkR-RE6oVTZ8OIQ&utXvzGBJr078AzM51MPFa75RQMLou{ z{Nb_&H5+^LnZ1)x$c0m1Qz!DMHn0#8h*&gv`k8r`wA|B7?;uaKwOYjFpbbNw-ae&` zhLEQw>%SvUdatQeOjulal7RDkRr#U1zV&b7K7qT_RL)jf;_t#|zF(KP?D&tLv)fdB%RE$O#ydVoh zI=%@LUk&`9#e#^GNF|>CV|M%xQPuy;^OgNmFG!Xo&b`$A^^K3U}@084p&dflErYKKlQ1~-jwVtwP_R+y(XO9W?U9ddVWm2@ipHBp% zkM*YW`&Se_sN*@QgV0IPqKT(oGP)uDi2=TjaiOl^R1o8@hNsJ+#eZ<`Gk-p__h1H1 z>FU%q`9#xailiW4N2R~tY-NC-x^x zHcB4EpX09XH{EPn23t>2Gnl1o4AX=70GCNo!DCm_JsHOs(M?Wf`cJ2*xfJ?Lq4uOR zr>mp&{*)qFb5H7I886y~Qg>40q%Kl(t}fj_)@3=3qpp+jr{mME%++WtL2MEql(1}h tSkkDF`1nyl2?@$%lNDhJiIK`UlQH8*j~Q(*@st9*{)1x#fH(ca{0|mk!UF&R literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet deleted file mode 100644 index 81083b2ee90f8208676313b318811207bed04c6d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2537 zcmc&$drVVT82_D1DeVTxqqk{Ybxfe5A_XT#*=D#!TfhoPraa=*LMu?{ZKcIBjgxF6 zb4t`07dxjD=i+OCY~tdgPThi!I3H6t+5VAX%ydqpnOQd7aOd2vJQV(0<_Vnpo$vYk zzH`1WEf5ORco`qXt1LXnBL-0D-W9nQ@kizf9()Y1PNw`QUe3#PQ;?(x$~1ix0A(CU zca9Ja`evfzW!iMeqqG`jnN_qcth9P6c}~r#*Aum$=%?xx!+FvsQz+m$&@GC7fXAT` z%46$sXc|RO(od-=+$Xuz_+PjjgWDU@2RA9g<|?mq+C}d|o7HQLsG>xbZDF~qrp_r! z2^=KqV?=+4(|^h7`s4Id!$cf1t%fjub(n?kCQ{Llh1@?%Oz$j#BmFJ3b_wV@vKYzR zC*az5;kdsx5jQ)>p?tiG?A+zTnguE(Zt8^Vr$oRIi+hnAnr>I2bpPMD+`9+ziVW;3 z*5aePQCPlNAS-LbIfpg_$J`zn7eEOTh-6hKxI$0NE`A6s}pwY>?q% zTLP40;}EOL#J7$ptSg;@>eX48-Q~di`sbm1FBVtwTCpS2htY=5$k|pMi9fOzs@2(u zKXMlC1Ct@%QR8srd~95-!-v=RJVVg&&9nG1Iu>6Ye*vc)_uIKHSur^->?LIRaf4N2+b- zCBWIJ#D?Thbnf4eUmxHguD0N#Lg>Yks!v9 z`^~?Tk+$o^n?&Fisg#DiPJ zlehkS<5;shJ9*@~Qyz zbUYVm%k07LT61f4>Em3D_iOr<56X~30aCLckKzZ;qVS4>_rvp}Qd6`+u zG}0Zg3CPh1!MjKtN&Uw$|5D(`lZBeo>a>>sc=SG{(fGf-RQ=O>N$QIcYEG|EKZsBP zTAe0bC@jn`$_P9xO|#6vR=Qvm1Yr*SXKR)q_yl8xU=oB1A*({-8L7f-Mr8>?4*Sw` z@Jmk;>0ut2BO!m;1cPBJC1&PjvvZb@?K9a%T?>MRZN?IIl;lDxJ6D%Y%1bLNnpbBt zO{*>`F05p7X930cx`S?#zbwfK1wmK9pF}d*AbU?~QMza|R|r;SFNLS+&{YyV7tWo_ z&be$0q%J`Z;HjoOqt{xL?kXwHbCniLsnd9K0{P{>Iv5Wv7F`4Iuu>Y`HuF3$69_pL zrY|Y^g7N3%NcE=0rYl%_L72fva}BG3yVNwzQ8uZb)?_kMqDA(x`csI3OQAfYWJ!_J z?5Mm%F!}=bB!OjCR7a3188DKP4DhAYOPr=U8b`{L*;D`2lwD0g6Ph@;wRTCJ-8-?| n?Vf0@t#$e0mfCAQRW30uZAwa7N~+eP1N#1OSOyf(A9?=(tu<#2 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f3f7d2a7d0cc4ecb1608dee4e76b94cde19d2b83 GIT binary patch literal 4018 zcmc&%d011&7N41WZ*r4>M9obTKtL%0abXkOP)XPk6~PLq6vHAQTS{28?obpJ72E(p zR*_9m5LyM5T?My-&jq#0vseLN>-wI1efK8C>a+dhec$_D&zH>1nK|eD&N(yRmtQ=j zo)#2N*-;Me6iI;*02r)Z78|!}x1Rp2qp7})1!Ygd(-hlXZ{B>jHe|0&-nH;)HWZ)) zlxRG~rT7$!V(Hod6LQE#L=IU20JxtcW`qdq1Cnm2OW|T`z~YNVoB(-LkbkIL8A>4` zSJ(oDolp`gA^$9tcs!jWpb(%3K0N_I>mHPyJDmfQvk8s?4g{JUHV(8jNn1W`!N_p+ z#JCA5?cXyNkwYgU68p_YNLTUcE`7)|QB{F3EX4^(dhz#Vm}mw?2gXGPN5%RF$z$c3 zVe}#_$UiVTA}%sYRoELSd;}%uG07vO`(I%GVYy90s#EAreNU(=&gVVLBb;{31u<0% zP;F>6o?7-RcwRLLo|=%!Bgf1J2S*>omulyM&;5*WkD_O=-t`=~J|!J5_wVMJnq9=U zxsC-ESMPyH{2V&D=NNoxd>&aw53*NuT?FkLZo$ri2hj2}1uW~M0r*h zsA_rz#{hRUt<4;kxoX1sEhoT9NepaV-;FlZV_^G=#i-lM5&UfP6p0QwV|mRQz~-no ztUb^U?xgGi?^0v1(PrzA?UboR*iIeR;OQD@5!Q(3CbL*xYiDsY6*}Aqz1u`i%R?ZT z0ExU;Zb)Mi!s4g~;B5E;bXfLr`v)cy0-y<|tZoD^%3Xn`uOo=in}8mESr0aMHey%2 zW`dOFW$0-G0WZ%v18i-L@H+jcpjHr%Dw7?6p2KW#;?gKgu>CbON;(R(9AAUK`&R+X z)*Oqb>;M^;f*)VHoAoMP34Ur%1p7``6WK+JQEsgTY%s3JtV-TuOU~cNW;Tvy8z#r2 zYO`zXwSG246Z;n;!rT`-3zT45Rv=h6;6jX8zY>f;c7iqh8iY^(ML#xqF)*Js1Jhoq z0enYhao`E&{KAS#!R&A!SQ38==}E@0{l#B$-&_=9QmfT?d9)u&c$J1o)@o_&d)Etxn`A)?K{|`;{eX$OWZ6M3NM#Fo} zd}479j}tR-EUQ+l65YIQgz|)Tgo1YjrMI>5z0+hwgs2c0jbFizC*nc%yS-@fs%G{^ zErO+EodOf8SHg=alQjAFO+ms`Pu5p=ouE(d9_ar~HrU>y50pwFSKnhaX_mf&%WG{w z%NIR{y>64q&wscNL%HL*=WA@(3)436BYDTL+I!>Ri&K%f_|h%lF(Vl)7|6iR{a6pX z}cc7E$ zU6|>q7Czs~fwY`|1MBQDV=1xogq@oae&`a8nLpjF-BU1wvstkbwqx~d(|NzZH4kop z{kxs8QTo3ER@6iGhB2o>Wq1ravST`|-^JyI3oRK_Uj#qs{XYPFGDZ z3aoL54P)QJtu33$EelFfd}}!C4xx<#xQ5t;v<53jo%a-0>SfnW8&3ZSb-< z5p^btd0Vt?32uH0+hC{Pu$I1MkSC~ziJD>Ha=~3}FH2uDS)dK~#O}gYANT=wDl=H) z>O6tn(jQp`8nwIw_p-qFvdO$ATU}tqT?hvrKS7bNcpzb>rf~Ic3idpIfZlz%ku&GE z2KM-j0*>mD>X*R+Dh`_3>oJ5#Ho&JUZ{RUrW0tz7G80#@-`4~yv?WDse=s%6Dzv-QxAow*?8%C|XS3`}w(p<5;z@r`YsrqhS?du; z+o3eAz&>Tf*hOun=|#?IV_f$+lH;)*R^AW#U!7q@$+zT!wjIXO8R9 zgBP-=C~aZ%&aTTjUiuNbaorGlwCR;{8Xc~E zLrwXE77HpnlZV2Of@)>VNncw3RdC~Vaix`a6~XeLuk&?OA7l&3;XpY^`j&gRn+Xm z=k|xh!j7_i&;7_?1PF)+K!R-t9y0HfmVWibJm-&ipNO$miHIDsVOZ3syR^}7To9?a z;NaLlBT-uKDOE&pR7k9v9}Ys{Ae3A;k*qhNY1F6T{{-}psuibN0})FZ7#bWYr*N7^ z8lex^sFAMwp^?EXKd%6}1M{l7e4$zS!7gsH|tu7nh(8VLXp>xwj_o}Tl(r>P&7 zC?7Xt#9AtoN~QDZFC(+1(gdk2MCv4!hDhy0RC_YoU=Fjgmr7lknbwD9T2n2ROavy7 zbl%tF%*>|H3&*)GjBcO6jE>Aehn7m+nL*~usH$_RIirUMSkAQw@Lm`f*gEt#{A{FRHmt)qgRL!@%XUjLyPZ_?)k+RjPAw^ z>e5xItMdt`_hhkhZ|i7ZpSjVCd{m{=c=Ofuxh?oOAG%qze2@>*N|_?ad0{MbAa!+T z>{UhyL-_MuRsE)$P0LX0rBVlG=^VlIK(WYa>hb`~6?9J~F-CNg6PW(fDXK4p-jk_3 z>CEZs=yPAGEJ1xws$>~2+D5ACq}oYUqXRB5JH;yq1KJA@nY9ToLdAizzPb%7V-8whusS6f0+_ zAs>y-U^+shnVOO&%^;wlhVOjPls3~OH9dT!HiJ`&ll6V~wmek+HF*ZUd%p8N=R4=j z=JNSEj^zTlAOnXv2n3L8mU_IeZw#3*g99#*Q$`Sa04L?7ns^A8`*^AW3;_H>F*z|p z7_?1D!?Ef(kOrt#ekCTmx!7uQTRE)6%JoRufV2r(`AC-35hN7gFpwjFd;lH|jUhA& zkA|j_`#6J`l7#y>l_>ukE~eo=WCp?wSC}28-b#zzQ*1VQOp0=Xl$(o79X4;JT}Yr6 zDZ7yNHr5`&nn6E<*a#k*icuz}rE+%tM=^m*pJtQSD{&CgKDp2deZ zwW9k|&f!?c4Oz{l^U%HeJ9Os5n<(}Gbei4Iq zXDLwLu?{$_b)n<6J$Q8+18Zv*;-0x_a4WtKtM+CvjYs_<{$K%W+0hMG>o&pP^)6;& z^eUW`FkMpq{y6E46Q3bnc_+JVjZ`{!<&04c&T*q`A>T{39=iiUkx0_mpN;*eVaCg~ zL&oHLa60y*EB*iceZYIdRb?l)f z8>Rgl-0(~HoAAMj1ClM<7vgO#I@CU;jd|(gznE9g{mLYFPLxevQ-u#iUzV+06ffzL z-Imy5jLa!;L*nLASam&95?H$&B3e77V=p7r_XlY&(?<|9;}s@&xj${nYlypzBIv6Q zjEwL^DwYI{kA4W&VhYwPT^3Y+u5;)SZ zju{wTo5fz{5%-NauBXOo8^)4xecbv8KA)7Bx;_q6Qg^A&8 zgP7)d13mOcdKLJ5481QaiOtcK6fE$X)2A;p7Uo+izcQD^x4eQJg1r>M2nm9KXiq>X zZFEz=f88vc-tXauN0lK?s72p#_5j#fpN80*V`F zuvS0?>#B+y(dVdEtJYezxZ&x$H;C3}d(M03yz}~<+2Q3U1ysi7;p<8Z=bKfX zyK(Fd4YYvnZbR$ShBSxfn7e|6>e30Rt|$PYE5VpiAK|!A<1?15=yy)6`)9QasVLLY1d44yAf8RZ8&7>=utbr&xuKjM+-MJuB+ z6fr5PsnK$kTrYuHBt%b*QYL38QnU!yV1issoJ)w8$ z1fJ<*u;_6Dgzx<)TzaX~4o*KfeRZ^6b#>WyxL!oh(1Dsgn4Sf{0@NCRG zbasawEHf9Pe)U#BJhBH$=@0I?1YFo*1-!l%U}EYH=MxQKNw_N{E%1cVt%)!xs~7M} zOyEjj4XoGCgr4?$(3u)@WZAG4Oc(foWy2Xrt#yW!-vm&vKM9t{n#015eK4vt1|qii zf#%5^Sl7B6-O)jyIjsxVj7*@Zu{RLqBf&w_2Km005V)ffL|Yx90+T?wnFi*(-vRev zIJ9`v*qyJ3LD3jf5bgAa{9k@Ti5W()i?9MkJ`a}JkZ^SGUa*wx^WRDxCdDQKVN1(px&;Zo!>xb-dtvQ!~Zn!g_2Pd^4Ly*I-; zzY%a}6^7b#?!y3e59m|l1W%vMg^gGGf>nw!dS3Aob&tM>R1yTKDVE^1BoM}R6e3Oh zZer1vckpV5GdgHns!R3phmE~A5)s|kfz@PdLbm=nY8_lm-1k3%`r9|-i+%fnW6M2I z5T~$)lKn_!%tO(K+{mo+&2TjDI{KmV5pwF5img>f!^S@u^??9v z==Tn2i?1FU+f)w?;xu$5_X3u80EcV#4(PFK8z$WDjjwCugJ#1~xTP+E_lwf--u5|| z+u$K&!ZI_?FUR&E$Am+~+I$X&luX9@j@Uy+9SodLgWP>xc23F+78=18>XM zaMJ0n&Vz@8NK?>*!Ltv+>#e?^7d8r#d)i`8rW}C69}eM{$Bh7WV=UHIN1}_PkHfM( z!Kk)Z8)!_YV>S6Bq31|{s6X2Sx1P5b3`VuX?WsEecXPm%v^!84YT~G1Q> zxzO)YcXIXmiP&0=BdY6r0N--*J(_g-K0e}5Z?0wjbZlFJ1KG4ad9B;6qW?MC`k?37DxZBJ2FU(Ta36 z-fi+#Jj>t~QY75Kru|lcJ&MgCYDdcub>9t3HTVv7Qz$WzS3~8uu|x;mOn$p}G%4Sm zY*@9e#?(J46m6P*9qTFX!(E#oK^@H^Tsm+zu~j)4%Y5aHF1Ft^dNTJoui$(HZzp=J zx7_R~*Du?Ezv}&u$VgI#98HTkIw6lJ#Z8JgyQPlWI<7aWKjnZ;T%*fNv+Kvv7}Q|@ zxM_{e8EI;is=EtYG36l@vebi27FL0^O%``LIURPtUx7`W)yOR{AUT>BZ-~s@Gf=bI zSa`uMa8+(b?-D8=enA`!12Eema28%U?n0!R7ej{(7kK zLMV2@Aj-smSA|vHbHa39oPn4(&FJ-=Bm%3RCAvA{46#&ZN$RdN$0IwUF;oyl#XaZ` zkM1|>95Fr)hh%HGYI$VO&BOY_z`Ku0lYzyklIw(-nJvM&1y8_dO)m7$-^K}0O`=lU zvU##!hUk9ZCr6*0#|M3!z)x;#;l2m~GQwjh{HjrM&o8|Ub%Rb~4aMj2T_Y?EtQW5& zoW@?mtNQxj%^mv;Tp^(cX`K05JEl*s(zT&0 z_@BO&3APBFIa3gIPF1e?Q|#g;zHBwMZU)UUW zWaHx1gHrFDDD&J@R6Z>0_XlCS=8x(1RSuBG&T3h2&-sSdK^pZ~OYbEXqn0@rE-Um~ zVYlV*a^H*Mz;d@a80~jS5>y>f?or}*xqooYq=vb)e~V3cZSsw#68|f9Q)DyWyd`Z) zcgeqBAR1O+Q?@7O^r|7VIJT=a3758p)fL#T-lzD_v#;*Vu9t0-fon70WHUEq19fX_ zm{B%AceUpx)FEqn$beWfxD!N?7DVy59v!fn?w~fNl^b;sKQ{+v13y3Tik)2tNUA?o zChB-vFuHu7;!$gD`YZh@ZZp~*T;7{$Fm=S7_SY}kUKvEV=ixkobC#jpBVSkKs(Ni0 zIkM2CU$7?2D9UTG*^ueaUmHdH6nk6xu5g}V9OG9qAZ(xNjd5&1*}#Mr%?y*c!17@; zUOs8#q~IDheUBXBtzahYv=>G>!e3`H z8F%WU9P_71Ov1*3TB97ov>oM$vL(m&seA>or}xC3ZL!M+P>|V@(X+-Yv~ef54`sTu zFR5MhPn2zSJ?+ISL4^DMX*|7bt2Czk!LJ5PF3jkW4H{9l`R^%AxySEhFhPjsdR~W0 zA!}f296d{=462@8=K8q(V~v-UEr-NQ2bj}XC!6&7oPF^W2`MbIj7cYw6Vg@OnB*9i znK3l1f-l(BS5Ran-qA}um&;PkKYl{~lxxXouK-#{OSB|Z9Un+Uv|=Qe0y*MQZc!g7 zp(Tb4jvSHcfm#2);CwA#mWh6dl_gC!F;3L;ej!B7Abu#DMJ^!Zq({ZaDC9K3@N8@B4}9I)JYc(tJHxoa1rvVWc$(8`{5kz) z?LJ)zjnSL{H`bdFYOjZ-^ca(&>zb^)Gx06h_}aw(4i*IZB6Cr||N5VazlqHMU!LN> z?NJhJw-xA%DZ$fzrT`*yAzvC4G$v$d=aVI@AnaLjk$Om_(y`2&6@RHTQ|b{X^^!{C zq@HowGY`gKG`sSYN`2Xx*>}y%PP5QjcvbxXw z)5AktQlh=%q;l3@o1THgNLbfCXuefx)Wf#YBQ@GPSj8%&z5%Sg)+n4`II#56gR?LSju=cO=b9$HVPat0lv50iRicHWb=Sk{ZNk!m~H>7)hO zd9H;Xrtr#O@@R3g{!DzvmA#ryOAuP5%G0K0#HeheQd4c^X=%z#i|H|G=?Tgd3rANw VM>_{mfH}a&7c>w6GUgYKe*tj26RQ9K literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet deleted file mode 100644 index 7b231f3e38c05597bb0f2244bd064df42bdc2274..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3314 zcmc&%c~nzZ8vpKlFCkGjAul8t6|3SipRS9G|(bi;!O)^d#uZs z1%aoaTP6{7YXd+pcLI+BM3Bot-i~B6B@G0=y;LGhSL-yBbJV6Bnh*=b^+?==nA>UeQr!9TeRz;sDxngUq9&aO~7fge^zJqCuI0igR; z3@x@3VR@De7C-wAqE=A#F9eN8d&9*(j?lh$AW-W@fTHmcl*YJ0 zeBDNnZuNr=geR=)+X%B?)WHoH4(F9LadrL&uw=9dq~9u`^zJp3SI`HxQ|_QG6+(qK z4>}qezxHu1b5RBcseT@+;02A+4Ksy z{9Fe`<|J5ES_Lnr?}wGjZ=fxYqi^gNJir^Myg+u5&<-H~fyA zH8+si6M;eJ2GL97VeGSos4-^;RbKrZ{!tfz_V}#oEpUy4%>y@6Q=BWoJ-Eyff*$Q}3ZX^f#789Yt!+5zBeff9y*CW5&Hfnt-pU?A5BL;=l^D+*k^47Nh z2C*#?sC*JjSO*Yf0o@Erw?E*J|MgzC@4U|wfei$I+y;MajRBjKD9G>cL;NM2~ATnA*R zg4EODASfuKc28N(f3nO3H#_En*I8%YmsJyp^^Jb0c~BEsedq<6c=Q$-);7?}t#mrE z&FK*NNm>xkd)wz^z9NM@2qp+wodLynM)Dj=W`Osu7Ji=#2tE3pgjt@i zONu5CyN%-rkq_cE$0^YlrZUng?MJf6?lRKm{!C2!^;6NA3iwIBT^OFdL@!-LI%#>JBf@y$XPm0I4 z5(sx4v@2#;>Y2eZ6_hH`QCW)zsE~yM0G}L*vfx{=RE>Q*%o~>z=mn_qOl*p=1An zgNF_uIeP5)iIb;JpE-N({DsaRFaBhSH`4%uOY#*`1N_W0z%NY(5MAaJkU`dE$>$2{ zf~&kP_iJ1-*Ksm7U2usuSl_@Y@C=HZw_Z!Yrb~323*M(obcc)U-vxrZ7k`qU>?@-Q z3k;3y`G^a(xjGh-GEG!|uE}hbnV-o%@c1<^u@cXB60fD@yJ&eMhj*_Z-dmq+7usFL zVipD``_N`<_J{#GS>(cmFN6aveg&WsLAQoP(5=Hl(wn>3Sm?rmDuGB_`Q85SLL|GF z_>`Zi%QjnZ_8=0UMDmOM;X)Ktj^9%QWAj+!Hccy;uFG37W48WvwX*cNQ%26F5TmnRqy3|1PY5F7SUA3u1v( zCRP29e{%dq`2T;ov47c365sAC7RUwSa%cSfNo5jiCNXhzQi$bZNj6ZvECn(V48x4U zUzXw+rjUuqW}+D;n~BWk_9C#tD0UUeFfr_m`R*BW+|tB)U>z}|XAbf84aSS8_>n9h zS;&r2?0{V}jEWs1QdpLgGYXcUn(iO(m!34fKodP|YD#iq4lCCt;QVIf;eqoP$vMGE zV6yn*maMFYy(u**P^VF5GiuhJ3y;gel+w*7jvL4FvFuQj>A z^{L5R={VjPOMS5)^~?u13)5@)u&s list[tuple]: config = { "cluster_num_bands": 14, "cluster_num_segments": 2, - "cluster_jaccard_similarity_threshold": 0.0, + "cluster_jaccard_similarity_threshold": 0.7, + sort_output_cli_param: True, } launcher = PythonTransformLauncher(ClusterAnalysisPythonTransformConfiguration()) fixtures = [ diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py index fca5485b4..8c4debed9 100644 --- a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py @@ -35,8 +35,8 @@ def get_test_transform_fixtures(self) -> list[tuple]: os.path.join( os.path.dirname(__file__), "..", - "output", - "docs_to_remove_consolidated", + "test-data", + "expected/get_list_transform/docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet", ) ) @@ -45,5 +45,5 @@ def get_test_transform_fixtures(self) -> list[tuple]: duplicate_list_location_cli_param: duplicate_location, } launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input/data_1", basedir + "/expected/data_cleaning/cleaned")] + fixtures = [(launcher, config, basedir + "/input", basedir + "/expected/data_cleaning/cleaned")] return fixtures diff --git a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py index 07710b74d..9ad8a32d7 100644 --- a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py +++ b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py @@ -28,56 +28,13 @@ class TestPythonSignatureCalcTransform(AbstractTransformLauncherTest): The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. """ - # # create parameters - # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) - # output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) - # local_conf = {"input_folder": input_folder, "output_folder": output_folder} - # code_location = {"github": "github", "commit_hash": "12345", "path": "path"} - # params = { - # # Data access. Only required parameters are specified - # "data_local_config": ParamsUtils.convert_to_ast(local_conf), - # # execution info - # "runtime_pipeline_id": "pipeline_id", - # "runtime_job_id": "job_id", - # "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # "minhash_num_permutations": 112, - # "minhash_num_bands": 14, - # "minhash_num_segments": 2, - # } - print("====") - def get_test_transform_fixtures(self) -> list[tuple]: basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) config = { "minhash_num_permutations": 112, "minhash_num_bands": 14, "minhash_num_segments": 2, - # # When running in ray, our Runtime's get_transform_config() method will load the domains using - # # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration. - # # columns used - # "fdedup_doc_column": "contents", - # "fdedup_id_column": "int_id_column", - # "fdedup_cluster_column": "cluster", - # # infrastructure - # "fdedup_bucket_cpu": 0.5, - # "fdedup_doc_cpu": 0.5, - # "fdedup_mhash_cpu": 0.5, - # "fdedup_num_doc_actors": 1, - # "fdedup_num_bucket_actors": 1, - # "fdedup_num_minhash_actors": 1, - # "fdedup_num_preprocessors": 1, - # # fuzzy parameters - # "fdedup_num_permutations": 64, - # "fdedup_threshold": 0.8, - # "fdedup_shingles_size": 5, - # "fdedup_delimiters": " ", - # # Random delay between reads - # "fdedup_random_delay_limit": 5, - # # snapshotting - # "fdedup_snapshot_delay": 1, - # "fdedup_use_doc_snapshot": False, - # "fdedup_use_bucket_snapshot": False, } launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input/data_1/", basedir + "/expected/signature_calc/")] + fixtures = [(launcher, config, basedir + "/input/", basedir + "/expected/signature_calc/")] return fixtures From d07a23a47d3faf0e5bce744cd375b1c5ec1d5966 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 15:24:41 -0400 Subject: [PATCH 40/91] Update versions in pyproject.toml Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/pyproject.toml | 4 ++-- transforms/universal/fdedup/spark/pyproject.toml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index f2b9d8268..fa815441c 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_python" -version = "0.3.0.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10" description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev0", + "data-prep-toolkit==0.2.2.dev1", "pyarrow==16.1.0", "pyyaml>=6.0.2", "boto3>=1.34.69", diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index dcf1f48e2..548f350c0 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_spark" -version = "0.3.0.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10" description = "Fuzzy Dedup Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk_fdedup_transform_python==0.3.0.dev0", - "data-prep-toolkit-spark==0.2.2.dev0", + "dpk_fdedup_transform_python==0.2.2.dev1", + "data-prep-toolkit-spark==0.2.2.dev1", ] [project.optional-dependencies] From ec2168c2d8f9b1bf9575689b05b08650bf91510d Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 15:27:39 -0400 Subject: [PATCH 41/91] Updated ray test data Signed-off-by: Constantin M Adam --- .../docs_to_remove/band_0_segment_0.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_0_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_10_segment_0.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_10_segment_1.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_11_segment_0.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_11_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_1.parquet | Bin 0 -> 1532 bytes .../docs_to_remove/band_13_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_13_segment_1.parquet | Bin 0 -> 1526 bytes .../docs_to_remove/band_1_segment_0.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_1_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_3_segment_0.parquet | Bin 0 -> 1510 bytes .../docs_to_remove/band_3_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_1.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_5_segment_0.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_5_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_1.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_7_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_7_segment_1.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_8_segment_0.parquet | Bin 0 -> 1530 bytes .../docs_to_remove/band_8_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_9_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_9_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/metadata.json | 58 ++++++++++++ .../data_cleaning/annotated/df1.parquet | Bin 0 -> 6923 bytes .../data_cleaning/annotated/metadata.json | 56 ++++++++++++ .../data_cleaning/cleaned/data_1/df1.parquet | Bin 0 -> 14933 bytes .../data_cleaning/cleaned/data_2/df2.parquet | Bin 0 -> 3068 bytes .../data_cleaning/cleaned/metadata.json | 59 ++++++++++++ .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../expected/get_list_transform/metadata.json | 48 ++++++++++ .../ray/test-data/expected/metadata.json | 84 +++++------------- .../ray/test-data/expected/sample1.parquet | Bin 36941 -> 0 bytes .../bands/band=0/segment=0/df1.parquet | Bin 0 -> 3984 bytes .../bands/band=0/segment=1/df1.parquet | Bin 0 -> 4763 bytes .../bands/band=1/segment=0/df1.parquet | Bin 0 -> 3695 bytes .../bands/band=1/segment=1/df1.parquet | Bin 0 -> 3684 bytes .../bands/band=10/segment=0/df1.parquet | Bin 0 -> 3305 bytes .../bands/band=10/segment=1/df1.parquet | Bin 0 -> 4466 bytes .../bands/band=11/segment=0/df1.parquet | Bin 0 -> 4906 bytes .../bands/band=11/segment=1/df1.parquet | Bin 0 -> 3317 bytes .../bands/band=12/segment=0/df1.parquet | Bin 0 -> 3138 bytes .../bands/band=12/segment=1/df1.parquet | Bin 0 -> 5020 bytes .../bands/band=13/segment=0/df1.parquet | Bin 0 -> 3138 bytes .../bands/band=13/segment=1/df1.parquet | Bin 0 -> 5244 bytes .../bands/band=2/segment=0/df1.parquet | Bin 0 -> 4782 bytes .../bands/band=2/segment=1/df1.parquet | Bin 0 -> 3988 bytes .../bands/band=3/segment=0/df1.parquet | Bin 0 -> 4323 bytes .../bands/band=3/segment=1/df1.parquet | Bin 0 -> 4341 bytes .../bands/band=4/segment=0/df1.parquet | Bin 0 -> 4035 bytes .../bands/band=4/segment=1/df1.parquet | Bin 0 -> 4860 bytes .../bands/band=5/segment=0/df1.parquet | Bin 0 -> 3554 bytes .../bands/band=5/segment=1/df1.parquet | Bin 0 -> 4872 bytes .../bands/band=6/segment=0/df1.parquet | Bin 0 -> 3553 bytes .../bands/band=6/segment=1/df1.parquet | Bin 0 -> 4311 bytes .../bands/band=7/segment=0/df1.parquet | Bin 0 -> 3765 bytes .../bands/band=7/segment=1/df1.parquet | Bin 0 -> 4158 bytes .../bands/band=8/segment=0/df1.parquet | Bin 0 -> 3781 bytes .../bands/band=8/segment=1/df1.parquet | Bin 0 -> 3997 bytes .../bands/band=9/segment=0/df1.parquet | Bin 0 -> 4018 bytes .../bands/band=9/segment=1/df1.parquet | Bin 0 -> 4326 bytes .../expected/signature_calc/metadata.json | 48 ++++++++++ .../snapshot/buckets/buckets_collector_0 | Bin 263 -> 0 bytes .../expected/snapshot/docs/doc_collector_0 | Bin 31 -> 0 bytes .../snapshot/minhash/minhash_collector_0 | Bin 2840 -> 0 bytes 71 files changed, 292 insertions(+), 61 deletions(-) create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json create mode 100644 transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/sample1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6 GIT binary patch literal 1513 zcmcgsL2nXK5T0EY+$N2|#<$r`I1r^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p zHuSaLi9eaPx}L+a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuPAEYJvDI#?y${L zc%Jv>w`yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;` zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m S&e+cY1V1>$PjeMN^1lIVM)pPk literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9df2f3bd54e13d5078be076585302c2d0f4e93af GIT binary patch literal 1497 zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K} z0vA4laj)@Nj0+bodtj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&y!K9Ws%zO&4iPF=5B-UWib&PVCM{MVlq86>j`Kpc8ToVkIRUE3s(@ z;&_BI1rU7Dco^$sb8>9Kjw%N$)+<<#qXDs%Y%n%K0{xpj=0*PIRk^UctwEd9doL(o z1^x4~{(2edO&6cjauz);w-2h+x7v26MkE~S9dY8mc>X^p>;-4T14Yte+TjAOpT4j= zu<)rU-@}Te%gU#v4FD~sMrg74E>di-#A~ tc(3NZy0iy7R_BWU5^g~{&Bn|ZS{zu%je&GVwc!Ysk|b4+5e2C(^V z_h(#v3s4O~7;bOnswf(OdF)mp0@r=W zbJ?x>3920NSH$jW6gOwF`w*G1yNSF8m7!DV+Z|_QPf>gm)<1@j(7(wOUhHpPl{35B8n#_}?*-+n zpnqP{UoRoOtHtNEoJCK|?SU!{thU{$F$sryTb#Ntp8wAYd%@Z8K#{bVcCdi!r!VXd zEPN`;_pl=Avhpdi4xq)<6<#bpKH2w%q+W`Ke?DYsN6OQq3C^R^pwNK+$UN@UAs>ZN6*5_KbWQ9dz7jZUdq zH%sT`WYNuU=Usf5iEvOZ0>6-T@wy=XKC!XEGK4Xfu=zD}>Q9WNgyr9N= swf?Jfd$?`&dfT;r-xT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5601f5cb07d71179df35855265cde6b0634c130f GIT binary patch literal 1532 zcmcgsy>HV{5I@_A>!N_ts?V~e3>m=c(2t~15velpl9nhCN)jN24CKT4Dr!DR(nKBE z85kH@85!9a8TcC*PzNT&j2HlM_iVR?qytM&$vfZ2yZim#Jv+w>QRZV2wv}ZngGGSl zck92x#v6c&0zL?UBQjKurHaIWEir3_X)K88YRJ&M1VdBYR2`OQcrhp{U&;f>XlRnG zyaMaD6k$_-kV|YIe_GXrh_2CQ(0tJ&(6<_7@C($J*U06 zSrDAhc=MAK)%vNO@?(GVD<3qvtx;Rh{3z+)B^>PkrP8UC?F$`m&Am-^pxT}LmF*QqeY?AL-OCJMU+Zqs5BZ$ z-WJXtt<=bsBI!~({?v}fBeiJ!sI(Uy*E_{#EnPe;#R?+7ofG&l6XBp7d3HV{@Cpzo z&^g~Gyy?RDacs9)Psf{fr#Py`N(YGVhKq-DffP3-^Ta33xy`q!ID*?DP2$$S?HFRE zB}HA+rM}(WjXc`GJ`?h`Xg@-+sh%qiUNFM_>fqU7W3+1b`>WN#pf?GR8-w9~uN#hD VUyH6q%tQ#l`N0l;sdM<<{{tHM{bm3F literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..02bedff1c8ca8f026eb6a3d7664517b0f74cbb2d GIT binary patch literal 1526 zcmcgsy>HV{5I-lW>q3Oms?V~e3>k{mp&xOhB2smENlO$6B?%%eS@PjHNm28GlP2nj z#2>&v!Op_MU%=|B zyB@0mR4=8fgnL$*F-dOOOVE^{_eJEUqqr!@-6{Cw0>L#fJ}g|byNA36m0_>q*qz?U z78Jip>Tk(-7cxEtk>qsQTbj)zIcs{vpG0yE9mEG8y`Rc8KlU%b z%9-754cilw{JLnoT|}DG#cydji>9921x+4UZM#!rG7k5CVB+4m{*9A%gR|j*D(g{w ze-77hes{r`_f*vHAyqaM^~=I4fF9MtyjXm8yyp%{y9{iTjCqQ4Uci^+3FjDS@kgAu zc)BHGDNJ}uUun*>WYPC6dbE^&LGkCLM5%O^N~4qFZ4vB|rbeEW$d)t7SC*NK*UjWb z`PdvaI;CbkQ#vmviXy+A7x*v};h7)1#llY#9vt+|M-6aMPB*y literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bf131f43cbf10180944b4906799b7d6288c54724 GIT binary patch literal 1523 zcmcgs%Wl(95FI;->!J$Ms@JlmEFx5_F8YXxib&PXO5R0x6Yjz-J{2+lO3pO3uGk4~WXU@5EHHxnkQ{^B*l|z220-JN9xan5DQ2-FpkpwZc z0s1c?#=_bjNKs8)Zk{=VVWrWt*oq=4Uj*fcpuf=N|LLM_Asb*Kpo*daSj27>A`tZ< zFJ!msC#Z77UlF_6C~nSU_aQQ2cN2LHD#KpIu{*tyJw@?NSpOJ8lCxzOG@DB@%5LYF zI2obL0R$g39>yBmm>yfOqsqaGdIjrAG$6K|4aO!&pnsDmyx8BoD(7~$HEg@|-V4fC zLI1p@zg|Lmv&9#*oJUW~?SU!{thU{$F$sryN1VDZp8wAYd%^keK#{bVc5n^XPrtT1 zaP3o3zK0b_mz7VEbpS1<)aaC& zb+dF)P8Qw#cHYH@nFt5vGVlvo7q1KA4|L8C39nfkJx?At8)m9$bxOl}vV4yC?)!K+ zcah>p=>qXFIk)&B6-RJKWJ&z`4?V}#q%zdiEcdPMapLqY_8C*OMc*S7i|YB};RQ9` suQ{(S?BTZ6?{C)}r#Fs|Y-e!R>&A_HJH}2zJBR^z4;m>l00$Gw|Fn&mB5!d%2k!`pB#MZqFOLbF6;GjE^zmb+hY|S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDSo6 zerMF3qxb%n@s;n!&E`u+E~2NO?1PpVI$gKd5D7ZRd-B{B3FB*DI7}{v2db>6^@A1M zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0mccu3lJd5>j+1w^dS%kl(~5*<87>|n7I zqt%AK}bR=(}@YNKY(z94)L zr+5$pq_n9}VLle3gNNLlz#Xw=nf8Ai27zW(6)1_&QGiUE}Ar~Vt0tMqdK literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2838dd9727770220dd6b3f3ae9f0d4bdaefd8ec2 GIT binary patch literal 1510 zcmcgsKX21e5I;MKaZ!cRs?V~e3=t|;hyIC)ib&PvC2g%hC`k}0WXT`LNs5{{IBBA8 zd=wTI1_mS~J_8doV(7p}Kx|0dd$yYdhk+$0`|jPndw0K|?uhZFiRp*LY0G^=}LjE1yMZDtF7b#M06xUkU;+-BuG@-2Pv+p zEA4Z4II6WAi>xY=@>x*63;KyJ|4$Qb8wmgr0aY9g0QxC%P(ZXdgw%Hc^)-k<%p-+( z>eoVDz%v^}pCz~EC#Z77UompwC~k6cw@U#zF|rBz+t`%d9pp8vjhvcm_nfiKDSi`_ zUxNN7s=tpQ$#B_!G+Rh=(bNLa;i&~X^Tfi@fY|)hzRFWxWS&>;!tQrQ-I)n~yrutS z7bRSr(F;dm5k0N252`e@x^}NYBpm8Japt~w{$F3%3oeESilim9!zEn5<=p{&*;7%z zL={Pwl@GBE04<@e(sKFf*?~7C^;+CxGND;YX$fB@k154Khn`T{p}7u^WijCiTjeOt zGmCy;(PLY==Pdq=DPAg4=OtVJ5RG6)rnOByPNi~hkRrSmmw|bR!)2v)p z(`BCDEpmLAiEvP^0>6~!csYn0=#(BYyk>d)GJV=^nVGiLtBjiI>ILF^=;NW3BgI46 z660f1YSANB9Kjtk%fzq$$aA=6D#yB-)q&MNO})5}eI_j0VdoKw#p?Ou;RQ80Xt=L0 o?a_`k80<7$*O?^8wmUp``bp!#uCbfa4if<01IPGNE8;)=AFJv2-2eap literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7cb2cbac4ca976304da8c64e8db39c678260db2c GIT binary patch literal 1513 zcmcgsPj3=Y5T9L^b(=H>8{cL(;XsUTdax~9Qd3MX4_Y@UR=`M=3xQ>UHigB7LVEP( zC-D<#()bDb2|Ri519t?-)3B$i>jWel(Zu=#1{ zS0?oiAOV3T!^nVbu=@|OEyZLer!^(o(UhoHh0T~M0{EzFB>)i5Q*S(Jc3hjStCD&xsNV&nZYcjVL?>#%L_n9q0I)>ZoluY*#zkRKho(e5By6#Y z05QToBfTGn=LG6oL|rl~R$q$UD>HF^a0&+ymgjjIym|GeR#7*7vWNl)tsE4X>a%I?6*Per|!RLM}(kIB0LdRklOwc3l5$H5_K z-^3lV882|otN15-#yJ*x{0ZkhUhMg`0yaFSRgv=&Ma-j!8Eq9`QT-)3AF5a)XiN&c z?-yP2G$^z>#fFtXv(0?AZRRfsCU~|{i2bp`ThNh&ksA14%%guS4%!$pTr;P zoF5Wjt2TX;JMMO@eAgb-C+%G09Qo~sd^q=+;zNZh>9IJs`5}QLxg&~XA^y`K@jbaB zO|=>$dw86Ey^C{ZRPE98NW~_+&;q`oWkxOU?S(VhvPYw>mgl*%%+&G5r|vLgKH4_7 Sv-&|AAo#%%ewr)zk^c>+^Y&B# literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6 GIT binary patch literal 1513 zcmcgsL2nXK5T0EY+$N2|#<$r`I1r^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p zHuSaLi9eaPx}L+a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuPAEYJvDI#?y${L zc%Jv>w`yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;` zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m S&e+cY1V1>$PjeMN^1lIVM)pPk literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9de62574605a07b28b991ff2c736fbf6e3a7f45b GIT binary patch literal 1513 zcmcgs&2G~`5FR_Oait1{inVMhhX{ey18w6*MFjP-NsAQ-B?%%0U!ufulAltmS!@OGnx|bXiD6x!d5~R0Rq(ZGJuGQBnTBA`oS09YdIRwPJ{;-av)LsQ}&61G@H zfCOP5*5<-)qNH(i;xⅅ7#3Y>fTGn=M?H&L|rl~QD2(fsW5SVa0&+ymga%I?6*Per|wRLM}(k88I9^o+K_>-Fa+kHSOJ zzKOeJGhXDJ*YHpFlyfZf_+!p{ywnS7MQnIRs}kpBikQa{GukS>r1}eT0aU3>(3li? zKPYXms0F(f#1fE=p zrdsyM86M|e?ctmmReSV2QgH|`vXC!m*-^`Xd+tuQozZB!<@??&J9Yi>sW;4;4|dF* SoPLl22!C*dpXMrltj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(M%SvVG!3&e!nK$$1&3xa^%a(c7VhT&LJ0&JF*Z|o4 zwDUWya)3Al!hmhCyZ1#_f+bSA8t%+j8f-3!;(1YRtq35aqezkh=65N@6k`t(Nkd=j zocg12qwP6tT~oA6N&709N2dBeO$>qtOa@d*Gyv$Q*=~u#A)(}3fLj3{h|O%2SeT)! zVGa?CMzO#mv6yVWi_eMOL`tK^*lYN1-%D_qZ`czBT)4(zi!=f(cz)i`qp-Er^A@IKryf6yI?7Qd$DGI~a75A?*y z>AC$TOW-{3$ycsQ;4ggPAh;YJXo`_C_E&K8R(A(hJ{9d!(G*kF-p6kP7%6?7SF6uX z9tA_vzsWl!Q=aFXSMViy!Z`-I{D|`|FLXsLj|op{Rp7ix7X8qoM_Yv#6n{=ilqwXd zG&*_S6G4wOHJV9{Y~9YicC1{cW#umFC)T9huXS2>?Yy3?iu_(#;KNLWgK`o2m7>5a zK>UTy`2pdztCLsR<4)Vob)0@}+{)I^5Z}WY9?k_)Je02xADeTBA5d`wcSx3GR{wF} z2u-R$UG4hN860O`?qZ)QMZ0t!p*U1;CIMd1)5E6!=G+}`Im6*r)Azk;dgA({Q*V&A W9&B6N8Dl>M5Ik^*f3PzC-2VX7ZuN)& literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..37aea5168fab44a7bd45091bfdfe5871bee8d360 GIT binary patch literal 1530 zcmcgsPmj`25T8=$b`cG`@ilG2frB<2_77dMM$8_3u)7V4E1%K)i& z>%Wrm*8pMgB<5QJWSQJrAyE=15|I+B9IU8vkgGsyMidM1s*44H5IRvo5aarHAwk00 zCP;BjU2Gq^!%@BESR|%Il~025P0*{l{69^!6~=&s0970f0Qx0zzley>h^0OPyjlVg zA|SzapIP{62t)z5DI|4~-;%$s%A)_p_)SM~k@LHD5bzVlH@l#}kDN2Sf!v1mkyCf= zo-?*N&7XquThQNz^$#KvovwRDx0yu$H9^5gKRv-Ry=Umj(TF0clPbzoO+Vnp$xaRB32+?OuaOIN;l%3unjk|N7Hja5g+pqFO@R zn!{0?-&xS-Jr(6NqC|CB`5d_kpe588tyCWG-}8o~ehsZNna~`iw2Uv4hm>NVLw70d z(0qr-a+vUledQ@FFpGX*(PK;bXDt4NDPAgHV5QN?(Jl`50~csYnO=#*|Vyk=#5klAau%xv50RY%QC z?FjMR_3=>3k>aLYnej0xwdgi0j^K8fW#ZR=;5l3~m1kYe+Q91XrJrwLp9zb0*nWg! zv3kCEctK4L8t%(ud$ei|2CEI%btcKN?G6u}e$u$JW~`;PtptGgzz*JJCA{_j0OaiY Ae*gdg literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3d1f158e9e79bac193f88f94d2b548b79827778b GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJS(ac$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1h@mC2oUSy{w9@^A|VnfO^GT~Plb&+S=t|TTP*^}6v&b!gYiR3iMYNCiKMQrw$A*) zu-5b(v8KxE7fJml8ApclKST6QA^<`LQb{rZ3kW4cKD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t uf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEVGN)|KlR_NHuSUr literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A2STKw42<95IbXmGq9LDE<4uke-BpVv@1u5v%nlfGtK{}wpq0xsL^P|%? z)It!iTE~h-=XyZ8#$1jerle+L(%BMRu2v{nOg4x8R)@Vqhqr{rWBh*%avcFxh@}I8 z${0K#2(%w(vj#>BS?oW!CwW+_r$Tc!^>TgvR;;Ehjb^W>rC%DTW!j(jqK~RLg;%D@XTAQ>+`N|aoE_9OjR;7<+T*=-d1vynh=sAc-ahFVQ5f)upxq6#H=1VDIdoGJb%!`WYZzwmjfD_T zOz_%x%S@-?hGJfr?)zht%aCjGZheNAJ2PoqML)5Z|Z&)_}IN>8nA0bGHQ|Gj%c#d!l8VIWqgpA56-?pJ(8pCJnZX$71T-V^r zYi5lnendLSYnNsYD|hui&$Uj7i{|$}4DAi)6Kk7(gYO=Zxp!r(Uvsf~aNX^=`j}TG z2Us3I6)W01Uk{tk2&{ACXQ#f8d>9rI^kjb@{l(@=VT8|^x;8XvcdP%4xKo3H4r>n* z_sbMYl$h`kKI-Vxaz}l~SeYVPkePm*?->xXJy)5t((TF8bsK9^Z^fM+kT*r6myekGFC+lnWE?|Zl-k99L zKrVzlUEE04M(U19TC#{?Ze7GMYO3s;S(vtW>BKclyCxcSGw69wh>?>xIr?WhwvG8# zpDr+lqvzF@>Q}Xw2wz&)|9YA599s}s`n>NAY_rlrwDaCwlR2uc@AG=(QOC-+T4vnl z28i0=;EUVe|Fo`=^TwlZokUEPRS`*V$MP27-j!~(d-iqM3lm<+{d3$3VjTEKub(Uu z_c|w3Dsp2IU!A{6*E5UVb?7OXcG_$&>(_6=*Gwz%tgUl<=7`W!0fBA}T*U|Fko!WU9Bg3IOcP!l(X|{UZ$nTP*@(tkeXf*yAOZY$@dBhKGZhX zA;#UBUZiJfYim0p!uHfYbzf0afKKkc7xN+_Mt%Eko=-T^v)&{w?~p-kb9LxV=jt)G zZqM^V^Rf?*J~6`I+BGOmA*SALuF)AaL3ic;?8MvF)v3w*haJd&DYJ24{bWq z$F{YFjs@9)+Hj2;OLUVD9tzkq%{1x4gca!X(>uJfJM5OGD$Wi@@x9rF+zpB64F~mA zEy+Tovqx9nj;ot{rbkbbu43OiJ0s{>{n%pejnD-P*0i^Oa%O-p!fnCFEc9>MCehHlnoVed&ole&a(MmrVnn z-EIDD7eWi{FOubC7sE6)bA;!zw5p55Q1$b31{3}3<`*QqT6cK{-tRYrYu7|yJbKq% z&UAgQ_`-#P=7Le7-U7PbkjSXai$z6fB49z zMv*)vX1~rw3sb+qVUK=s8Ft(C=)Qq4_RD7-i`o}8FjxOR;l%FyXP#^mA+7<&Q*uvV zh%_9(nPr)oU;e|d<_RX>o5q{2NOBs-$UWCr$852`bg+9#n()V2KbM|p^83){ymS6k zYJ@tajyYqGe&bJ7-CYmpMwi6A0esDo4Y8AjaeeteEc}k|jC<|O94@u73cr;(i|}*e z4Ov1zE#7M+e%Dl7@2+!TV*4ns%e1a58qXi$M_p+0sAQ^(T`h1=eb z+)W!Ut9-l2>UcpU#W61Hc(_`3(VqRLtNE>OeiIF__kF0d+u}rwt_tWH7Z+WnH2LAa zkz|*n`;?Pjdy&flZl(KsUv6(0pGBzgQ|p#b2=jRn;VZ5=6reU=JAa(n1sBF7+N%!{ zk}z&9&Cp}?mJcnv&+3JY@r@1i<*|K_+%?EDpz}C1^^n6}vxSBwn;b@>wj$~%Q zaygfl`@UeWSj(C2MrQroz#f0#yN}DoTaQz%~pbq^96>;A9x1UES&V~b#G)xLdprl-(27v_3M1lHW z62h1Y7!%_NDRq(nc33h7!mdQsBreBgTDJuO9{~UWzlNMZ0ah_spl|^2IV#gGq43Wx z`Up%j>Gy(-Kk50izDJ;Y#h&H~!b0SQV zg2@o2e!N-&K#5UOyLg3|m_%SHi4!1(l*${n52frijw3NuD){P>_#9M1+3WwWM@tKE zwCtT^#bSOsS~5UFpa2||_EgzgNMk?n;uVhL{WiqvYYO_Bg1)ApuPNx?O+jD0Y%*ny z5^C+|2bxci#LziuSIw7L1$z0KE+#6x zKQIEo1AK7;|MZWzwVfX)`wLREFSw@u3mz?W1n8T|m#5C;ho^}{Vkohezf2IQ)hGD` zMKcpv00rWJ+X8Z~8l{{h1mT2`Bsjr<6Rk>fcy#r2c6D~)`hXv@j~_^H7z9Otzrp?q Du#EMC literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json new file mode 100644 index 000000000..047921334 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "spark", + "job id": "job_id", + "start_time": "2024-10-14 10:43:38", + "end_time": "2024-10-14 10:43:55", + "status": "success" + }, + "code": null, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "annotate", + "RDD parallelization": -1, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"] + }, + "execution_stats": { + "num partitions": 20, + "execution time, min": 0.284, + "cpus": 20, + "gpus": 0, + "memory": 0.36, + "object_store": 0 + }, + "job_output_stats": { + "source_size": 4111, + "output_bytes": 8856, + "processing_time": 0.46729254722595215, + "input_bytes": 8753, + "result_size": 6923, + "input_files": 1, + "source_files": 1, + "input_docs": 12, + "output_docs": 12, + "filtered_docs": 0, + "output_files": 1, + "result_files": 1, + "source_doc_count": 12, + "filtered_bytes": -103, + "result_doc_count": 12 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd GIT binary patch literal 14933 zcmeHOYgiLkw+@Ji5meA1DB=bYY@39OqT-cs6OfyLPz9`$$&idpX5!3*K(&@CUfOCE zue2&v@Jef~TCG}ZRn&T`7p#g}QB*#w^@dv89<|@M_9P&BJkN8!?>W!;mHZ%?Jv)1^ zz1I7_YwgJ z@p5UtC-t8B=<&s=EL1E3;f91H(Rh zUiYNzNoFarKGHSEA=x3v!Rx~H`w2#fw)o?VFtZTUGy? zN!4WPt}&-gmlmCm7mi68c;xD!4{QDKq4b>d0f*eTmHEaFn(t+#Fs zzj^6w;i&!JqncS?9ADS>Noi`2sg>;A;ya_?if;6sn)ozg$^78z<%5SWEU*1+@~xds zUmWXF%dBildvE6KTHldxOiliL^O$ZwZrtcn=kD%a78CPlxYzpu35wghoh#~|99}S> z*RI{e^6IN@4)L+fcbGnX#JHVB+WG1SPQ7c|&g`J*5twE$eYXEW=8bRGMIT(8w`1Dz zZN`E@R}$}J%70b&cfDFVk19&MA*VevP#bi&mMDxdZ8)D z=yreTy7LuP5AyPxes*)~?leKOBl(}_ruz7H?=(+1ryiHbX6J+*F0buQog@M$abq8U z_jp%L-c#CbTwcYh>Zv||pL#s~M8JTacVq9pu{k2%DW_+jPJdK%v;>B(*y6%|weIqp zsYPQlYLu7Pe!j|e@coEg3g5rq_IP;rQ2^i0yC~}Vh=A`NCtdrm+m#cxnlJK0g1-Fl z+~gHY%PR~uiHk?96L;i1_1(Jmn_l5>mQFZOnB05TtV7XSf9t0mQy))IvgiU{~EcM)QT65yO^Q|y_LH*k$+a6Mt z2Q#YqDZL~8wf@kkvSU1Zu5Z?3 z=8NR%YcfB+a zYt#?URZXY*Pbu$rVB_{_yO;Mb9aKn_udUcrJ#K2*Wd6vTJN7Qo_uVo7p2^kyjlk2a z@z%YILQLX?Lf_cy4er0!S3O;xAZ9K7{>LW6mGT2F?7It!m_6TIiN5(HqV2R!-ICTX z>h(tBPq%-W>UOB6msf{O-QEYW?~m3r#dKM?G5uNBcXy3E8>=3cernByE;;Ulx;rlm z9)HF=^~C7do?>F#xwFDz$38N@d(Q8tp!RpD@`II@+%w(w{5mgU&7y0x{9Z(tsS2+# zyMB9k%e1dP!-r1x`?lX6&Aw|##{a?p>^n1Pj@OJ)4`Onh_KfPeW9!yy0KPj%0{xx2~e_ISK}^XJo_=e=>7UcXJOK|y=Q_PgWzcewuP z@u+unCCsPDYt z)~KF_U&bFOry6=5ubk9j+@Rp&qgHC#$U;gCjp)>SzEsG)yRN4@MCR_VTE=$JbVwbx zwcUkdY3JIwHHweCR?6Bt`o_$jU3Ast^I^wE!>qw|Wj@~>pR_v8=fUO`yUZctt-Yfg z%bpHJF8tNV{bw%6|GGOxXZrnA!_1jgN2mX><$T5XlXo2wt7x(R&|P=(5>FQmsvcVS z>B8>Cmp%vm)5`t(ib_D}g}OZg^M$&K|5<>`s;+WBSPW!}lPa~wjVqGl7WoX3Qf z50@`;IeRK;K@|1Rh=&7Ig~b=Al|LJi*1z8%s>Uf}p7`a0gsxkDUbLqlda5#g@?`G8 zGOvYonv#-FS9QGnC)pIbxP#Y0yV~mqy(TpZ0up)!AFgKTNhvT!M~n{+38Sv}ZxM?}QuM{6otg&HMP|mXPhw zG`0IGKl9yuefZHG`&Qp?)4AKHe(QfatGF^acv`wg&Y8PApLvYvVeD`(FOIL7@ZO^n z3nN17+mf==*GIcJ8tNARnB99vRI%UIDE$pTw?L-u57f=PpocA@ys0G@^1SNPINf_(N{;m%ssGX|F@$Y?~FSdlKxi-f3vM;kkTU{ zzWr6du3y#pxE;|qg?2dG$*ZixmD6kX2J7#R@|0CNIs_P(kNC68CrbH4&)$c&$&xST zG9Pngzh9{we(nI77A%@}&p2fY4CE#T3D-I7WxkDUqNVxUFGH1IG&}o8xp0 z!RlaYhCzvDK}4x$QJ{4c8p-4KbizdIgduX6;Kk(894peSnG(Q4o)9TLO`;UZE`mX^ zMJ9q3u#IUjvLKJqWW}bWrKV>pQ37Wnl-NXdi+w0KExp^5b(|@aG(WNr1d5?ZDLh<0(n|hg2_uEDBLx#h z(gY)dwI&EK#VCS7Z6if}%funRh zfig%=6rsh)!9tAS1;l{`yr6y&5%3bh2pqBiJeXn?Pw8P?2uQOl1ha|aY48gPIO2=| zGlAj@X%Z%)S&}hJ))E-IA_dD4Rr%N!(m?Qfc!RMdUvNev1Qqb5M?3}VW|4_wXtG!d ztA_5_JIKO=6LAl`0I69p(AE~)l!98GiLn8(#IV7A$rjX#9MYO;;Dgx&FvX5Gg1M|{ zHX;ZdzRo9%G%OJU4eM(K3v8+gO>8sT^*m=04G<`a2gR+tWkV~*NIuO2v9f6aOb`H@ zt#Jh;9Vjh>0B^y4pd!seq&Sgywcj9Xz-SQ}AkKJZNe8TK02&HNOBLf0@SMrC&H_HB zz<(I%CLX-rIu4l7V>!M+i8KtMC5(h*L}~#;7&s%LMjD-t#!v=b0M}!m*xVpZZoW3`qlA}N#ANQf@eihMvV2@XmGplb#SM3~}109i>~Iq93gh?g}GgB-%oii9A7 zthIy+gWN1Fxfw@N1n}4#sJ4~utyEd*j|76&p+wq<3SsG3+m*mz3X3afI{>X&>eR?T z4iowX5V16vfJ_3i5KIBw;5oD2AWfv>K)02M1RqEo0X~^oo-QO9fMxTt5mkh#NUZ;)_A(UIj0(uzhHUjT*E(fK$7 zyo5$jFJKx3(PkUK$4CPmKo}v9ON@OP)Hg`nYpWgah-wd!Uw z^&ksFn$yw{=$A5+(JTPrrR7N0;{pQ$pqKKv3A%HTQ^2H`B4o4K0?5RKf*gW9g(#zm z;ff)@V7!Nhhd=-Y4)Q;U^bjl0iHVT$LFcRq#|XfJ5wu{nnPF5S4ak4t7m0SDOIogw zQW*)pfPxfA=y5v7NCr}XplG2u%EzI_Igx;*DS&<#!U`dj`A}d;T8=Cr`G{f*X`Yj0 zP>te2z9c^5fLjFuvn?4C36xe~egjHp7{lcwb5%4IX_yRi~3}tX}W5HR6fC^Nq29dQIl#(QhRl%rO zEN}%t+7gig=#Hra17r6>orUW)h!f&i28t@EB_OFo6V?TgdLt;k8q`TGj|SI@P=28d zBg`o$ZI_WeWio?F5D{GJv$)EF{_-puax$Po4-96cas}2k*dU6g!NI{6i$!hXfjd&2 zkNXGX3#h`N4iqRM7-j}+n0b}1Wdos->PHpkg`kp3S}TQNVigU&Z5s$xzmSlKegi`X z4Nx0IBO@PYCJQJYb`g-;Sd>IVZUYB!CaKmPjDiwGpap?=S|}*S=$Z&cZ;)4ATZtVb zq1b{vnMCJP@?@e2C7JaQJ)V+}<;)OKHV9WQz)k=bpk2#BeL^7Hm!vjW83pqK{^KB} za~5QvplpVS0m=l@n$sqNC&5Rw0EkGj6*4FjNP|ltL4-OS0s}g27Gam8M44F#unh%> zH5=fOShEQ914md>r3m@?g{@d_v3{8-qCm<-SPvx0ra@Zt1`&cU2eH&yc?0Pm6BhFm zKyJZ!ff~(7*-9h^ixEQ?KnL4G)$)GfgMoM!lnsl6HAEA<7UNSQ5>(aDz@m79H$sU9 zqet;jw4f=N2T)O@Q9NTbi=ct9w~3Lzt`R4F0-Os;T!pg?PA#xz%FJ^Nr3BY8dvxYkaMG+1dDuR&$dH0FksLNDg4W)T%^~ra=~= zz?YIK#1)UrV19}(f`Szq2dUX$5*lo=SHBRCe}AX^A0b=*`ritUYkd^1w%-a|a7_qg z6{MdD>fZDG=$*j*y#x2R@v-mit%7CW+e?VH@9ph-d;8uV^1M~`?R$H0gne&s-`hhC zWZ&E4=SB9ty?t-r^3(^O9@_Wz_Psqkv9j;&TQ>amz5R%5gxM!WG(Dw*7r%Nj~tDz|+aw$6NXTbc!(0{s6*$xWIn6 z;H95%?1u~NhYRe73+#srS{+@pA1<&TE|AVuzH*ihKY)yGIqzpbTwp(3U_V@7KU{zh z%-IhYv^;CDA1=Trt1|v~$MFBp9zb}meBaY4(8cq~gsw7~w~v>rCOthhd$9F+gjb1U zFn$fxYv45+|AE&ijmDyh(re;08oef3x}}MV(rA+L&1j7#0l&j_^E+JOk5MnKQ(|Dy zLC_XECKmqUFTqRtD1Cx)q5ni}Sa`^|?C=oMBDKQ((Rf^GaF`s%8>@-3Sm%{)O5LD~ zLHfYix>{Q`niTwjfbrfAV+!65ODWXGXNg+Le!V6}8jrM@f_uUY@RE5UI5#!POo=Me nWKt1u%Fu$~jFI$D2cZMhq3RHCB|MM8k7Rywm&wxLA4>lPWZ^x$ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d GIT binary patch literal 3068 zcmeHJU2GIp6u!GGYZrVOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd zr)%p2Ubx#!+mwyv0QF_?+Op+i{+J=%~>8 zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N! zT+gohy}f4!viskNbvSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk zldhaoEl!8**-E20aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1 zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R$Zwx35Deu1M^|QO;^<5i^#8S> z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@CN06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1 zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^( zZ9Z*Ab4(47jbd%8_xTj?;P%JFsDFc^j4ttIwch50n#t|Pjl@Z;5h{{%m9Hn zNRWg%h%mb^f+SF&DotmwhftZyhNh&Zc}UR;)ehtIz*79ySESPK{RMg zuYgQ+EqUVihQ}lSIB@&^*rf@Ts86DLFIs`6{Dn);;B`PnKtWIeAV;PGnmp{Cy8RB% z#1_S$grE}L(2f7viQh&n1@Bt`&?UFNY zm$ojtt(D>tZiIZui}NNCUsxY9MV+9TgHCXD?T)g&!64fS0)JwT-C%g;Uz&yGTp_ol O8wNhkqX5vrFZl)Zqj*Ab4(K7jbd%8_svn^E-#6&YV1r(OZGmYLrk|07$*$KF#%afM*G2K!!NX zK@teeLxkA_5hQ^ERcR`XJ%q|sHZ*0L*A(fyQiaqnQA~ys@go2c9U@55-P?!tAR08L zmq8}FmOSx$!}iE;2X5aVyELH^^+{CkMJuqBzi8=cybh=cXb2JjYNX4c$-~~M+wbs9 zY)Skn2rAJH-T1Ga_-({e@V-R=-Ov)uarT>Q(TduukmD$M4AI~QvBTJe*&f?q%wv@> z#cZ3gU7oBkw#6C!DWg~Ap+j!)h3xVEO+E+F;KdQDm9R1z8a|_V3CTY17WQshMQ7Y9 zZeMhptA%6S2=!1G=S?ELus&3ZIzclBo#5))9c6lhL8cP~{=^)+!SKw#H1jLje0Etk N4E#8c0ze&q$uINqc(?!n literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/metadata.json index 4a1b54395..a0b26f931 100644 --- a/transforms/universal/fdedup/ray/test-data/expected/metadata.json +++ b/transforms/universal/fdedup/ray/test-data/expected/metadata.json @@ -2,86 +2,48 @@ "pipeline": "pipeline_id", "job details": { "job category": "preprocessing", - "job name": "fdedup", - "job type": "ray", + "job name": "fdlist", + "job type": "pure python", "job id": "job_id", - "start_time": "2024-06-24 19:39:44", - "end_time": "2024-06-24 19:39:57", + "start_time": "2024-10-18 11:36:37", + "end_time": "2024-10-18 11:36:37", "status": "success" }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, + "code": null, "job_input_params": { - "doc_column": "contents", - "id_column": "int_id_column", - "cluster_column": "cluster", - "bucket_cpu": 0.5, - "mhash_cpu": 0.5, - "doc_cpu": 0.5, - "num_doc_actors": 1, - "num_minhash_actors": 1, - "num_bucket_actors": 1, - "num_preprocessors": 2, - "num_permutations": 64, - "threshold": 0.8, - "shingles_size": 5, - "delimiters": " ", - "snapshot_delay": 1, - "use_bucket_snapshot": false, - "use_doc_snapshot": false, - "random_delay_limit": 5, - "worker_options": { - "num_cpus": 0.8, - "max_restarts": -1 - }, + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, "checkpointing": false, "max_files": -1, "random_samples": -1, "files_to_use": [".parquet"], - "number of workers": 1, - "worker options": { - "num_cpus": 0.8, - "max_restarts": -1 - }, - "actor creation delay": 0 + "num_processors": 0 }, "execution_stats": { - "cpus": 16, + "cpus": 4.5, "gpus": 0, - "memory": 14.396823502145708, - "object_store": 2.0, - "execution time, min": 0.22008283535639445 + "memory": 15.91, + "object_store": 0, + "execution time, min": 0.0 }, "job_output_stats": { - "number of buckets": 15, - "number of docs": 3, - "number of removed docs": 2, - "number of min hashes": 5, - "overall hash memory GB": 7.152557373046875e-6, - "de duplication %": 40.0, - "source_files": 2, - "source_size": 73126, - "generated buckets": 15, - "generated minhashes": 5, - "source_doc_count": 10, - "generated doc_ids": 3, - "bucket processing time": 0.04204988479614258, "result_files": 1, - "result_size": 36941, - "processing_time": 2.286285161972046, - "source_documents": 5, - "result_documents": 3, - "result_doc_count": 3 + "result_size": 663, + "processing_time": 0.024, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 }, "source": { - "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/test-data/input", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", "type": "path" }, "target": { - "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/output", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", "type": "path" } } diff --git a/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet b/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet deleted file mode 100644 index 92b4e58c722d74a2f1528e89c521ffac58af6360..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36941 zcmeHw3v?sZm0*=yc1v!%+v;ig$iz*`z1>B3)3#NmzZXcCq_VAUS(5d!Yz63)N+nyi zepr%!fh(NH*Cp*$jut z;S76Ul}ai}B{z-ThGbi(+g82b{k(hcyYJrj%0VVX88ily;fT-BVZe4{m=VLe@yl>b z`m|Mk8!*EjgYgKo>4EaV};BDLXW8^D%zd_Sy#X5TW zj9t-Oj>`(k5yCV?8QOX~dSArh9~mUI=K`Vc2#|mo+A#Rh1HUkAYe~Afy+%@JjU)v@ zuWutC-nLEGEurx20@m+Tu()5+*B0SMVp+oDQR+At(@6mb(pwB z!e%ukOi(6J$rNCjNID%hi_K2UItjkY&Rm<>C4bfdHxk8YGQAgoS$f85n? z=&DHXj%1!MXN6qpL^>(X^?$VY-r7N)?j*n8(f=MnUsW|8!wnDNK0}+R?^k=`KxEN8 zGCZ8fl!d`UIwSOT1_u|ptWQOse%$AHvvQFs6aEG$FYGqIzqBvcn3zl zw5KKjT3GVRW2SsZn*uBEEg{wIP-I$azdP8f+NE6920JFj)`!&P{^DhT!IGB$n6WbnFveNBo2>YMV!Vx2UB3!ju2q3g+<*1J??SFKSmE@?JHf& zb3%@O5YvKa>Nt#V|J$QOH}_ze|2R8C3=)37J51nRD`4L&F(T~syIEp_nGS~{gf~pY z*-+pJ0n*w2@|E!D9rFi5tcP&T5Rq|~2+xEg>_j)?p(vK{`llG5*F%g3LK93x--pqC zeMFE6F%xWr4e9$ab6>xP@!`fjA$Bs#h7FftmOi2fIz`!07pAeY-52fm2O>nw8yWZd zaeWsI7CJ!a{^d%_#u8CKz=Lz3SWXsdkFo9u>p4P%1McJD zSdefte*Au)3z!k`ct^cJ^k^tBLHGmyfqDE>?OGx>&iaY4FA&468gE!nU_;%fHK5#A zVciF>z6xs(X(9nV+kI8{sllHZ;`_~B!X1nPNh{XZ0gKbb5(mqX{v(*hzaPUs-o0P@ z%54wp4`NpTK}_%Ki7{a!N5?P|kRJAXVt!&G8jcL?&-mDI7?{ua0q1apfobvqFMNQw zKNKZ2v>mtOJ$ifBScqkq@%EAa@OU)h3B>$;{~1plxW9{{DR$ytJurPCmSEt&KR~$M zt}wxLYy5!&j}mMoOnBH)uYcg1-1{^h&*d5R5e@6nhcQ|oHi2kBYa-~2h6!4?9luNq z{L($Rhn;q_tOwuIHSUfa>DKO@?uiCHOr&p`BliR+BL3(E^S?AvcHmJb(Z%?Dfs-Q# zJw!MX2r*;q5kf!26h^@2TGFrJx1~eC&m|wGfAkm>Sx;K9xb9;;OlXV^9a`4L#+mkK z2mZ0k#kh~(Lhqr72^TSX68oMPXcft>%O|bq5%NpA6Ir-M!$M)v+)@iRl7>yAAiGUvvpuL2P z9rOA@R`vaa#~Tj%m>K=W8jF{?yZaLa;|qhnMExH9#h5Lp#k(kzm8P^y3z*jLb{#4n z^!mLKFQ`?Vi41h@0YZX4c3@n$53@b`nSPK=0vPMM??<}5n7warmp|ZP5A=1v4|9AT z?n?d}DtyomyaXwv@<_ZflQWkaDrhzPjd(GYW-4r6;VWwa^U8-HT=s0DlWr}&?~{MS8z zf;%*rJ^ALdlp&6>_`a?%DA*G}?Fq4g&=_~y#DeZKfBKR($hx0-3=8SM{nY2M2!4ac z7u6re%>Jjb(Nh@aF`Uv^*pHmTwC;c(U$8-u?eK1`CQS7GMGsgdn1e6>^$S?!SQT zfdE)l|IE%eb;b~g5gQ8YufV7Sk2}Ac{t}40?Vm7>nbuu$;3W;?zve~PC9rAHUt)y* zZZFKf9|OuJ9^8FKd&D!&gvZ@5m^&2Ee-g9mywQ}-d*`vPBZMpZ)Zu|^fn{Mf(*CPG zp@5HYc?Yf;a7Dj7ZZ~;hP2o$c1%Uv)WL!-6q(I-V*?m}xvDB*?J9QI{>2Eh!j5JG( zGVXBqi&)@^+jsv$*A6LY8k`@lEp(QTN6c>xQ&Y|w;e(p?}6Ucb<#@w-prH(UgD_oVQX zkFd9nq;Rdn;&p{_d}&}U>J|P3BMyBMb6Dr`u;D-L3By9-9(UahAgo3I5Ozfr*EsA? zY{w7(jpHI-`}wyp$5RKoFJ`yj_jq^zfjQ6^cOW_7E z*ckq~BVaK-URZ&^-p#OI*MvOy|L7Wx`h0p5W}}z3{ms?heS5$^$d$AD!_mh^{|xK< z((W5|&$bpWONPJFuAkT8!G+q0=5dd>DV@?>f2P z051DFJKNunKd7b5rbBmXy#C(8uATSmm>=%R38i#$sCT~>j8S;!(H&&8bLVe&?TdDT z38kKTh_z_Fks0nUj3Dx{K(NsQ@$uEPSK0>g$VtG+$?=`LsM}nh7QO9c1McXW<9U|DB59-?w**=c# z=VrNdZiom8nVs8q8;Nu-m0!&2zM~BbLibcxpdc)^@7Z4@pp?kw7X|t-)&uC2OQTm| z*w9E8KZuA!zE&g@RPo_|+9nZCS?U}S(YZoSn6yrzr9oP!hZ$xR(K|{JY59FD8K*%kuR!(QjML+!>zDWnrz@GA&rW- zDwfswTGvu!aP#0pgeYmq_ZB5h!Fl7)@pJUuCAa>3dhcjQP(=Pda_$u#Jjf>OFhnO_ z>Fi6hi5&X6O$K=O()K{wYz{T<#cqw%_F`H-57Uw>6~*n?H=@0#%;f#Y$TtYI9lIcj zpx75AaT6s`bz9wK=#j&79eYshNl*)@wFx;R{D*2~fa(HaPP=T#hvE;gHrLnK2t${J>Vy;go zY82@pz;vP(ExEg`_ue-0D;)X2hY(S3Z+KOb3B!JweAZRkR`N;REbY;s#|ub{cjo zVy+!fxZNnmE-{S!{%pUYvl6=2B2$WX86pyq-Z**pEcyFXzpVpP1#?fE;Q;_7MT|d& zzjnuMFti*CgGlkvY`Qd8PE^ByTzW8@PYRjhpco@YKL$DensU0BMl0zsq90;~%ehRR zOM)#Qt^{`F-@qcNZm6&d(X<62cR9njW?0(icTM>GUNdamW}=fbZu6+e8W^WlCg?W& zJ%pd}J%vq!fZkN738+D4JcQYk!igneu?X`eO3B}D@BP|#@{R@a%p4Nc_Y_6%G5)IR zP|7ebYmoWOFw8-bUlhPm8BAu01Xr9R26^I=!fI&_eENzyDAnK+6&W8s$03a8+Mz*G zUREd8vF`pcyY>>iqLN=7P(3Bcq_paN)xPEm5&%SY8jc1+dTvAv$Q8m1Gwe{Z9M&5KS zdH;tIyVf7~Y+zMcF=DtvX2sqDTFD^*YaTQ?25ARjvW}Qc@b6Obr;2#>wQOu`UDZJT z<0m2CFK0|E^rJ|=C@%kj_ukY-p1vL@zxrPg16q%|9za#X8*}CEqkvcv64;3ZpR&W$ z^K`;YLnhjk608&rx$cCGvYD-%)nYcAD9Byg1kO&|tQ1O# zq$oRYNkIZVMW>ue7$TK|Bs?upDVx&-xo_TXNu_wR!(uYiyw$-|f`z9X4jYB=u$u&d zvzYB%GR0Y)iKJknIl#lhrR-+TVWzE?gvny#k~SyG-8-Bnn21!0GC|5-NH{nr%vIfK z3}D^SjnZa3x>3r2usXl(%eN?|zO61t{;QjC@}ZC7XwKgYh^tD_g`B6jz2D57=ZkkL zBw|~6F(X<5DdlD~?@rvjH-6^TYZ_w}E^$yXp+ z*~;ra2S9gxQUPd3C4_;3*;QcQy9*~DxvMp>rakA{n#z*Uwzg)3|Dfv1EzYfwQuVgf|OCb zyWa>)7&S;aLjBMN@4t+;p?dM3*h0lV8l|-i^{Q`cL^TR1wNV=F+4G-KP}z~>VOJxu z_g8JbPqvW{{W(sa`m{*qI{~K3k{5OvzR)G}lUK9*Sh|uQnDAkF<`XDC@PYG?AJ9Ueo0bv; z%-@OmfuL$Yd)@p1N>`L2NIfS&*3noAT62GOS!2vvMbj_DXfMQQ&ug~eXOAe>fgMQ! zHsR?43TugL?TdZA5BHMS{}oPtbj_$p(_cR5e5gOlnl(yCCMej$&s8)97$bD*PEbh#k;E3ZiI#4e>p-`M|A(fJj zfL8se*V}r(*+%~5 zqe|L2Y@tqn8!(giYFaqbiB6bj!o=Rgm!lqKyFoi#RPBDeu5K-kpP=sVdRPU?b z#mSexQx#R%v8jx_9#GZ!yvT5|EUMimlx-|SL^TCxWs)iqs;gESYhN3Px&HeKVmcOu zS$Q452k(6VCr|$qPQLg(k&4<7%F$G@>@{2@W4Yttc7OJk*n#^^FM>_$j@5q zk($9m_jy!^M|K8PiFO8kX<)!-f1zXm%KR4vX=Rs8*4JLe$(vsl8PGhGDie%`etBNH zj-cD7@`W@N?6h*7vHGn6zgPaZ0>2K$QH;Cs-g|NK_rJu+Z~Rh3uR4NaEL8;8U6J2r zl1*A1>l)WDj&)THG~N0e1x;PZXFS2fsf-M)is(|UlxF+!ufe7IwMbdTuRceZ=R&)4 zkhZ?<1fxo0nRb<`2HJ1@%X`N@9+A^sf>myMf8YyqbQWr{1|Z^4ytWD(IvBmGh#Om1P+zE$}kfX$^aKpkN> z|6W1Zc0N;vBg%_XFh#r${`wm@`S2Tvq~@_&Ku{y8@6AJ2%-B)n=Y%Y02*M@t18wzJ z(4`{96>iMX1#OfUeGgzhNIFR6)%%~yLg_A_vZQ*Vw60J-nx`zRoo-W>Di4jx<(+_5 z^$?eYjr!3YxdWo3@yLd=jrN8 zvu8wZ_{soz{*UOB=kFN(J3kpuUN}0v$p`sWt{;$p2&az8ck!4n`L3L?CI66*canbu zClSdF94;g`cwUg)gcE<{R-dOMFC5%}$l!%*$8Y1ML0jj0y=!M8$am+s0lAe20LYEx z^vMmR>B)C3M^0{jLYv&w95ne()5_#K&lHn?BuPqg6SIrtJ5Lyro1Eh#H#;XszFP?~ z@?FeEkz1G_BDW-E(|hI&-rGTHdw+W$z6XP?p|Jz*&m_5`@w*2;yidAIDdoS4`$mWl zes~|eS~3aW_AR1UO71&C>_ay#p&$ph?%@gE?y^{dN0azHUx2Sx@vDHoQrBGt57Kz> z&QrLkDO857A0?F*GD-0cXKiE&trO_N4HWJx=hEohNi*ZNLn&hBiwxUe#g+T8L@a z9RchrVCWXP45d#Hd_)HJ|M!K*k_ zZG)m>J?kJJx~O|`L^B30n>C=bK}Ax4K`$p*7Z16?=3-F0>fa2!zV3X$MG5;>aOJv9 zhO1(qC7ZpuaNwvxGhvm&&4pCCuGhRRI` zE#BHyqqbVn0CJ-`D&C-N1}NREMdh0euXa;Q1!#Tc=0dK!>e39msd|&4S8tN2fUc|C zT)6d@JDOoP*KaO-`D9NsWMxeQOogJpBMq+H=4ghgsMutXbvH1Y0XJ1QfZw1NO9!Ev z!Pd$*7oKIjz?WQBlzVa)V5j4z#NP6we5v@=b=PIBc#0 zwW0Prz?MCILB;xmY)@EtFv<1y=ef({iDSza7(YAQ75 zi_E4|4Y>2G(0;T8{FCuw(o-?3h=1Gzw7M-T62BVpZ_zI20v-b&DJ8C%a?&$ns_>mk zOuMW~T48=n{`tu?;%Oo_WtxeF=8|J~IDKxUAGP_d!It2v@NOYN`ze@tSaWr#7&2a6`9X+v8iHw)LEPNT&R#7 zTanZMKSUab%N6 zodBPj<6@Rp_9Khdg=sppl!%QM5^2%SZWaGP_E$0$y+HQASr=eENr23W>kL}Y)bt@6 zFDB^Gnr&^?hiqcZIajmIZ%aoP66RzX^#k1$^?%wN&+s{aAwjp~ z#37$(Yw>Hj{%-Y}o18vYY$a#ax-F9{SktD#Ps=T|nlsRA3fVeq%lSG#wguo?O~`ku z_!%v1Uv*8E{NYwIb-r*kqqY8+#G59+UfdT@;4iESlJCA%YsSJ1jr``8dRSM#b@p~@ zZN_}8z>kBCSZ95RPy6TjY-SmJhNUfDYcug|9DG9XEwfX|Kdtz)&HJ5r3|fLe%T32K zXs@-!K2=5-cNO@v;W$0F%uOGAH}EUWWX1iA=%Xw44l2G21R&7fLi7)%y;4g%b6(ah zcze@ya*5Bj_USo#%9^B|tMPSw`Z)33r#(Y_kV(N?V&5NAoY+|$BdJqNL&R`@jR8z9cm5YuVUjr#ahi?~E`%()Ev z_jPbm)T{YQ=WD&M_#_b1TI0sp)mDC7Ee_yM@$}r>#MbT^>G<@qH7@2Xx55wBw^g3r z`t?;953SBXY_x?vgLSP|ucnU;vZN9VeJ6Gn`AVZ}@#)qvFchl-8z#lC)a32FtW9q5u70a*h3FSSY#rt?S%kQ2tGQHdr#7b>if^~DAEHk-1#ulYJ~1}ca{t$|?fIJL zDTsBtkAaUn1#;(HMKMr_X&1TJG2kieCB*!M%L?CD6J}oIuR>oT-s)YM2wRqY^Gq7W z!xizKn%<)^=DlTTx8}{6P~HXpUUVfM4`~!g#y|#Tb{u6+T=48+c!u#Wk1>pU8MU2d zW*ErSz%w(8o*hh-VSKZYyKy;Okc$bUXBT?*l^h1Qne}ffV!u5h8z3TUSL-LQ6^7iehvN zpi|7H!5j>Aw*J7#YX7P}ih~~yERbvB?PxI$z;BRe%5IIo)m1_TW@Es3<@12=P_)agaRq*vJ z%lI-Y2qsb#;3E1VeZ=_{eI`U7{2lY*pvX@*<65&z@==k8rujeu=#u0Ei{i7(Oc~K<@tmbS5+zq54nNJPy)DwvzLEcVv3(^*F78{Qz>DxRq5cZ~i$o~w ziFjUNtS$x*suDq>>X7--LSBOagHg*LFk`5{qgdgOEUySjr9GKa>Aw)4a#q)RRG$mN z{Ltcf@Wgu5*Bhitu3A0`dB-2@FerE5ns{X)Z`h-7?24zdDt|M z`oq(@_E^e81Fx9lsJ(L;ew-DaBCn3XF%*Jl1szwybBk39`IqcNHQzq!M6wO_laf44 zRQtQdPEGt3<-6{z1E`&)f^WsYL4!>qR|HH94}KqfVDYLVG3iB=O8%ij{>65qb!RQg z(#nA3XBmwLDkGX|aYS(t^o?OI$BaaQTfC+$lm_`iVNg6|L{ttD5tI#%5SifJZ17&f QcjX`U!AT4YG9l{!0VnkT$p8QV diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c7d3d807242ca605dd7ae80e4807895ad4d48c50 GIT binary patch literal 3984 zcmc&%XH-r&>{ns3qCWSl>YEfiY;u0EYF)g*Cg|rUM zp*e<5V8I)dNqK|X03aBJnN^4|Q9#Jsks(dW9D$=Flk%gZl4It?N2SNpSR#}(B1tQf z$IE$tK9>Ya!~>Ct=d8nMv7|e&v8EDQ z|F^8Ayg`YSNBpWIVJi9KEhE&j&{Tp6BJV7g_mJ;Ou@J?i%*#xQP0pAT6O|DqN?;BN zF>~gnBxWWhYl_=|B`=Zu8X-gszsT=vGQob%EB8+bGY?0g!fVI+c`f1aNk1{`(j&s<%y90=+$>yWbCau% zbfW6H-%*LS;lz1Jhe;*#;N!>M)X?RN!RoV<9Nn7;J^z^*wy6DJ`@s}Kf3XmP`_>|} zq851M*Mu^L@=?qgXX>_%E!rI61pGohpe!P>xl1)Q?_31sdr}Jd)gdU~ih|MC^r?dv z-lD_9E)Z6Sw-8HidBNvtGjN@s7Yf%a5%kelsDB>!r=qG7bt(=Y75gm=L!V@IVq^IT z?i|@{LEjY_p>SAA9!iPC*)P1%)z{s+73SW=g~{GDzqmFkdcsP2{phL~vrNvJhJDO9YeH4G$ZHD(qT&Yy8Px02~W}VP2 zN-9xW0cKWhOYy@6lb~@w`!8k5Ih8N^q&#i5s$Ig-(*zXGG0| zG3e##BvST8Cj?H(h3Surh>Ks>qPD0P5El=q-cp1b+QabO^>c@~4c&yxAJ}8TPnRL~ z)fM#e-a-=ZUn1*vxlC>e97$;v84}TbF&M25;>Gn?!jnf00_#o;X99O|^P-}SD<_+S z!~Jh5J%_ERoNJE_3^o(o)lb2D*K)ATJ$B+qPVXy73qAsjY2<<>^Kp9S3)w{j}ReYljUdow!7~`kIkPz95O$ z6Di$}@+thasjE?xlFPT8`W;&KZ=ZPw3_1Ba-LRA5LvK z*^By$Q^_PfVQL1-r*CGM0g?}+O-d&WqKP3 z*JB6^zL8S~=d1D5w!>lwZ4x;CDCW$oR-wAl{b+sTTHd=k>UxF9w2>rB@R z=~X-{x?b?x-$;^ey_94JiJ>x%sWGz=Z~leQddj#kvhX^wc!f$&7}rQ(t4C?fM<-_ zGRCJ!ZTph(cX1En5A^jab=5kaTNqarjLs^1HITv`lscdfRPFEqkIdkA{z^QuF3;;J1@hu?=_2{{i@x{Lf`9_AxCGO%Ax&kSVf#j>}o9Z zyE%6Dsipl`qPk@p-Lk>RZM&+&e*WcMQ&vj+Z;xAW{YYffcK1ryPddEfG-`7t_1Mj-#P$FaZ;=r$zt3^gGtEAKJ zxv!(GXEzs5``UAp(G)c)3%c)}Z{}a>UJ~@ccl#LY$kVH2!4FkC9Fw}6OM)K-lulXp zhSZd)w%erU$g*)2O}JPA1#c}h(ggnScL4R5mX57}7$ zvzZ}{2N03>`!Ir^l#tA#qu6vZF(EyJ8=Dx*LJS5^)C(l-BPC@f@c!%wHu^1=>w-|gBjo2{+=o#2peDofwlk#q91{FZc8`NR((I0Oa2frdctU-DQ zM*kH&(tZb)5@VC&G6v9JFOob&@&XHay*UGr{^t7T2KZ^y4P-ZAo{f~k0-vmbgFT*SUyCzApZnQ)w=Sj|w5(CH#r9wqe(A3aL1A`?k zBW+l6R45e+#Wd#2lAl76tx(1(JQa#Kg?G)5#Y`8ZrSYW!7hwzg+j$H%5av|$Q8CMzcAW9z%Dv8Bs0cy;==H-pm&qC%-nGqH0m=YcqkP;E5DV>QoZJ<8i>F?*mG>f6%<-@j8nHu92lEDfTJ}OpU zqm(@uf0~b`-%PU^8f?8n;m#ht64@T4MtDwK5N*GR>B%@&iD_~++kYm-fk$EPlp0GW zbEY~*9;9gUDc#>x(6Gu}gYtQ&+OxdRy&;+SzYE;^X unXwtx^HNi-qten+vQ4vM)6x@Cl1=TL#@dawm8lE?-hJRC2@uHqFaB>f=8;VR literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c355b299a2b6bafc20dea943a5a9833b966e68b4 GIT binary patch literal 4763 zcmc&&d036<`@f&(ec!XEsFQP&igZN#YK!TV_K<89(P>Z1NsDX^MRiKH2q6?j$Rssi zDpYnuri3Cf_GQL0Wl-VwzNcx1S^oH4*YEm$pX;3GeU|&aKlkT;-gEVQ@{)R62uY!r zaJr|ECxih2hC3F2a%Ycj(%o2op`t81$mz2car&$PfNzNDUVsQfVT~mZk+zT&+W<>VEaC>rVuI&| z%i_a@SW}?cj5OPjWRZmPZ?VM2bh04CKzC~N1Av|f2wgqtB$(vQNd#O3EH0Y_z6u8f zDjd6hv=!4-dmoJJS80nleLEr!`rSs8rtTGn5J?LOS;HEs2HjQ%Yzc5Lt+x< z1j`Vtc21QDYTpj1>*uEcyL_+oofx?+1U{gG*SZdig&A<<9$>$Mgr>v8fID0d)HUp(alRU?^RtD>EJv8sG9TtF(gkj=I^3LG z3VZq`fsR=@x>Bl*^iLcBjVxEtKXC}olJ?(+~-T;{{A{_0R)Hw4K1iC|vQ0cmdfF!^u+h!0r6ZfpqogDN2T^I>=dg39a>AjxLIp{+8w zcvlt9U!&mKPIKg5QUJr^e}L|#&Y=I)44MOT;O?gwSd=gewx;cY&kN7N7MK07%Y6de z--c25@+V-V7z{&8E#dW>R4BeV9EQaVK%KjPLt4R)P=XiHuY~^!?dO-1zVlqH$7Yy`oCA{`@qL$Gc$R{4B(IVBmXpP$lu(aQ^fUY6F^ZHiaV+)bZydk$SI$RDs?*` zR`ob4O`8Zh6Fs2j>R`fPV}`@9R|YAoRACcfs?2pj|XgGb!~`;$7fSvV6+@> z-!mKUtgt}U!>fpri=WY)%TI_2r-rcg(-z``11}Qa`rA^*2iFr(=Dx%Qh=;LxL9qOp zD>Y!nVlX~d!|Hz%p^o1O09g^3e>0xYS}cHq|Adk~!UpO;M<-FTBT;IF2TL_PB7M-l zg>6_zGL*eD!VvW|hzY6HGV*}jA1A$YLD#z5`@c#(&rNR(;C_dGQQ0);EZhBCRl&B; z_fUVsJY=D0y!}8X9a)ST6xJ$gPewx07;n~s z`_9N`XD*tzYCEiL)rI1<#-y&-5RTb~bv*v*YCPZX4Z7z(iZkra6BN!f=3OqgWzSrn zqQHjk0&(b8-_=J5mMJHTmEs)gX0MZXqss zWQloiuRzF$2K4@ZB#DcbirXh#A=i89Q~WL3L|{)aM(N(1u%{#8#gjVT=>g~Al-CZn zLKdjAf821ediV>aZnXgwur2YRLF)*1`YUkVu>wY>9b|bX%;Cg#e9QHEK8Am0=<U-s*sUjswc;JpG3i#i>r4-k~35BO0`%=n&5PWlv44ya;x4RgkifzuSCR0;L zPR9*`)2hW~-#<#+EN#G=N93Hr`-?#PR~7Ipyi8nbHDkpSmnnO91M<*Sk}&Vsq}5t5 zo|_q)j?NNQ?2%JmqU0C1Ab*oHF{(#SYmwr(7%+N`Ex*H}sQpJ57T zo0716>O3^bEs-a=eFq-uEF2y?tpX=xl(IuN>rzgkE?BluMjl=z6kOFG&z>0KMqcL@ zv1329q8rCoVC^ln8n(@AU_n!vp1Xk+66KvEPEIc(e6yaS>4Qnmgpa<|`kLpcXKO4O zt*!&%j{8vLlp{pg86dViFoi2i^0ALai{Sf#ahz}bw2*HU7n)Q{P_lY9VSgB5-p>+h z(1miGEPggXVe**USRR6d$~{r_h>vJ>eI_Ss`hL9dbR_F8rG*1|`oxvA1h|oZ05wJg zg3VP=^48;#_^$04bWNO!FQtn4Sz2}!Z?}T2m*d~B{$VKMYgD0Bl?b?2aG%)B(p5>* z&_cNh8;NCm@1RTZTUbVw-e4c_fK?!<;O~8u2gW6%__cQ0V8xq>p1t`MM|bfdX@ZL8 zvQ0wN`tAk(JU5*?sa-(4IUkD#-`kFKr#awOe+junV-jnRh7$_&{+6Pz7PSK5dV#U9 zmiOnSh-6AHBAGv0blc6+)1e>4rymgtrJ!>*bBn3vk=*5gIU@VrD^du~07IYJ-6<>C zYC~PO+83mz@kN&5Ewu$JGgS2)6e4Nis!Vl5@BKb?g=wn?ng?FkD&3R5X3*%kr!94R zGS=zZr*gy&dleh>U2_b5>-T1E9PU-E#=qFCuaBHrI_E^H(Bp=&UuD#t`dp8jrgOcPeCVQ# zx0cDyWQoV68|RgWT;4WjDa&MAMMU!f-|BRe?cYbYRW0sjneI5Y;NhjbaT%uhRf#{} zI=+-`wzFn&=kwO;46|J)lY6>8bhFKO*QStsEjxvIL4BHF(AZ@hi^4OR8b*FK3X47G z)@YAg(!;Ubdtt*ck8O6DmPMCxjb~Ra<60G8&a;lZRg-D8uOZ)VX;%-|djF3F6IW}E z&$2FQD*6lh;A=FxO>!T4d=`zqPl>K}fJWcrKE4lq2k&e2k|jMpHhWzU&N}iH`avan zb5{I8CHh0&U(lCy@c!h^pr5o&X4~$pDGPkj+KV34{h`}MbAD?|=ojdAdm>fSIwP7( zW_%ZV=hoZEmLnloi~Vo^7TtC{@zvuyx8KR{)upKppWF6+!NZHWW?}7ZAL5@j7EReV z_fA*h&#jg6C+&B-lU_?YEwqKWR|mL$eR<|aN5n9iAS9lQiil5OheU-i7i;$JXI==FMnMBcvTJ%PX0q3aH<7BA+iNiY!1i}mjgkxK$!RYt^XDC^Q7C~ z1p@7+Y$8sd4I>c#zDgT?Ef8vK{!JjL{vj8lLSn)adi4)25k4Wwerw4|D_SD_J^sI- z{u6I;whf3_@j>As(J~=P%OnH*h1xRE_B^C+PYWin!J@B=FJ$!^f308`yFZUbh=nXq zJI0$7DUXOzWSBpO89a<{&WI6|5W~$tm znltmrK+DM%fwN{N20M?5^!4!$XZEAL==>JXr#odlM`aK?3A**hQx2KEul(cvX4%99 zyM#$)jK30|rb9R1zPa~|8O+?B>3XRvQ}4|ul3sI2kj=7@`}$0l`}ruT(|A*R`MFR3 zIv=`Nbo(M7rj!n`!7eism<_3$Cu6TPO6tR(>ZYtWU2MAbm0l{H$P8Vgm>R_TIggni zXt{{4i8W(H7deTkKb>Olq0nm%N>4g-njO9GD|JZfJtt+dj2CSqRaUasNlCJIT}jG-rOb2Pn{AkvGK#Vtq-NiYeDjWv-RKi7BbssI20 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ad59ee31cdea4b0c0bd77befe4a698181186db33 GIT binary patch literal 3695 zcmc&%d011&7C$rh-YgLI+!zoQMFJ|wq99sC5(EUJBA^f`#jpsJEwYHE8W4-J3gU)! z0|mvUR4LdBwjhYM*lIgxYM-s5zPU-U^|k%uec$_D&qrqF%$)6aW|DIV zmWQ~qH0#4oSF!>Y%mHkcR#63GKPaA$mVNuf$Mstt<&7R5b!y9$`%7=R4ZGP}^kW{IcLt z6zyL*8#0^1a7)rbx_r-bc-%A*ed1JOC>S*pc8%OcFRwJo;2pj{h=Wjq7eG-4X{v-6W{pZ*%<1T;6nN!fS{4(mTyNBF{Xz;d-c=)-k zo)S8EGkZ6kfu_zUXcQ>%lw&SvPoNQ++jRs!mSv*uf(G?KFcL%}|5X z;a8X~UH3ukgqZD5g0N^JrgGSJ2(bMLK6ATnaQn_AMhr&axwHd*+8YQ)DnCda?u75Z z(FP^G9n@E$-cWpWA%57-ptG}1fR~pY-8$kSw1^ku_ll;&@aZ$*$mwB}c=c0cx8egZ z@p}s2#54irEwUMq`Ii6PazAYY>I>YJ2M#bg15l&qa!@C3kW^@7kO;z z25^}+m9lzE1fc^Z$Zlf~+?v^qvdx7kzQc>T$qm|R6^$zS6| z->t;=7Ut8fvx1S}mKWBT97aRZGH~xjXI>WdIpY&#NAEvPQ!WqJSzWA~DlE~gL_Jg+ zf9#y^(6W1%VCy=6YS@V9z)Qc+Uq0$MyqA)R4^+N}+BTRPd7Wux%;t@Up1xe1F*gRy z3d}K({q1YGK76sgCbR`-uX>NaaKi}3Z(#sdFILk{@35lNwp01j69Vb4gbjSn&ll18 z!v)xSW2e~TTm>xZJ75!JHvvgE9j87CYoJu6w^7(ITHyUm#Z(=+iw0^mbehF*Nb=o{ zQsx|_5|073@!Rq6Mc!5%;o2`cG$d0nEz%09QiafGvIi}*D5rdy5H|RSj2ZfAGhSwS z$Gq6-rtnO20*-4|qV_S*P+3=rpfs!zFYZp^U1h9rtbr}{MNu}K-?|r_Nsa~g(@Of% z&9V5Z$1!x)vJm$cS{jyGc`*jH#eAFUXydNy3z4C?4HX(C!`ZsOQyX|AjEcloXnpot zYH9t~s5fgPZ(M5#_$>IAS0`#QtiQ1d9QRBz?DVpR2?q1go$r6ZX-^Cx*V{<4bRCN> zKE8)vys=U^>xzi_{)7e%yS5pP2vy*V(K0$uJd3we?1vIVw9M1VjG0=&7cMw> zMttSP37+uT1!3xS3BHD93fd-4l1=WY z8(T6_&%Z<{kAVkegsBzL!hFC2sZVXea!O$C5Ybt?d8P|wWU{YQ7MsvO}{D6mx27q79MQSG4W zsxMhNf)~s$~HPL=qk4K-BrG6LQ-F&s(V+(=1Fjk^Dfkiio1PQ z3$?WGZhtLES8pYrE~{9r^>X{lZA0JAysgjzJRoArh)pA{%TL7K$?e1*SXnfhX-)mB zh^ZY!<{dXJ(4u!4?d$sdRg@{Or)KYnoI?1z;x z)ABj@k19j#3f%I6W~}#)`8PeAJ&D=U$73VUG1})p;(z^!Wi2DES*$PkGhVHb!nEXcuJ|Wp z(W%K<+5Cjm1g?6LQdujK+_sUNw~}>9Wmj~a#cyBQAJ#)3*7IfSB1pi%Am}Hh|33FH z3EkFD+&g}yX^{&0xpGcODj1aEO8@V7iO4UF48~NLvF!TjKdbyEFZ+Phg!II0eRFdc zNp2!ppRMfqC{p`>Z~jl5{t;Hoa(9sOvf`2w($p+XdLTRen^wxsTDjlChx7#Cj-{_A zoaO0-Uu{h|xnHk>#Vk+h#c9)0-L-J-7A77BugA`c*tg;A>m2_lSdbW7T3agr$9>*k z)Mx!)9`9fFk|YOY5@VS_veHHjU}-HC$wNZsgip~wENLz6%#piXA(zYNk`G5Sv) zB2n%ymnX`76LotEA~1_v`O4*i+)VhvnQ*#g7H5GoB%hNv&e_?MEc|BA;CSC$ZuH{@ zVp=X&a)Uy}vAT1)3&*F#y3KZt4WFMA?>{+36%mreou>tp_}+Sx3_5$hIwK?qLVA0; zC3p6UepY0-dwP68qFl}C>)aDKgj9ok$h>(RAH)rM=(^DL@uZMFMYcNJJwp{SJ0mhe z2c7tvtH&1<_G&yNS%iKQ50^@XCO%+(Hg_NoRC4k4@tlJWjlZl)VNpdcie-fhpQplb{r%56w=*YQBuE^EjlP*|Ji^#}znba%k zKdefHT${xSnOVsh={Bw& W<6XzQSSqanUjD%m8X%bbtNCwBcqY97 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fb2a0b13d44e39718519f4b481204d15e4dbf2ba GIT binary patch literal 3684 zcmc&%c~}$I7C(0~NhX0nSSBWrR@N95WfNPa3lks-ii*_A;(}q3ogjd4Z)2? zsen}$#SN@Jwbr;C)A^E+HjNDgrF3er64Q#h!5Muw)7vq=YTnZw}Tgh%L;ox@>`QjP{6 z_5kvZH&Dj05Hz2JOFKP4G{^-U9E0FuniN*Yj)e5(!7#2f4W`Z+2%_~4@cH;A*kV%v zFMIDr=bMzs{YWD?F4u$mk@JwfZvH z#SFQzgm^YxOl0d)C zy8%OsUIMo!Ke&5uA#D465O`$SqrU2YBd5gcC{K+bJIftv_Rl2EbO(Qb^Ekd?PlmL1 zdz0hX^k`H!`%B_wm7DN`R5dbpD0qzjdt{?=8d30*4qg8Bn%$2JPm7jZoFUqUevqtl zJ}DeyvbC;!+>Pwi70B1IR3MJJLuFp0MDJZWL^ZzUiVmOkA*O7y66FjZENHQ9BEGoh zNxUE8XqRoZo7gb@78O;Z;b+Ke!P9Gwa5jH7?0LL_m@>CjxWtw(XnF86S+M6F)KN4- zBE2yL3Pwc=X7y-M^atxv+~SQ;e03mfEA}D>Mz~VmrK`nO#}5%{vG-8-m{%x|uWq0e zv6uMV-jTw|B@3jYLvIl+*N38qM>9#qCtpIu>xB^c(Wm^g-yTF44L?Ej(RI9ILnNs6 zQZ#YNHo?JGR7=!c_anp)&O_4A9q3_CI!SDutGM>Yd9ozJoo`j9aGPhtUt|q7Jd)SG{fh8G6!2$gM#J|lM&ZSh&*0E2?ZlDNi@e=$xY&BG zDkJ?ScJXQl>3JRI1Jcp^ilv)nLHy&|M1gZa5xNrkHs82Wub6bJjF+4OsC&f>n_&B5 zf>@hL4EvG*ZJypb&|fF|?qIDr#-TzSrd|xqis^De+cyrx*&-49828K;Dgn!2mGSTDZHZaSmDPD>cRTWQ)FVu0OFcXjan+T zw!;%3cY?)}G_x*Wy7j-9dTL-P9lNa*_>0 zPO?8ZOjUdr1)g}1E+7=ihdvv2x0@nG^MIh_LDlmY@F;tA^s(v%i-b})eR)vL!a^&# zUrOh(nng=(T{J~X^0tP*5WF584% ze_Zi}oymH%u5sOP6TEhzy>k7|4a4&9bXvSgOjD~kPr0YRui`u{(mnAcGQmANGuhiK zGD+X*!{8()^0O((tejQyJ%?aHD*6-zon}Scy|OFs=3g)Z&!)yeB>Z8s|ok8$`K{X*0KDv3Nz`a$6?!Y$+=yP z=8>+O{C)$E#g>#*`4;kjo0L)0oNT=%EM5OJA?IuMFIAPeDrv$}-u2JdMNwvI7F+6* za>1ly=QDf-kQYv3c&fcUuV zC*ZOl|D=}3J(*uD`&fe`In^)2mi-rZvC#8nzr*9N%D(N>CLkjzD>={7(cr@0hg1jM zRb6hl@V{vPFTDO4WW~w=kPC7XQj#(aG>ID@PvV*W$x|7=!yyPa1Q0;TpU*ojun0fj zm#}hw+z3t3g7DW^ZBowN4%=f9*hBv_s^K)mbt72)(|t{-J#yzui{1McsJtjIbg+0%V!12A4h6m1GFlPh@fyrWzTe4@*>F35q z1!N`ak{JW5&$-8NFvaxqk#E1v@?+S+0-X!p5>Gnb)8rYV0*o=yES#0+gbMooPYmPojgyV3KlKa~!Y*|gY!0$xwWHGF z{pR4Dj9`Uuk_*`U;}9*Eg7-9>CXO7V!{;%KroeJfT(GPbmSMO|T9i1DmVFL-Or|y; z`{8i1`q(~JWv|8}338WgL(Z)Hq`YAX+1bMkIXT7xm)S`b$sao9kHMMp`-1XS%(bMT@N9VqmqEhvbIcLuFJIP(Y<@bH}p8HD@ zm_$FCqFreZ74ED}h@9LZqE+8+iZ^?B za0w6uc@t!f)T<1N`NJGY-T800@S+y@S;4R=Sppj#Il!#d3@XmIA#x?b zuz#BZj;v9Fo7NW|k9vVlHw=afsTg^;cmVZ44g!4~gq2WmuE7JW2e^WhFbvM-I>3_D zK$yBP0!Es1VM0+quwL#2m*eZ9+O81#%6FiX^-|>Cx&?#_qrtuPBp7xL0o~6cXtA3B z%ce+S(eJwM!ZSYXD)$CiIUnj)s^RE0dpLZa2j?plD7JPzco>hv^I1{gepe0`(ktNF3mp`h zl3-tTgc>v zz@T%7s3q|*=Jy4tF?%~zUi$+6*)Rm{^Ih3f;1&lp{cEVnE>+->=1C<~KSk!CrPQ6c zgJ_`q6#03KH~3w;1zPGj(YkyOGC5i!OO{QHM?Aitb8sN=Xs?O144K3G7hKm)*Sc+Vq3&e z^(>aK8A6Z+bTcU3e}kj`H+$T@_W@4`wh;7j8~nB<25eIzA+N75@yo;}Sa7V3yf`Kl z%Jxqoo;35&`H_dAV#heNtKSo76iz4VOGiN85pmFRx)13&ZztGCK8Nd*8h{K@ka{`{ z1O;W(-pR}O&z2hDcKclLzTm?9x_UfO+31Iw2Q-njM_-}|C+?7;ZT+p>OQ#cCosW{A zqy_SPwthk8DN@KIV1(dR8BlyLn&((D1AO+h@OxcG=*h1n%rYxM@!@dNWrhtbE*ekl z%^yn$eG#uYPKmxUmXXeBKafTC*N`^nCt}*qpArwI6jQrKCLqC`5W--;7ddP71gO7Z zU!Vh4^ckKHU1Ih6#iZEd7E|oy#&Vr>72=M3aR#AK9z1%8CnZ*tU>YETqfT+YcQ+a`f2o z6DLodK6Ccm`3n~>T{i!4j2_v#SdA7%S`OKHMFL($zI zacgakj-{kbBbAq9G+AZlWwH-Ee$7j)MDv|RYiL;wE!)7+-J6H^&L_JYe>U5&M+{(Q z$pj0&5Z(AwEa=h@3%YbzN_u-08~?$n1k0)BzVBls`&aywm#ND#S$Osc5}iS^BYk9L zeK04z?fpG5-<6=GJOIReV@7tSR!viwjy;JtSlv^qs&fj%#0&w1xOrWFjUlqbyQ)2EcSmq~igswC3biw{zS^r$%KPL+!fkY}%{f~cg{6+l#f4Q-L*-a8% z>Mat;1fpjW{QOCzVjCtgaa2;Uy>9l#)T98NMw!K^ zS$i%#P6ta$*F16TSau%EHkQ;m=oUOv@mjb^ofM!?Nsia2CUdFdc%v=(#eUQs4=xs# zH}J5f3^!<$<4o)Z6Qg4FIiHbByCPP>y zT;xKw{urX=Q1DtfXNi&H?C^dH6JBUJCk`xYg>@LNk`^N_Ny|EydWtrx0LS6-WbLtk zY|0*uMH9rX26g_lf=rWdhQZ*g&d=8ux=zo`H|FSdu6}`o{RS%}Dk;FL542DK3HVRW Fe*i)ig~9*; literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a46cb40fef1eae7a5d0207565c796ee63a4a4fe GIT binary patch literal 4466 zcmc&&c~lffx38-1o_%0IW-C<+Lu zfXF5&TO?qV9RW8mF1Vm^2b1^|OMU3!0g&15RLbs@Auckw^Pp}^Ozy}+TM$xJ?S>=&VL5Y zKa-(DF(Ke>!%>WBFrO&P?gf=i@8A^RhUT`J!cu20T-OL;X!PK-K;|_|d--VAiHs4BrMY;#2YO0y1dt(&gZBdonoKQAOkw`k_O$X0UN; z17=Y?iY+_;1ela4XQG^PFF0qBAV#G5K*Q+*f}5vbF+iN#z7}SJ#if{sy|Ir zyAI*-ujFM5ECQwrzQ8osaX|j575E7MB5|r_ArV*=#VaVU&~T0PhQ(`cqedhe zpl`I6zJ^!>sy-Y*e*bEwZ&4*^wXfge303Rh#nhQx-V;NRFvo+o`o077KC~bDZ_WX` z`*pyv-A1^M`(%bm`W_a$wGplKeF5*e%wl|g=LrmF8L`e+ThW)MCGnVzo~ZWGbok~> zG%mVy8@PXw0zAI`f#`YA0DA*}1KwvcuqHz;Z0_?$eX79|RG9@R|B*Rjy}kfK-(G}o z?nmON=wG6N`4{jscRhlgsf`7Vgdn)ZgAw-B5IlR*%xYEd1ZUlI>8XK1lS<}I1r`r~ zCMH;yDH$2nA<5}1-euS54zWT0cIEn!56R(NX zE8$qGn=idPsRVEqbl@R&pP>Oq16X^|LDhJ163!UDhB5jzne_WL*!7BwNKh5Sm{?K- zv|n<8Z^3!2yWfN+$IcTrF8cWKOE_jaoT1sD{{?fKYzu718t8_Let{dF-2{0V4%kGU z-vMpab9&O04p0#pi%#yD2OAFXxYoUOgxXg#Kzm;TidpOr7dpqY#JBE%hm+P!m3h{p zxUCiR6$f+(`xTBTa7`dyxtY(or1u5gF4P&n$}FPG-uA<5-z6gL%q9)1A9jJ&eJ6EY z^evz;y8}D3qzLob{uC~mh%@Go`VeWShu}z-43C~L3544g!^lNdSXdjtG9SzU7rx0u z-exyA-+mU$Sm3J(eWIA4Pqh?on6MMGsf392lbFytSB*A^2GvuIA2WNaLs4+G8*H35 z3U{<@V{Bhig4VP~((V$PD2SzpT}X)o*Ye6>Z$uEVyyS-8d~AsBTD8F|qGZ&aEMjlh zv?f?bQt7&Tm#efqTmjh{4KSG-0j}iV#}3eRxG5T%aDUuBZ2i$Yuv?x=Q?LC?`v`yly$RsJC1Lc(t-DUun>aM%_6jd>)f={DWGZM+w7idJ7op z90ENpRk+Hb7u3(@c^>~4wh=#3<=?;jICZo_oVv)Q9AoGjkRA5>We9f@;FSs_wL}%1 zxe*|S(B?>D62?$B@NPPiw297};*@2RpPa%Lnup(L%HOnARacrSloV{#cFba~;bvW+;I5<*tMKD-RgQKf5s& zjjLFer;H@;{3%ZvX_Mn0k(dhbLTvTq+q-ec5kv2II zPT-ChAU)@UGt;!2U5Osi2yX5~rX8#-&d?FK5ucoEJ_BJxXucQn4r?&F+TymYN3fgxFEmqz>S5?@#N7r_{wsZB-u6+icWy`v%i+VCmgS&H` zYmQyWnJpi>-BnY3>9EZv)Z4I&?ZlI}jIzKEne z9qayv?vgu|B4mu)P;~dNv!v0-bXQY6Wcp8ZSM^Ww8x^`LEN+a}=OV#oB)(=MemsLD zynpro4^jUPx@e~*5Yptq;i1uie4J$e`skyV2z_n0;}dMi*8pINgrC-*Pa8A-l#f$( z|2`Q$;?vx$sol6xc}moa1|lPjpO(HR>0g)fuMGS@!2$tKq%Cs$KYqjc54~Lczue{j z*i8~-m0o@z+f5{Vnt zNPQ?(xh^rK^vEFdFU^9ymd1xT%!%~z_6Voequt5;)~zHPr9WGx6EX=>#{4OV)Y>Qe z<-T5)t3n*ZB!Sd^Wq1+}DL&)6$5&rbx(n5eL06_emQN%(CyfjAvW)TZ{xZhbTM3G5@$EcUTKst9)Gd3vfgB|Nf|G_L}Et`9iylk z$b22%?e=l}o! literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..56934cab8de9fd4346c856340c7d0894b2552537 GIT binary patch literal 4906 zcmc&&cT`l@*5BurDbxYxjx#_6=_pDOupwdSASwdV)W}c-q)Nv!DvE*~qI6LZr3ngx zg`$9>Vtrr(`!gCWF>2H()_3plOyZa1k8iEF-rI|N>fZae_dfToIX^tbUgi`*1yP|M zlmZ2+0A_7I6-xar*H`oX^po_q{-mV8?|Jv*%MX*5mL00-cz_?ZFQv}=`%F3SbZFunPu6C01VzCO^Og%0gERR za)TwYq4Ogoi4hbgP!&8zf?tq97Bb-coj;KY9XL=J&<&670OS>HVIEkn?3n$f2W^iGa@S&#Uwn&qkIT zMvy!UpA-(gVsMcBEjuH{(72G~n6TKS`Js{|iDD!@hzy+{5*M8u6D!O2G$ts)3x_&YAw=V#bvN7<(6$N;;Ngp0{ z8$-xwKH5_m2kXD%gMNND#Qjqrz*G%hC}FVSX(R+5_z0IjRRb%0GF1ADp!_)xmK2H6 z@&pBJp=dI6?l*(eB`%;7GZtRV{DdynkAqxIAu?{$1LBD(aN?$c-3kKw>h*wYs10g- zJLp}=gUx}~5Vg($ruQ#|xoKL!-KGZDX4JthrBoPc+KA5AX(HYB8sM*U1>N@Z5Z`PG zu|Er-O=&J{U7!i;KO6+7f-nf$s{M5{kCLcumTp;uD4`^Ys3hXEJASROw zxh5oZA2CC;Gx&=f8Q z=fa&&v5=PJ2L+kC;B(4J$akrN9qzVpuLwi$mk+|2tZzW4&H|pl$bdc93_&ke6}{T| zD^d@AfRc<5#K-D_^X3_lQ162^MKll_sz1S-dP{V4Y=JySXBzAowTB2&-vN5_^a;;h zuTYEIS?HgbL)6UglOJt*0pI9m4CYrKKn!sfYu|Pl4H=%pEaD!s(|RvJ_sW~-OxY7; zF(Mu-jSGdLmI|C}WJ}f*_CkHfJEQ|1*yIytsM<{t&FX4{c7p_Td_^C&vIU1tX>+hX zZzp(a{Sp)Ib-{OZDuDF?Kh#vw4R^A(!RHML_$brmn9amVWMr-e>+#7(WFC2pD9vQC zyrtjD6~t@EMUT8qmUKM>{#Zorc;}8OSYmiG)e0`UZ=uKHzTCq{6G%QN!o;P=;BAc? zC$ENPpDz7XIFc|MYn|qT@)EP~5%UJ{H03)eCh``x=;t-qlLgC( z=INe@GiZawD<454V&br}Lkoxx)CKa;f$5~AA(~fSTgRUk<&P>;Zek-1bl9blM(D!@ z5iTCTl&FcDho!!8L6_d&R(Y0jio2#ah`SFxRotr4&32b6D-?aci&TsXkvTtyCFl2q z6LXuxU3#^ZQ{z1fwVgG?=9I{D6UG^{nw0CX8@KhbWsZE6c=`QU-gl2Uep~FxXkj_% zo20Q*$P{S!oQKU>(#c+F7e1r6O-|G^9@QVlBGojQmTtq4}$eVbg#X z?Ac^OXnBs}nC5JjlRw^yEew2t?z&Il=-nDb5ppJS=NhfqzFSuExUJq;(}OYS?a>%Q zbnzy5PR#_bgO|y(KeV7;$s6!Lx)tvjt%y4N{INde5H)3PIac<-0+V}v9>Rt$pttv; z2yFKf(QVuF#1>CoQa)c35B?B}p*3Ed@Q0(}$zZ44an)0B%(H}@B?%r`HN_Cd-+xM~ zjn6@4Yzs_7V>8ZP^9)=|R>0`YT9!xBTu%H;DcAGyB>59M%azDg3ZR4&cu(zB_G>>N zgX|~6Pfc;`-Yr+4bwUr;p3{r(x7AkG-~lOmU@1$GKV$ zI(g&{fgPE1;sj5_Dhnkh3glws592MR4nMkbN~F=z>a% z3zQFzNZ`~q&%*;kIPpRwv0wm9GK^cmXaH;Gbp2yNs%;c-1ZAstbODk@nGl}`EG0k33Me(?0 z3Qh7A4+?>^_p?f~R~2WxoG*Ix;up+gOc$iuDhifvrO?2eC)ndtk74?41^jGR1EEy< z0<^sCv4MF8#A5z*)-Jvi3ipzd^u44{m3pUOLfuvV{fb2}r56&+j~y-5TjvdiFz>?^WzbX{|e0=g=)zc=(O zcP+Hvy>8nW|Hg=cuHEbNO@q3!L=Jm46poMRs|q;2XJhe%_0^_PXR&6PB$Wk>9JW-Xi%Vmp>TajaR?(S&?7LnIaRDWMNbEkc_x>L>8YVRtq z>cH-rZMA;;gL{gcYV+#m98P$2y}LGl-~5gYu7>m8f`*XOxyEx&?kzkJ{*Q9EV&}S| z=BWP0$Qvi?iVwxbcl7**9?)S}B12c5Q2!_N?2>F6y}F;0>dn4z^N9CD26wU)%y!?9 zfqBZnNNGeh4K>h0Lp2m%t^WdbemetIBn8dt++jb8QledhltNXX6yUi?s?c5RP&sOG z#QjHf7&qIPBB;9TlZA0*&TtWo7gJK9;SwhG+2{jj`J3l6J_50ApY=z8NA&Rd8YZLn zZ(qNBqa0+n66Xpm(|8j5OnH%Y(pz4zW44;HZ&R8|i1S8`Nhz=1s)V}c=(^=urmKdz zZyOVEFlk72fk)x^$g551YT+}sPf7pv)sR|*XPHB`z-lo+(rfp$LhIyr{DnT%-qpU% zi>WBzx;aNuUcaNF{To7h@~oB!Vgj3^hG=vog=nsnGv`p8NUGP>OhcYLtnna4Q3`{U zl+%|yq#~UdFoMB&`s=Io5fPqk=@wGp_{8}z-L3Z&l0IKEM0-5AZ+`P~#1_~T#7mB^ zRWbB!D_9hAYV#g=5U&+XElP^KR5`2Cx4kGi>e@cZsl2Jh zDKR$=EgHPqUYr_t=h#a22-}jhML(R%G4wlHlAbttvCJjkcKhPwC)XOnuN~dKB=y;y z&b1@HEnS-aa2nIS6R?xA^6e6;Ix531)Cu$fBYi<@?IW9&G?OFaat9CL_`=y((I&PkwZuj zmuD0FZ=Ugx`KD>1KmD($m5?!$iIDR}hoK~YUZs`(O}8+bZdEkUjt%-J{2d`t{*9hQ zhsA~`4O6gOOt2a=IBIQh*@~tle}?!Q;J->;w8095ti+IruowwN&_qoi`vc+A*Ywz{ zW=AtNu)>62*PmhyD}T+H7`@+5hQcV8hYe#*2xX_la6*hm_;OnM>a>4t#=i{s|0W9p zRgtF1<3G(t|401qzw$Ew&t4_L+lc~I1CHRW1s_19DO3=9dCl~jJp5(}cM_(ISc&b$ zV(~2cVPu+EoGP{t7dwl^;bMny*_u7AFr67Wh{bM9PtU*f^h`EPWIQmA#4{Id!4@;jgZ+GyL!Bo@1^9bKF#9o{bbRRx z=|<+yLFR-Gf|g-_vLUngm3?BMpH*zAOSo9V*vsJQbZ80qGWVK2o0+>a&2Z|n(1+uR zqSx$`Bz{(L0sb@M0{vyF(|EIn^K z5Zk8?pOY+D#){Sv%PKi+BujF5U6y)4jB_%LBg>Pqr~T8W%+U;Mf>1kNlCUT_ENN^= peEe8RLPA`sc1l=6Vq{#bwz>5<^KoV(4^4pI{-B)z@TC7s_AlGe7Q()!s=un9a5mDN zAsIKOznY;L&llyRS>Ml zbh12q95TENf|vf)(zVP!m?)^9AleNdNmp8(5Y)c@)@;q2R@l{HHFURFMvG$}2s2xR zqUnt_!t&9dz=P-9(BS<`%yL}%&{d`t=vs>%`by?Y=}h*Z;_>amm?A$aLtX~#oTdnl z1t!yb-xQ-yryUU#m{PQrXCZ71h^A@_`%BI4Is(I>NP2QxAdKCx3MR~53zjt5fUlQ0 zk=bj56|T#cicOByph@vhVQc6BVejjAVXD|ke7@34Fs5j>nW$zos=D15{(2~rRMa(t z@S*cSRC@_=?q)Tt*SrKNsQ?~!l)@tov8Z+7gf9NwmLRku6Nz730NPiz@Yl9<5^b5L zXz{r~7KQhsOiGkQ;_GAt3nGOn_Z-24yGO*%%?SJuzD`i6N$jz0h&}MQ^@uX}SOx?2 z!4fO0rNj`AAW`tTS-^4L4qB}rE7U!oDGI+o$mCXanV9-q0=f_+ab`!O;6)6eCio2o z538~T^+lIK&44rL`(^dS9v@p%yTv7Bzlf{E&+sB0Z#M!x&!B+G?iW17x;>*qx zXPaE7MM76+i)Gu(xgR5Zu%0R-}=6676Jr>OF zc@k8l=b$}HhrsG$GpTp|eyVG<3pmkWK-m!qFf1fbth#m`-0G2UuNz&3^j}m6CKlUJ z0TY9eCSODDn#)M)dJPo}*M^XnMOy^AS52_-yIDwCa@fM_$HicB!yemEI}a#deUdm7 zwS|aVa1TaVlR}^0;;5p7_u=a=b!4V_50L7=6{e5cOQakJM9EDTaAE3t6zhIP@=f;~ z;jnm17?&Xe4W`@S4D&^V?=FbM4^@=a=}I(1(cZ1l`HrZ*Qj3x*)v%_|Z*bw!`N9QJ z+fe?obovHmi4w)Vhzs-dpmBXUtWQe>o^@*S>K#XP!|OP_q?m)w&QX{wu=J+HrG*09 z<>R^>y)_Yo+CcRNoWwQD9 zYT&eefXQKRCGZfBf$dM9q0E1p0E3TIHhm=nn_fOZZ^jph!dfK6(^EQV-MSXqjP^rK z<5c8Si!geVMF32RoJrv-b%kOUP)5 z9{oMN1`HfDc!>8kBNzmi~n@&xbX=SG>J*c z+KDNtX_L}3GPAOQ4oxO0Yn?Kun|UrH)8|Qbw)xnG!ETEEX9}HUDpoGiDW`For~40A z=*Su2BHh55yuQDUPne}ioSm$d>Yn7x$@+Zmyn-(Z=Py{eXz`MzMa!12Sh=dW;j7Y(Wt%o{`FiWN?d3amR_xlnXK&@cs{P*_`1ZT%nuFgT`r+`AqsNY)IC<*y znX~84U#P9SSl@8z$IFdBU1_>{t@(P(jn<#rZr-|m=kC4s`wt#I`sMMHr_Y|NPFpD% zWNbmfovXMgGcAj2N7`I6BP~}i&}L}43XjWqDJ5%TWE&XOKNwXx-|ODZ%m-_>8{_u2 zF$t3(prCgG$BcLUGMCf9DPl)W?d$3~5HA8o26KCV12+b616Iwl zYHsk0sHwiTy^pC&rA)OAZ{A9v2&#%JDP9F}}|x;eprZ&nv+oFd6mnOODs+ zer|k>XI64h3ajDn^X9QTnBqG2kC7N&RbaG~_mB?pbsa||{_&B^mZh7S7?A$+`XurYOno@Iw~%b*M{2DYbBxI%iEzPPV}|Uz?MgmYrqm?&ad{;-*k5 Q0eJfZ2T1_J@gJUl1J8q)CjbBd literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..842ce2caacab4c58384e71071de8b40ef03e2e4b GIT binary patch literal 3138 zcmc&%c~nzZ8o&3xmlXntyqI7VsuB@dL~inR#F46h%sF$W-%0NF?Z12TeJ8PO zye~sDuFMcMo`53&4nZ3{?e-qKe>b7w(9xn{j~bI1z&J8ePt12<_zYhW0NjOEo=j+! z1^|d$5gus>DWu#76$~x+2Yd&)Oq8L?*QRA_3bPp`rKMY-v=%BOl)_h8k`rHmK>!a9 zcmm+hA==?HLiU!42=)q5CbV*7LaVG4hyCIyHt_GT5ea)U{VZ&EiPn%=q}Sz}(zF_r zMv{XUIoh;LLtc?SpG)KnlwN?!a!T2T6#s?Cm!?-@$CTNrnP!T+gc89&-=?Y$&7l7d zFYpf?h~}lbO+s6D)j3VdO@`ZxucCp9{FMf8gsDR zp}>0>zrnr^UFmT{A4r3FEemQv<@7tkq*W?jo@^_yop72y5vJv9C-y^c%sD3L33rG6 z{Yzn2=u}=?=_rZAZ4Y1$jOWj~83vO#uZ3xgHh}VO7w}QJC*80<%-*MBrP$^~6PlU& z5Z;IyEgW*~Hp~`#iqF*t2ohHkhkZ`Ap5PmX;d>u-ZQ?H7Ir&}Y^oGg##1eV zOX7uD-+F+1w~vXvt|IVR>_&m$<^ig9>`>r$>w8{bzX}*;im>h9e)&Sl+w!n^}sk6Jf1do#dFI_bTeAi$Qw6D4dnns^RpH{R} ziXSDwZABHGy7e5@((S`9q|Whzqul7-=V@xhqczUm)#F43@*Cji-xn+xbQ;v<7NC7A$HK-+2T4%-L0+G!-r!V+85!czV02`WSb60dxHYhN zsBuaIGQC?Tm{IA%3!M>;G{qWv&mzY5{NQndiMmMoqNqk-{J9%;eNuuHRn1NTU#TP>|2;HfEJGbHHEk8$X)pKeIu>!I}l`tY=^lq`>CvxfU5f1 z8??^egpz$P*?!!wKsX`Q8K&fkK!^PfIIr(=DtHe>;_sBa{%7jZJoz2RQm=1B?e#j8 zS+9mo!+wR!jxQCKCDfwg6S@5BJZF?49!#|^Hi529J7IfH2Jk0k4!Z5wR1{bL>Q+%ejwIY}Gzhj|S5^z!x@;XBfAlz+hJ*8|6l4H`FoLNKcenHU-t9uXNeNgX{oCN?g9O2X8{ zq~w&;H~uzldRn?BBU7uJk(He@GdEA4Z!pd(C^Quno8NqE_MEx%=D%ICVBtH97B6|X zbZObLh$Ml&bEAUuC?v_h4zllFE4g|b*cOEm8;i!t}7qN6b!P| z_lVwwQl!txC$(NzNay7gngqH$9Vw!?Vm4FKOODc8j>^MM%H>=ic^=G{mc?F;DGd_H z0l06OUQ+d~5;CDxhE)9*PqBgLtA3f{RaM{qSr5q5<^Dj%FU-u==`{?E`GvHfHv4AD`FpB6M;D*@u|P7)l0gnq}2wP^FM9$7e^7eQw6c$-Ws$ ziACD5z}%GN_-w-0$Kv?jnu!O_UI=Fd2Z70Ak6RL6Fa5&QB>#MEcowT6`kZ^54yF|A zK7Q&{vX3HzC3P-zOFX&wOl8s}`5RJ_CmT|exzw@0I7@y}3BBXNU}1VL9s;Gxs0~jv z5e6HnCi0w++3G(olEWK=jmZi<%T6TA@H~QnF*Pi3X2!@jF(#vk5C+*y@W&xqE(M>d zI87Wm&JOcaSe4mwPh7A>3(K$^k`^T{Ny|BxdWt@*2>apkB>LDsRwY+skpwbVqo!b1 tkfDMrWsiGnvO^pskK&o^_N7105SRhIj1S}K< z1;Ij5KoPqE6~P*f2Fr^*v4HyS9byu{B!7Hsz4hK++*9}7zrFW4Gu+>}N!(2-f?7lc zx>AZ1r~#NX9ZhSGjO^{q;M=S}{<%h2IQ;TO8_#zy4!sFJ=h;?d`Yh{Pt<-rbHK#tB zKDawUao`{YlqRKZL`+zhlLTmvn zzF5TdmqrFI43Wl$P?%64e1e3pkX{3wT%;+j*!mpudDPX z-%6ezMv#0n&xBlhMejKIdxi!|fl&eR5kZl03j?KbQl(IO5E{5JASygQB2u1n4<_7= z=@k=t`MBo)1Nd(Xaa4rSj$`+hP4uSYojjs(K{Q)4UjrO2)vC*A(H+ zW?gvPZU_OR1!!+s6m0xP0J=Ht5Oq%%z*rq#Dr1oGb13*7`UqD)RRAk^GL-pRv;tvkuIvE&*|+DHLM{P^eJ{D?eAmLzoKPjudwP+bOVlwgAKj93l0|k7!A}DjXz6 zK}0GSvW!S*KXeF;Ryu&&2Nu+47=m5~3-;$q;p|-%=(s|{l_C@5Qc(hzsy-N#HVj5qnZb*fYhdqneHay~ zhTataiiQS0L~#ZPq9b*{VaqItsrJO`LTZSNicj#i+8iApo2$SXIRo~N*h?%Lx)Vk% z&?Vej}CQ-StOJTI}C47^!A((bQgb3n1*1Y{Fdar*0GmCn{PU^e_?J2j= zxss>IY)CX#6cq^X8+PMd18cG}uM?_UKA@4{icLOcf-0Pq(44j=Xx58CCs%i2DGfMm zPMU{xdDz3xmaj30;lsyb&%bGa^m8ZhYaZ5+*18yb-AJMk+Q#~j84EwPuYvs2HDtlAd00`M zDQeVjz{i(;#OGh=!>vz@VC$qNU^T|q*|G(eWDENl8E)c>pMhAIoEHGA9y^h0tCxY% z(I%Gab%b93iUUaZfXTF}_|RpF;4zSn`evL&acW!?c*2UjVQhj(g9VtrbuAgt;YYeO zi6E=i3mIM=N>-fxjP?yViyQ6RiYNCt!m*gSSmO*wlpULf4_VNQC#l>;5uvxSrO(!3 zPZzHu>SwwkPM;MPt#S+viHO2Vjw~hys7vILLo-QfO*p@FUzK1+m=7vTxP=YZ8_6yT zH9!NG#JFVq3ZgP<0hajI5nbuOqxyVJ2X|fPBJKh7v(h$=cD9R5MKS;L4@lJ@519%w zSv>EjoQOLd?uzb4PNl~P)O6kio3}%O8#B(3Ri{#g-MpiVt(-1UjaE2_WqTGxu2B*@flPp!&)L|#KK?DSZn*$G71vqm@u=mk-vZ5}>ZaEyYS6nqjQDl7&&xMd1z z$mT0qb=H%rbH_n@S0c9P)D6x|=Xjpp&D-!`c!GYkM;#WozKXp#TZ^<`?1)Jdq{O(! zT)~~s9W3t0>)dPHJ?!ZBy{P-xYD_byMPPY#6D;jIs2%=5h>7w}<9Fk-NZ$>QkmoRh zWBt*W+}iX64dg}>5$eOCc*=S*Y<3MEd&R8e2PSfpXh}#YjQW zfYvC5!j+Qycs5H*DOE5OZIAm7U$Of(Iv1P68dL8M~zZQHXk z$9Pd&n%Hh{M&9_4u5#a#dp8wKh`xU&&#r89;gqCTk9tp*ZP__(HO3t}y?kqlU7DJq z->LGeQ1JJq&IJxt`SoEvwV^jpR}~zIif%dk zJGyU+{th|1+Jx%Apr`Lhr_n2VDA}mFmu?;Nc*NjNmVwEhn{qHWIT$Gmsi2{H8)&GS zg6`@sP!|gssC*e{R%8wOQIZkO8l()$yJP?_#4^S90=u#i%R?SKro*_^#1uiz@qj#x zYx4$+V7Q!;iS(11)aRlB&Z@UBW_<)=`vJ?30FUY6ixeiK{&&B;eyg&`CI#mT&6D_2 z+f)UyW!yWy|MYZqL$A6d)c}W08j}*V3qsxZ%*eBh{~%c6S>aLPRll4H^QxM6Ea8_ARJcz~ zz}akzWMPC~JzWveJwF< zL&9^R#V{EW<%wj97nA2kkuuKherlWyh!yXZ24E#Qpm@*k#47OsfgMHzibpi?M0)ui zHNfE|yjPX+CyD4-#WF~DkpVTJBzv)un`4oD_@K0Yy{f)fQ|{7$jx8e`v#s)Cg3jld z26s2*#RgyADOoR^njaT>rEE@_S95-R*!2U_j_j!g2@yAsEbZ%VE=Y{Jdm@EB#Ck{4 z(jPlA^}UbpSQguNxx_KYx^Q{?)9W?C*N+z_CqBR1x_-#Cq7}-}V*NnwMm$LVWMz>NrvWEfCTED;_W8^;a` z4`TQNO%7UAg#+fohH-jZ_4W3%8A|l$&-ULkfN09gl~R_!{!ipIDgud!AdIBBP>&La zkRmQ00%!u{{~N1$#C${0LVuBbMdn0|sa!;yFFFhX`s*sK^c%s$7=l-$Ow*sHN&iHD zRDLHz;X#qXaf76<6cc7(dZiQf4o;v6&|e|`0r>CI7iU<2h!qh&2$~w|Vt=BT zx|**0)oo~w2o{*=>ocHOgUVm?Ge+-^)1femnE7REeGE@1!#DtlL`J2r$_4oFQ4|JFm=Ii4g!t6)5 z(eW)?LN{`Mc5)|l5VQ>XlMk7_uk2&}ye%RF9fKuO#$FCjr$dYHm%01gxy;;!X$Di5 zhdvlj7`w<$oyMCpn4gR1*YVKBqUBHVFr~DO4s`U2V>Tqtu8h81 zDe(*b9A|mG>0;CJrSuZXbY|!n&eR~<&tcLMf3qaICgzM1UF1Zj{&a|gheEH}$}Q>0 z>FnrzUx{tv;5o^IWvpl&iM*17M)D*F*X61EMmWUNIPyFhd)hy3${fw0CWy47r7=t6 vgW|>pL`RR6#>7M=Y9|E6#D+#iYMWY)GaY9lcGU#<{STT6`nN0kce#H9l9NZ9 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..84c399e67f3a1f307e88c5325c849ac627b14610 GIT binary patch literal 3138 zcmc&%d010d7C-mC_d)_h5qKeqQBWjqAfSRv5qTgW7-Vt5&>{wifNYTjQ0s!USVXK# zaU1Kxh*n$}rPxtK!37uUhFihbIy1Fe>rAy{zp-^%oqJyU*Ws5-wgSf(Jt~`^NJFA;GKDeH{m<+5?-PSgaJ|; zPbsxY1Axo}laWP;!P4d)Z2>w_av3jfWZ;IcA6k(V2!9KFgq3?DnJvd0 zLAJ&P8fseMR{0uuR$0LG@m++2MvmgLR`-;Ad#WD!XEn2%%OnzRKr-$hT+gMQPU1d1 z{s5wzl(6MVG!|M8}=z#kpXmBmUd&|Z_?{O2L>0&QtwKoS|?|cm1!~Y3? zPN@SX$dAeALqJ+s&K{akC3&*S40l>fVc=CYw{hE>cyoh4I^4gJsXhA)y>;#`6WrX# z&a4Qa6G-gwTrWG;9gY?Ac&453H=VBqbJNbnmw zhEdOTgtks!iJ3Xag+zO?doHq!-=ivZXZ0A_QqvOD$~4*yiv12PxPJpSRYfqp+@FFZ z?}6RozNcVsb^+eE;tkZe)=3`Jet_#SWeBuhF2?!sDJUkY&|drbO}O2=sK04a1Gc=k z*KX!ocP@NpBsLTo*t&Ol$BUk0?8X_R*lV)wcBY>?(e*4 zq}*_dIXZbeqp$b|P42}?gMZO;E1SMWKd(2jxh}mSGwdUj9lM{&I0?+UuZF<+d7JPw z{|?7b+zO;)lhjC`BZJG`YtaIiN+zTZVf(*oxt?e0@dDL7=W?Grvi5o-POI0U!vlXo z%Z@LVR!rWBi%w)qZgFayYVXOMFSEe)O}kKgRw@Ks)Uh}248pepPogWTQruRma;Q)T zarRrw?L1Z|cRPN2CUS6XM5Xd9xKjNWX060sUgoMsYb+ly^J{LRHuE~k-~$N|GUF>r zwPS-r&DR^iyLN;_OOOVJ*-u6HzW+PUec}Mc!E(j?Djsz{zK@?xUm}b7(vkW8vf!9-$}8NIDs@jyuiow+efoO#^V0SoFmTXd zZ=WH)e*Qy;4G#z$F>=)CH-g3lj|~w*!^VY2M2?S&o}i1F7#kO#Flq9Xsfp9{NpHTD zJUwNGAvG=CI5Q(NYgTqnZeG4=c7fSaSX4Y`?!33>FIc#!WO3;`@0KljuY766vdZP} zuUNTi^#@gJ)~;K>;ddK9+_ZVi*6QDHtJ%Keqn)+8cJJ9+_lJG^>pyNd@W~$!9%?+? zbmVAr%dz7pPPU#pedcW2r{~UJxOl1k@|COCu7B3idE@ghZhm=7`%tOjv9-RB?^-Bj zxmkI%)*H=iPL|nXXUs9uB1$S|3!~_8R@`>h9(2`Kihbn8WWKU2c4`2H;jV3o%Kaz*Jmw|3AgawO#k-&_ z6%!A(JMmHz`ySN3==j&cf`V0PRJ#9hFaM2t?EmG7)&5pblA_#A!D^+7J#MZ5Dvi=n zNJxlH9BqBDlv`O}N&!NsAPDi~OKE~26bqplLWCe>2w@rGStt>Rp{p=Kh@vyu+h(#8 zmu6~#8WLjX4fgdNNfzM~$J2dSF&)F{KuimQjt-%Ex)u3?AKhoC4xQ+qnmDyEJz`Y0 zep*5%<>$r`d~;`!L9`bp8X+JES?!5S%Il(UPD%{OOOMPD3{+oqPr@Ojx9td-kPC&Q@_NpdmGKY?hy6mk|SY7*om zI>Ofrp~cpF62Vd}A|r^Iv?_@~TF=GM^|=v+#E%#!)hG6eD!m%3Bq%*hhJx9JMvG6H p$>d`wD9A7NC^8n9v-0yi`~!#h5Ajp!Gyu;(&_sS$$B};z{{p%vL_q)m literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79a6f24b35cf797574a2db8d32609d38de32a0f9 GIT binary patch literal 5244 zcmc&&c~n!^*5Bt2$xR@TKyFMR%!8mPgMgwq1epazL=hAv3^|k%uTkEa&_Trv#@BQ0*pPNH|2~mVP z(*&JPtAc1f8VmqN+?wBe_9GwOz$1;jeli?I8EGnaxGl>J?%laV^I5}L&hX1)>xTCv zXuRrGH#R6z$X6SzH>Vdh%XJ>pK+9=!2U<)EX%5YiyMrC|eosoh*9HJRSBz=J2q*K7 zmg@o7uEpduA#($cP$uQYC{yERCn_@&X-p!P{E8$iG1~%UOZ}5SnFEsm&=@d{km&%J zc@d%$gP2Izj6w(mJOnt8O8_aLfWtgG%%jUZ0_M>ZP@Y1@j1^IT$NDm-t^deHO1>x4P`(+w@I!g>T+ZrJ4o(+JbAw1W^K>aKUB9Fa+t8e!M zCt)OfIb8-NKMG;d=L)neLl>JX8woW>oZ!NCKd?+00?#MCMVAi`h0Ss)vOjGN#8XG0 z(k8&TViDj@`F9%E=kD^Ou za%6Sp0EpN6gVmW!kY3>msozWBwBAg}pCgA)Umpjbf_R8NWC^WN9N5)Sj=J~=G%Y%C zQ)CFWH5NeZ@CGN%3&;(yf{BNVL3Y3yiZDCaVXT2A?+(L57!B9`Xsm11C|Ey946?8M zAot0AG&f5GM+j?3$>qUj2ND{N9Rr6Yz7X;&2P)L|V5{c9p#mi|-O-1}D4XCJ z7lUdjx~?Ou(|t z1%7WEA%hsq7G>r&@vUhKZ1II1JMymJ{mzZ{g)(S9EqrfevLk z0rpz#C8CXXfpwG(5wg1noeF4%juD%P1GC$8206Ck>jLb-`T9dhA)2u>TTY_Bfi0Lz z+7s@AwpM6ZaSL56eu`WMq+>hN;-K$TDbBMSLmnt>gTvLoB1;IuM%Fu_{Q&|rx%M=i zvCTkr%iFOPr*K%mU%F?m?f^=hkvgvJdd+iS(9)ieSzXoEut{9$0SAn0^FVN)j zfd9CAI4K4Jj97dQemM{Tf(Rc-HXVZfFzXboxp)r0Hf0Rt)y%#TN!PI*KoW5*=i`TzGHXk1YeVJ&KIhj>nec7H zO8ESIIr+uz8Q4yZGpZVR3Lp068+>NVBYey`3$9geHdgL;1?hh zMi$1xvTlFUVEICDIC+{Qx`EJ(pK$=?9&j2v8aG;~3sYXNL60WXA(a6S#hvpcD>J5I zq+Jv?aLiFMwlR|2d~_PJziLG8Z+eFg4rsz14sOJ&CitO69e%JiGY=mS)rl|Azk^be zzQN{yzZ!cwXBklu9D=Avo>;nmB^r>Dh83TfL%gP2$rHzdNo9Gmu;gHwc*4Bt=*#R| zn5nHLcW06vdfh6+6~h)22hyUjoR@y+%B$O=A67Q~joxI)OQ znu2K_4o1J6O(A5LZ$ZfDTnIgWm2AF$3biR;!t}HGc=aFws%f8&wd=hnsl;)gC6 zzxNWv_qC#5y5`s1yyPYOp? ztQYQ*j3euO<2c4{dFZ;&R5I-ge_2@fHasB_(A`g?^~M{l$FR>*upzfF@KyQijvD62 zdvL0RA7NO?pDmlkZyM!KlioAU9@YrSP4)G{%uESCMdu{mt@fe^ovjrLhj~-?(AS(u zn+e#9WD<0~>Lv58B;t8Nk=%HnwOj4dkne5vG@}C3{S<-@E>~!R0;w|L3KpW!FWdeWYL5ezyNAuRCuSQBQE^pAJ z@6nQ{H(^JXS0=oULZ0&m2EYnXM#-RuzbEziZ_1-rV3_g`!cPkn<*$ z;|cYEzgkiU=N|c?onI8_A0LoG9ju7LBa(U0uD>5GF;wH@4kL{Joh@m6;V3q#?VuRK z>-cUxVovPQAXH`l27Ow)hFTl856iBb$GJlqVKICw{8Fw8uJ1U2+LB^m&zlh4o8Jz? z?zq>ZE3%c?#g#IhwML#KzbKDuzBx*-_Q4#aBR+*z3XqCdXQ5p3zD z1p7~wkxCtN7pBACevQy^5_d1+j}^1LjuwD`PH zvgY1-1uYYCahYV>cz1w&KInAx45*p-ftQ9wYSwJ*Q;T%m)ttyc3*P8rMiDQ5yjtxZ7G$>ehv20K*OjgPs3BTZo;v8WRt9o+D+76~ z)%80T@>v{G+nv-RzG!8#w~$&_?`~uO4KJH5M{4Vz`te*YTe?V*Oil4d7gHEp0uw&uRvc%#w?nX&_3bT#o?%#jhXkJ<6&qm4RP431i6`k3} zsg>QwjnmHb{cN0G!y7K2-yqA8XEfQLkY`@o;bm7Co9Mg}w_n$Cb^sMrw>Eh>2!_AiTcIKEZ?dv{aw(L#cOXjuVk1$_B8Z9ti zX`=m1B?Zdgf1s3sZ)1q|7{YeDhn&XxiJbq3G>4axl**F2_)H=>DO1IbPmX6P5W{z> zMUv7HlDp2fN9=4*b6J-6=WFI~NzHOZ&kAC$C6?WdqpEa)L_|DBGW@PXKoLsB6GALQ z>|z*VH~TlX_L#k~7-2594@jz%HP?!idauJ0yT7h7O1}|Tj3KVB*Yz1%_uBKHXr2D= zBriEWH9^%+W;K{(D`wj@%J$ol46*ww#6JN4T@7SvH;{5NV-w?3lr+ImUK{LB)Y(QJ zbjWZVQyp-_q#qsx&FNSEkax0re;g5w(VQSp)|!xNkBep37=!Ts*ldg#|K_ZJE%3i5 z3lgzRE(`ik^ZNf0i~g^?)&H|sNpfJUL~Kh*wvP}4$mCL8MQG@x@R9v*mb8d)WW`P4 zrBEm)GY=~h6p9>$SAxPqs%_qrr}j zBbb5DM1OWZK8J06*oJYfPz14!R|GrN?kk+w`MemHiOwd z7S3fFt-tYFCrl8G^!w8e*}V_!Gb6*@Qsev*6iU`!3(us(NW}Yj=+vp~JdkbrQ`d&x zAJ05y%}b>WcT0nTPy__A`dX!& z_xO_owDo3+&B*)GD-_=B&@Y*-L3*UG$J`i~1x!s`StX{(Ic)uz5c>~>S@Y6bGLbXc zG5Zk;ublpK(gw>~F**uuCHsxEN%pU6Q;$gT&0=u0d9wD5f5wzOntn}?nx`u>=4ZvL thQy|)4^d`hq~(}r$7f_FrKOrXyAO39>Ld%21N{C6X9(suIp%lA{{vSbUhDt> literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e67164596565a48f5cd69702114b2db7228ee7eb GIT binary patch literal 4782 zcmc&&2UJwoy8ibmGiP9E!r%-JMVg2pMX(nJgrTX3QHr2cDbl2xD4-Y+u_Fjb2T_V3 zAZQd&0Rch93o4>fG(=;Iv7oW*J7_uT{nyN{fBXJ@&YJy=lgQbGNASFO zevUjY55@o(H#@%FRrYIjmd%K#?Xi>L&tew-bhaqo+Du9a#WgNb6yAD~oX4)=XLV1rqWIfjO|` z1$>T=cbMPGK<}tP9;Pg-dLjA*{SNeHJd&>mTgCM_^zTqJ;pOm0(F`z(^6E$zrhWaS*s4`2d$cRsbttCLDAXK*>{iShr7v5+k`-uwW*%)*8dv zJTYj78p5+hAJN5{$&fDOqlwMhKs+)6D%=4qHW1KXqYa!18lb3T0e!*pu+x1itWL9n z1%tuhu~r>8nTl|AVKtP>#(|nqJ-SdWL|QGCpp<3{S}hkKqR|Y(eo=;ISr5oqC4{Z- zkAii!KX@I|gkDb;6b{y*VJQR+=cVDgf+DoFjt3%tJ{UKQK(d_{EUYO8L8S>4VS121 zt^wjd)xcet0|R0nHvH`@*s(|n1c$|t{P0H<9HRiWgf@gGb0FP-gpMOez#!fRoZhma zaoa@D-Nu4L+1}87OAfj&k#MQN7}-}8gLc$8c(=|5wC)?hWuJ7o^)U?AM!P_Eaw&X@ z?Sw3GITYH@h2eb|dY5<)CZ(!^X0<6ieU=0Vu1)~$urcUG(XU9w?=FhgLl6p&jWP_2S#@ zCW6VpT?i%4VJ(^8q3AIjDUOYAlwh$oY;?Z^yZS4Iek!b z`YqA~M{MR9V^m=$gBG_nLyK-CYTwY0Z9IX)bt7Z+cC(sK&GvSco#wywfPnn-kM zK^LUgFGG#$Bha7}i&ZDj2etVQ(0ox9*V%Lg$l3?Ox6*9MUIabhMf|@3rz$>~xA&>L-@VTePv$c|Av8u7%c!K9t(x7S=9%Af< zpM@xxnd1wI4{XUX8@>jE@0wW(R}mWd6^C_$VlbXJ2Uq!;3p>^>$BsuX#gq&Y+2kNb z#m80=?|HrCu_Fse@468AlB#MYht;m=VC)S{O;?j$5Tu9R_X=>)l=VbqxF;6(N{lYO z8&Y_Z)Wu2Z^Wq#vk7Y8(b+GLdhpdDt|v3_)A}-~wci!%m-AJWvxN$picFGg5 z&D#L_$yF@JXb&o4B!T1faEA05%|uyp6BlH00`IOGV83($(#vut{M-=E?%Q<*nx=JQ zExY^h+PNBXI@_}d)5X{Ek_onW@B8oMXEyGTFI2W5+im<<0v21_n0Kw%l1$#GE@PN~C^3|6;l8IA9|oBuF2ANv2*3#8VyJ z**zP}fjh60@S8st8xrfGhJ7}2hKn~6)W`r%T~pz(hmYV_t9vo!x^POhybOfTWx&1U zJl=E7h!usOCoSxCh(i|%{M`&v;ac$=&X$N2eT}5T(#g=#ABTCJxlS#xi;>d({wCZ}i=7a$qydZGT+Lp! zSDmz8CC0pCy@|<9*-As7x>%eKS20`I1o^bH*sSV=h$|$eT;Gt`;9$^o=9>RhlE`F}9j*u;Q2I@;J zv1^{X#5YO{SfxtVD8M;^q^}yCvb?v_2D~EfA2%g}DLtQHe#B@gWou~B9scoO2z^7s z3o~Z5h)Zx10i*a9MH@Ea)EGV2(?uIMvE?;wvn`5~lBM~kfrF=uH*J>FuuSEPO1|Bq zsOMbn+FFvlb*!<^*=$j1%Jy;7BkvEkmTumuZjnR@Ecd7G*0N34b8FkbWzPhs65AZh zvb4-euJwV}+RCNMVJv%HRTL+WBJ@lFlvbh9n#TkQ8vt~i}xcS(0){_J>+XMb7Gxy&)&GSmKw z{<3P1mL#6TfPs5s$j!D)hpUrUIDPZxEoqQl>wP9oFe}9%r{4ekz8ULShWi?VE?2rW zr5NTO4gJ4I&mD~}lVA{oOCk?3k=qi0x>f1MNq8E6yKT!jaZTRwsQdjD%e!jwo8q1h zcjt-^7qqN<{rtiAU55)#B?2xbw5=^_-9#zsdz`H;Zr?1gWuIqTSJJVSKPC9a*}BrM zof=kYLc99?-Fx(wRC;vRmz~Qp_U*~DJ96Mc?)0dKH@c4;yjWpdX`Qc-z~9qDN2>soAly|+2di@TR7fD2?NI`V1&P{qX~LYw{dU#M zkDDffc9q2Mi=L4xGLa%38m1-!m8R$*_<$0%J8kmCM+;i&SB=OdY6sBCr|u2DzA0EW zk??RpiDZIscQu`G-BH!a3BYwYX6%bTP*oR>(Hn|27Yo;HlFAe$Jlz;wv*bj~Hp4-T(-uyXMP!_eXg5rD&%3KjKKEG8#E_0D2!nT{!v6oP*gPAKg6H8 zUDH?TBChhFq4M5Ix&u18NoW-oFOjnU z$dmkEdz6&#Oi`BArIa_AC;Mt*F0vGfM2qP!BMu@_oX9dj zWFryM0LCdH= z$&lIm!amB~#XQVU93b*$>?Pr8IJCHZo;xpH%FOMVW)!+4_0fD*(`%N|-Y(|hZmtW% z-CZTn>3EAr@!5a%Wj=JVX!%n{-L)Fy_GBf-ho)Bb5w=4eJWfv*wa9l0jPKibeYBErx+GBP|)Bi27M dDkwZm!({4YlgY*cMP%$Vdir5fE zrHC{^nu%hgs$d0IBUaX+f^MSG>{_C6-y4F7o8*uEzWp}$o4NO%dr$kFdp_oehuqVW zAsIWy!JVNQ7z|*za(T^**yrnePfWR&bgVbyQOb~p1?LXQf}d`xo03}~*;L8^qs>Un z7$Kv{@ED$+4VcgaIud%o3IK#7F*hSj^nk1r=`o~qJn%H75z>n14`(;w z!Tv!pkqJ@JDqwGs_!*L2AY{+5-hUDE58*b5nRc-|bA?hN&J+E(pK{8a0dbYHaa~w7 znNj*PysR9DPL0_mqK8j|gToGz3w8Fx$oWR3N8yjC-t|1(n3zSD`FDy;%`Xv~Tt|WB z^#>3|p2sJ597V5^$PrUqGtpCIx%=-lcMzo`UumMD2g$ zhSkPlB7r#$&Ldtyn^ljnw{HTa4eBs){r=5r^QXOEZ!!$T?S)b6}{1Y--TMHVFKf&MqD}k`JAYvFh zpcB%_V+*(Q-ef7^VQUKPK2uHYDV&S<)mftB#`T1C@jGJPg-68H)5G{9QWJ5t`8WLJ z`8HGo|9dLZ!k0J)N|>Az1grX7sKKk3f!Wa$yg}a}^x|J^vjr4^#b;9pon>m^^)?;t zRHhL_=64ZB*{$$sS~FUzaSz207NDr`J9x?W>u{!@4G4Bno%$$Dh8Lfy+`Fk+6c4qErhRDj_@sWiTb*BIu%eIsaa57 zp*=0a2NfsY#zSSp`To)c!nc>Cgxq>1Sr#)NC%tq=SAV)YsKD5TX#3QKp&aHJY}ak& zyCrL=?S9{l2ASj_OYJP4(EAA;b(a>byndW6^BRU8G-^_Fb47xl(8Ws4~T_e0- zY)8e4s&H0Ii>CJ$1r;eN03)-d{6s1ds^9O#b5}I-*J)5Z-SKHCsd^c@lr~OX^N}ee zP4eU|x$lI0_U%CaU+#g;T?U|3iiHLq!)WuYOrfaxI9}}c1L}60K##oh2!#pFgcoXT z_;a?b(ToxuCF&lGMz2mqk4UpBF8suP z@MfIn73OWzJI) z7brV7Bl6H?lCXHOU8gI5iXc699cm@&`KB|!M_)X-346CY5&8x{126h1f6eeSP!SP_ zt1>@D^*c4yZ99%ogFYV%t({3YW|lvi?wTN!-MRxkLlcc-z3OoM`U?J{od%TSB4-?s z7(iBj$*5f(F@^6C;!0i<6!BwUccJUYR%5;F25psB z5fW+FqZIXUxSD^T*vT_cPu12zJL0zyD-Ya3?aFN4=p&wBxA0qDzFM8=z=IqxE1e)} zu+;-=;T+WW!*d+Ge2YQzs`W08LM9%SI9z+LlYh0I&N8PiSw`ddkIrI(OgBe7#IWhiAA;^!*}c~bCfBP(>%q+k_b zT4I;CdJRDjHt}i5TeFt0IoxHJU4BZcNMaRsqalCodJRKGnnYgkWxAG$XNgZ^LF&dK z7J=t>$qUzQ(w!LBd!w;%eWrn33N2L}NXr`Gvem@5=|K86V~+xt97WNF9ix3}!n&G@ zHfEbIY)X^bAI!+H4(lxOZ9ceZ&jjdhk=o^u6Xsl@7CGF`%kX|La66bmDjkI)-s)Ut}4HI zz2*qFT$k#C){PSD#kbE@7oN>DwBMlTT63Unn~7K1ytbO6^Vt?b?YXYC2QTiKsC<08 zt+x2`e!I0oeYd)jjzX7hrt{nDO0O1s6uRwoJ5qM7+^2T&o%SQ;-yB}pyg`53SBJW4 z0Y86*)HK+qQLI2+( z878hWkk#wUE~)6(Z(HgQ$%7p;@dLl3LoiSXZm>Kl5OFOC@U(y7c+yAAPD0;MH7S6E z9?;=<((g}M2fwgAtYUkWqyCI0X?!3}ks;Bc@%;?&F%mySvRo6{Ib)V6{oehb1pOnd z(v0yS;VFZ{LZSi~k|l~p_&0phNYDL{mK{qOVLX<6TyTchulzBOSzuNpeM~+(|AEmD`7^&J?V|bZ%!a zm%DN+D-W!!q}nPu51b?U%;lra%_p)O#~Ci1Y@fuSq-mHFnZnHnmhb8q5I2I9|hRn?m9tT)|AS5P$f(rqWT|uQ_Nl?~+2r5f!l}$j_1_Wz0EEbB2xD*7d zsGuk=c$MO<1*{70A_^*kMas2Ww_00q7xm5wm$rUw|M;Hgd%io#nVEOyeShyVlPA9< zP!Z%Hrp1BcnJTf47={8^ZgM&lWX{bGo_gdJ#2!z}nfWUJ;ZCpEnCX+;UitY5M9Oh* z&TsJPSsf|{u|#ZSE!HD1j+kTO43@mXJt=Qc8vq2BnDlyxqM%Y=eiNC9X_*sn^kh2PFhRvp4%?4vTLPQ`HTgJj$4pKpTmbs4xtwoE!Z*T9(Pe&E1bybMrSMTBgbK> zcymfLyr|z!@hzt?yEnAKzN0VEC{SUSc~fD;R6l6EFq|?h zdWFoFeFeJHUctZS?gPrzo=OqB18-pg{mr~$&darF@Z*VG`10fdX5+RG@a8%P)L>Ch zjoe8BE1S2M?%Im-Q+Nwg26PD2{FY>0xayWY&u>`%b@P$NhG zDnflv$z_YE0{i#fD5Fn=5Zu2UnU|h`+kVX`eJCGAA8}=_+1aDj;m*J>F#yJL4z`$5 z%S4?DXM7u_u&#DCT4&9`*vm%Dt}}1Zo?&Mw>pg3zPdmNf8}%&Q;OB)ZzMfC_i(8qm zzw&1y4kYST?x`{KO9(|fGrI8z`6%w@cuUmZDx(zRm(aUY=HSd{Ug*-R9{q>8r}+78 z^Z5Tj4@AYrC%C>EUE!B+dy&572IOG4mZKALpO@6b<1guG;O!0`i5kz@;}13o_-fm+ zoI2ea{9TVZUOLrKKUJ_Fum9*SFJz69Nt9NCx%DD$29p5?-mb?Ve0q$Vugh@ip1hzl z4}5}J3nqy4Zd*a-B#_6DqPl8l(Im zUhEw!xPSYYPV>-Ha3pXOw;&>NMD=?XF#hHP#$fzfRKaz`#>T5DZvI2?-joeid3!jj z^bdKdeHwn?y~%>(qq2rD%Y-n5qNy``I=D|lfSIRsfuHJ9xNU2$K*Pi{`0&~`YX1~7 zUGr6?wBw9xRHcPC)!KhZ&*fl|-WG{F)9e||F?K3I9n)qpDI2|IGw+sCv2lQUSI!&a zF|-KdFOslLHwMpiZ{d66z4$llD|P;bE^@-n{cv9*1A>1%VG1tAQ3a}S?%AAb5Wats zj-EOU_jp;Nx-UI-ZD!=qyuJ>K*JZ@#-uQ~TTGNUp2U2*$tE<4|kqE*oTd1>LcAPY- zg>m;ar}tiu~z#HQf2@O_^!)y>LWE z1ikMwvGBrZH}2FJANmTvikteP3w3;xjZI3A8aiKI1s}KXH}f?gkE9z;Qs2(3qQX|( zK{JQbyeY54m^F>}P=8q}on$Zq;yiYugy{#U*yBKz{$LB|7nkEuhfd)?hpBn*hZ~`= zL_W0Z?m|lq3Muz}2d>mbp<>IrsGQkQXS4L-R0oQEZoFPp& z=Ock(J<1iu!=;KF)OwDoD9_Lcm85?`E!ln@olPs{j5`s=EC zM_o-|yv}TN_t!@_>7@WNr-&p=ip8kw*?s)>qkO)9kC6KHR4N+YyAhcND{ zpR>(y8j20lFy!m6MNfQM$6A~{3;UFKeqnOGUfJywWGb)S%x)jU;c$fV5Rp4Wz~w3aiO(w zT}@RhOYP=0705hx6mA$F*IpggykphIiO|b>=WB!o&F)2f4ehbhQv=eXa^k5wOsp}< zu~@y?_*3;Ar3T;^qPCP^iZu4m3EsIa1P_c1s`WK`o@)qHCDHlJQ3JH_OGXpW5BaQ& z`jO_BrFp{P%3|-fQJ$+z+;R#z<#n0URZ#P6!@M~9cZ#u8PhedLM4;U4Q%U5@k5Z*p5E@xL}|q!i>vTy{0z%CZ#tmn*N>Z}FI|`-E8KmZ2<0V`6=Sg#6NHs;vfub<3~p zMXz0ZomY@Zjivi;Oj1kEmp8~XzvSrt&{$|RQT@}MUq_!_GsuY7_51 zGH)q!jBVc=-zRU~LMbJ-GkOuJ>pz*T6RM zx&4jfZ_=ozX_P#FoQW6@5K8ZN$rnEi)(y6JwHN(+7BTA(r$b`5_zmMQg$^|BU}P zUjHg>S)mh1IcZUGF-Z|(n$T@?{0G)+l;0)5tlsaZBF18l z%9XXIrP>o>nLQ>R22aY|h~S&C_*%#RGg*-6$xLLb|Kpz&|0Op6|8kH2Yd1+!Y%kH1 z^CUlyH3X2ENQH`^py?ql0~brWm9}G>lR~LbC}xm{ZGH+xra~F3@Kh*b6&|tLH6>B- zXJ;M?g%3Ltd2l3>cACZ_ut19Gi^tj7O(K(N0p6_ak;(Sc*p5Ie6e_k;hOwe{Ut!P6 z36YKg4v`_V7e;$dP6!JPievYa0!e(IEFhg0&qE7Bf*@@GPdjDz-my;$4{=J4_KHC3fD0FP literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d0f1bd9b4393fd8255b1fde58a8b75d36fb67909 GIT binary patch literal 4341 zcmc&&X;f3!7T)I$$xQ%>l3)TT$Pkek1XMsuf-RpYYQp@b*jap*jBCfX$9ro8;Y&3?T@$CTd#X1XWGNr-##a~U*ak8G8Yg6 z2f+*v0VRMj09whpi^mVWOwzBOKJL|&T>m{MUw@Ije0LtNw8Ca?-;#A#&j>)k7YL1L zd!B$LU};!`J~g5fQX|#?z}3a*mO>cb3*ru>As|Fnz~YI7oB&y5;F1toY={6;rBqvx z>SZJj5mSE`N@PUm2m%b~hDUb*@|>rB$eBRELBQg$9YBfJGQu0~7?+Q-6;dM_A%%Xk zQKb?5d6icFh5(Zz=p;TM&{qnKS{5G>6dAW9P!=as3Z)03flHP}g~vxk$}#mL)$ftG z6&F9pH2#Y%{}Sl~bwR(nhu{rMen8KaUOi(uWvql1RvV$JO9EX@%fYC`5Wc&m2(Py2 z!IQR0u*`su4jqnyoW*?5+t~(D-|7LFs=$j;807vK3VtWv!Oi#OzzVj3!#*M?ea?e5 zyCi5`j3Tx|WCP8|%;4f4XV8u?h8J_*qpzw>AYVg>CN=5;@zfNkD0gs3C7`2P7dR8O zK!xuB?JIb&)z2EjvK?Vo*9usaqzRk?6}UaS3J#1)gz=^|=vtKq(rKyyezptfG+l$} zI!lQBNfjDLErRXKH6Um3B+M)ff~7~b;krKy_H`Xc54i|xuPQ*#XccH_o&ZGg3^1#G z32Cl6FuS@0L>1<+AJd0o^;$?7s)k1}9Xg!_*uzb>uw@P(M4vlDTK{)wMf_+uM(9FB z8VB-?NN7880*q3e!1FB&>T)N6IF|)S3uSQWzA{|6LBfr_X2`9)1axDsz`z|jI4C0YE|%1O9op94L6=LOB8#!n z*xsl>Sou>r#?iMYD~j5o`s`bz4IY?Hs~IYHRYG%H8lg!XgU+RPVC(B~*pjpm`_cL( zCOqnl?`u|s+JkLyKeGUaa%1obrt7e$%hwU1`Rc6x^EJpk^bD~#jm0AM{jrJmHRQ4j ze&n8(XHYlG6H$G3n4%?y#|s+3S?3L0vgqYLelmsR)1S+!$!FkAg)1oe&V=ys#@MsP z^^kS>41Q~#J!Cd7$6hv&=*FxIkYBR^)oH$jXysF=Ds2XgpWzOTUyZ}{Hl6_GnFH|c zl4`*1%%P$R~MOXW9`!f^R zI%x@5<=9L3T7PTOsB#+~Zsv<$gjldCS_bQ$xR7H~SAo%&jjYkP5qkMA96(kEW}i&Q z)mABjuSyYjP;j06;>0Xcc08O{T3N++5A#8X6YgN+#oFv8qQ%_7>mpn-Ihm-4^2ZWi zJEI!|_ePgabips#x(LV_i!`^Zx3S&UDl6_9dVogj7a?Yr z3g;TI6@D+!1Gg!Z?%h5V!ZqSvt+8e=*tVV*q3|VM`^XTzIUPZWzPbaR)6-!7lb!gL z@9I&z>@@_306J@+gqk~ium?FyM>&q!f|We7z_`C$gP>opqc;!32&`<4=$`#GVwC>YoCXzuo&#vR$ElRgQsNIUPq4p)CAf(60xPNJ=84Mc&_-H zyU;s6VPf>WS}ZQ3ioJY?COLDtGbT%r5he|V{Ch(eSe$paIkz}v?C4*+QRk^tOk?L+ zzV*$`u(IQrR(P)}CM-IS-;c{DeX}2<`Qr%6{+%zmt+5{s7Df{hD&t|lZ3Y=O=QtkR z3iv?EIXKhjjP~v>R6aR2hN`Ud$9=;&(4kz8QdDwrhiZgze-M-E7i+LN?Uj7+KF76s z#b+(6@jwle-l3e9EGm2c5iH?c80$W%h6QkS@M~#t&{*>( z6cJi{y2IapjgUPFuZEb_R5rm`2^b}G*q^!{r^e{}oZY{EBb%q~Qs_{!Ax%MO5z=+G zWMhW1mNZi+DczK%qVILYr@1sOXRKMk#X`w}^v&v1V;*-kAIR9M>9BzkNe^b`>A2+U z`?eg+`fQ?SsY{WxEW5zarzWJkr7UNs>C%==k>jDJ@wyOszi-Q;QblrE^ zWm%M6E-+eHpUjzj=xUK=SWjctS=q;osD!C`_rob1Lp>PgBm6&+j!Q54?S+ zF!k7j=1mIf_PJA!KWxiYw_U3^t)}+>SRy<$HjW(>9>mZ& zn!Ba4RR?ud-{^|J))u$Q+3g>%*^m@`7;@% zt>q%5Mr;@|`sY>J=p!=v+~n`bsPYGD8XgoG95+m{X*$}E#P^KF>Bcl2{d4%g+4Qf_ zigK+$$ckMS5)>g55Hu6l!~Q_%^)x(=syNWUBftt1eq3?^YuNZ>w#?Z5ei#Bw!1AzT zya}QFU>MSm(L0PBke(VHUyF$^-}pblf+{7_5PAHMe{G2W8S4LkxxV6m>L#h?8>muZ zitgxdZIOmhQR3w_$J=K3VhQ&Xri@rgq!Ni_F8yZ2T_Q=8NP{I#5=pScF<8DPr442= zBS(qEmFemENKen?!&oK)6G$>=wV|o$RC+LTwhJ?NOk~=bOhbp3NIaNE>dQ>!dWjh` z4-2rEZ64sgAU@D(T9~hoR|unz@TBuwwSsQs@f_tr=p<+vjwc^7x{v&0{k*Lr1D%5< zGR9xNJq?Ez-;uf3r=K!&H>MeeE>C?ppD=n&8YlC%it_cD9p&dEhfeP|cNm}B{EzdY zi$%*H@?lCTjSh5P5XUGat{#lN+$eEm|GBR6deg?8*{ye^0C8{rgB z?<2>__|x%eSLSGjEkURiEsI$h9~5W2EIQg)784VdsFe^D6B`;8sby|$Vs2t4^3VYI N@PbAHz?1&NkVV0zh$QTbia~{75!r;u7Fo3_1r&`tAd0wy z8>neokpnB z0fkXQ^6y;nEvOs;7XfO~q&5JQIZytOGmZlm1Z)=52@Ip8G-;?d+T^2bdE}6UN8-Qh z=uj2>^_CL;UI9{8(1U-dz?2gnAC?>&5tkGk9+DKokx>U(cyL&JOmb|TvZev3a}Nre zG2v5W@Lz=dWTn}<+?~2U+&hGFKD9CRCy_0kB``%5^+^Q4X(cbuQ@TTS<#xivzN(;Kc(fYU082F&6&DOBQlLKcr)`4$? z39u=>8)ejB;Hs51+!AOH9@`HhUb#EAuR$HKwv7PQ2V208if!Qi#sqA%)tAU&@)Sar ztIrr{t%5ePv-sXD27@pSLMDz?L|9uOvAgjJ@T%j1eQ&*x`b2~!b8CV7nAf1)_Kw&6(US!=U|q*q>^nb4plDcx25Sj;c}5$^t(pf<8V-VL z?G#jzYsmy= zSM-Qo2j-)_)i$u!qy{T#e-A%De;;!^JDNErD+QJ5wqt2Q_Jl=QHWp(o#m<35Fts2I zeA(|oXr(U$7N_eN!@q}c@D~g$?{x>()2Cs2%hW;PsYUoZZYOc-#0(;&GDfqgtU}v6 zIshI?xsFB&jhTC8rtn=S9}`VjftSVyq10FI@ba%WhCj({W3B00$T|iebMo|Cm|kfb z>O0@xgu_h>ppAA8L(Tso8GD0dt>~#GOZ`T}x^Jz~{M~F;!gw=AwMGTHc0+(xx@ZrV zvyY?g3;Id_Y%vkTD*^(GrOXr}1ysJ@j^;0KV6M?17}YP|;Hj0%V5edtNAtcpNSz{K zEV<fpfwSlv~ zyP1lR&{4&onScp*9up%b4fq7OL!zbx;Qa+f*@*SAMZZ`gM z7_mwn48w4&y{w1%(jO2D#Z$qf>Ud^X_IIFmQak!4rwcpoXrv+7l#koazKZ<>FEIiI z-l)jL0~?GXKt!{?3%E#0Xoes(aN;hC%^ zy!HdQGb+VI?pKYH)>beVZ8s!b7rCR5lo0&rYOealG1HhX5uW&WtV2xsn?Bg{bviQ0 zuh+J}vI#8dK5pbCm;iYNt=O44hcM~-2XM|voOJw`l*q0dfbVw5@z@cgK$Ps^(qGS@g-oXG&P@4z)pFKyhZ`mN#k)yLBj|=-=Jw)#p ztYOW#p^iOklf#iWcR@oxG3pBv;$LXbU>wkPg^`jp>Jyz?MRK!A3vN2|kFQglZaojD zhfh%fZlq0Z*l(90bR@uwVUXg|nK)|+KqSv;e|iQ+YMBPq@6TAp)HL?k;Z&HJ#pc;Y z_0|`zTB~6sR`5hctJjS%l@tdw6lHDDwGKVELv&!xCjH3?4|*F8tleVhlu7c%2NgME zJaSE?jR)6lHSsO-C=ef7zinJVRa9T&p$+*~3mX-D=ffKdCPZ}?OPdaF+BHdj>s*2J zkLE%Pg?DIOl+Swz20*S74uIg^j~c$%Rd%epUKj9E8kHWb}rX!ertKb ziO5Sup1a&CcAkvxt&&}9t=M%cUS8kMa#uws($Y^)PjAi9pK~;ObmDz2GjSUgJ86K5 zUA;M|*8Arh4cCUJsjqi0&Eup!J>|8Rig~YV*W6>xY2H&Fcq-xXz{_#H$3xq4Eu(+2xLUpV`>NQ7zgS*BE$kHLQ6`mj6`M!{Y77t!T0niA5sOZ0esTZ;>cf zB#sohiA0ei=Sbz9m{OQQpPWS^PkN{Lp`GHC$3)r#?MO81i*Z&~lc|I2Ob?oOPNi2@ zdZAp4L_YK)meQL%d|}T$=Zy7gg!X)K&RJQ+MK|5P!RPX~4|* zzyM|GRJ_@$`n=|RoDbD3ivE-j-Ab`M+vZJ(c~YnoZGA>qR0L z`sf}*_dp)#Hf3?B?NX{I6KN%?$*FYzsT5U1q3*;=ODc1!I!Z1TiBnZ`QYK4VQ92@J zCsjtuB31Xw(xtI($y6L=owPmWpE9LKqtXOCBY8-|lH`aa%P_gzG9)1(KGi5CA|X*0 cA7^A^Ki+1%HQ&bofDeD5j{5cLOZ|NRH{sIBfdBvi literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f892d384dcee4c386d866900d99cf672068e5bc4 GIT binary patch literal 4860 zcmc&&c|4VA`@imIIU9!~=ja@SlC6}bsE{1HNTRZ}SVPwAGZk8tN{VSAZA95aOl7Gk zN{faz+7~9YP1{Tp>3yDK>YaWw^T+S=c|Y%aInTXb_w~KM*ZrK2`+Ho(t_A`^;3Eid z5=aYR6o7t1MnIyG^~8rC6&vGskz)kU);Sbfn--)Ve;z&LH#i0zHP_R@#J^rV`m*)S z^ZY9d*9btMAW+m5@TfnAfT3&*+N>`zgsd-K03fA}Q6&%Ic-mBB&38yyK!}Wh!4nDD ze!dX_OM`u5f(4iYPoW1XJV%;oNR#zX!9=>$u0enSRq&_^fa<3pfrk^-rD<3Y2-pa4 zHj@C7%K`@VNK+4&%QE9p?zG8`RCJ7Q2nbnUW`r#ChmQh9>aU~J_+AvXq5Lr(rx0&-8#5=?E4n1awx62ey_P$n!0rV>u7Dc^O0KW-FM} zwHy{DssekbJlt|FheI+6prTieu9Pby^~Mt5Z?*^Z#w!q2I~5{cDnNtGBG|D^8MX}l z2sYV);8QjZ+7~n6Kvxxd%t26dSqg5;$wPC~SRnRUgMQ6hNOe#L=ZZWKl^DQ5OdIwo z)xhe{70?Sa;ijDcd%WHhGUoC@bkq(~2cDqiadJ>ejEC@4He~6N&~oe;=&rT}myZmn zP1gaL#_sfC@!x4}e>kiqehv0L3 zE9|x_f&;V7;qe{}4JJQ@iD_eCT)83q@_H>CzNH1@BSxWj2Y*G10lg?z8$ncrI@oS= zhUkj_`R+GXKS02 zB{>~Xapof$2TqvDIek>@z(w<#8=z4$8l6q)#Mac|kdf$(b-LNW3*)z#u*?oW&?F7U z$J|j(K?^)c+XK~xZu!qyvKaJ!A7pAW|4IfGzfa_@kVJefbS4-+()5X*H z4L5U0Yg&fAttZj7Ip@L5OoOOZeG4`Gc&t3t8dR(uq2cNnoS!j>G}avhS(`z4w6p?n zGkrW#U;(VSG~%Ss4#vmK82GtmE$nHnB6AOUWBY3iP`y?iKIO!(4RFESNu*I z+=e!xr;cY)>?k%0IBiCrj9!3|+KVwQ^J>!nycd~Oy%6bKS0szuKBLmnZMbgfcD&xv z4z296gIzIc_~^ymc%tkB6drO9Tk&!O_H0=)Q9H*4v7VY?QL-n|=ZD43(mxm<0`T)2G=k6rJE)$~q8A5Mi6qN{hoWo9bO@86AIdQyive1C(WU_fUk za8XmI2ljA_kBrr*3=I1r9OJyd0)fA`qYsZm39N9X=)U+J>7>v18lsO@Rrir3|Oo zMXac|No<#a8B#rU`5bbcG|1ot-d1{(`Q9B!A4?N>UK7de*nR`*r?p{?nH_kgxtgrT z*4>2RyxX{^d=6)+XbGpy)LuYZFH)VIBrDz9#3Q};O0hG~o#Vxm@sLpU9g2xj;HbKc zWra(f!1H2qj~2Z^r*yZMY+{ru^Y>o%XX=fHp}cV-*{hQa(y=c{^#=y_NQCy8|*@6 z)bHKs=E)RHdG{H<@%62+qO(#htVaP8=CtAuVzWrk&3$P87=mT~$&=jPFo1@#qlj>M z6*y?Rkqn(%g$JDjd~o$yIQ`TP?a$4Y{c&_OtF(48?it30PT69#T0R}Os6ZI!xhAP} zp&FapQOXDRvmB#$e1?Cu6ROwwgtjzqVr`yZgvFl?Wjr7iF+YwvekC;)Ztg2V9U*>D z_{l~3_RkZr2gc{nHPKq^;#!f^W<@iSb1;pmmbF->xn~)Y;@6?I+z_~y_ZZ*BQ01ob z717SvAMjNL_t3?d-HeH~u23Z4vGb&BqzZa-fK~QcuGY1PHHEVf^}l|DIZbSa1aq#! zsvQE<{o6Ba;QRp0xi5`hYOW$=_P+*IH%qL0u_p07e-7gi-v$M_CXv)NPu35+A zjkC|T$XlB#B{U4~I+M3Tr6_l;F-H z&$EZO=1z-xd?Cl`NX9z3DD#lr_Hhn-tc$Y> z9kUHXZl5du;mF*BmYWo9N_G^x6}c9BwUq2Eb+7bm+hbF@tK9oUbpP#^(%nawo>|LQ zvMtN5^1qO!s-amIv0wV6HaT35I3KP0e(k~lm z0>8g|@#NOk2eVQTDKp;Vr-$P8Y{BG!uAd%RrO9bH7kG3$wn-NmMf7xaJh9t8)+S@T zXJ?PY4xI%R0o|Rwj@kYOf$asJ*M6R}cShXHp6+W;=N=TN%4m39@0(ZPuyb?H6ia`JRy@|IXju(R2wGvF_4=qUANl1 z110kxQJ@AdQ8IB+gPY81FT8`2$t1XCND+A@YRx)J0^OcN}qA;=PP?nR}M>?)bw=}IOdvfHK`rA zQRZEr#5Mi#d3R0dol{#)kH5UvuySyiYj)z7N9VRE&B`!4`KI@Bu4%H&%({2|*Gs)m zXUwes^~K%#m7itI8$P`5xs*F=n|b5M_ff;|u{ZBA&6+970&GNI*nfK!XNQMG&{`uf zh6oFZiDd?c1=81a>Q0^_qp(X~p-5LVTT?TgNsEs^fBpXU;_V}F{wkeV_0m8hA`l}f z)zQUe5&nqHGxPgHL-f&KV9Nd~H>hP26SBVe&~oFiqm}_W%3r3 z3h8btCjZY~C50Me1)e5LVXq+{K%^{`7Q4F6bvGG#vxEl;J=%=KmSVAZ z9`(@XC>AG(ErZ0iVsVhzDo8SBNjc1+TUKJR16@=7FE!PZG-K!pbRhBE?)l8_u3mr(Z%w~eE4Bs}T$RD5bFeKaGUAXJO;jb0HK7(2;7 pDr%B%baZ5bT6|!1Oh{yent}0TgUR|LCuM-IA7~^1T&Ul@{sjY`8d(4U literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a786300b7ea789c918d25ecc282aa7737b43bf2 GIT binary patch literal 3554 zcmc&%d010d7C-mC_mYcLo_o*sJNJA! zC*gdA8%J|~+!!s#a?l6BwjgJ6)wJs~Pv+Az>f*J1r#!^yP;%e0z;n$_KNtRC5M zz*%w%7f!~Rb0SV;;|We|uZ)82l>z|iU`%Ei!qgf{)sAd9TIm5IbEQHOr%O+mk*v#3 z=CE8QZ$|PnNR_N&-=s?ELVN@c1DVXp0zl|kE=Ws!5PVkC6i5)z5-|;oDGMg1tdERv z3B$s;2%>VX-|t>4*xnljOZ~^eU(K&uBMs zc5S7W2fqz&9oHd^K8qVFj-khn=dgRmZSkD8RyeV+3!T}12f15i;w>2o@VIs-C2{g) z_OEY)Lyb?6J!rA_=W4Vs*c44@u7?Iy7HV10ju+NaP&Q{OZl5?7?s`7JijM-RtxYoU zJQ|5=cAkLmN-E%aX%=PYT8MqTMlmU?tVOp^RwK8R*JyDRuI!?!CY`TRxh&n2B8N(lVsEF0`9398#j-;Ab9XMzi zIy(yO$?L)cRQBR6DNd-TRY~!~3+VkB({cWzK=k#~D;D<_osukRiD=FxxWA{(AL-l9XcVhCP12S= zr{H+_CUJ=_ZouAm9AWsiyG-BVTsE zK~!tcF>~*a%gndR{g{@31d+8z3F;U-naS7~q?~lCl1fSjbah$mTmF5@FkYR8hjw8I z$O)3YGdxgot+q-^e(oA&+%~Zb*C_q{cbXR5SI`188QVm*1IR)>`#VB28MwcU5RFn= zQV5t;_-$LTkYf8dMKx|)SR^*L4_fE9eNnNA!acdOaeL8+X11CV1z)vzX6DuzT= zCtqydwXD)Lwz)*_>Wb#{ zHVYkBY_Xn~byK4UxKG&T6P>1dbvMy>b}P{b{rc{;(3=OWB&wXW&Jg4g4*l@mfMV8WEtE(IHba5dh?p#uT^22edc2>%DEaE~tTw*>>z1&<8`pvNS8VY{Fa&4zN?nG&S|FznS?lZsKG`dJW?$U@^7e1Qa zvUc3Jp6QqC=TlZ;U0zw&&u#Xv47==|ceksd$ZGr*pAWnVq=?bPXy`t8NSS#3t z^fu1l5n~VU7Fp=&?<8Ry0Bq| z@GKwyoR3u-2HJ4gSYv};E=-cNlysrqCT7#ADcL#V#MDHgXpsulD3xF8FaKPjsfBHljOB!$dB`S(Jiov7#IYdq~hw3kW>KizpW$%U+NPSY^NdZ02FL5uTXJcJtazh z>m9J6cQ|AJrh;qsqWq>NrYGeX`-$5~{sU4Svr}EOB_;P&^=~lxI;_f44^W7*+z|+Tzdr?sIg!>!kfs%eXAwFaOf2+1 zk3&DA-&W9XF#MljK`v9;D7F9N*6L4cZ~iaItNODtNxo{3T&7~>KlZl-P}(SDd_=_f zNN;0eDYntB!tmfVJkL)cUtxsud_J#9;sba-iT6)3oM{NdIAQ0{^TEPO=-!o3hHbW> zK~Thx|6qu#s~6df4GR)z|9oK{D@;T+&ufKA6D^R2dtNQjv*X;u+~Ojq1P&R|y>!In$pRfB zOh)Ji*Ny(nCTE%)U8F}wbW~VIOq2mSu{XhpFJ#io{*Yji@dtl|P--$00;l8%7ksc* z;5RVk_u8KjY=}1rHW|I4=lLB^I|iJBJkd`?g{Lucg2Faam-T*b sV$RU`%*>&>tgMWD+q}fA?39djTQ|>PZo||{tqs7757g5D;pE@luYn)u8UO$Q literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bc20a7699b64f53ab5cc24074d8f61c1e997b00d GIT binary patch literal 4872 zcmc&&cT`l@*5Bur=?Dsg3?QJ$h|-%zMZ*Bnn`ori06{>ibh}`HL9r20P(g|)7Kl=! zQlyCm5Cd4iC>SiL#2Cdq6;R*3Gr2Ozgg&OlOp5Jrz)9m1N)uYmgYUFOYN`5U7^|RV|ul$X>sWNr$?SF zyRp0UY-uka_=pIt zvHg6*{Z|J0MhEdRMV{g#r1%RGijk1@PtF9oRMf!7fNJEZ4nUHH-RCR_1Z)HhHj@Au z85WF_VVTQPTC~VXJKFS%xRqFAJ4!6{hmInZ%U@Ti%|CL%q`9P%pL5Zc@sC&)8x{~A zv(n!;##bho8U*{VTon--8x}6fE*(>RhlKBN;XYjTe}Vb8Dc7s>ThyKSzcD3e_dxbd zAJZmvEvzxy1|{7h=&oH2y7@Zr@D>l=?9zf~&C_Ak3~ zcr^}#^cTV4UH$=Xemnt;z&UW-Qve0O$-#z$BD5unhpiFJfrc}NaP_b)ObOG2SFRt? zjZ%HcP*pm{m4v-Tt?Y_|i=+UpQmX$s-LDniY;Ww39xD(o2j0TyHhfY0eE z(7v1jN4n3Vehz|)HZI&9FAt3klYz*w2E&RWNVeAmm(qL?oHT+wOdE1kDqz#6Qg{sW z;I=Iv>;HBx>~d8A!S}Y1{QMzW6FVNx5K|#6nGG4bBs7iElZ7i?By{B(B8L>6Zb0AHyLb#sjjF3*l4TWjJ743`ZR1 zL;pbxjcgeJ9q}ZXQeq6hy-I>(cc#JA@CoQm-oKEt|6>%RjUX~y6Kr<7z@pJKv_D#m zr=R!;Z%a+lMZGL8YlO*zS9^&8$0n2@+lO}APY0vhk0Fd` z#cDIpqnHV76c&6ROXz5a=EQrbC4UeZPmIKJBm80QuPGQ?dp>zGy8}w=htU*p!sc8u zL?`TJ(89(Vs1-(`rmdY=VigX%5|&~wW)5LWr)}{g4Lqng+6;Z-O!$-@g-!JqN@q938ZgIrrWhW}uLfJqcW}-4 z5$EZ%S)>BUz*igV;N3}kknvgop=x^A%N13S)>4PxS~MTT4Xd%CY7%uhUxAFWC8$z; z2qI-Kpps;3P_uS~nj4dFt!?EXyI=%`>uSEVsf4RZpwJPh9(imbDGw^Xn!vHIw78QxmV@o0rcdbx-ZVLk+#~s}K!yvRA>D zXLjU-t?NMdd<|p#9fXGdg#-8&f#KKlaOHJ8@RHBwjPl#bbLGyY@7Ykff>R|5jv=1t zc-%cqO*n5K~566~j5E$oP*1sTab zgYCU^NzP-BB^jzz09v{U%s4U*&VJgDE#1(-OpzrSDrRC7e|8;e7n{n+4a|V}+3t+B z{Wi!mHxsS=_7LpqQHSVgMUJ}LWR}6+-5hRHHMYk46?)(>i#7HB01D#ha@xvfGMDU0 zlndjY$15J|pm!I;2*Hhe;5IKA7C$?Hw?3>w9lmcNFbGin3>nnW>4`nq;WN%^!Y(ZT zu`$N^{W=7^Z%6O?LkO&BgP?c*bz+a3CdoaZiu;ZFV<^R)75H=p3=TAK^zLC$=XRJW z_VrUMo;wXp9=#ytP4=RErZJ|XvKwcnyac<$TVY1>DTY(bGFIf!X13e&+1y7}1srl4 z560mHe(lt4=IWOP!ifl&&zSY|D6s^RZ)Q!aAvULSmt0F?G3;OQjK%x< zGSOt?&+xaNi1pfPquUF3SB`OZu$g-;!VR{P_3D(eU9Q!T|WA!Mrnf>LsO^hAy zPDwkacs7DHskjJ!sfb{zzLo*4od*c-f;POR$AA%yw~-bOTEyua1a3IAPq`<59y={E z1vTSU%o(n~qD_N$A!nZrK1ux}Fv4Fl6Q^8;l8`9u% z_n_MswqmLW>J?_*+zD$t&uBPknINU?%lO5`MYz}Yr)cpcf;Inx7rCeAIU3E1B*Ns? zAjs-C3UNJ)2VMgFz(akwzCH)@H0tL4FfoetwYM_z3S~p5>&Y>2JLe?o2=)WB z8&1UCCo{0VnU_$PAPH+p5^%RGo0FV8F;gRB`MAbMs}WbB3MI(|Lsx!3zMr8kldPbO zGGo5OHy*u@TA~jybSmA!!sj6)pI5;>`ZybOPt4-ho2!BeX9;@t>NOTN%!T;*GKw4b z@lns)LG06t6t;6O4}Wzf5>0w=2&pf!#CnzsiS-K3j6#J4DA0W~IWn6Z?`z3q`&_!q z(yz`^=>2qs!TxZE9r{QS>x;=C^X8_KBb%C_Zon4h9?FtERk<8@?Kq_X9!(sxHwuiR z4{p3-z8sr%`qTJI_hOclP9r?O@*G+l&l2NuU(wf=dLBQQr^~PB{O86+&_}C8&_9Tj zAl5tRX zeg3voSq)3ElBnR@GSyyP+_3x4K0VOJI3ayjRmM zL))wIXxewv+zRZnEsM5i>Ufp~^)wdkIAGw@C>B^9OV2h5>MZtZI=1uBtjPYW*;dDQ z<;+cZ{nYDBs;m0fTY#KkDt4P}l`5y@ULjtnX}eovL568sMw-KZ{S)>3?7M_6Ip&)% zzQavz_adhp%S?w`GnSMrt4-oN-q!W5487l&>3Bzfh1>e~!=!H38Q)9W1#?q$v&#b7 z4$j`d&^uTWeDkDNb&B4hAHwccts7zJA3neKQA_sRRQ;T)*kA5m*uXT%ty%Zxc~5n! z!I6ubMu*>zFb(tS6A7-ex!5qjF`1__XCup~;8L1`j(3gNsPOVm)w%0OS;j}N?w#s* z&^*n!s3lW(Y1Kxy$+5O<(~!G0X(q?pbIdmkkFrgRf6TYup*(N9>50xFr$g(bIW)R! zs|0;c@xmgPO}1vaH7A!0E^ppWqkB~jkDSX4xs$kpyLsLp=t(l%F?8*?(=>X<2Pt~1 z1l^&0485E}x87k=(w?(s!|*b$1ignsuPFKHT;#)+9LIB|_p1|zp5Jdh_kCa8R-CKm zbpFiGO=5YS6>aCwKD@F=(=pfSLfPZiER&GlwhQG0H}b4@s5w_vJnPaaUUX{3^{UFj zJ7-q4<~mpZ@ZzWQ(SyC$tIxgctKY^|cd0r5>fzPzbXK<4To@YovC#2|OKsKfgWcsJ zeeJc?Z(q{z+>536fBOPn_u?3Qg8BQ`f55+Q9gxBYzR9yX?ooHqbfim)UOV#kPv}j3 ze??cTyAb_;sL=7aD}_GX{i08bE^ISVJUW4F{pjB%;I`)1f)YUsoxmFFDA*S^vaAWIr>K(17s37%9i1Sja~t+&WXZ-;81* zfA#+lsDGEYAl(d<7}2YO0>XUx1jWR(us<<6Emfz}@)i`c1T#$O%i{AHQspm660P^2 zW8q_bhLbsMO(;o@h$ej)g)nwhTFR7v4cfm1_*ZVLZ9l<)uLNf!RM z9wo&m#)>>4OR>UO0YIRt#1px@yL!x#K3Ph6gaIvPB1@4-w2=DJ;wTcui!1|0HX>1= z$SP2>W=SbH(<3X9$e!-0`B+cQB*SRh1MNuUx?ab?;45mdz{QT9TgB7u0=l7Gi$qRz zW9datCHo>ndLH6u>|*5Su_V^tW_E~|r+W~+ALd5Iw{8v9Nc>qzoKQhfBK0R3(tBUn zM|*pih5OqEihOB%2|Se!C0=85_iw(T=MHouO&Q<6G` zw@{j&!{RUFp^8PxpW>lQX&LEnyCjC*5ZOD?`Vyu1G5m%0l6q6crev)2B9S#cv<;Lt5&&-0e_;OxMuIRh literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..151008dc482639c3df72873480ca796f9ef31d42 GIT binary patch literal 3553 zcmc&%c~leE8o&3>OlA^-1kJ<*QK*)P>`GBVOPQd68WnJX@Ser62*@e~sE9eBfN>W$ zK-7l_O5LgyZ3Ul2#HB7M+M-xXU1(LbuUhM~ty+6$Qmj1nk9W>Huiqhe`+nbk&wQ6) zE_5hMvVQCsEz7W=Hvsmzt*PGE9rjKoHH$vr_uF{cbxzIf$k#s3By=9 zt8~G587pE%w!?uV(~VU!-6#Nn ziYBNy1=aCt=1r%(ID%JCG4qcIkPtx|`62>G%b3*Y%;eY^pL{%#((g->Rqos^pO6?1 zw`W~N1JncQ%?Xb1X^V>BJafq%sWVXa<3M=vNr!di;2@%?b*A_u_`T)Ye#hvRr=Q9; zJii959n0WQd9g?mc9%)+V8pp?4a|-yZ^1*SJ&@1-6|!`~qz(yxac;rGjy0&y?FaO>v z5L%OrB#+L5*k4-U&o>fDR6S4C;d_=`6+DQtD6u7?p2i?p6w1Wib^>>A9g(Bu04#ii0-h0jI*9BCX-?Oxi=EIJnE(qO-nI zLM@bnUIa;ezO#*f6b7i78Xs`4E|qRw^(AN+@j3cuaVxRc*WSuuc?mfzeo)<+p zOh6BlC}8o?Bg%9!o-k=6=u`PMK>FTsGGox>fxrXGE%)>THy}#+fzBUz9PCa^M|)R}g7xcVmZMwuQ`Xbmz_AP2C^aM!P8^ph zQGfL{=p2wWIBiNDGA!Cn&suLsjhz*U^jUgxPXQ}EH|Sk@Ozb%FOK~-w_Def#JD7)T zOB&_FzgQ0DT-a+r-oXIc)CHfGrjf~g27q|~?J#lj zJ|eCe5G6O=z}cKlD17K;>4CoK%zF_wP?sbI7py*n^ZP6#{PsX3xu>T3ovcOkRd;%u zT)z{y*2bdfS}klC@(eC*TFfk&T7$BVCW^kHY*3VB5OKE90NOU~fUOBp!0Viryz-qB z`et}Dyr^1$PAyPbEU_6)Nh(aV{puOLnmT7e3wb?UV3`0eZn;6M7ui`B%5C5}!x|#D z>T7r^qeSGqKNR@QyeZlut+S~5whXv@IKrZFv@P(IOow+K{D_i&wE)?^mWtf9ENp*# z7d?N!NIbDaN<27`2K!$thjvpmsC|Z-%#lwNZIzFOaiK=)i8p1f*U;jb%~u$=hEjRQ z^AjTRvrFQnP6hhLu$rnEwTOB$|0wwERv_G5UTU>-{|sVs>rOccJ1X(|NiM2>7A&n{ zWnyO6bL;(~HH_A|33Q$40;ikyk-auQ;J<>k44X&0;NPF0ZIUmolH@NwrZUoAjwj-` z^AP${;Qms)$)ic}9Do?5UqxO%!Sr?vZ>-2)NXrHWmHKU2P-vkX7Js>M%fdxg_8OCt zt1MXD$1$`f{77Zt(!L&1Cri1lMa%n*Ouv2k$ks(G?fe!nDovHCcu>%4M_p6Z;x&VV zD}%~3)l1enhu6lpH&rh!ai7^_Qu%LNR^}Ohp+ zYbbIpKahO2e(n>I+s04lbe<|3wa9H#edfI@2j|i5n-9%}M)?@`;>w(5Zbm@HxLX@# zT>}p1R*8+$!_%HbP(~LQW8LgW42h(GXA&L|QVVf8g&BrrYM}HH{Zz1yVBI7{2kodWu0k8t=)0 zZ)&_&FWPWYY)YKL{G2ei-UDx`Z`kA3`>Ogk7=2x=s%2h4DaweBk4@IIBz|=`pqCym z4z}7|ef)4s23|%IJDQg!l|otg2M)p(y36BWgZ10<`US)P z87xpRDqEHIfBdpi|25B?|I6i7|5Z0hvD!|-s2N39fBc12*(#-6Xz1iHAM?dhR*>#I zd2t$!<3jL{rwJUF&1vGe0FH~}{Nsc(4F*i)SN*buoZa0= z;>FlWLHyo7n;*yW16Iv(T7J;z_^rU_JotTL)UZiIqr#?V#sqjL>cT_gd46&*_HXWN zJP7*y1tr)COlEz;lIQi1&xi=~N{I=K=8qN!Ps(oi>f%kLL$-=z{BJe-iPT z#-I=LO4Wr=N{t8?LdW((%<+w%+S4DLEKD!?!>3Y{785w#z%#gUS{^Tevb*hvj1%&W zlZ~l6^&B^bUj`=eIY^5L@SYtt>;s&WcX$v^ayFlT?4r3S_)H^6V$X4On6KkB+2%S4 z&hkfF>8aWF hS+VIE38^XeLx;N!b@NbZZ2@@k2M&<{1ml11eg_+(_4xn* literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b485d3882928aa5ca25ac11f280ec415a3432930 GIT binary patch literal 4311 zcmc&&d0fp|8-LDkxl8M9`Ca#_AuYEeS}5z7?k(*|p$+ZYyJ?Scg^40jBgztGX`zU+ zm8A%cZDflv%otu`3`UmJ@SfjIhS$s=@8|P=-Z}L<=RD^;=lMS0=ltq^%TwxQ%}`7b z6XwCFF`x}#)p&aDzJ-&YT<8v{NM4#^?y$LRI6rGhcKO+pf#1nW0_PWQbvXU@*XEzE zTr@P?r}08}IRlK4>2JXZ@E3>S=-YuAuP=s(*XIQQs%8Z41&AcGC3ccdq|Z=dTi^)9 zB7R74d{|&)a8e{g2nE8sNcby~WFrah?}CXfaMr*OfI9&m0I;2j7+();vNY@|1bhS} zpGyIy&I1}h9Q>%N^Tw%TceZKu4s4812#9!nJ0c!>?<2&dK3~O~?@1Au)X6uJ68Zxv zGxaceXi8jod~#q|aB{GE6rMzd1%}FFQ{v*4G;0XqGD5P9lB_26{|n5&!Jvq;X7Tv3D0JQL_>F^AB>x@dd39G1`01(UTc zAph9}z)}aEX%LY6I12pt{sEWYRsttt43ztdq3o#u=B<;W1&L}zw0I0O?Y4r`8=S!~ zZYVsP{1#oP83qOVB4plZ4Adh_;K|*=K9hpB8e`y_8i0E|pjzx5`p5PC5hK^PIrwfhAlHDS+E=;~_oS2a2+`z`N9w zu-3T(Ho1+5JL?G4z2H6!$^HTit8L)vvqiA|nkg8^YopH1FH!%ndnnlqK|;I{IIWxn zi8WJ+y2x58xAHB#su_Wf3@uXS8M?#vLEEXI{+qyfrU~V_r4t=;Jp=8d@~Nu8Hr2tF z=gAeW=3ssG9>h^+h{J^kP?zah!bbj(n||>;v@E=dzAJr%Z2Bb-CGs%nI<%GKn~kTd ziZ4RVu{X#NJcuzTtWc$^I-1hl2!|z!=y+xuvG5QHdFcT}oA(5GZ1ch;dm>)QYlK%Q;8jLi{8-u z1ayZYdea*>LTv;=rZ5fQZ1fshZSJZ(=om@sf;x=)`Y61vas_q22@pGADDh<0A;|ge zD0#(uJY+Y`CZ0FY=+eYfFm9X)RX^}K)aj-Y)me@(z|kEVFML7j=5-^J#d|?>LO1*z zSOerZD^kwb11}|;IvBK?^CmwD{?)Pw)}5@S*KY|RO6sgpgXtl1c=;dXjI;O2@ka-7 zjj~dSTFYx(#Y{W;822|i*2<4O4M{MjI20CixX{{}8DMdsk)w4Dq3172fZ%Ol_0^YT z{|q(o?#V&--H)SWZ9WP+I*z_!X@zLBnS|;1J#^?Pf7-241OB`o3Xh*+R z(qhLda$dVL988=>G`KsXHA&fIznRy`bj{l+F6u{O&TmVIN3$1D^%Ff2@BTOo46 z<~4DbXwsaz7hP0ZZ3a4@Jwjb@e=wwt_TtRBml#@K-ivVK@IdA#Iwv_ zsiJzkfr$2hhJJDz$us`(K8jSaP&vEDjyrwTLIJd2FycYHIT(+B8FTH2Gyj={*}l<;6J&?>diO--)4!ZS%yp#-F2B zc^c8GYxT*Ho-hI}@!~~17z~f@H>n)gJ_ScTH*m9qLk3igHHG1KAJaO+^HC|+hS1Yn zNphDw0hbM#FgR-m$0KIuUI8uVf{&>$(Yb;nii3m>3^WMkmhIyv{^ zsw>bivXwZTf05ih-ayl2#ahZ{$_>&_r&uLWJWHj_)P-yv>%!2EGX~yk640B3_H_P< z69PTkY?PEFREbkPKu+ygOGZQj`e}KPhJ$tE6 z!1SnCFqs;Eek@aLSP{kkvczpW&%g{15^vM*hfr7E(CY(G}Ic&Vm=EL$Wk`zA-n%&Wq;sVr-GKdX?_Mba%xR_KjN ze9+#sW$DU+_KSF8+1BiQBbNd*zvitu-zGD5zggd8;9m?}Ij!pml!S&r?WN$~qSR;(&@$ksOZKT(mt#d(#X z>pI8Ef^F_aHc>ZDRDN4Nd9y61|Aea5mEILzmHsVNg*$w9hqSJnuwzYiz=6b$8!bE5 z?g~7%h_C0gv#2)obb)!m$(_Y}Bfc+lUGG%Au0E!HPt*@5tJm+BCmd`2fbMt9bb}II zdt}XL=s6p5FnVP>qcEO!{^mjN2Q2Ow1z2tSK?&xm1fvy^l^E*!Aq-W!{%TDh)VYl; z)H(&|R~GcfQCCoh^=JiDv?&0diWO=t>mAAm&5gXeEmUFZ9*ZNq+!UxXbPjfDSx;x`bua{rbi8SDHcg3rW6kM7khYmZd7TOMWc~ zam>*%pI(=)73#D?Z**$sYppPsd?VL2BQmtZ-3o{J?Mv>`p6yXQJnCv)hEBw!jbk%j zc6RARdX_rm2yN%;MtN;>FSbj0qZ>W7(z|kc{ahwydUe3T)E94I%5eD0potU0RA!e*IXC$1giG`bBy+X4oiz zH&s5B9e(U)kiwt~2i&(Sus-4K3*z?|grsw@K11~CN0(2f7C$a3p4F1!NmOi9QZhF@ zHk_4VSaW4+2y4uR4~!(c2TF3ethoI6N8&Gql_%rq!Kf%zl^L%=4QND#6Es$p?N~_$ zzn6z0@5j%TBtD^RB3_>_t0X^P#ZDh7$*tCsJ;VM^Nos!3m9gRR5y`!hxd#b5k)&j( z zf9zEfHd+b=5}vTlTo*vBFH)0wc}?~i)B9$LHdB^tvz5xEQt1@@u+3d6O_RzZq)t+4 zgw!EIxhBI76WN)ARO-r(*zOy#rJN?Q5!gV|$#aKTT8_e#36oq{+aZnZC$Jq3EtPt( zoy?ClmHSdF){Y6WnPeT}Gd(5DX>^RAuU90yALohl%ZSFEGMyZ6cY zB!3^<_%P=PX)t?U36JUE=GSL?O`FErZfw^}U731sJ~4PrmK^M3EBEuAB=`4KQpb2x zdil9c{WKq3EZjcIhb^TnAK|1uk=!>BRh4DWowY& z?=(6(#3mirWCZJki=4*RAE(%RDR@n$Jc%>M?C`#yRF>9zPs(K3Q`kqUtYq&YCCT1( zC3U|zrxc8%I7|U};(uxU8z)yG4gdfE literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0da33db3c308003647b082db9d88177f6ae33a22 GIT binary patch literal 3765 zcmc&%c~nzZ8vpKn?bU}~p)$}v-dD@r~;I>zdxmpQ$?rsM0{*lD+e z(|h;ap34EJy2kk9?_JNV33TAdP}_0AxOc3kfGl2+3<{8YBor5-|-{ zau#HAc9fiG3DT{z6HH%8Tg7(ms95?J9VLPC`YPG{1qvomPGYBa`Xv;NJi!>BZAeVZ znx3G~(#uoGASGdXyfHP~kY)k50xO>(r`wd%BFg^1i21vauiJ1x*o1S3NHGZNN9U-llfcAHuQEaV>f;RE{E#Ho_-P8R*!;3%H=3g0J1QXmzX?Jn(A6s@=iV z=4Kg4*7bn8?I+;dl6CN6MF!Q+Egz2>>CL2+*@=EQxgWWw9HBQAibRZNDjqa?KNEi{ zmZ>`W5W*T&u(>@9%Z6hrn`?mJ0ngyH$4%+odm|WIki*CYN8njaD9CkzklM!;Kb%$% zOV1pkzMME3N}6Zlwgv`W9DfSR_D@C!`nExx?OeR4FaY`ljDyC{dsAgD3^*?S5Uc}# zg72p91!|O*GID;vW|z<(&sZyJFU^G8ClM*b`6!P)lr{9Cis-cL;k%Gru)KNa5>Z{viS<3wjU|TyAllA zF>g@4a%HeSvxMq7^(r;b`deg3`3BGVM-g5%ZwkIYGoL;%J_50KM&Wepk5Ny95rcjQ zZ2f2^ZRT2-k3JmF==Y^6s%!Vyj+-5W-ksZu`#AL%Z%WZ1bBmhdhb^FMj8k#W(_nP* zr)xG37oCz6ou46jA3cz-wL2jWGg-?jUR*~unsVfBTPl)9-)9ZiSjmFR4Q$QCeyH)R z7Eh_Nl4J}WEUL5Kga3BT5&tE?)+XKR1H69PJvMrcKa;Ae21l29;{;= ztO3u8{U^E}Hi8}a%^j31b&;Oi?Aw(U##GL)U z9-Y@eg_y%@sU~MRYQ7MIFIdO-u$ENgt+za|^vP$C_`C%@yD^)_JMz@mMt?@Hi5S3G zZLp`}%n2A3MX^bDo#FnSX6do+r{G9LmAFJ7*JqdSAQ*P@0n=kxDcUObz;<@4DRI$5 z2&q~K&V{w2@T|Al^fr?u;sS;A5!=nSf$vk^ka?%C5Y@iOVH)ONsO^FME%yi z4OCJxpzF(LbQ{}!6~^xvuxl%ZpsWy?@33IW&H8Gou1C3ax_X-Qv~LK<1ian%RZ-tPWB4R`_$yhmR^0!vfgv&=;au$8oqVl>w{wpD-mClc|#MSn-*HT_78C zl1>QdiLV7~P+dikwQFPn&9+^p*sHxH;u{}QU+rnZ%6&$*_pTja|1UYjR-dED&uEj3 z`;m3wnxwxz0JM;XiWU+jvPI<3hji4>Um!G-fk(^9q)AFk@&U6dzby+3D7L#MrfEyT zVzHus$VR`diwdn&9?4glwk|HVcJMDz@zqO~_RvJ_ifOJcT;5X~_vuD{ThU6pkr{Wd zG;b?j-PdmstM=bsQaT`{Ortxxed#-cBC12m{dX)|Hza0%^3|g|mTz#IakNA|cIS%n zVaXSE>5lDOSvdl5liF_!GQkbT1XYZQ*1k)cFRu1B!79$gTKQ^iOl;1hyJ04v%ii@e z0orCGCuUm{ieEYXx!2d?PlJ)a(2wmo#Mv#x&0;CFV8uky)l zr=8dCirgMn?o;Tzu4dBi*pKHkL)P!s)y=wEUpQpL-l>iApSLqE8$Zw=U#9jgaw*@R zc&@@bPvlxrmvX5_*HGkIdC<^WpZk+&Xw{)PH_w#&77zVveYTGYR5lGH858@xc<4|9 zsl&z$KQm^9O+!A|ab}unL}pn|6+K^Wh8vujl`mkBY2Xkc;Xo}(Se@_H0g`;KL!Pgd zR`?dTTyM@^plAP zH8RnWW{(b_mz{IW%i27AIYD}{_(AnSgQ_9-Ec+7agl+XJo%Z*)=Wr)Y2zm8wl^9ae z1j>+@NvEb{W{DG16NPt|JhDwv<+8rYGj>joRZe#;gygsP@poU+2rl3y^I@-GU=XcG z451J4ggV3_;T8i-|D}JOIuHwssMszYfjYdtN|b)34t|6MJ#zdb>R|nnNTepFC1rIo zBQsJyMoz5*oy-FWd3fFbA4L5<$m$iIpb}-qCnp;898Cz6BmNB!bF>fN-NTQN3h>0L zSL@D+I+b7LHiF)-C&FP)6h2C@rd5{H5x5#A5xP#vu@~|0Ao#Zg{!g%=l&S62;s0Zj z_$Lvy|H~cz({7UT_CTe~iB*>MvjtGwt7Lps)SJ;hofk{Bg?1CfllSL&K9YO|8OQTE zynhlO#Pdn~*d)uEKT#MjjK=bOsL+#nS5Ib^VW!|gaKyhke~6pgNHPeV5F*US<_K+| z&=A)=A1*ZhI$>(r=e5FocAUoq_qgcE*$F}3vvo01$-=%Ng5;Mwi!>I0V=Ycd5=1)v zS%$*iEB2YO(Vl4u!AZPcu(yOKaER!-=24R-3G*4a`cy)&QLWX(TIAMI(>#Y`~9 z##o?}c#)m>!rpo{A5ttLzsW}^rGI)t@Z>CEgAWZC^esv`UGXDBE%heFCel@Uo(~X) z!Kp$G(qn_XXT^EUBQ+T=D3K!P2=yl^b{+~@^S4-%%n3TOujBo5I?u_HtYAfScuOTa zjVvHL*DcU>hM;T`$AVL^C;o}4a5SBopmIppXUxe?%yNxSPj}U4WEgWC<|bxjrWn&4 Z+`Wdn57nx}?EzkXpppDNiy(hk{|!(~IQIYm literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1e1b4765c9bc16d822b1bac7fb9e6d103c2763c9 GIT binary patch literal 4158 zcmc&&cT|+u5}*5(WfvA{`{6F4AZ4jaQ4kT8rB^{vDMsnNtyB{g6oZ1INE47CARwIx zLR8d<2-q7o&?F}Il88o6j2h*A-|{f}V*YsNyz??=_uf1A&dhIS?mg`9;3@Jlp$N*F zvU8)@6et3yuQq6?w1{^Z=^I`!;X>d@*WeNT5o*FZrT)|lKlw%4$~03z@hH9_#iisa z7R6FE2VM5KEuTGZ1pv}o7(H?kh8IZ*2azg82+V*bFW_^6gW^L2B7>45DNLEG+>4Y? zAz`GD{db`RhIEcVVL%V^^aMcbfs~UQodc7zF@b=C0Ov>%K+3W~PL^#YPg^mfFg7u+ z%9P5VS@YTBCww;gw~aDg$zONrLm6F3B^Zup4`9Lv#QRcoWkVA}QsTnmlLJD7l7nQU z=tWd$KuAJtN?g38xLi#6cO<-w3!h@D{{`kJOWr?`>YeCDJs~Ake<$^*!!)gP#eT54zwraz}emw}$=b_#839xxN5421AA>k)&03#)MHUWd&U!uT& z?_0S1t`S%fmQe32fL%}JVbu;1%1Gp5(E>|2y2lt!eB}rlans9d%IRS;NI>dC2oOhnP*a;64-$pRH5_PO%bPThIzM6VhOc zQ3pEPs){rZHvw;x6KEbj3*xSs5dTCO4o&zBio#W4^B>=UeQ6j3wrIe)Wh|&1YDc%E z5pPTU?zMKPF zJ$NAa+7YrI-A2(V3b2RJg19UW6d01wzjrSfu5kd**DUDD)dOKJ3tCEp;N(p?IDUzQ zOFNB`OJg-?C7p&hs~kY{fe~B|E`Xcw;$da750qxrz-a0KlsGm(rOQ0HwF5(MGVX&x z_GHj#HHD|oGGX^M9ngwbL@%m-Ln@(nQL-)qal9rt72n#6CO#eym$`!*L{ahRX;|i6UErhgiv_ZU5j&c=aEfi7on}^HPQe#%(BlIH9E_p zg}sO1urLuFTRVuY>&9X0%B9$#w>|t~{v6}CIO3H@xnRE62X)r=!_Dks7|l(@ry6Bo zRu;3#sDeqXM*|(mBpV&Wex0a}Y-gtlI;hi)&!~Tke|F|RKJUm>Db1`@tlj9ERQfV=vPbF(8Efo^ zpMWH=EDM2*hfbv8+Rwpo{~?ybHH4o3h64oEf${vgxXR~T@cttQ-FH2Pk`*~9^oSLC zAaOB9>Mp}{=5>%E$NkBIjwML%vI^OFauhXBJc%1NZ^I9|I-*rWj&N$&h?Rt&A+24s ziI(#OZv4DRWw?4SCr7*i_2b=A`W{cvn#b3nqR0WCto9CA@gq{}GzOqGCK20{HwSfZ zmzTA=*hMNVo(}zkX;{L-0OanRA}ze}J=~d+sw4L9#F96*N`-G%BkjW-v7po-qU}qH zdtP&{lwFuJafMSSC4M!Gt{zy6sh0Hc%r9?&6@z=!U9@K)e%S#2?V>u|Z_@*`XfnZ` z_tuZxcIXlMqf|`9DNTV$+jb*sqdvn;E7Byh0$6{DFNfj(u zS`$B;l?+!an$X3lU@$xHMqIzAkKHuyLze`Z*r`l`%qA5pQo1TzO1)s&gx)*hNQT#q zGG(LSQuQr-J4;PAi>HE$lMC_Hwcn#tNhK_UE-$bSyv?fScFNS=Ed#^GSu#CVsxU)( z33~YKS1j(e45ZDIRbE|0q2ZU0vC-ulIPO1i@n^@yX!4KcNX^>@8(tk@#Tf zXwAkvHS0{az@|1kU(=~T*RQuWr%=apms6Qd-KJs#-;T)P-nz{tMuENA0^8lWWiuiN z8~l##-cmkGeCtG+ZT;4YIV*pCFnp{&uX6rcjH6=Lu&vrQTT#!yuOYw2ahrzo4!g#J zI@eOusOx==h4mg)HaRNxO+}5~4PK4@{Y}NqK6`>s?yzs(-nw*u;=}9x%_Uz4^ki}- zIkc3vhny(TTRPBEwm0I!F6VNG)*W3jLmg2!23pJaC5U@YzDM`#(fLY(t~jgh6X-cK zdgG96x?%pL#pk{|FotZI4#w3tBygS*I5PcdZ5@NwO{2Az546!}zc6TLEA}Of{McVO zD{uMSRWG03y|gbcV1C9Zuy_{R0!8bk6oiZRZws>BC^yxi$Ra=3e)B|=@XP!2Lmcze zL>qXs3qqX>br$Ykbf6&2wZtfBplEhsxO@4m756S5D2(u^oWG7mSr$butZ~d2F77Uh z@~-!&b||(ij$YjC(;j}MyEw+@YX<#g@$$KQB{%yu|&QSl6k3riwmqmq)P!eYZ1CPdSo9%JG$Q3(l&ooT3uYMq-|H~Wx$6k_hi@q{f$X2e^;Q(wrR|0&kS^Ek;si1Z2TBiaxOAv^q64N1t!5hOHx7|X28bWdh7Ms$!NtW@VZA6kzjyXw+9NU+a?ic5fLgPs4Wc=y)v@3HpW0t^I z7Y8M-NC``x79ti;3rb8(NK;P@OH7JNh*vi;pKdbUSm34#@csjb2mnv|KL`H;bMo-^ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e9af93b02ee386e9ac98f1c9aea4130450e7a99 GIT binary patch literal 3781 zcmc&%d03Ry8h_9EzHjE6VMd1K8)t+RR0c$lO%|1uZ`c(LL1l4)VG#%rVnAGK02Km3 z7R}W3qTrgAhT#&JY@(S)38myU^L8^tE4`0(UBc{qLu6j-kNZ6LxgD5u&U?=L{@&%B z;rs~aBiuNebLA$6aUu@P0aOdToonMS#q#;GaF27~Dhl0~HoxCs)t=7*XUVCYI0g9; zaUyFkaA1e_RP2y807%DT(kl?AV4S8MS#z}76GRHNN|K;kk`$M!)2DJ+sZbt9$|j_V z*06sDrFJ5*0f&Kf3Nio?xe#&DVMG*y^`j|}AfP2;8W=eXBJwdIADNu>l@oQLnGFsC zhLRicBzzS+w4-9FUv-ourDs>krip_wB}`GJrypu=(!b^8AWzCl%*#w(k{g$#%hk!# z$RsT(E-@=3FLQ}8%Y#_C7Hck2nrqnlzwr5ox%ZlJUzvq*M@dyk_TREUq|-rFvmoc- z9Nd`NKv%r^Gu%5k5uJ42Y{HJ70efus(D5U-!`Ov(bokC6P*dnR==G_g_s4acIJ$IE z>q5tY+r{gUNuR^-Z#<43*q_JlSvSSYE_6X>(G~Pz-7VxkJR8?$CBcKHU6jOODzksf z1vuFH5RHZ~Jo%I>dNWjxX1AY!_cb}Fqp%wnHBnHxY(DOe4218z?qbzj!PNFP8F;-L zg&KEt!k48R;c-O{W$RLaeSIb|Y3r>-H%}i%ZfPxaZLvrcX_zM6oNXn|7I2ZjzB>7cK{<<0nLBmk_%9Z*tjmyTNtZ^VEpt zGKd@~Lw1`wp>IY9$~Bjuq!wT1vWqLKiuD4?MhjpZ7GnFUhnd7PvCQ-nDp-Gb9$N3j zz_?FGFmIlHj1CMxOF12=rdIX_!+SY%ar2B|^t!&38ouyzYMJSmC^PK~{L=R&_}1bA z`pC?1#P<2(Y}5D9@XRb+cWf~|z;!Xl-kr(l8Zs312i~@vksgiqF1>F;;SIus(IwL8$5HC4|zLURZ)hV8CFqOUW_!=^E*n-?FYedqh zTWsc4R`T-2W_Ew1Ejn?|70=&hBFULBPSj}nHvaonJN!z3rCGMgJNWfQ{cKdVmdQ}n zgPqed@ls|fG(3JC&tKUlE-__9jXyr1^Bb0dwuHA^@)@+5! z&qu+YN+)_$xGn2aQzbR&XvQzb{tI23?#+(6*td_wril)vUxG4TH z3Yd89Wcao*OMIdFGidfci{G!gK)o~7#?)@zCfa@WWvbpjgz6eNu9$qJQn6j>$8-cG ziL5+J(Z#^IOxD&A^_>1qR7xtKYvu97{LL#d-jIo%uV4tu4UtXp2$p=^R4=#b5Rb*qX^ zRPL$0t#zwkHMP-}s`&afWfl$*`=Zt)N%$V+LenFSpiB zTEQ6YqLUvhICl8zu|@?Q@w(wkz%wRoMitpI8=QH)E4)a(81{r2Pxq z{ff;!*UlLixi@@w#F4dg$44EC?_BR$KF@jnvFz*JN6MpI;#vwB3$Kc3x5UIV)pn6b zD`GrSKHd?y!E0TtXZodgl6#J>o9~_VWoxm8cjbaf`o2$gM;@zOI3@q~wayLRRg0cq zdGF5ko?}&U(+Y_D2?e0)nMT|qOS1x%^f>WYbO;q#r^geAtnyQ{0@}j|6~GzJ;4~~~ zLrHb)ku}oci+i3kfQ~UZq#y6n8If1|2-Q*8${*2YZs?JsXIhyB&po1N&q0r5MTZM6EW^4awwCSQ4OQKyFaEe$#WzWoOXOdg`H`(sYlu=I|ho z2zmO=m1L$Z5jaJ%p3X?q=Zcdvl7;t}Jidj)l$B$Y53Mv;RGMBRVfp=C{lgbFoC^r! zq{ipfflbE1An28|v<@+*evpWS8xD~0>-a3C=rw)^2#aEMzv>8-;@MRK@`O@^)SGFl zHGf7aOn)O78OcjhatEo?08;*dGy`Kb+r|=7@htwIc>SZ4)fJwg66q6DlQVT3O&FFP z{vA=XvkrU9!jF&*@WiU8>&}S=m7nH3g5Hx8;V>r(^A)UVmGN{0GKYzWp;NLOLGWz^ ze52$4Ocs<1wY56zf6OBPA_Dh+xx;_iO;X-;P%1R6^02KXfZAFm<0B$oh?+cju~a)~ z7omCbTAt@;laJ75@O(b6P2q!hK85#BF|KKe!c1Z2&-01c|J_&v@wEc+~-{dIX%IBmRmy9yu75KN$D}s5vjs{W;ls& z`HQ48;`tjvND!nA;u)vH-c$Da*eK5>Nx><+POvw+C+UzDGbBgMoh!)Gg>Eo)W9WnN zq?0vmt}eeUKV1Co*JRJ`y7HPkWM<}H>J1KZxuCT#}h6(ycrTiiP zvqO#bCdDRgsPsG^AWVZZgc@YW22FY~!F?I2$wWbk6ggk0KM8U0P{^9rXh|X`*^&Ji zUYkF7PR3vbE26_2D>-OnOmcADn0icRP#*DP%u}!@_{3B=nn6ua*<|Z-Udl_(bxzFA ocGl(OWaZl|P0rD$Wi7FB^P1o`!Brh*4e;A9oS*^1$v?2a0DmGVAOHXW literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d112e179eb9bbb8372b524bcc40eea2e5cae3f76 GIT binary patch literal 3997 zcmc&%cUV*B8h^iYPLh+bA|z=7Q79t}QBcIK%?UC^K^zRV$dXMEP`tIa2%@NniUSk} zLnQ2NTSv^_2fT!~I8vS#hZwu`H>4z^DdPw!R9u|U7AQamQNg z+o!bh&k8UMzXP&%;&la9I-$yt<*{LLNr9n3NkKZ1^dd4eFhm)%JT^{UQ$G@Yk7NzF z>=q{dFCgDpX_H8JTI4Cbz)}~Rul3{r%VqsCh;mGW${q@OjxPn>f4I|CTd#US&?NC>EZ0bjo?1$OvkDDjg(;X^*G+C!mF z6ST1?$z*7*v4>N8UBNVV6byL1MCYqUL$*|mtdE%k@xTs9$%1dNC1jZQGDCkhs=GzRWY1GqA~5{h(_VT4^B>a3I^v*TqT*ysjk$2%duaU8_` zB!XkQi(to6DQtRv7@Tv%U`d53oLkI>13k5lCm+7k6FP!qXt<0stUe?Y0&K|#QN4wfh;cpNUB{Sz5h0fT0RVF2y=){=R&qE z3)<@I!FII^c>m6Z#!PFFWwN0nHwZdz>cPoNEV#7a9(j}&fO+B>c(%#~%q4vseKZGqJ+*O2N!%KUA{fIyORP%Gn=jg`S_2;FQD1 zM456imi((Ly7cVEutG~Wd|lTPZZ-N*=Wz@R&OQH}zvtC0q*eV3Ym8tUo9Fw0%(xVR zXLts1%I3b0j-9c`CO2mB6GmIJ8}ur%Yd0*g10p%=^cofH`tUyKyH&x85f_5?2kk_w zODH>Z#(UVpRn43mu2#s%F#~lw&u5*>7^lO(YXixXeAvtST#(=XohWc^9%S_z!=Wr& zqG6AVp53-BJgwHFSX96Oy5%u}H2>xHKx{kqD-g`NmzIt3W#S$Fu{KztJ*oIt^ z+_a61w%}JP&tYzRQ@|#@oGq$VlJSo};d=K^(i*NR23DFj=;8$4QQpn@#TQsh6qDhH z1|_F!>t#4Pp#wXx`Z?6{vfYfRgrGCBdP*)jzk*~QnH+SSM!Hh(maf+4`}!H9zUOw;g<&Cep6ICc3$&?a1G#aY3R}?{&A!Pp#DaNdcxQSNbnh!eU6H}y zc;1t^`n?TybNmT(Ns@}4PL*hFG@Q!fP+P9$6^=)-rhjdjum zhG=KfcKqXm-=NcpIqZ>*J|JImn_ZyYpmp%O-C$cfL91!16dZU9(EWi&SnTgwkUU*S z^zjZM>izWr_UglR+*voY@qv@^X!xx>WIR`a^)8kX9|>l$iv-Rn+~*V4vq`LBK?)9c z$%(7v=%cxU8?R2Xxi7A8WA2Esn@QQM{3+{M&sMj>S9e{}{=8hh^2WuuS68_Jd|P>r zPXz4Z7v9<`A)iZDyc*W%qar;=wm|>MemK)oOX%(&V7_(gNZ~=!R+z&1>uZu=u2xJi zA854fCQJnMgulFi&~z3&Q8A|}9D=(HFj6efPg#SL`c{5T`D@ZRd{ehvc|mHrmUwJL zPg6nKdOZ_GhL|c`yTQQ9N9EUCn7-*f``}Z#RMEQ4MiUe6^)wf)-(oCJB_)c38QaX< zvaS4E4sO_P>0RiyTT#4m=SaW0h~AdsO*wW;S~4U~hcb6NL|jz)w;tM@HzB_7)NZGe ztbJ2fKDyW2TC(N92PqiWaE5AYfm4RQb-)SLwj$T9rtW)Ylx7#t%pDtf^+f6R60dy4 z21Dnv9i?+sKBWO|Wjo7#Yl1uWIG69LT=;py{i|)|In{wpsazwMirm_eQ`y!F+bee0 zhhHdk&vU8V(-_@T7kRC{GVe3xt(FZ&u2p-#NW6Ekbm7UWeMgfY_I2dBR_{N)>gl8V z*G^U+`07)@c~ZBU{N^;$z-G~@nu6B#d^3-|ZncGNo5T)L*H6_Jo!nyLv{C9_cd%o- z)!ed09d*TLa_mD+?{%*~)R{LivHyBUeaZO)@-*IXj|Np&k=u5g#itueFO@LpoW{Lh zsD_|hHV&dIZ7!G5=;)J&K9|#cHRw(ie6yvOk2IQI`4heTO7$D)sf*u6mmDtfstrlw zS)bl`YtUtxGN};LkR-RE6oVTZ8OIQ&utXvzGBJr078AzM51MPFa75RQMLou{ z{Nb_&H5+^LnZ1)x$c0m1Qz!DMHn0#8h*&gv`k8r`wA|B7?;uaKwOYjFpbbNw-ae&` zhLEQw>%SvUdatQeOjulal7RDkRr#U1zV&b7K7qT_RL)jf;_t#|zF(KP?D&tLv)fdB%RE$O#ydVoh zI=%@LUk&`9#e#^GNF|>CV|M%xQPuy;^OgNmFG!Xo&b`$A^^K3U}@084p&dflErYKKlQ1~-jwVtwP_R+y(XO9W?U9ddVWm2@ipHBp% zkM*YW`&Se_sN*@QgV0IPqKT(oGP)uDi2=TjaiOl^R1o8@hNsJ+#eZ<`Gk-p__h1H1 z>FU%q`9#xailiW4N2R~tY-NC-x^x zHcB4EpX09XH{EPn23t>2Gnl1o4AX=70GCNo!DCm_JsHOs(M?Wf`cJ2*xfJ?Lq4uOR zr>mp&{*)qFb5H7I886y~Qg>40q%Kl(t}fj_)@3=3qpp+jr{mME%++WtL2MEql(1}h tSkkDF`1nyl2?@$%lNDhJiIK`UlQH8*j~Q(*@st9*{)1x#fH(ca{0|mk!UF&R literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f3f7d2a7d0cc4ecb1608dee4e76b94cde19d2b83 GIT binary patch literal 4018 zcmc&%d011&7N41WZ*r4>M9obTKtL%0abXkOP)XPk6~PLq6vHAQTS{28?obpJ72E(p zR*_9m5LyM5T?My-&jq#0vseLN>-wI1efK8C>a+dhec$_D&zH>1nK|eD&N(yRmtQ=j zo)#2N*-;Me6iI;*02r)Z78|!}x1Rp2qp7})1!Ygd(-hlXZ{B>jHe|0&-nH;)HWZ)) zlxRG~rT7$!V(Hod6LQE#L=IU20JxtcW`qdq1Cnm2OW|T`z~YNVoB(-LkbkIL8A>4` zSJ(oDolp`gA^$9tcs!jWpb(%3K0N_I>mHPyJDmfQvk8s?4g{JUHV(8jNn1W`!N_p+ z#JCA5?cXyNkwYgU68p_YNLTUcE`7)|QB{F3EX4^(dhz#Vm}mw?2gXGPN5%RF$z$c3 zVe}#_$UiVTA}%sYRoELSd;}%uG07vO`(I%GVYy90s#EAreNU(=&gVVLBb;{31u<0% zP;F>6o?7-RcwRLLo|=%!Bgf1J2S*>omulyM&;5*WkD_O=-t`=~J|!J5_wVMJnq9=U zxsC-ESMPyH{2V&D=NNoxd>&aw53*NuT?FkLZo$ri2hj2}1uW~M0r*h zsA_rz#{hRUt<4;kxoX1sEhoT9NepaV-;FlZV_^G=#i-lM5&UfP6p0QwV|mRQz~-no ztUb^U?xgGi?^0v1(PrzA?UboR*iIeR;OQD@5!Q(3CbL*xYiDsY6*}Aqz1u`i%R?ZT z0ExU;Zb)Mi!s4g~;B5E;bXfLr`v)cy0-y<|tZoD^%3Xn`uOo=in}8mESr0aMHey%2 zW`dOFW$0-G0WZ%v18i-L@H+jcpjHr%Dw7?6p2KW#;?gKgu>CbON;(R(9AAUK`&R+X z)*Oqb>;M^;f*)VHoAoMP34Ur%1p7``6WK+JQEsgTY%s3JtV-TuOU~cNW;Tvy8z#r2 zYO`zXwSG246Z;n;!rT`-3zT45Rv=h6;6jX8zY>f;c7iqh8iY^(ML#xqF)*Js1Jhoq z0enYhao`E&{KAS#!R&A!SQ38==}E@0{l#B$-&_=9QmfT?d9)u&c$J1o)@o_&d)Etxn`A)?K{|`;{eX$OWZ6M3NM#Fo} zd}479j}tR-EUQ+l65YIQgz|)Tgo1YjrMI>5z0+hwgs2c0jbFizC*nc%yS-@fs%G{^ zErO+EodOf8SHg=alQjAFO+ms`Pu5p=ouE(d9_ar~HrU>y50pwFSKnhaX_mf&%WG{w z%NIR{y>64q&wscNL%HL*=WA@(3)436BYDTL+I!>Ri&K%f_|h%lF(Vl)7|6iR{a6pX z}cc7E$ zU6|>q7Czs~fwY`|1MBQDV=1xogq@oae&`a8nLpjF-BU1wvstkbwqx~d(|NzZH4kop z{kxs8QTo3ER@6iGhB2o>Wq1ravST`|-^JyI3oRK_Uj#qs{XYPFGDZ z3aoL54P)QJtu33$EelFfd}}!C4xx<#xQ5t;v<53jo%a-0>SfnW8&3ZSb-< z5p^btd0Vt?32uH0+hC{Pu$I1MkSC~ziJD>Ha=~3}FH2uDS)dK~#O}gYANT=wDl=H) z>O6tn(jQp`8nwIw_p-qFvdO$ATU}tqT?hvrKS7bNcpzb>rf~Ic3idpIfZlz%ku&GE z2KM-j0*>mD>X*R+Dh`_3>oJ5#Ho&JUZ{RUrW0tz7G80#@-`4~yv?WDse=s%6Dzv-QxAow*?8%C|XS3`}w(p<5;z@r`YsrqhS?du; z+o3eAz&>Tf*hOun=|#?IV_f$+lH;)*R^AW#U!7q@$+zT!wjIXO8R9 zgBP-=C~aZ%&aTTjUiuNbaorGlwCR;{8Xc~E zLrwXE77HpnlZV2Of@)>VNncw3RdC~Vaix`a6~XeLuk&?OA7l&3;XpY^`j&gRn+Xm z=k|xh!j7_i&;7_?1PF)+K!R-t9y0HfmVWibJm-&ipNO$miHIDsVOZ3syR^}7To9?a z;NaLlBT-uKDOE&pR7k9v9}Ys{Ae3A;k*qhNY1F6T{{-}psuibN0})FZ7#bWYr*N7^ z8lex^sFAMwp^?EXKd%6}1M{l7e4$zS!7gsH|tu7nh(8VLXp>xwj_o}Tl(r>P&7 zC?7Xt#9AtoN~QDZFC(+1(gdk2MCv4!hDhy0RC_YoU=Fjgmr7lknbwD9T2n2ROavy7 zbl%tF%*>|H3&*)GjBcO6jE>Aehn7m+nL*~usH$_RIirUMSkAQw@Lm`f*gEt#{A{FRHmt)qgRL!@%XUjLyPZ_?)k+RjPAw^ z>e5xItMdt`_hhkhZ|i7ZpSjVCd{m{=c=Ofuxh?oOAG%qze2@>*N|_?ad0{MbAa!+T z>{UhyL-_MuRsE)$P0LX0rBVlG=^VlIK(WYa>hb`~6?9J~F-CNg6PW(fDXK4p-jk_3 z>CEZs=yPAGEJ1xws$>~2+D5ACq}oYUqFEz=f88vc-tXauN0lK?s72p#_5j#fpN80*V`F zuvS0?>#B+y(dVdEtJYezxZ&x$H;C3}d(M03yz}~<+2Q3U1ysi7;p<8Z=bKfX zyK(Fd4YYvnZbR$ShBSxfn7e|6>e30Rt|$PYE5VpiAK|!A<1?15=yy)6`)9QasVLLY1d44yAf8RZ8&7>=utbr&xuKjM+-MJuB+ z6fr5PsnK$kTrYuHBt%b*QYL38QnU!yV1issoJ)w8$ z1fJ<*u;_6Dgzx<)TzaX~4o*KfeRZ^6b#>WyxL!oh(1Dsgn4Sf{0@NCRG zbasawEHf9Pe)U#BJhBH$=@0I?1YFo*1-!l%U}EYH=MxQKNw_N{E%1cVt%)!xs~7M} zOyEjj4XoGCgr4?$(3u)@WZAG4Oc(foWy2Xrt#yW!-vm&vKM9t{n#015eK4vt1|qii zf#%5^Sl7B6-O)jyIjsxVj7*@Zu{RLqBf&w_2Km005V)ffL|Yx90+T?wnFi*(-vRev zIJ9`v*qyJ3LD3jf5bgAa{9k@Ti5W()i?9MkJ`a}JkZ^SGUa*wx^WRDxCdDQKVN1(px&;Zo!>xb-dtvQ!~Zn!g_2Pd^4Ly*I-; zzY%a}6^7b#?!y3e59m|l1W%vMg^gGGf>nw!dS3Aob&tM>R1yTKDVE^1BoM}R6e3Oh zZer1vckpV5GdgHns!R3phmE~A5)s|kfz@PdLbm=nY8_lm-1k3%`r9|-i+%fnW6M2I z5T~$)lKn_!%tO(K+{mo+&2TjDI{KmV5pwF5img>f!^S@u^??9v z==Tn2i?1FU+f)w?;xu$5_X3u80EcV#4(PFK8z$WDjjwCugJ#1~xTP+E_lwf--u5|| z+u$K&!ZI_?FUR&E$Am+~+I$X&luX9@j@Uy+9SodLgWP>xc23F+78=18>XM zaMJ0n&Vz@8NK?>*!Ltv+>#e?^7d8r#d)i`8rW}C69}eM{$Bh7WV=UHIN1}_PkHfM( z!Kk)Z8)!_YV>S6Bq31|{s6X2Sx1P5b3`VuX?WsEecXPm%v^!84YT~G1Q> zxzO)YcXIXmiP&0=BdY6r0N--*J(_g-K0e}5Z?0wjbZlFJ1KG4ad9B;6qW?MC`k?37DxZBJ2FU(Ta36 z-fi+#Jj>t~QY75Kru|lcJ&MgCYDdcub>9t3HTVv7Qz$WzS3~8uu|x;mOn$p}G%4Sm zY*@9e#?(J46m6P*9qTFX!(E#oK^@H^Tsm+zu~j)4%Y5aHF1Ft^dNTJoui$(HZzp=J zx7_R~*Du?Ezv}&u$VgI#98HTkIw6lJ#Z8JgyQPlWI<7aWKjnZ;T%*fNv+Kvv7}Q|@ zxM_{e8EI;is=EtYG36l@vebi27FL0^O%``LIURPtUx7`W)yOR{AUT>BZ-~s@Gf=bI zSa`uMa8+(b?-D8=enA`!12Eema28%U?n0!R7ej{(7kK zLMV2@Aj-smSA|vHbHa39oPn4(&FJ-=Bm%3RCAvA{46#&ZN$RdN$0IwUF;oyl#XaZ` zkM1|>95Fr)hh%HGYI$VO&BOY_z`Ku0lYzyklIw(-nJvM&1y8_dO)m7$-^K}0O`=lU zvU##!hUk9ZCr6*0#|M3!z)x;#;l2m~GQwjh{HjrM&o8|Ub%Rb~4aMj2T_Y?EtQW5& zoW@?mtNQxj%^mv;Tp^(cX`K05JEl*s(zT&0 z_@BO&3APBFIa3gIPF1e?Q|#g;zHBwMZU)UUW zWaHx1gHrFDDD&J@R6Z>0_XlCS=8x(1RSuBG&T3h2&-sSdK^pZ~OYbEXqn0@rE-Um~ zVYlV*a^H*Mz;d@a80~jS5>y>f?or}*xqooYq=vb)e~V3cZSsw#68|f9Q)DyWyd`Z) zcgeqBAR1O+Q?@7O^r|7VIJT=a3758p)fL#T-lzD_v#;*Vu9t0-fon70WHUEq19fX_ zm{B%AceUpx)FEqn$beWfxD!N?7DVy59v!fn?w~fNl^b;sKQ{+v13y3Tik)2tNUA?o zChB-vFuHu7;!$gD`YZh@ZZp~*T;7{$Fm=S7_SY}kUKvEV=ixkobC#jpBVSkKs(Ni0 zIkM2CU$7?2D9UTG*^ueaUmHdH6nk6xu5g}V9OG9qAZ(xNjd5&1*}#Mr%?y*c!17@; zUOs8#q~IDheUBXBtzahYv=>G>!e3`H z8F%WU9P_71Ov1*3TB97ov>oM$vL(m&seA>or}xC3ZL!M+P>|V@(X+-Yv~ef54`sTu zFR5MhPn2zSJ?+ISL4^DMX*|7bt2Czk!LJ5PF3jkW4H{9l`R^%AxySEhFhPjsdR~W0 zA!}f296d{=462@8=K8q(V~v-UEr-NQ2bj}XC!6&7oPF^W2`MbIj7cYw6Vg@OnB*9i znK3l1f-l(BS5Ran-qA}um&;PkKYl{~lxxXouK-#{OSB|Z9Un+Uv|=Qe0y*MQZc!g7 zp(Tb4jvSHcfm#2);CwA#mWh6dl_gC!F;3L;ej!B7Abu#DMJ^!Zq({ZaDC9K3@N8@B4}9I)JYc(tJHxoa1rvVWc$(8`{5kz) z?LJ)zjnSL{H`bdFYOjZ-^ca(&>zb^)Gx06h_}aw(4i*IZB6Cr||N5VazlqHMU!LN> z?NJhJw-xA%DZ$fzrT`*yAzvC4G$v$d=aVI@AnaLjk$Om_(y`2&6@RHTQ|b{X^^!{C zq@HowGY`gKG`sSYN`2Xx*>}y%PP5QjcvbxXw z)5AktQlh=%q;l3@o1THgNLbfCXuefx)Wf#YBQ@GPSj8%&z5%Sg)+n4`II#56gR?LSju=cO=b9$HVPat0lv50iRicHWb=Sk{ZNk!m~H>7)hO zd9H;Xrtr#O@@R3g{!DzvmA#ryOAuP5%G0K0#HeheQd4c^X=%z#i|H|G=?Tgd3rANw VM>_{mfH}a&7c>w6GUgYKe*tj26RQ9K literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..f7f0fe9df --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-14 10:43:37", + "end_time": "2024-10-14 10:43:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 31.7, + "gpus": 0, + "memory": 15.83, + "object_store": 0, + "execution time, min": 0.003 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.2, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 deleted file mode 100644 index c92d73bfb11dd463e1c278b8eee0d36cfefa5610..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 263 zcmZo*o%)9X0&1sdba7@(>=kV`bvef1&Dh29FUL4&+AEz55IfyAV%q7pHHtts=WI`J z%P;F+?F1>})EAxM+v^^@9;_oYv!e0!op|2ZDH`5P-mIxzoKG7a*UsA^w1)vM#8Gr~ z#<6)hD@))aoH_IV-HAJN=mCbuia5^XM{0ZQ6-xqXWA$C00kwNmHX5JDVPeDN}2o}7L#eWiK;B|Tnp diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 deleted file mode 100644 index c3966bec265a8da7ff350586855e53ad69f800ed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 31 gcmZo*ohrfr0ku;!yqUdOyqUaNy&1j!13{@C0Ar^J*8l(j diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 deleted file mode 100644 index e419c951696721f426be5649d731716ef4543547..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2840 zcmeH|?N5_e7{;HzO=!iJszYRGDLN`>M?$8|>_x$lj7L!7jIZf684iQt?-1xvMY+amz4s^hfY8<)=xSp$N#`a-<0sYWlult;H zo%=bx&;1X-{@G;OLDZ|V!D$|ryiWS9Lr8w|Ysok%j&AfdD+G0(9SwVqPOIvAC zLl`r-W+({Cz0$@YKh+{9Xc-=gB6=b^J)E$@k+Otvm%k^BusJEXX;Z3ZKGE!Lzafyw zu)=xxwPP$2&pMYI)URKiNAydN_wfneANRmd@6m_j zP+}i_j zrx52*Q9BRww(7f9kvNZW5c^GZwIkksmJ6-B`xzOg4nI1Ab69%l_Hv>JRnf@PRHZ_k zHgixn0lrsSS)vRSiJ7@Sg+5YiqB8?^#WFKqr86=z-h3AL;7dxC()rRYHME(LMwbRA zNtr6vKZUgu$3{{0r(PlZQRmIbi<&szjR(?SUW2$t6b`MDbwLXs-UQp;iN^w?>NfP) zMl)Fbb}Q&w5ev2)I0xG9tOJdc+2986ark~2xd+;kZ~BNd;}Km`UUd}1>3=} zKh9&{ky{-Y-@#Xcd80nCZV~E-YD^MiK6^$5>cTxBwqEWg?qMH#+>0HJ z@EboVLoY}@y9arFu{RRWF;gYRI!nbF>~pO-4)fJB+FO##ls$o~`Zg*H3(pYGd? z^}MC`kmg!hFM3y~_guAa!gzxx73=<6A`RUw*mr#*3-fNYHbEPzzs3I$DiK554E%?u z6Oc!>I}_)4qWmc04LM5?&sg~eSa_oxe%94dm{)!=559KFh5Q~5urc0o;%oG%f0qb* fu0$wmu0;N?L^!icWbQ?ihFduIqM7xgDHHz Date: Fri, 18 Oct 2024 15:29:50 -0400 Subject: [PATCH 42/91] Updated ray tests Signed-off-by: Constantin M Adam --- .../test_cluster_analysis_transform_ray.py | 52 ++++++++++++++++ .../test/test_data_cleaning_transform_ray.py | 61 +++++++++++++++++++ .../universal/fdedup/ray/test/test_fdedup.py | 18 ------ .../fdedup/ray/test/test_fdedup_ray.py | 60 ------------------ .../test_get_duplicate_list_transform_ray.py | 45 ++++++++++++++ .../test/test_signature_calc_transform_ray.py | 46 ++++++++++++++ 6 files changed, 204 insertions(+), 78 deletions(-) create mode 100644 transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py create mode 100644 transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py delete mode 100644 transforms/universal/fdedup/ray/test/test_fdedup.py delete mode 100644 transforms/universal/fdedup/ray/test/test_fdedup_ray.py create mode 100644 transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py create mode 100644 transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py diff --git a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py new file mode 100644 index 000000000..a3771fbd8 --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py @@ -0,0 +1,52 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import ( + jaccard_similarity_threshold_cli_param, + num_bands_cli_param, + num_segments_cli_param, + sort_output_cli_param, +) +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher + + +class TestRayClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + num_bands_cli_param: 14, + num_segments_cli_param: 2, + jaccard_similarity_threshold_cli_param: 0.7, + sort_output_cli_param: True, + } + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "signature_calc", "bands"), + os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py new file mode 100644 index 000000000..a62105b2c --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py @@ -0,0 +1,61 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, + operation_mode_cli_param, +) +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher + + +class TestRayDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "get_list_transform", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + "run_locally": True, + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + operation_mode_cli_param: "annotate", + } + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "data_cleaning", "annotated"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_fdedup.py b/transforms/universal/fdedup/ray/test/test_fdedup.py deleted file mode 100644 index fa46fb071..000000000 --- a/transforms/universal/fdedup/ray/test/test_fdedup.py +++ /dev/null @@ -1,18 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -# There is no local test for fdedup -# This is just a place holder t satisfy overall framework - - -def test_fdedup(): - pass diff --git a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py b/transforms/universal/fdedup/ray/test/test_fdedup_ray.py deleted file mode 100644 index 78ee7cc04..000000000 --- a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py +++ /dev/null @@ -1,60 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os - -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration - - -class TestRayFdedupTransform(AbstractTransformLauncherTest): - """ - Extends the super-class to define the test data for the tests defined there. - The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. - """ - - def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) - config = { - "run_locally": True, - # When running in ray, our Runtime's get_transform_config() method will load the domains using - # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration. - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 1, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 1, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, - } - launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input", basedir + "/expected")] - return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py new file mode 100644 index 000000000..34f3ee403 --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from signature_calc_transform import ( + num_bands_cli_param, + num_permutations_cli_param, + num_segments_cli_param, +) +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +class TestRaySignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + num_permutations_cli_param: 112, + num_bands_cli_param: 14, + num_segments_cli_param: 2, + } + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + fixtures = [ + (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ] + return fixtures From 954dffddc11070366fdf56efe2229a412f8501f4 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 15:31:46 -0400 Subject: [PATCH 43/91] Spark test data and tests Signed-off-by: Constantin M Adam --- .../docs_to_remove_consolidated.parquet | Bin 663 -> 663 bytes .../python/test-data/expected/metadata.json | 16 ++--- .../docs_to_remove/band_0_segment_0.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_0_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_10_segment_0.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_10_segment_1.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_11_segment_0.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_11_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_1.parquet | Bin 0 -> 1532 bytes .../docs_to_remove/band_13_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_13_segment_1.parquet | Bin 0 -> 1526 bytes .../docs_to_remove/band_1_segment_0.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_1_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_3_segment_0.parquet | Bin 0 -> 1510 bytes .../docs_to_remove/band_3_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_1.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_5_segment_0.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_5_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_1.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_7_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_7_segment_1.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_8_segment_0.parquet | Bin 0 -> 1530 bytes .../docs_to_remove/band_8_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_9_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_9_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/metadata.json | 58 +++++++++++++++++ .../data_cleaning/annotated/df1.parquet | Bin 0 -> 6923 bytes .../data_cleaning/annotated/metadata.json | 56 +++++++++++++++++ .../data_cleaning/cleaned/data_1/df1.parquet | Bin 0 -> 14933 bytes .../data_cleaning/cleaned/data_2/df2.parquet | Bin 0 -> 3068 bytes .../data_cleaning/cleaned/metadata.json | 59 ++++++++++++++++++ .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../expected/get_list_transform/metadata.json | 48 ++++++++++++++ .../spark/test-data/expected/metadata.json | 49 +++++++++++++++ .../bands/band=0/segment=0/df1.parquet | Bin 0 -> 3984 bytes .../bands/band=0/segment=1/df1.parquet | Bin 0 -> 4763 bytes .../bands/band=1/segment=0/df1.parquet | Bin 0 -> 3695 bytes .../bands/band=1/segment=1/df1.parquet | Bin 0 -> 3684 bytes .../bands/band=10/segment=0/df1.parquet | Bin 0 -> 3305 bytes .../bands/band=10/segment=1/df1.parquet | Bin 0 -> 4466 bytes .../bands/band=11/segment=0/df1.parquet | Bin 0 -> 4906 bytes .../bands/band=11/segment=1/df1.parquet | Bin 0 -> 3317 bytes .../bands/band=12/segment=0/df1.parquet | Bin 0 -> 3138 bytes .../bands/band=12/segment=1/df1.parquet | Bin 0 -> 5020 bytes .../bands/band=13/segment=0/df1.parquet | Bin 0 -> 3138 bytes .../bands/band=13/segment=1/df1.parquet | Bin 0 -> 5244 bytes .../bands/band=2/segment=0/df1.parquet | Bin 0 -> 4782 bytes .../bands/band=2/segment=1/df1.parquet | Bin 0 -> 3988 bytes .../bands/band=3/segment=0/df1.parquet | Bin 0 -> 4323 bytes .../bands/band=3/segment=1/df1.parquet | Bin 0 -> 4341 bytes .../bands/band=4/segment=0/df1.parquet | Bin 0 -> 4035 bytes .../bands/band=4/segment=1/df1.parquet | Bin 0 -> 4860 bytes .../bands/band=5/segment=0/df1.parquet | Bin 0 -> 3554 bytes .../bands/band=5/segment=1/df1.parquet | Bin 0 -> 4872 bytes .../bands/band=6/segment=0/df1.parquet | Bin 0 -> 3553 bytes .../bands/band=6/segment=1/df1.parquet | Bin 0 -> 4311 bytes .../bands/band=7/segment=0/df1.parquet | Bin 0 -> 3765 bytes .../bands/band=7/segment=1/df1.parquet | Bin 0 -> 4158 bytes .../bands/band=8/segment=0/df1.parquet | Bin 0 -> 3781 bytes .../bands/band=8/segment=1/df1.parquet | Bin 0 -> 3997 bytes .../bands/band=9/segment=0/df1.parquet | Bin 0 -> 4018 bytes .../bands/band=9/segment=1/df1.parquet | Bin 0 -> 4326 bytes .../expected/signature_calc/metadata.json | 48 ++++++++++++++ .../test_cluster_analysis_transform_spark.py | 46 ++++++++++++++ .../test_data_cleaning_transform_spark.py | 58 +++++++++++++++++ ...test_get_duplicate_list_transform_spark.py | 45 +++++++++++++ .../test_signature_calc_transform_spark.py | 42 +++++++++++++ 73 files changed, 517 insertions(+), 8 deletions(-) create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json create mode 100644 transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py diff --git a/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet index 557f866a7c3a83d68e8842afec48e1c9af5e5cf1..edbd80b43e1a3e1ede5676006a991cffc1396238 100644 GIT binary patch delta 26 hcmbQvI-PZbEI%8A00T3FAOkA{Hv=yN-$tEmCIB{E1C#&& delta 26 hcmbQvI-PZbEI%KE00S=r8v{23D}x{d^G2O)CIB|<1C#&& diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/python/test-data/expected/metadata.json index bf26b5228..ba1f5b0a6 100644 --- a/transforms/universal/fdedup/python/test-data/expected/metadata.json +++ b/transforms/universal/fdedup/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "fdlist", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 11:20:38", - "end_time": "2024-10-18 11:20:38", + "start_time": "2024-10-18 13:22:42", + "end_time": "2024-10-18 13:22:42", "status": "success" }, "code": null, @@ -21,16 +21,16 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 136.2, + "cpus": 32.5, "gpus": 0, - "memory": 23.89, + "memory": 13.31, "object_store": 0, - "execution time, min": 0.0 + "execution time, min": 0.001 }, "job_output_stats": { "result_files": 1, "result_size": 663, - "processing_time": 0.021, + "processing_time": 0.047, "input_files": 28, "input_bytes": 38040, "input_rows": 44, @@ -39,11 +39,11 @@ "consolidated_rows": 8 }, "source": { - "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", "type": "path" }, "target": { - "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", "type": "path" } } diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6 GIT binary patch literal 1513 zcmcgsL2nXK5T0EY+$N2|#<$r`I1r^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p zHuSaLi9eaPx}L+a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuPAEYJvDI#?y${L zc%Jv>w`yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;` zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m S&e+cY1V1>$PjeMN^1lIVM)pPk literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9df2f3bd54e13d5078be076585302c2d0f4e93af GIT binary patch literal 1497 zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K} z0vA4laj)@Nj0+bodtj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&y!K9Ws%zO&4iPF=5B-UWib&PVCM{MVlq86>j`Kpc8ToVkIRUE3s(@ z;&_BI1rU7Dco^$sb8>9Kjw%N$)+<<#qXDs%Y%n%K0{xpj=0*PIRk^UctwEd9doL(o z1^x4~{(2edO&6cjauz);w-2h+x7v26MkE~S9dY8mc>X^p>;-4T14Yte+TjAOpT4j= zu<)rU-@}Te%gU#v4FD~sMrg74E>di-#A~ tc(3NZy0iy7R_BWU5^g~{&Bn|ZS{zu%je&GVwc!Ysk|b4+5e2C(^V z_h(#v3s4O~7;bOnswf(OdF)mp0@r=W zbJ?x>3920NSH$jW6gOwF`w*G1yNSF8m7!DV+Z|_QPf>gm)<1@j(7(wOUhHpPl{35B8n#_}?*-+n zpnqP{UoRoOtHtNEoJCK|?SU!{thU{$F$sryTb#Ntp8wAYd%@Z8K#{bVcCdi!r!VXd zEPN`;_pl=Avhpdi4xq)<6<#bpKH2w%q+W`Ke?DYsN6OQq3C^R^pwNK+$UN@UAs>ZN6*5_KbWQ9dz7jZUdq zH%sT`WYNuU=Usf5iEvOZ0>6-T@wy=XKC!XEGK4Xfu=zD}>Q9WNgyr9N= swf?Jfd$?`&dfT;r-xT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5601f5cb07d71179df35855265cde6b0634c130f GIT binary patch literal 1532 zcmcgsy>HV{5I@_A>!N_ts?V~e3>m=c(2t~15velpl9nhCN)jN24CKT4Dr!DR(nKBE z85kH@85!9a8TcC*PzNT&j2HlM_iVR?qytM&$vfZ2yZim#Jv+w>QRZV2wv}ZngGGSl zck92x#v6c&0zL?UBQjKurHaIWEir3_X)K88YRJ&M1VdBYR2`OQcrhp{U&;f>XlRnG zyaMaD6k$_-kV|YIe_GXrh_2CQ(0tJ&(6<_7@C($J*U06 zSrDAhc=MAK)%vNO@?(GVD<3qvtx;Rh{3z+)B^>PkrP8UC?F$`m&Am-^pxT}LmF*QqeY?AL-OCJMU+Zqs5BZ$ z-WJXtt<=bsBI!~({?v}fBeiJ!sI(Uy*E_{#EnPe;#R?+7ofG&l6XBp7d3HV{@Cpzo z&^g~Gyy?RDacs9)Psf{fr#Py`N(YGVhKq-DffP3-^Ta33xy`q!ID*?DP2$$S?HFRE zB}HA+rM}(WjXc`GJ`?h`Xg@-+sh%qiUNFM_>fqU7W3+1b`>WN#pf?GR8-w9~uN#hD VUyH6q%tQ#l`N0l;sdM<<{{tHM{bm3F literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..02bedff1c8ca8f026eb6a3d7664517b0f74cbb2d GIT binary patch literal 1526 zcmcgsy>HV{5I-lW>q3Oms?V~e3>k{mp&xOhB2smENlO$6B?%%eS@PjHNm28GlP2nj z#2>&v!Op_MU%=|B zyB@0mR4=8fgnL$*F-dOOOVE^{_eJEUqqr!@-6{Cw0>L#fJ}g|byNA36m0_>q*qz?U z78Jip>Tk(-7cxEtk>qsQTbj)zIcs{vpG0yE9mEG8y`Rc8KlU%b z%9-754cilw{JLnoT|}DG#cydji>9921x+4UZM#!rG7k5CVB+4m{*9A%gR|j*D(g{w ze-77hes{r`_f*vHAyqaM^~=I4fF9MtyjXm8yyp%{y9{iTjCqQ4Uci^+3FjDS@kgAu zc)BHGDNJ}uUun*>WYPC6dbE^&LGkCLM5%O^N~4qFZ4vB|rbeEW$d)t7SC*NK*UjWb z`PdvaI;CbkQ#vmviXy+A7x*v};h7)1#llY#9vt+|M-6aMPB*y literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bf131f43cbf10180944b4906799b7d6288c54724 GIT binary patch literal 1523 zcmcgs%Wl(95FI;->!J$Ms@JlmEFx5_F8YXxib&PXO5R0x6Yjz-J{2+lO3pO3uGk4~WXU@5EHHxnkQ{^B*l|z220-JN9xan5DQ2-FpkpwZc z0s1c?#=_bjNKs8)Zk{=VVWrWt*oq=4Uj*fcpuf=N|LLM_Asb*Kpo*daSj27>A`tZ< zFJ!msC#Z77UlF_6C~nSU_aQQ2cN2LHD#KpIu{*tyJw@?NSpOJ8lCxzOG@DB@%5LYF zI2obL0R$g39>yBmm>yfOqsqaGdIjrAG$6K|4aO!&pnsDmyx8BoD(7~$HEg@|-V4fC zLI1p@zg|Lmv&9#*oJUW~?SU!{thU{$F$sryN1VDZp8wAYd%^keK#{bVc5n^XPrtT1 zaP3o3zK0b_mz7VEbpS1<)aaC& zb+dF)P8Qw#cHYH@nFt5vGVlvo7q1KA4|L8C39nfkJx?At8)m9$bxOl}vV4yC?)!K+ zcah>p=>qXFIk)&B6-RJKWJ&z`4?V}#q%zdiEcdPMapLqY_8C*OMc*S7i|YB};RQ9` suQ{(S?BTZ6?{C)}r#Fs|Y-e!R>&A_HJH}2zJBR^z4;m>l00$Gw|Fn&mB5!d%2k!`pB#MZqFOLbF6;GjE^zmb+hY|S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDSo6 zerMF3qxb%n@s;n!&E`u+E~2NO?1PpVI$gKd5D7ZRd-B{B3FB*DI7}{v2db>6^@A1M zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0mccu3lJd5>j+1w^dS%kl(~5*<87>|n7I zqt%AK}bR=(}@YNKY(z94)L zr+5$pq_n9}VLle3gNNLlz#Xw=nf8Ai27zW(6)1_&QGiUE}Ar~Vt0tMqdK literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2838dd9727770220dd6b3f3ae9f0d4bdaefd8ec2 GIT binary patch literal 1510 zcmcgsKX21e5I;MKaZ!cRs?V~e3=t|;hyIC)ib&PvC2g%hC`k}0WXT`LNs5{{IBBA8 zd=wTI1_mS~J_8doV(7p}Kx|0dd$yYdhk+$0`|jPndw0K|?uhZFiRp*LY0G^=}LjE1yMZDtF7b#M06xUkU;+-BuG@-2Pv+p zEA4Z4II6WAi>xY=@>x*63;KyJ|4$Qb8wmgr0aY9g0QxC%P(ZXdgw%Hc^)-k<%p-+( z>eoVDz%v^}pCz~EC#Z77UompwC~k6cw@U#zF|rBz+t`%d9pp8vjhvcm_nfiKDSi`_ zUxNN7s=tpQ$#B_!G+Rh=(bNLa;i&~X^Tfi@fY|)hzRFWxWS&>;!tQrQ-I)n~yrutS z7bRSr(F;dm5k0N252`e@x^}NYBpm8Japt~w{$F3%3oeESilim9!zEn5<=p{&*;7%z zL={Pwl@GBE04<@e(sKFf*?~7C^;+CxGND;YX$fB@k154Khn`T{p}7u^WijCiTjeOt zGmCy;(PLY==Pdq=DPAg4=OtVJ5RG6)rnOByPNi~hkRrSmmw|bR!)2v)p z(`BCDEpmLAiEvP^0>6~!csYn0=#(BYyk>d)GJV=^nVGiLtBjiI>ILF^=;NW3BgI46 z660f1YSANB9Kjtk%fzq$$aA=6D#yB-)q&MNO})5}eI_j0VdoKw#p?Ou;RQ80Xt=L0 o?a_`k80<7$*O?^8wmUp``bp!#uCbfa4if<01IPGNE8;)=AFJv2-2eap literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7cb2cbac4ca976304da8c64e8db39c678260db2c GIT binary patch literal 1513 zcmcgsPj3=Y5T9L^b(=H>8{cL(;XsUTdax~9Qd3MX4_Y@UR=`M=3xQ>UHigB7LVEP( zC-D<#()bDb2|Ri519t?-)3B$i>jWel(Zu=#1{ zS0?oiAOV3T!^nVbu=@|OEyZLer!^(o(UhoHh0T~M0{EzFB>)i5Q*S(Jc3hjStCD&xsNV&nZYcjVL?>#%L_n9q0I)>ZoluY*#zkRKho(e5By6#Y z05QToBfTGn=LG6oL|rl~R$q$UD>HF^a0&+ymgjjIym|GeR#7*7vWNl)tsE4X>a%I?6*Per|!RLM}(kIB0LdRklOwc3l5$H5_K z-^3lV882|otN15-#yJ*x{0ZkhUhMg`0yaFSRgv=&Ma-j!8Eq9`QT-)3AF5a)XiN&c z?-yP2G$^z>#fFtXv(0?AZRRfsCU~|{i2bp`ThNh&ksA14%%guS4%!$pTr;P zoF5Wjt2TX;JMMO@eAgb-C+%G09Qo~sd^q=+;zNZh>9IJs`5}QLxg&~XA^y`K@jbaB zO|=>$dw86Ey^C{ZRPE98NW~_+&;q`oWkxOU?S(VhvPYw>mgl*%%+&G5r|vLgKH4_7 Sv-&|AAo#%%ewr)zk^c>+^Y&B# literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6 GIT binary patch literal 1513 zcmcgsL2nXK5T0EY+$N2|#<$r`I1r^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p zHuSaLi9eaPx}L+a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuPAEYJvDI#?y${L zc%Jv>w`yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;` zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m S&e+cY1V1>$PjeMN^1lIVM)pPk literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9de62574605a07b28b991ff2c736fbf6e3a7f45b GIT binary patch literal 1513 zcmcgs&2G~`5FR_Oait1{inVMhhX{ey18w6*MFjP-NsAQ-B?%%0U!ufulAltmS!@OGnx|bXiD6x!d5~R0Rq(ZGJuGQBnTBA`oS09YdIRwPJ{;-av)LsQ}&61G@H zfCOP5*5<-)qNH(i;xⅅ7#3Y>fTGn=M?H&L|rl~QD2(fsW5SVa0&+ymga%I?6*Per|wRLM}(k88I9^o+K_>-Fa+kHSOJ zzKOeJGhXDJ*YHpFlyfZf_+!p{ywnS7MQnIRs}kpBikQa{GukS>r1}eT0aU3>(3li? zKPYXms0F(f#1fE=p zrdsyM86M|e?ctmmReSV2QgH|`vXC!m*-^`Xd+tuQozZB!<@??&J9Yi>sW;4;4|dF* SoPLl22!C*dpXMrltj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(M%SvVG!3&e!nK$$1&3xa^%a(c7VhT&LJ0&JF*Z|o4 zwDUWya)3Al!hmhCyZ1#_f+bSA8t%+j8f-3!;(1YRtq35aqezkh=65N@6k`t(Nkd=j zocg12qwP6tT~oA6N&709N2dBeO$>qtOa@d*Gyv$Q*=~u#A)(}3fLj3{h|O%2SeT)! zVGa?CMzO#mv6yVWi_eMOL`tK^*lYN1-%D_qZ`czBT)4(zi!=f(cz)i`qp-Er^A@IKryf6yI?7Qd$DGI~a75A?*y z>AC$TOW-{3$ycsQ;4ggPAh;YJXo`_C_E&K8R(A(hJ{9d!(G*kF-p6kP7%6?7SF6uX z9tA_vzsWl!Q=aFXSMViy!Z`-I{D|`|FLXsLj|op{Rp7ix7X8qoM_Yv#6n{=ilqwXd zG&*_S6G4wOHJV9{Y~9YicC1{cW#umFC)T9huXS2>?Yy3?iu_(#;KNLWgK`o2m7>5a zK>UTy`2pdztCLsR<4)Vob)0@}+{)I^5Z}WY9?k_)Je02xADeTBA5d`wcSx3GR{wF} z2u-R$UG4hN860O`?qZ)QMZ0t!p*U1;CIMd1)5E6!=G+}`Im6*r)Azk;dgA({Q*V&A W9&B6N8Dl>M5Ik^*f3PzC-2VX7ZuN)& literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..37aea5168fab44a7bd45091bfdfe5871bee8d360 GIT binary patch literal 1530 zcmcgsPmj`25T8=$b`cG`@ilG2frB<2_77dMM$8_3u)7V4E1%K)i& z>%Wrm*8pMgB<5QJWSQJrAyE=15|I+B9IU8vkgGsyMidM1s*44H5IRvo5aarHAwk00 zCP;BjU2Gq^!%@BESR|%Il~025P0*{l{69^!6~=&s0970f0Qx0zzley>h^0OPyjlVg zA|SzapIP{62t)z5DI|4~-;%$s%A)_p_)SM~k@LHD5bzVlH@l#}kDN2Sf!v1mkyCf= zo-?*N&7XquThQNz^$#KvovwRDx0yu$H9^5gKRv-Ry=Umj(TF0clPbzoO+Vnp$xaRB32+?OuaOIN;l%3unjk|N7Hja5g+pqFO@R zn!{0?-&xS-Jr(6NqC|CB`5d_kpe588tyCWG-}8o~ehsZNna~`iw2Uv4hm>NVLw70d z(0qr-a+vUledQ@FFpGX*(PK;bXDt4NDPAgHV5QN?(Jl`50~csYnO=#*|Vyk=#5klAau%xv50RY%QC z?FjMR_3=>3k>aLYnej0xwdgi0j^K8fW#ZR=;5l3~m1kYe+Q91XrJrwLp9zb0*nWg! zv3kCEctK4L8t%(ud$ei|2CEI%btcKN?G6u}e$u$JW~`;PtptGgzz*JJCA{_j0OaiY Ae*gdg literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3d1f158e9e79bac193f88f94d2b548b79827778b GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJS(ac$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1h@mC2oUSy{w9@^A|VnfO^GT~Plb&+S=t|TTP*^}6v&b!gYiR3iMYNCiKMQrw$A*) zu-5b(v8KxE7fJml8ApclKST6QA^<`LQb{rZ3kW4cKD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t uf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEVGN)|KlR_NHuSUr literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A2STKw42<95IbXmGq9LDE<4uke-BpVv@1u5v%nlfGtK{}wpq0xsL^P|%? z)It!iTE~h-=XyZ8#$1jerle+L(%BMRu2v{nOg4x8R)@Vqhqr{rWBh*%avcFxh@}I8 z${0K#2(%w(vj#>BS?oW!CwW+_r$Tc!^>TgvR;;Ehjb^W>rC%DTW!j(jqK~RLg;%D@XTAQ>+`N|aoE_9OjR;7<+T*=-d1vynh=sAc-ahFVQ5f)upxq6#H=1VDIdoGJb%!`WYZzwmjfD_T zOz_%x%S@-?hGJfr?)zht%aCjGZheNAJ2PoqML)5Z|Z&)_}IN>8nA0bGHQ|Gj%c#d!l8VIWqgpA56-?pJ(8pCJnZX$71T-V^r zYi5lnendLSYnNsYD|hui&$Uj7i{|$}4DAi)6Kk7(gYO=Zxp!r(Uvsf~aNX^=`j}TG z2Us3I6)W01Uk{tk2&{ACXQ#f8d>9rI^kjb@{l(@=VT8|^x;8XvcdP%4xKo3H4r>n* z_sbMYl$h`kKI-Vxaz}l~SeYVPkePm*?->xXJy)5t((TF8bsK9^Z^fM+kT*r6myekGFC+lnWE?|Zl-k99L zKrVzlUEE04M(U19TC#{?Ze7GMYO3s;S(vtW>BKclyCxcSGw69wh>?>xIr?WhwvG8# zpDr+lqvzF@>Q}Xw2wz&)|9YA599s}s`n>NAY_rlrwDaCwlR2uc@AG=(QOC-+T4vnl z28i0=;EUVe|Fo`=^TwlZokUEPRS`*V$MP27-j!~(d-iqM3lm<+{d3$3VjTEKub(Uu z_c|w3Dsp2IU!A{6*E5UVb?7OXcG_$&>(_6=*Gwz%tgUl<=7`W!0fBA}T*U|Fko!WU9Bg3IOcP!l(X|{UZ$nTP*@(tkeXf*yAOZY$@dBhKGZhX zA;#UBUZiJfYim0p!uHfYbzf0afKKkc7xN+_Mt%Eko=-T^v)&{w?~p-kb9LxV=jt)G zZqM^V^Rf?*J~6`I+BGOmA*SALuF)AaL3ic;?8MvF)v3w*haJd&DYJ24{bWq z$F{YFjs@9)+Hj2;OLUVD9tzkq%{1x4gca!X(>uJfJM5OGD$Wi@@x9rF+zpB64F~mA zEy+Tovqx9nj;ot{rbkbbu43OiJ0s{>{n%pejnD-P*0i^Oa%O-p!fnCFEc9>MCehHlnoVed&ole&a(MmrVnn z-EIDD7eWi{FOubC7sE6)bA;!zw5p55Q1$b31{3}3<`*QqT6cK{-tRYrYu7|yJbKq% z&UAgQ_`-#P=7Le7-U7PbkjSXai$z6fB49z zMv*)vX1~rw3sb+qVUK=s8Ft(C=)Qq4_RD7-i`o}8FjxOR;l%FyXP#^mA+7<&Q*uvV zh%_9(nPr)oU;e|d<_RX>o5q{2NOBs-$UWCr$852`bg+9#n()V2KbM|p^83){ymS6k zYJ@tajyYqGe&bJ7-CYmpMwi6A0esDo4Y8AjaeeteEc}k|jC<|O94@u73cr;(i|}*e z4Ov1zE#7M+e%Dl7@2+!TV*4ns%e1a58qXi$M_p+0sAQ^(T`h1=eb z+)W!Ut9-l2>UcpU#W61Hc(_`3(VqRLtNE>OeiIF__kF0d+u}rwt_tWH7Z+WnH2LAa zkz|*n`;?Pjdy&flZl(KsUv6(0pGBzgQ|p#b2=jRn;VZ5=6reU=JAa(n1sBF7+N%!{ zk}z&9&Cp}?mJcnv&+3JY@r@1i<*|K_+%?EDpz}C1^^n6}vxSBwn;b@>wj$~%Q zaygfl`@UeWSj(C2MrQroz#f0#yN}DoTaQz%~pbq^96>;A9x1UES&V~b#G)xLdprl-(27v_3M1lHW z62h1Y7!%_NDRq(nc33h7!mdQsBreBgTDJuO9{~UWzlNMZ0ah_spl|^2IV#gGq43Wx z`Up%j>Gy(-Kk50izDJ;Y#h&H~!b0SQV zg2@o2e!N-&K#5UOyLg3|m_%SHi4!1(l*${n52frijw3NuD){P>_#9M1+3WwWM@tKE zwCtT^#bSOsS~5UFpa2||_EgzgNMk?n;uVhL{WiqvYYO_Bg1)ApuPNx?O+jD0Y%*ny z5^C+|2bxci#LziuSIw7L1$z0KE+#6x zKQIEo1AK7;|MZWzwVfX)`wLREFSw@u3mz?W1n8T|m#5C;ho^}{Vkohezf2IQ)hGD` zMKcpv00rWJ+X8Z~8l{{h1mT2`Bsjr<6Rk>fcy#r2c6D~)`hXv@j~_^H7z9Otzrp?q Du#EMC literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json new file mode 100644 index 000000000..047921334 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "spark", + "job id": "job_id", + "start_time": "2024-10-14 10:43:38", + "end_time": "2024-10-14 10:43:55", + "status": "success" + }, + "code": null, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "annotate", + "RDD parallelization": -1, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"] + }, + "execution_stats": { + "num partitions": 20, + "execution time, min": 0.284, + "cpus": 20, + "gpus": 0, + "memory": 0.36, + "object_store": 0 + }, + "job_output_stats": { + "source_size": 4111, + "output_bytes": 8856, + "processing_time": 0.46729254722595215, + "input_bytes": 8753, + "result_size": 6923, + "input_files": 1, + "source_files": 1, + "input_docs": 12, + "output_docs": 12, + "filtered_docs": 0, + "output_files": 1, + "result_files": 1, + "source_doc_count": 12, + "filtered_bytes": -103, + "result_doc_count": 12 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd GIT binary patch literal 14933 zcmeHOYgiLkw+@Ji5meA1DB=bYY@39OqT-cs6OfyLPz9`$$&idpX5!3*K(&@CUfOCE zue2&v@Jef~TCG}ZRn&T`7p#g}QB*#w^@dv89<|@M_9P&BJkN8!?>W!;mHZ%?Jv)1^ zz1I7_YwgJ z@p5UtC-t8B=<&s=EL1E3;f91H(Rh zUiYNzNoFarKGHSEA=x3v!Rx~H`w2#fw)o?VFtZTUGy? zN!4WPt}&-gmlmCm7mi68c;xD!4{QDKq4b>d0f*eTmHEaFn(t+#Fs zzj^6w;i&!JqncS?9ADS>Noi`2sg>;A;ya_?if;6sn)ozg$^78z<%5SWEU*1+@~xds zUmWXF%dBildvE6KTHldxOiliL^O$ZwZrtcn=kD%a78CPlxYzpu35wghoh#~|99}S> z*RI{e^6IN@4)L+fcbGnX#JHVB+WG1SPQ7c|&g`J*5twE$eYXEW=8bRGMIT(8w`1Dz zZN`E@R}$}J%70b&cfDFVk19&MA*VevP#bi&mMDxdZ8)D z=yreTy7LuP5AyPxes*)~?leKOBl(}_ruz7H?=(+1ryiHbX6J+*F0buQog@M$abq8U z_jp%L-c#CbTwcYh>Zv||pL#s~M8JTacVq9pu{k2%DW_+jPJdK%v;>B(*y6%|weIqp zsYPQlYLu7Pe!j|e@coEg3g5rq_IP;rQ2^i0yC~}Vh=A`NCtdrm+m#cxnlJK0g1-Fl z+~gHY%PR~uiHk?96L;i1_1(Jmn_l5>mQFZOnB05TtV7XSf9t0mQy))IvgiU{~EcM)QT65yO^Q|y_LH*k$+a6Mt z2Q#YqDZL~8wf@kkvSU1Zu5Z?3 z=8NR%YcfB+a zYt#?URZXY*Pbu$rVB_{_yO;Mb9aKn_udUcrJ#K2*Wd6vTJN7Qo_uVo7p2^kyjlk2a z@z%YILQLX?Lf_cy4er0!S3O;xAZ9K7{>LW6mGT2F?7It!m_6TIiN5(HqV2R!-ICTX z>h(tBPq%-W>UOB6msf{O-QEYW?~m3r#dKM?G5uNBcXy3E8>=3cernByE;;Ulx;rlm z9)HF=^~C7do?>F#xwFDz$38N@d(Q8tp!RpD@`II@+%w(w{5mgU&7y0x{9Z(tsS2+# zyMB9k%e1dP!-r1x`?lX6&Aw|##{a?p>^n1Pj@OJ)4`Onh_KfPeW9!yy0KPj%0{xx2~e_ISK}^XJo_=e=>7UcXJOK|y=Q_PgWzcewuP z@u+unCCsPDYt z)~KF_U&bFOry6=5ubk9j+@Rp&qgHC#$U;gCjp)>SzEsG)yRN4@MCR_VTE=$JbVwbx zwcUkdY3JIwHHweCR?6Bt`o_$jU3Ast^I^wE!>qw|Wj@~>pR_v8=fUO`yUZctt-Yfg z%bpHJF8tNV{bw%6|GGOxXZrnA!_1jgN2mX><$T5XlXo2wt7x(R&|P=(5>FQmsvcVS z>B8>Cmp%vm)5`t(ib_D}g}OZg^M$&K|5<>`s;+WBSPW!}lPa~wjVqGl7WoX3Qf z50@`;IeRK;K@|1Rh=&7Ig~b=Al|LJi*1z8%s>Uf}p7`a0gsxkDUbLqlda5#g@?`G8 zGOvYonv#-FS9QGnC)pIbxP#Y0yV~mqy(TpZ0up)!AFgKTNhvT!M~n{+38Sv}ZxM?}QuM{6otg&HMP|mXPhw zG`0IGKl9yuefZHG`&Qp?)4AKHe(QfatGF^acv`wg&Y8PApLvYvVeD`(FOIL7@ZO^n z3nN17+mf==*GIcJ8tNARnB99vRI%UIDE$pTw?L-u57f=PpocA@ys0G@^1SNPINf_(N{;m%ssGX|F@$Y?~FSdlKxi-f3vM;kkTU{ zzWr6du3y#pxE;|qg?2dG$*ZixmD6kX2J7#R@|0CNIs_P(kNC68CrbH4&)$c&$&xST zG9Pngzh9{we(nI77A%@}&p2fY4CE#T3D-I7WxkDUqNVxUFGH1IG&}o8xp0 z!RlaYhCzvDK}4x$QJ{4c8p-4KbizdIgduX6;Kk(894peSnG(Q4o)9TLO`;UZE`mX^ zMJ9q3u#IUjvLKJqWW}bWrKV>pQ37Wnl-NXdi+w0KExp^5b(|@aG(WNr1d5?ZDLh<0(n|hg2_uEDBLx#h z(gY)dwI&EK#VCS7Z6if}%funRh zfig%=6rsh)!9tAS1;l{`yr6y&5%3bh2pqBiJeXn?Pw8P?2uQOl1ha|aY48gPIO2=| zGlAj@X%Z%)S&}hJ))E-IA_dD4Rr%N!(m?Qfc!RMdUvNev1Qqb5M?3}VW|4_wXtG!d ztA_5_JIKO=6LAl`0I69p(AE~)l!98GiLn8(#IV7A$rjX#9MYO;;Dgx&FvX5Gg1M|{ zHX;ZdzRo9%G%OJU4eM(K3v8+gO>8sT^*m=04G<`a2gR+tWkV~*NIuO2v9f6aOb`H@ zt#Jh;9Vjh>0B^y4pd!seq&Sgywcj9Xz-SQ}AkKJZNe8TK02&HNOBLf0@SMrC&H_HB zz<(I%CLX-rIu4l7V>!M+i8KtMC5(h*L}~#;7&s%LMjD-t#!v=b0M}!m*xVpZZoW3`qlA}N#ANQf@eihMvV2@XmGplb#SM3~}109i>~Iq93gh?g}GgB-%oii9A7 zthIy+gWN1Fxfw@N1n}4#sJ4~utyEd*j|76&p+wq<3SsG3+m*mz3X3afI{>X&>eR?T z4iowX5V16vfJ_3i5KIBw;5oD2AWfv>K)02M1RqEo0X~^oo-QO9fMxTt5mkh#NUZ;)_A(UIj0(uzhHUjT*E(fK$7 zyo5$jFJKx3(PkUK$4CPmKo}v9ON@OP)Hg`nYpWgah-wd!Uw z^&ksFn$yw{=$A5+(JTPrrR7N0;{pQ$pqKKv3A%HTQ^2H`B4o4K0?5RKf*gW9g(#zm z;ff)@V7!Nhhd=-Y4)Q;U^bjl0iHVT$LFcRq#|XfJ5wu{nnPF5S4ak4t7m0SDOIogw zQW*)pfPxfA=y5v7NCr}XplG2u%EzI_Igx;*DS&<#!U`dj`A}d;T8=Cr`G{f*X`Yj0 zP>te2z9c^5fLjFuvn?4C36xe~egjHp7{lcwb5%4IX_yRi~3}tX}W5HR6fC^Nq29dQIl#(QhRl%rO zEN}%t+7gig=#Hra17r6>orUW)h!f&i28t@EB_OFo6V?TgdLt;k8q`TGj|SI@P=28d zBg`o$ZI_WeWio?F5D{GJv$)EF{_-puax$Po4-96cas}2k*dU6g!NI{6i$!hXfjd&2 zkNXGX3#h`N4iqRM7-j}+n0b}1Wdos->PHpkg`kp3S}TQNVigU&Z5s$xzmSlKegi`X z4Nx0IBO@PYCJQJYb`g-;Sd>IVZUYB!CaKmPjDiwGpap?=S|}*S=$Z&cZ;)4ATZtVb zq1b{vnMCJP@?@e2C7JaQJ)V+}<;)OKHV9WQz)k=bpk2#BeL^7Hm!vjW83pqK{^KB} za~5QvplpVS0m=l@n$sqNC&5Rw0EkGj6*4FjNP|ltL4-OS0s}g27Gam8M44F#unh%> zH5=fOShEQ914md>r3m@?g{@d_v3{8-qCm<-SPvx0ra@Zt1`&cU2eH&yc?0Pm6BhFm zKyJZ!ff~(7*-9h^ixEQ?KnL4G)$)GfgMoM!lnsl6HAEA<7UNSQ5>(aDz@m79H$sU9 zqet;jw4f=N2T)O@Q9NTbi=ct9w~3Lzt`R4F0-Os;T!pg?PA#xz%FJ^Nr3BY8dvxYkaMG+1dDuR&$dH0FksLNDg4W)T%^~ra=~= zz?YIK#1)UrV19}(f`Szq2dUX$5*lo=SHBRCe}AX^A0b=*`ritUYkd^1w%-a|a7_qg z6{MdD>fZDG=$*j*y#x2R@v-mit%7CW+e?VH@9ph-d;8uV^1M~`?R$H0gne&s-`hhC zWZ&E4=SB9ty?t-r^3(^O9@_Wz_Psqkv9j;&TQ>amz5R%5gxM!WG(Dw*7r%Nj~tDz|+aw$6NXTbc!(0{s6*$xWIn6 z;H95%?1u~NhYRe73+#srS{+@pA1<&TE|AVuzH*ihKY)yGIqzpbTwp(3U_V@7KU{zh z%-IhYv^;CDA1=Trt1|v~$MFBp9zb}meBaY4(8cq~gsw7~w~v>rCOthhd$9F+gjb1U zFn$fxYv45+|AE&ijmDyh(re;08oef3x}}MV(rA+L&1j7#0l&j_^E+JOk5MnKQ(|Dy zLC_XECKmqUFTqRtD1Cx)q5ni}Sa`^|?C=oMBDKQ((Rf^GaF`s%8>@-3Sm%{)O5LD~ zLHfYix>{Q`niTwjfbrfAV+!65ODWXGXNg+Le!V6}8jrM@f_uUY@RE5UI5#!POo=Me nWKt1u%Fu$~jFI$D2cZMhq3RHCB|MM8k7Rywm&wxLA4>lPWZ^x$ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d GIT binary patch literal 3068 zcmeHJU2GIp6u!GGYZrVOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd zr)%p2Ubx#!+mwyv0QF_?+Op+i{+J=%~>8 zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N! zT+gohy}f4!viskNbvSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk zldhaoEl!8**-E20aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1 zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R$Zwx35Deu1M^|QO;^<5i^#8S> z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@CN06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1 zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^( zZ9Z*Ab4(47jbd%8_xTj?;P%JFsDFc^j4ttIwch50n#t|Pjl@Z;5h{{%m9Hn zNRWg%h%mb^f+SF&DotmwhftZyhNh&Zc}UR;)ehtIz*79ySESPK{RMg zuYgQ+EqUVihQ}lSIB@&^*rf@Ts86DLFIs`6{Dn);;B`PnKtWIeAV;PGnmp{Cy8RB% z#1_S$grE}L(2f7viQh&n1@Bt`&?UFNY zm$ojtt(D>tZiIZui}NNCUsxY9MV+9TgHCXD?T)g&!64fS0)JwT-C%g;Uz&yGTp_ol O8wNhkqX5vrFZl)Zqj*Ab4(K7jbd%8_svn^E-#6&YV1r(OZGmYLrk|07$*$KF#%afM*G2K!!NX zK@teeLxkA_5hQ^ERcR`XJ%q|sHZ*0L*A(fyQiaqnQA~ys@go2c9U@55-P?!tAR08L zmq8}FmOSx$!}iE;2X5aVyELH^^+{CkMJuqBzi8=cybh=cXb2JjYNX4c$-~~M+wbs9 zY)Skn2rAJH-T1Ga_-({e@V-R=-Ov)uarT>Q(TduukmD$M4AI~QvBTJe*&f?q%wv@> z#cZ3gU7oBkw#6C!DWg~Ap+j!)h3xVEO+E+F;KdQDm9R1z8a|_V3CTY17WQshMQ7Y9 zZeMhptA%6S2=!1G=S?ELus&3ZIzclBo#5))9c6lhL8cP~{=^)+!SKw#H1jLje0Etk N4E#8c0ze&q$uINqc(?!n literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/metadata.json new file mode 100644 index 000000000..a0b26f931 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 11:36:37", + "end_time": "2024-10-18 11:36:37", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 4.5, + "gpus": 0, + "memory": 15.91, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.024, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c7d3d807242ca605dd7ae80e4807895ad4d48c50 GIT binary patch literal 3984 zcmc&%XH-r&>{ns3qCWSl>YEfiY;u0EYF)g*Cg|rUM zp*e<5V8I)dNqK|X03aBJnN^4|Q9#Jsks(dW9D$=Flk%gZl4It?N2SNpSR#}(B1tQf z$IE$tK9>Ya!~>Ct=d8nMv7|e&v8EDQ z|F^8Ayg`YSNBpWIVJi9KEhE&j&{Tp6BJV7g_mJ;Ou@J?i%*#xQP0pAT6O|DqN?;BN zF>~gnBxWWhYl_=|B`=Zu8X-gszsT=vGQob%EB8+bGY?0g!fVI+c`f1aNk1{`(j&s<%y90=+$>yWbCau% zbfW6H-%*LS;lz1Jhe;*#;N!>M)X?RN!RoV<9Nn7;J^z^*wy6DJ`@s}Kf3XmP`_>|} zq851M*Mu^L@=?qgXX>_%E!rI61pGohpe!P>xl1)Q?_31sdr}Jd)gdU~ih|MC^r?dv z-lD_9E)Z6Sw-8HidBNvtGjN@s7Yf%a5%kelsDB>!r=qG7bt(=Y75gm=L!V@IVq^IT z?i|@{LEjY_p>SAA9!iPC*)P1%)z{s+73SW=g~{GDzqmFkdcsP2{phL~vrNvJhJDO9YeH4G$ZHD(qT&Yy8Px02~W}VP2 zN-9xW0cKWhOYy@6lb~@w`!8k5Ih8N^q&#i5s$Ig-(*zXGG0| zG3e##BvST8Cj?H(h3Surh>Ks>qPD0P5El=q-cp1b+QabO^>c@~4c&yxAJ}8TPnRL~ z)fM#e-a-=ZUn1*vxlC>e97$;v84}TbF&M25;>Gn?!jnf00_#o;X99O|^P-}SD<_+S z!~Jh5J%_ERoNJE_3^o(o)lb2D*K)ATJ$B+qPVXy73qAsjY2<<>^Kp9S3)w{j}ReYljUdow!7~`kIkPz95O$ z6Di$}@+thasjE?xlFPT8`W;&KZ=ZPw3_1Ba-LRA5LvK z*^By$Q^_PfVQL1-r*CGM0g?}+O-d&WqKP3 z*JB6^zL8S~=d1D5w!>lwZ4x;CDCW$oR-wAl{b+sTTHd=k>UxF9w2>rB@R z=~X-{x?b?x-$;^ey_94JiJ>x%sWGz=Z~leQddj#kvhX^wc!f$&7}rQ(t4C?fM<-_ zGRCJ!ZTph(cX1En5A^jab=5kaTNqarjLs^1HITv`lscdfRPFEqkIdkA{z^QuF3;;J1@hu?=_2{{i@x{Lf`9_AxCGO%Ax&kSVf#j>}o9Z zyE%6Dsipl`qPk@p-Lk>RZM&+&e*WcMQ&vj+Z;xAW{YYffcK1ryPddEfG-`7t_1Mj-#P$FaZ;=r$zt3^gGtEAKJ zxv!(GXEzs5``UAp(G)c)3%c)}Z{}a>UJ~@ccl#LY$kVH2!4FkC9Fw}6OM)K-lulXp zhSZd)w%erU$g*)2O}JPA1#c}h(ggnScL4R5mX57}7$ zvzZ}{2N03>`!Ir^l#tA#qu6vZF(EyJ8=Dx*LJS5^)C(l-BPC@f@c!%wHu^1=>w-|gBjo2{+=o#2peDofwlk#q91{FZc8`NR((I0Oa2frdctU-DQ zM*kH&(tZb)5@VC&G6v9JFOob&@&XHay*UGr{^t7T2KZ^y4P-ZAo{f~k0-vmbgFT*SUyCzApZnQ)w=Sj|w5(CH#r9wqe(A3aL1A`?k zBW+l6R45e+#Wd#2lAl76tx(1(JQa#Kg?G)5#Y`8ZrSYW!7hwzg+j$H%5av|$Q8CMzcAW9z%Dv8Bs0cy;==H-pm&qC%-nGqH0m=YcqkP;E5DV>QoZJ<8i>F?*mG>f6%<-@j8nHu92lEDfTJ}OpU zqm(@uf0~b`-%PU^8f?8n;m#ht64@T4MtDwK5N*GR>B%@&iD_~++kYm-fk$EPlp0GW zbEY~*9;9gUDc#>x(6Gu}gYtQ&+OxdRy&;+SzYE;^X unXwtx^HNi-qten+vQ4vM)6x@Cl1=TL#@dawm8lE?-hJRC2@uHqFaB>f=8;VR literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c355b299a2b6bafc20dea943a5a9833b966e68b4 GIT binary patch literal 4763 zcmc&&d036<`@f&(ec!XEsFQP&igZN#YK!TV_K<89(P>Z1NsDX^MRiKH2q6?j$Rssi zDpYnuri3Cf_GQL0Wl-VwzNcx1S^oH4*YEm$pX;3GeU|&aKlkT;-gEVQ@{)R62uY!r zaJr|ECxih2hC3F2a%Ycj(%o2op`t81$mz2car&$PfNzNDUVsQfVT~mZk+zT&+W<>VEaC>rVuI&| z%i_a@SW}?cj5OPjWRZmPZ?VM2bh04CKzC~N1Av|f2wgqtB$(vQNd#O3EH0Y_z6u8f zDjd6hv=!4-dmoJJS80nleLEr!`rSs8rtTGn5J?LOS;HEs2HjQ%Yzc5Lt+x< z1j`Vtc21QDYTpj1>*uEcyL_+oofx?+1U{gG*SZdig&A<<9$>$Mgr>v8fID0d)HUp(alRU?^RtD>EJv8sG9TtF(gkj=I^3LG z3VZq`fsR=@x>Bl*^iLcBjVxEtKXC}olJ?(+~-T;{{A{_0R)Hw4K1iC|vQ0cmdfF!^u+h!0r6ZfpqogDN2T^I>=dg39a>AjxLIp{+8w zcvlt9U!&mKPIKg5QUJr^e}L|#&Y=I)44MOT;O?gwSd=gewx;cY&kN7N7MK07%Y6de z--c25@+V-V7z{&8E#dW>R4BeV9EQaVK%KjPLt4R)P=XiHuY~^!?dO-1zVlqH$7Yy`oCA{`@qL$Gc$R{4B(IVBmXpP$lu(aQ^fUY6F^ZHiaV+)bZydk$SI$RDs?*` zR`ob4O`8Zh6Fs2j>R`fPV}`@9R|YAoRACcfs?2pj|XgGb!~`;$7fSvV6+@> z-!mKUtgt}U!>fpri=WY)%TI_2r-rcg(-z``11}Qa`rA^*2iFr(=Dx%Qh=;LxL9qOp zD>Y!nVlX~d!|Hz%p^o1O09g^3e>0xYS}cHq|Adk~!UpO;M<-FTBT;IF2TL_PB7M-l zg>6_zGL*eD!VvW|hzY6HGV*}jA1A$YLD#z5`@c#(&rNR(;C_dGQQ0);EZhBCRl&B; z_fUVsJY=D0y!}8X9a)ST6xJ$gPewx07;n~s z`_9N`XD*tzYCEiL)rI1<#-y&-5RTb~bv*v*YCPZX4Z7z(iZkra6BN!f=3OqgWzSrn zqQHjk0&(b8-_=J5mMJHTmEs)gX0MZXqss zWQloiuRzF$2K4@ZB#DcbirXh#A=i89Q~WL3L|{)aM(N(1u%{#8#gjVT=>g~Al-CZn zLKdjAf821ediV>aZnXgwur2YRLF)*1`YUkVu>wY>9b|bX%;Cg#e9QHEK8Am0=<U-s*sUjswc;JpG3i#i>r4-k~35BO0`%=n&5PWlv44ya;x4RgkifzuSCR0;L zPR9*`)2hW~-#<#+EN#G=N93Hr`-?#PR~7Ipyi8nbHDkpSmnnO91M<*Sk}&Vsq}5t5 zo|_q)j?NNQ?2%JmqU0C1Ab*oHF{(#SYmwr(7%+N`Ex*H}sQpJ57T zo0716>O3^bEs-a=eFq-uEF2y?tpX=xl(IuN>rzgkE?BluMjl=z6kOFG&z>0KMqcL@ zv1329q8rCoVC^ln8n(@AU_n!vp1Xk+66KvEPEIc(e6yaS>4Qnmgpa<|`kLpcXKO4O zt*!&%j{8vLlp{pg86dViFoi2i^0ALai{Sf#ahz}bw2*HU7n)Q{P_lY9VSgB5-p>+h z(1miGEPggXVe**USRR6d$~{r_h>vJ>eI_Ss`hL9dbR_F8rG*1|`oxvA1h|oZ05wJg zg3VP=^48;#_^$04bWNO!FQtn4Sz2}!Z?}T2m*d~B{$VKMYgD0Bl?b?2aG%)B(p5>* z&_cNh8;NCm@1RTZTUbVw-e4c_fK?!<;O~8u2gW6%__cQ0V8xq>p1t`MM|bfdX@ZL8 zvQ0wN`tAk(JU5*?sa-(4IUkD#-`kFKr#awOe+junV-jnRh7$_&{+6Pz7PSK5dV#U9 zmiOnSh-6AHBAGv0blc6+)1e>4rymgtrJ!>*bBn3vk=*5gIU@VrD^du~07IYJ-6<>C zYC~PO+83mz@kN&5Ewu$JGgS2)6e4Nis!Vl5@BKb?g=wn?ng?FkD&3R5X3*%kr!94R zGS=zZr*gy&dleh>U2_b5>-T1E9PU-E#=qFCuaBHrI_E^H(Bp=&UuD#t`dp8jrgOcPeCVQ# zx0cDyWQoV68|RgWT;4WjDa&MAMMU!f-|BRe?cYbYRW0sjneI5Y;NhjbaT%uhRf#{} zI=+-`wzFn&=kwO;46|J)lY6>8bhFKO*QStsEjxvIL4BHF(AZ@hi^4OR8b*FK3X47G z)@YAg(!;Ubdtt*ck8O6DmPMCxjb~Ra<60G8&a;lZRg-D8uOZ)VX;%-|djF3F6IW}E z&$2FQD*6lh;A=FxO>!T4d=`zqPl>K}fJWcrKE4lq2k&e2k|jMpHhWzU&N}iH`avan zb5{I8CHh0&U(lCy@c!h^pr5o&X4~$pDGPkj+KV34{h`}MbAD?|=ojdAdm>fSIwP7( zW_%ZV=hoZEmLnloi~Vo^7TtC{@zvuyx8KR{)upKppWF6+!NZHWW?}7ZAL5@j7EReV z_fA*h&#jg6C+&B-lU_?YEwqKWR|mL$eR<|aN5n9iAS9lQiil5OheU-i7i;$JXI==FMnMBcvTJ%PX0q3aH<7BA+iNiY!1i}mjgkxK$!RYt^XDC^Q7C~ z1p@7+Y$8sd4I>c#zDgT?Ef8vK{!JjL{vj8lLSn)adi4)25k4Wwerw4|D_SD_J^sI- z{u6I;whf3_@j>As(J~=P%OnH*h1xRE_B^C+PYWin!J@B=FJ$!^f308`yFZUbh=nXq zJI0$7DUXOzWSBpO89a<{&WI6|5W~$tm znltmrK+DM%fwN{N20M?5^!4!$XZEAL==>JXr#odlM`aK?3A**hQx2KEul(cvX4%99 zyM#$)jK30|rb9R1zPa~|8O+?B>3XRvQ}4|ul3sI2kj=7@`}$0l`}ruT(|A*R`MFR3 zIv=`Nbo(M7rj!n`!7eism<_3$Cu6TPO6tR(>ZYtWU2MAbm0l{H$P8Vgm>R_TIggni zXt{{4i8W(H7deTkKb>Olq0nm%N>4g-njO9GD|JZfJtt+dj2CSqRaUasNlCJIT}jG-rOb2Pn{AkvGK#Vtq-NiYeDjWv-RKi7BbssI20 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ad59ee31cdea4b0c0bd77befe4a698181186db33 GIT binary patch literal 3695 zcmc&%d011&7C$rh-YgLI+!zoQMFJ|wq99sC5(EUJBA^f`#jpsJEwYHE8W4-J3gU)! z0|mvUR4LdBwjhYM*lIgxYM-s5zPU-U^|k%uec$_D&qrqF%$)6aW|DIV zmWQ~qH0#4oSF!>Y%mHkcR#63GKPaA$mVNuf$Mstt<&7R5b!y9$`%7=R4ZGP}^kW{IcLt z6zyL*8#0^1a7)rbx_r-bc-%A*ed1JOC>S*pc8%OcFRwJo;2pj{h=Wjq7eG-4X{v-6W{pZ*%<1T;6nN!fS{4(mTyNBF{Xz;d-c=)-k zo)S8EGkZ6kfu_zUXcQ>%lw&SvPoNQ++jRs!mSv*uf(G?KFcL%}|5X z;a8X~UH3ukgqZD5g0N^JrgGSJ2(bMLK6ATnaQn_AMhr&axwHd*+8YQ)DnCda?u75Z z(FP^G9n@E$-cWpWA%57-ptG}1fR~pY-8$kSw1^ku_ll;&@aZ$*$mwB}c=c0cx8egZ z@p}s2#54irEwUMq`Ii6PazAYY>I>YJ2M#bg15l&qa!@C3kW^@7kO;z z25^}+m9lzE1fc^Z$Zlf~+?v^qvdx7kzQc>T$qm|R6^$zS6| z->t;=7Ut8fvx1S}mKWBT97aRZGH~xjXI>WdIpY&#NAEvPQ!WqJSzWA~DlE~gL_Jg+ zf9#y^(6W1%VCy=6YS@V9z)Qc+Uq0$MyqA)R4^+N}+BTRPd7Wux%;t@Up1xe1F*gRy z3d}K({q1YGK76sgCbR`-uX>NaaKi}3Z(#sdFILk{@35lNwp01j69Vb4gbjSn&ll18 z!v)xSW2e~TTm>xZJ75!JHvvgE9j87CYoJu6w^7(ITHyUm#Z(=+iw0^mbehF*Nb=o{ zQsx|_5|073@!Rq6Mc!5%;o2`cG$d0nEz%09QiafGvIi}*D5rdy5H|RSj2ZfAGhSwS z$Gq6-rtnO20*-4|qV_S*P+3=rpfs!zFYZp^U1h9rtbr}{MNu}K-?|r_Nsa~g(@Of% z&9V5Z$1!x)vJm$cS{jyGc`*jH#eAFUXydNy3z4C?4HX(C!`ZsOQyX|AjEcloXnpot zYH9t~s5fgPZ(M5#_$>IAS0`#QtiQ1d9QRBz?DVpR2?q1go$r6ZX-^Cx*V{<4bRCN> zKE8)vys=U^>xzi_{)7e%yS5pP2vy*V(K0$uJd3we?1vIVw9M1VjG0=&7cMw> zMttSP37+uT1!3xS3BHD93fd-4l1=WY z8(T6_&%Z<{kAVkegsBzL!hFC2sZVXea!O$C5Ybt?d8P|wWU{YQ7MsvO}{D6mx27q79MQSG4W zsxMhNf)~s$~HPL=qk4K-BrG6LQ-F&s(V+(=1Fjk^Dfkiio1PQ z3$?WGZhtLES8pYrE~{9r^>X{lZA0JAysgjzJRoArh)pA{%TL7K$?e1*SXnfhX-)mB zh^ZY!<{dXJ(4u!4?d$sdRg@{Or)KYnoI?1z;x z)ABj@k19j#3f%I6W~}#)`8PeAJ&D=U$73VUG1})p;(z^!Wi2DES*$PkGhVHb!nEXcuJ|Wp z(W%K<+5Cjm1g?6LQdujK+_sUNw~}>9Wmj~a#cyBQAJ#)3*7IfSB1pi%Am}Hh|33FH z3EkFD+&g}yX^{&0xpGcODj1aEO8@V7iO4UF48~NLvF!TjKdbyEFZ+Phg!II0eRFdc zNp2!ppRMfqC{p`>Z~jl5{t;Hoa(9sOvf`2w($p+XdLTRen^wxsTDjlChx7#Cj-{_A zoaO0-Uu{h|xnHk>#Vk+h#c9)0-L-J-7A77BugA`c*tg;A>m2_lSdbW7T3agr$9>*k z)Mx!)9`9fFk|YOY5@VS_veHHjU}-HC$wNZsgip~wENLz6%#piXA(zYNk`G5Sv) zB2n%ymnX`76LotEA~1_v`O4*i+)VhvnQ*#g7H5GoB%hNv&e_?MEc|BA;CSC$ZuH{@ zVp=X&a)Uy}vAT1)3&*F#y3KZt4WFMA?>{+36%mreou>tp_}+Sx3_5$hIwK?qLVA0; zC3p6UepY0-dwP68qFl}C>)aDKgj9ok$h>(RAH)rM=(^DL@uZMFMYcNJJwp{SJ0mhe z2c7tvtH&1<_G&yNS%iKQ50^@XCO%+(Hg_NoRC4k4@tlJWjlZl)VNpdcie-fhpQplb{r%56w=*YQBuE^EjlP*|Ji^#}znba%k zKdefHT${xSnOVsh={Bw& W<6XzQSSqanUjD%m8X%bbtNCwBcqY97 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fb2a0b13d44e39718519f4b481204d15e4dbf2ba GIT binary patch literal 3684 zcmc&%c~}$I7C(0~NhX0nSSBWrR@N95WfNPa3lks-ii*_A;(}q3ogjd4Z)2? zsen}$#SN@Jwbr;C)A^E+HjNDgrF3er64Q#h!5Muw)7vq=YTnZw}Tgh%L;ox@>`QjP{6 z_5kvZH&Dj05Hz2JOFKP4G{^-U9E0FuniN*Yj)e5(!7#2f4W`Z+2%_~4@cH;A*kV%v zFMIDr=bMzs{YWD?F4u$mk@JwfZvH z#SFQzgm^YxOl0d)C zy8%OsUIMo!Ke&5uA#D465O`$SqrU2YBd5gcC{K+bJIftv_Rl2EbO(Qb^Ekd?PlmL1 zdz0hX^k`H!`%B_wm7DN`R5dbpD0qzjdt{?=8d30*4qg8Bn%$2JPm7jZoFUqUevqtl zJ}DeyvbC;!+>Pwi70B1IR3MJJLuFp0MDJZWL^ZzUiVmOkA*O7y66FjZENHQ9BEGoh zNxUE8XqRoZo7gb@78O;Z;b+Ke!P9Gwa5jH7?0LL_m@>CjxWtw(XnF86S+M6F)KN4- zBE2yL3Pwc=X7y-M^atxv+~SQ;e03mfEA}D>Mz~VmrK`nO#}5%{vG-8-m{%x|uWq0e zv6uMV-jTw|B@3jYLvIl+*N38qM>9#qCtpIu>xB^c(Wm^g-yTF44L?Ej(RI9ILnNs6 zQZ#YNHo?JGR7=!c_anp)&O_4A9q3_CI!SDutGM>Yd9ozJoo`j9aGPhtUt|q7Jd)SG{fh8G6!2$gM#J|lM&ZSh&*0E2?ZlDNi@e=$xY&BG zDkJ?ScJXQl>3JRI1Jcp^ilv)nLHy&|M1gZa5xNrkHs82Wub6bJjF+4OsC&f>n_&B5 zf>@hL4EvG*ZJypb&|fF|?qIDr#-TzSrd|xqis^De+cyrx*&-49828K;Dgn!2mGSTDZHZaSmDPD>cRTWQ)FVu0OFcXjan+T zw!;%3cY?)}G_x*Wy7j-9dTL-P9lNa*_>0 zPO?8ZOjUdr1)g}1E+7=ihdvv2x0@nG^MIh_LDlmY@F;tA^s(v%i-b})eR)vL!a^&# zUrOh(nng=(T{J~X^0tP*5WF584% ze_Zi}oymH%u5sOP6TEhzy>k7|4a4&9bXvSgOjD~kPr0YRui`u{(mnAcGQmANGuhiK zGD+X*!{8()^0O((tejQyJ%?aHD*6-zon}Scy|OFs=3g)Z&!)yeB>Z8s|ok8$`K{X*0KDv3Nz`a$6?!Y$+=yP z=8>+O{C)$E#g>#*`4;kjo0L)0oNT=%EM5OJA?IuMFIAPeDrv$}-u2JdMNwvI7F+6* za>1ly=QDf-kQYv3c&fcUuV zC*ZOl|D=}3J(*uD`&fe`In^)2mi-rZvC#8nzr*9N%D(N>CLkjzD>={7(cr@0hg1jM zRb6hl@V{vPFTDO4WW~w=kPC7XQj#(aG>ID@PvV*W$x|7=!yyPa1Q0;TpU*ojun0fj zm#}hw+z3t3g7DW^ZBowN4%=f9*hBv_s^K)mbt72)(|t{-J#yzui{1McsJtjIbg+0%V!12A4h6m1GFlPh@fyrWzTe4@*>F35q z1!N`ak{JW5&$-8NFvaxqk#E1v@?+S+0-X!p5>Gnb)8rYV0*o=yES#0+gbMooPYmPojgyV3KlKa~!Y*|gY!0$xwWHGF z{pR4Dj9`Uuk_*`U;}9*Eg7-9>CXO7V!{;%KroeJfT(GPbmSMO|T9i1DmVFL-Or|y; z`{8i1`q(~JWv|8}338WgL(Z)Hq`YAX+1bMkIXT7xm)S`b$sao9kHMMp`-1XS%(bMT@N9VqmqEhvbIcLuFJIP(Y<@bH}p8HD@ zm_$FCqFreZ74ED}h@9LZqE+8+iZ^?B za0w6uc@t!f)T<1N`NJGY-T800@S+y@S;4R=Sppj#Il!#d3@XmIA#x?b zuz#BZj;v9Fo7NW|k9vVlHw=afsTg^;cmVZ44g!4~gq2WmuE7JW2e^WhFbvM-I>3_D zK$yBP0!Es1VM0+quwL#2m*eZ9+O81#%6FiX^-|>Cx&?#_qrtuPBp7xL0o~6cXtA3B z%ce+S(eJwM!ZSYXD)$CiIUnj)s^RE0dpLZa2j?plD7JPzco>hv^I1{gepe0`(ktNF3mp`h zl3-tTgc>v zz@T%7s3q|*=Jy4tF?%~zUi$+6*)Rm{^Ih3f;1&lp{cEVnE>+->=1C<~KSk!CrPQ6c zgJ_`q6#03KH~3w;1zPGj(YkyOGC5i!OO{QHM?Aitb8sN=Xs?O144K3G7hKm)*Sc+Vq3&e z^(>aK8A6Z+bTcU3e}kj`H+$T@_W@4`wh;7j8~nB<25eIzA+N75@yo;}Sa7V3yf`Kl z%Jxqoo;35&`H_dAV#heNtKSo76iz4VOGiN85pmFRx)13&ZztGCK8Nd*8h{K@ka{`{ z1O;W(-pR}O&z2hDcKclLzTm?9x_UfO+31Iw2Q-njM_-}|C+?7;ZT+p>OQ#cCosW{A zqy_SPwthk8DN@KIV1(dR8BlyLn&((D1AO+h@OxcG=*h1n%rYxM@!@dNWrhtbE*ekl z%^yn$eG#uYPKmxUmXXeBKafTC*N`^nCt}*qpArwI6jQrKCLqC`5W--;7ddP71gO7Z zU!Vh4^ckKHU1Ih6#iZEd7E|oy#&Vr>72=M3aR#AK9z1%8CnZ*tU>YETqfT+YcQ+a`f2o z6DLodK6Ccm`3n~>T{i!4j2_v#SdA7%S`OKHMFL($zI zacgakj-{kbBbAq9G+AZlWwH-Ee$7j)MDv|RYiL;wE!)7+-J6H^&L_JYe>U5&M+{(Q z$pj0&5Z(AwEa=h@3%YbzN_u-08~?$n1k0)BzVBls`&aywm#ND#S$Osc5}iS^BYk9L zeK04z?fpG5-<6=GJOIReV@7tSR!viwjy;JtSlv^qs&fj%#0&w1xOrWFjUlqbyQ)2EcSmq~igswC3biw{zS^r$%KPL+!fkY}%{f~cg{6+l#f4Q-L*-a8% z>Mat;1fpjW{QOCzVjCtgaa2;Uy>9l#)T98NMw!K^ zS$i%#P6ta$*F16TSau%EHkQ;m=oUOv@mjb^ofM!?Nsia2CUdFdc%v=(#eUQs4=xs# zH}J5f3^!<$<4o)Z6Qg4FIiHbByCPP>y zT;xKw{urX=Q1DtfXNi&H?C^dH6JBUJCk`xYg>@LNk`^N_Ny|EydWtrx0LS6-WbLtk zY|0*uMH9rX26g_lf=rWdhQZ*g&d=8ux=zo`H|FSdu6}`o{RS%}Dk;FL542DK3HVRW Fe*i)ig~9*; literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a46cb40fef1eae7a5d0207565c796ee63a4a4fe GIT binary patch literal 4466 zcmc&&c~lffx38-1o_%0IW-C<+Lu zfXF5&TO?qV9RW8mF1Vm^2b1^|OMU3!0g&15RLbs@Auckw^Pp}^Ozy}+TM$xJ?S>=&VL5Y zKa-(DF(Ke>!%>WBFrO&P?gf=i@8A^RhUT`J!cu20T-OL;X!PK-K;|_|d--VAiHs4BrMY;#2YO0y1dt(&gZBdonoKQAOkw`k_O$X0UN; z17=Y?iY+_;1ela4XQG^PFF0qBAV#G5K*Q+*f}5vbF+iN#z7}SJ#if{sy|Ir zyAI*-ujFM5ECQwrzQ8osaX|j575E7MB5|r_ArV*=#VaVU&~T0PhQ(`cqedhe zpl`I6zJ^!>sy-Y*e*bEwZ&4*^wXfge303Rh#nhQx-V;NRFvo+o`o077KC~bDZ_WX` z`*pyv-A1^M`(%bm`W_a$wGplKeF5*e%wl|g=LrmF8L`e+ThW)MCGnVzo~ZWGbok~> zG%mVy8@PXw0zAI`f#`YA0DA*}1KwvcuqHz;Z0_?$eX79|RG9@R|B*Rjy}kfK-(G}o z?nmON=wG6N`4{jscRhlgsf`7Vgdn)ZgAw-B5IlR*%xYEd1ZUlI>8XK1lS<}I1r`r~ zCMH;yDH$2nA<5}1-euS54zWT0cIEn!56R(NX zE8$qGn=idPsRVEqbl@R&pP>Oq16X^|LDhJ163!UDhB5jzne_WL*!7BwNKh5Sm{?K- zv|n<8Z^3!2yWfN+$IcTrF8cWKOE_jaoT1sD{{?fKYzu718t8_Let{dF-2{0V4%kGU z-vMpab9&O04p0#pi%#yD2OAFXxYoUOgxXg#Kzm;TidpOr7dpqY#JBE%hm+P!m3h{p zxUCiR6$f+(`xTBTa7`dyxtY(or1u5gF4P&n$}FPG-uA<5-z6gL%q9)1A9jJ&eJ6EY z^evz;y8}D3qzLob{uC~mh%@Go`VeWShu}z-43C~L3544g!^lNdSXdjtG9SzU7rx0u z-exyA-+mU$Sm3J(eWIA4Pqh?on6MMGsf392lbFytSB*A^2GvuIA2WNaLs4+G8*H35 z3U{<@V{Bhig4VP~((V$PD2SzpT}X)o*Ye6>Z$uEVyyS-8d~AsBTD8F|qGZ&aEMjlh zv?f?bQt7&Tm#efqTmjh{4KSG-0j}iV#}3eRxG5T%aDUuBZ2i$Yuv?x=Q?LC?`v`yly$RsJC1Lc(t-DUun>aM%_6jd>)f={DWGZM+w7idJ7op z90ENpRk+Hb7u3(@c^>~4wh=#3<=?;jICZo_oVv)Q9AoGjkRA5>We9f@;FSs_wL}%1 zxe*|S(B?>D62?$B@NPPiw297};*@2RpPa%Lnup(L%HOnARacrSloV{#cFba~;bvW+;I5<*tMKD-RgQKf5s& zjjLFer;H@;{3%ZvX_Mn0k(dhbLTvTq+q-ec5kv2II zPT-ChAU)@UGt;!2U5Osi2yX5~rX8#-&d?FK5ucoEJ_BJxXucQn4r?&F+TymYN3fgxFEmqz>S5?@#N7r_{wsZB-u6+icWy`v%i+VCmgS&H` zYmQyWnJpi>-BnY3>9EZv)Z4I&?ZlI}jIzKEne z9qayv?vgu|B4mu)P;~dNv!v0-bXQY6Wcp8ZSM^Ww8x^`LEN+a}=OV#oB)(=MemsLD zynpro4^jUPx@e~*5Yptq;i1uie4J$e`skyV2z_n0;}dMi*8pINgrC-*Pa8A-l#f$( z|2`Q$;?vx$sol6xc}moa1|lPjpO(HR>0g)fuMGS@!2$tKq%Cs$KYqjc54~Lczue{j z*i8~-m0o@z+f5{Vnt zNPQ?(xh^rK^vEFdFU^9ymd1xT%!%~z_6Voequt5;)~zHPr9WGx6EX=>#{4OV)Y>Qe z<-T5)t3n*ZB!Sd^Wq1+}DL&)6$5&rbx(n5eL06_emQN%(CyfjAvW)TZ{xZhbTM3G5@$EcUTKst9)Gd3vfgB|Nf|G_L}Et`9iylk z$b22%?e=l}o! literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..56934cab8de9fd4346c856340c7d0894b2552537 GIT binary patch literal 4906 zcmc&&cT`l@*5BurDbxYxjx#_6=_pDOupwdSASwdV)W}c-q)Nv!DvE*~qI6LZr3ngx zg`$9>Vtrr(`!gCWF>2H()_3plOyZa1k8iEF-rI|N>fZae_dfToIX^tbUgi`*1yP|M zlmZ2+0A_7I6-xar*H`oX^po_q{-mV8?|Jv*%MX*5mL00-cz_?ZFQv}=`%F3SbZFunPu6C01VzCO^Og%0gERR za)TwYq4Ogoi4hbgP!&8zf?tq97Bb-coj;KY9XL=J&<&670OS>HVIEkn?3n$f2W^iGa@S&#Uwn&qkIT zMvy!UpA-(gVsMcBEjuH{(72G~n6TKS`Js{|iDD!@hzy+{5*M8u6D!O2G$ts)3x_&YAw=V#bvN7<(6$N;;Ngp0{ z8$-xwKH5_m2kXD%gMNND#Qjqrz*G%hC}FVSX(R+5_z0IjRRb%0GF1ADp!_)xmK2H6 z@&pBJp=dI6?l*(eB`%;7GZtRV{DdynkAqxIAu?{$1LBD(aN?$c-3kKw>h*wYs10g- zJLp}=gUx}~5Vg($ruQ#|xoKL!-KGZDX4JthrBoPc+KA5AX(HYB8sM*U1>N@Z5Z`PG zu|Er-O=&J{U7!i;KO6+7f-nf$s{M5{kCLcumTp;uD4`^Ys3hXEJASROw zxh5oZA2CC;Gx&=f8Q z=fa&&v5=PJ2L+kC;B(4J$akrN9qzVpuLwi$mk+|2tZzW4&H|pl$bdc93_&ke6}{T| zD^d@AfRc<5#K-D_^X3_lQ162^MKll_sz1S-dP{V4Y=JySXBzAowTB2&-vN5_^a;;h zuTYEIS?HgbL)6UglOJt*0pI9m4CYrKKn!sfYu|Pl4H=%pEaD!s(|RvJ_sW~-OxY7; zF(Mu-jSGdLmI|C}WJ}f*_CkHfJEQ|1*yIytsM<{t&FX4{c7p_Td_^C&vIU1tX>+hX zZzp(a{Sp)Ib-{OZDuDF?Kh#vw4R^A(!RHML_$brmn9amVWMr-e>+#7(WFC2pD9vQC zyrtjD6~t@EMUT8qmUKM>{#Zorc;}8OSYmiG)e0`UZ=uKHzTCq{6G%QN!o;P=;BAc? zC$ENPpDz7XIFc|MYn|qT@)EP~5%UJ{H03)eCh``x=;t-qlLgC( z=INe@GiZawD<454V&br}Lkoxx)CKa;f$5~AA(~fSTgRUk<&P>;Zek-1bl9blM(D!@ z5iTCTl&FcDho!!8L6_d&R(Y0jio2#ah`SFxRotr4&32b6D-?aci&TsXkvTtyCFl2q z6LXuxU3#^ZQ{z1fwVgG?=9I{D6UG^{nw0CX8@KhbWsZE6c=`QU-gl2Uep~FxXkj_% zo20Q*$P{S!oQKU>(#c+F7e1r6O-|G^9@QVlBGojQmTtq4}$eVbg#X z?Ac^OXnBs}nC5JjlRw^yEew2t?z&Il=-nDb5ppJS=NhfqzFSuExUJq;(}OYS?a>%Q zbnzy5PR#_bgO|y(KeV7;$s6!Lx)tvjt%y4N{INde5H)3PIac<-0+V}v9>Rt$pttv; z2yFKf(QVuF#1>CoQa)c35B?B}p*3Ed@Q0(}$zZ44an)0B%(H}@B?%r`HN_Cd-+xM~ zjn6@4Yzs_7V>8ZP^9)=|R>0`YT9!xBTu%H;DcAGyB>59M%azDg3ZR4&cu(zB_G>>N zgX|~6Pfc;`-Yr+4bwUr;p3{r(x7AkG-~lOmU@1$GKV$ zI(g&{fgPE1;sj5_Dhnkh3glws592MR4nMkbN~F=z>a% z3zQFzNZ`~q&%*;kIPpRwv0wm9GK^cmXaH;Gbp2yNs%;c-1ZAstbODk@nGl}`EG0k33Me(?0 z3Qh7A4+?>^_p?f~R~2WxoG*Ix;up+gOc$iuDhifvrO?2eC)ndtk74?41^jGR1EEy< z0<^sCv4MF8#A5z*)-Jvi3ipzd^u44{m3pUOLfuvV{fb2}r56&+j~y-5TjvdiFz>?^WzbX{|e0=g=)zc=(O zcP+Hvy>8nW|Hg=cuHEbNO@q3!L=Jm46poMRs|q;2XJhe%_0^_PXR&6PB$Wk>9JW-Xi%Vmp>TajaR?(S&?7LnIaRDWMNbEkc_x>L>8YVRtq z>cH-rZMA;;gL{gcYV+#m98P$2y}LGl-~5gYu7>m8f`*XOxyEx&?kzkJ{*Q9EV&}S| z=BWP0$Qvi?iVwxbcl7**9?)S}B12c5Q2!_N?2>F6y}F;0>dn4z^N9CD26wU)%y!?9 zfqBZnNNGeh4K>h0Lp2m%t^WdbemetIBn8dt++jb8QledhltNXX6yUi?s?c5RP&sOG z#QjHf7&qIPBB;9TlZA0*&TtWo7gJK9;SwhG+2{jj`J3l6J_50ApY=z8NA&Rd8YZLn zZ(qNBqa0+n66Xpm(|8j5OnH%Y(pz4zW44;HZ&R8|i1S8`Nhz=1s)V}c=(^=urmKdz zZyOVEFlk72fk)x^$g551YT+}sPf7pv)sR|*XPHB`z-lo+(rfp$LhIyr{DnT%-qpU% zi>WBzx;aNuUcaNF{To7h@~oB!Vgj3^hG=vog=nsnGv`p8NUGP>OhcYLtnna4Q3`{U zl+%|yq#~UdFoMB&`s=Io5fPqk=@wGp_{8}z-L3Z&l0IKEM0-5AZ+`P~#1_~T#7mB^ zRWbB!D_9hAYV#g=5U&+XElP^KR5`2Cx4kGi>e@cZsl2Jh zDKR$=EgHPqUYr_t=h#a22-}jhML(R%G4wlHlAbttvCJjkcKhPwC)XOnuN~dKB=y;y z&b1@HEnS-aa2nIS6R?xA^6e6;Ix531)Cu$fBYi<@?IW9&G?OFaat9CL_`=y((I&PkwZuj zmuD0FZ=Ugx`KD>1KmD($m5?!$iIDR}hoK~YUZs`(O}8+bZdEkUjt%-J{2d`t{*9hQ zhsA~`4O6gOOt2a=IBIQh*@~tle}?!Q;J->;w8095ti+IruowwN&_qoi`vc+A*Ywz{ zW=AtNu)>62*PmhyD}T+H7`@+5hQcV8hYe#*2xX_la6*hm_;OnM>a>4t#=i{s|0W9p zRgtF1<3G(t|401qzw$Ew&t4_L+lc~I1CHRW1s_19DO3=9dCl~jJp5(}cM_(ISc&b$ zV(~2cVPu+EoGP{t7dwl^;bMny*_u7AFr67Wh{bM9PtU*f^h`EPWIQmA#4{Id!4@;jgZ+GyL!Bo@1^9bKF#9o{bbRRx z=|<+yLFR-Gf|g-_vLUngm3?BMpH*zAOSo9V*vsJQbZ80qGWVK2o0+>a&2Z|n(1+uR zqSx$`Bz{(L0sb@M0{vyF(|EIn^K z5Zk8?pOY+D#){Sv%PKi+BujF5U6y)4jB_%LBg>Pqr~T8W%+U;Mf>1kNlCUT_ENN^= peEe8RLPA`sc1l=6Vq{#bwz>5<^KoV(4^4pI{-B)z@TC7s_AlGe7Q()!s=un9a5mDN zAsIKOznY;L&llyRS>Ml zbh12q95TENf|vf)(zVP!m?)^9AleNdNmp8(5Y)c@)@;q2R@l{HHFURFMvG$}2s2xR zqUnt_!t&9dz=P-9(BS<`%yL}%&{d`t=vs>%`by?Y=}h*Z;_>amm?A$aLtX~#oTdnl z1t!yb-xQ-yryUU#m{PQrXCZ71h^A@_`%BI4Is(I>NP2QxAdKCx3MR~53zjt5fUlQ0 zk=bj56|T#cicOByph@vhVQc6BVejjAVXD|ke7@34Fs5j>nW$zos=D15{(2~rRMa(t z@S*cSRC@_=?q)Tt*SrKNsQ?~!l)@tov8Z+7gf9NwmLRku6Nz730NPiz@Yl9<5^b5L zXz{r~7KQhsOiGkQ;_GAt3nGOn_Z-24yGO*%%?SJuzD`i6N$jz0h&}MQ^@uX}SOx?2 z!4fO0rNj`AAW`tTS-^4L4qB}rE7U!oDGI+o$mCXanV9-q0=f_+ab`!O;6)6eCio2o z538~T^+lIK&44rL`(^dS9v@p%yTv7Bzlf{E&+sB0Z#M!x&!B+G?iW17x;>*qx zXPaE7MM76+i)Gu(xgR5Zu%0R-}=6676Jr>OF zc@k8l=b$}HhrsG$GpTp|eyVG<3pmkWK-m!qFf1fbth#m`-0G2UuNz&3^j}m6CKlUJ z0TY9eCSODDn#)M)dJPo}*M^XnMOy^AS52_-yIDwCa@fM_$HicB!yemEI}a#deUdm7 zwS|aVa1TaVlR}^0;;5p7_u=a=b!4V_50L7=6{e5cOQakJM9EDTaAE3t6zhIP@=f;~ z;jnm17?&Xe4W`@S4D&^V?=FbM4^@=a=}I(1(cZ1l`HrZ*Qj3x*)v%_|Z*bw!`N9QJ z+fe?obovHmi4w)Vhzs-dpmBXUtWQe>o^@*S>K#XP!|OP_q?m)w&QX{wu=J+HrG*09 z<>R^>y)_Yo+CcRNoWwQD9 zYT&eefXQKRCGZfBf$dM9q0E1p0E3TIHhm=nn_fOZZ^jph!dfK6(^EQV-MSXqjP^rK z<5c8Si!geVMF32RoJrv-b%kOUP)5 z9{oMN1`HfDc!>8kBNzmi~n@&xbX=SG>J*c z+KDNtX_L}3GPAOQ4oxO0Yn?Kun|UrH)8|Qbw)xnG!ETEEX9}HUDpoGiDW`For~40A z=*Su2BHh55yuQDUPne}ioSm$d>Yn7x$@+Zmyn-(Z=Py{eXz`MzMa!12Sh=dW;j7Y(Wt%o{`FiWN?d3amR_xlnXK&@cs{P*_`1ZT%nuFgT`r+`AqsNY)IC<*y znX~84U#P9SSl@8z$IFdBU1_>{t@(P(jn<#rZr-|m=kC4s`wt#I`sMMHr_Y|NPFpD% zWNbmfovXMgGcAj2N7`I6BP~}i&}L}43XjWqDJ5%TWE&XOKNwXx-|ODZ%m-_>8{_u2 zF$t3(prCgG$BcLUGMCf9DPl)W?d$3~5HA8o26KCV12+b616Iwl zYHsk0sHwiTy^pC&rA)OAZ{A9v2&#%JDP9F}}|x;eprZ&nv+oFd6mnOODs+ zer|k>XI64h3ajDn^X9QTnBqG2kC7N&RbaG~_mB?pbsa||{_&B^mZh7S7?A$+`XurYOno@Iw~%b*M{2DYbBxI%iEzPPV}|Uz?MgmYrqm?&ad{;-*k5 Q0eJfZ2T1_J@gJUl1J8q)CjbBd literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..842ce2caacab4c58384e71071de8b40ef03e2e4b GIT binary patch literal 3138 zcmc&%c~nzZ8o&3xmlXntyqI7VsuB@dL~inR#F46h%sF$W-%0NF?Z12TeJ8PO zye~sDuFMcMo`53&4nZ3{?e-qKe>b7w(9xn{j~bI1z&J8ePt12<_zYhW0NjOEo=j+! z1^|d$5gus>DWu#76$~x+2Yd&)Oq8L?*QRA_3bPp`rKMY-v=%BOl)_h8k`rHmK>!a9 zcmm+hA==?HLiU!42=)q5CbV*7LaVG4hyCIyHt_GT5ea)U{VZ&EiPn%=q}Sz}(zF_r zMv{XUIoh;LLtc?SpG)KnlwN?!a!T2T6#s?Cm!?-@$CTNrnP!T+gc89&-=?Y$&7l7d zFYpf?h~}lbO+s6D)j3VdO@`ZxucCp9{FMf8gsDR zp}>0>zrnr^UFmT{A4r3FEemQv<@7tkq*W?jo@^_yop72y5vJv9C-y^c%sD3L33rG6 z{Yzn2=u}=?=_rZAZ4Y1$jOWj~83vO#uZ3xgHh}VO7w}QJC*80<%-*MBrP$^~6PlU& z5Z;IyEgW*~Hp~`#iqF*t2ohHkhkZ`Ap5PmX;d>u-ZQ?H7Ir&}Y^oGg##1eV zOX7uD-+F+1w~vXvt|IVR>_&m$<^ig9>`>r$>w8{bzX}*;im>h9e)&Sl+w!n^}sk6Jf1do#dFI_bTeAi$Qw6D4dnns^RpH{R} ziXSDwZABHGy7e5@((S`9q|Whzqul7-=V@xhqczUm)#F43@*Cji-xn+xbQ;v<7NC7A$HK-+2T4%-L0+G!-r!V+85!czV02`WSb60dxHYhN zsBuaIGQC?Tm{IA%3!M>;G{qWv&mzY5{NQndiMmMoqNqk-{J9%;eNuuHRn1NTU#TP>|2;HfEJGbHHEk8$X)pKeIu>!I}l`tY=^lq`>CvxfU5f1 z8??^egpz$P*?!!wKsX`Q8K&fkK!^PfIIr(=DtHe>;_sBa{%7jZJoz2RQm=1B?e#j8 zS+9mo!+wR!jxQCKCDfwg6S@5BJZF?49!#|^Hi529J7IfH2Jk0k4!Z5wR1{bL>Q+%ejwIY}Gzhj|S5^z!x@;XBfAlz+hJ*8|6l4H`FoLNKcenHU-t9uXNeNgX{oCN?g9O2X8{ zq~w&;H~uzldRn?BBU7uJk(He@GdEA4Z!pd(C^Quno8NqE_MEx%=D%ICVBtH97B6|X zbZObLh$Ml&bEAUuC?v_h4zllFE4g|b*cOEm8;i!t}7qN6b!P| z_lVwwQl!txC$(NzNay7gngqH$9Vw!?Vm4FKOODc8j>^MM%H>=ic^=G{mc?F;DGd_H z0l06OUQ+d~5;CDxhE)9*PqBgLtA3f{RaM{qSr5q5<^Dj%FU-u==`{?E`GvHfHv4AD`FpB6M;D*@u|P7)l0gnq}2wP^FM9$7e^7eQw6c$-Ws$ ziACD5z}%GN_-w-0$Kv?jnu!O_UI=Fd2Z70Ak6RL6Fa5&QB>#MEcowT6`kZ^54yF|A zK7Q&{vX3HzC3P-zOFX&wOl8s}`5RJ_CmT|exzw@0I7@y}3BBXNU}1VL9s;Gxs0~jv z5e6HnCi0w++3G(olEWK=jmZi<%T6TA@H~QnF*Pi3X2!@jF(#vk5C+*y@W&xqE(M>d zI87Wm&JOcaSe4mwPh7A>3(K$^k`^T{Ny|BxdWt@*2>apkB>LDsRwY+skpwbVqo!b1 tkfDMrWsiGnvO^pskK&o^_N7105SRhIj1S}K< z1;Ij5KoPqE6~P*f2Fr^*v4HyS9byu{B!7Hsz4hK++*9}7zrFW4Gu+>}N!(2-f?7lc zx>AZ1r~#NX9ZhSGjO^{q;M=S}{<%h2IQ;TO8_#zy4!sFJ=h;?d`Yh{Pt<-rbHK#tB zKDawUao`{YlqRKZL`+zhlLTmvn zzF5TdmqrFI43Wl$P?%64e1e3pkX{3wT%;+j*!mpudDPX z-%6ezMv#0n&xBlhMejKIdxi!|fl&eR5kZl03j?KbQl(IO5E{5JASygQB2u1n4<_7= z=@k=t`MBo)1Nd(Xaa4rSj$`+hP4uSYojjs(K{Q)4UjrO2)vC*A(H+ zW?gvPZU_OR1!!+s6m0xP0J=Ht5Oq%%z*rq#Dr1oGb13*7`UqD)RRAk^GL-pRv;tvkuIvE&*|+DHLM{P^eJ{D?eAmLzoKPjudwP+bOVlwgAKj93l0|k7!A}DjXz6 zK}0GSvW!S*KXeF;Ryu&&2Nu+47=m5~3-;$q;p|-%=(s|{l_C@5Qc(hzsy-N#HVj5qnZb*fYhdqneHay~ zhTataiiQS0L~#ZPq9b*{VaqItsrJO`LTZSNicj#i+8iApo2$SXIRo~N*h?%Lx)Vk% z&?Vej}CQ-StOJTI}C47^!A((bQgb3n1*1Y{Fdar*0GmCn{PU^e_?J2j= zxss>IY)CX#6cq^X8+PMd18cG}uM?_UKA@4{icLOcf-0Pq(44j=Xx58CCs%i2DGfMm zPMU{xdDz3xmaj30;lsyb&%bGa^m8ZhYaZ5+*18yb-AJMk+Q#~j84EwPuYvs2HDtlAd00`M zDQeVjz{i(;#OGh=!>vz@VC$qNU^T|q*|G(eWDENl8E)c>pMhAIoEHGA9y^h0tCxY% z(I%Gab%b93iUUaZfXTF}_|RpF;4zSn`evL&acW!?c*2UjVQhj(g9VtrbuAgt;YYeO zi6E=i3mIM=N>-fxjP?yViyQ6RiYNCt!m*gSSmO*wlpULf4_VNQC#l>;5uvxSrO(!3 zPZzHu>SwwkPM;MPt#S+viHO2Vjw~hys7vILLo-QfO*p@FUzK1+m=7vTxP=YZ8_6yT zH9!NG#JFVq3ZgP<0hajI5nbuOqxyVJ2X|fPBJKh7v(h$=cD9R5MKS;L4@lJ@519%w zSv>EjoQOLd?uzb4PNl~P)O6kio3}%O8#B(3Ri{#g-MpiVt(-1UjaE2_WqTGxu2B*@flPp!&)L|#KK?DSZn*$G71vqm@u=mk-vZ5}>ZaEyYS6nqjQDl7&&xMd1z z$mT0qb=H%rbH_n@S0c9P)D6x|=Xjpp&D-!`c!GYkM;#WozKXp#TZ^<`?1)Jdq{O(! zT)~~s9W3t0>)dPHJ?!ZBy{P-xYD_byMPPY#6D;jIs2%=5h>7w}<9Fk-NZ$>QkmoRh zWBt*W+}iX64dg}>5$eOCc*=S*Y<3MEd&R8e2PSfpXh}#YjQW zfYvC5!j+Qycs5H*DOE5OZIAm7U$Of(Iv1P68dL8M~zZQHXk z$9Pd&n%Hh{M&9_4u5#a#dp8wKh`xU&&#r89;gqCTk9tp*ZP__(HO3t}y?kqlU7DJq z->LGeQ1JJq&IJxt`SoEvwV^jpR}~zIif%dk zJGyU+{th|1+Jx%Apr`Lhr_n2VDA}mFmu?;Nc*NjNmVwEhn{qHWIT$Gmsi2{H8)&GS zg6`@sP!|gssC*e{R%8wOQIZkO8l()$yJP?_#4^S90=u#i%R?SKro*_^#1uiz@qj#x zYx4$+V7Q!;iS(11)aRlB&Z@UBW_<)=`vJ?30FUY6ixeiK{&&B;eyg&`CI#mT&6D_2 z+f)UyW!yWy|MYZqL$A6d)c}W08j}*V3qsxZ%*eBh{~%c6S>aLPRll4H^QxM6Ea8_ARJcz~ zz}akzWMPC~JzWveJwF< zL&9^R#V{EW<%wj97nA2kkuuKherlWyh!yXZ24E#Qpm@*k#47OsfgMHzibpi?M0)ui zHNfE|yjPX+CyD4-#WF~DkpVTJBzv)un`4oD_@K0Yy{f)fQ|{7$jx8e`v#s)Cg3jld z26s2*#RgyADOoR^njaT>rEE@_S95-R*!2U_j_j!g2@yAsEbZ%VE=Y{Jdm@EB#Ck{4 z(jPlA^}UbpSQguNxx_KYx^Q{?)9W?C*N+z_CqBR1x_-#Cq7}-}V*NnwMm$LVWMz>NrvWEfCTED;_W8^;a` z4`TQNO%7UAg#+fohH-jZ_4W3%8A|l$&-ULkfN09gl~R_!{!ipIDgud!AdIBBP>&La zkRmQ00%!u{{~N1$#C${0LVuBbMdn0|sa!;yFFFhX`s*sK^c%s$7=l-$Ow*sHN&iHD zRDLHz;X#qXaf76<6cc7(dZiQf4o;v6&|e|`0r>CI7iU<2h!qh&2$~w|Vt=BT zx|**0)oo~w2o{*=>ocHOgUVm?Ge+-^)1femnE7REeGE@1!#DtlL`J2r$_4oFQ4|JFm=Ii4g!t6)5 z(eW)?LN{`Mc5)|l5VQ>XlMk7_uk2&}ye%RF9fKuO#$FCjr$dYHm%01gxy;;!X$Di5 zhdvlj7`w<$oyMCpn4gR1*YVKBqUBHVFr~DO4s`U2V>Tqtu8h81 zDe(*b9A|mG>0;CJrSuZXbY|!n&eR~<&tcLMf3qaICgzM1UF1Zj{&a|gheEH}$}Q>0 z>FnrzUx{tv;5o^IWvpl&iM*17M)D*F*X61EMmWUNIPyFhd)hy3${fw0CWy47r7=t6 vgW|>pL`RR6#>7M=Y9|E6#D+#iYMWY)GaY9lcGU#<{STT6`nN0kce#H9l9NZ9 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..84c399e67f3a1f307e88c5325c849ac627b14610 GIT binary patch literal 3138 zcmc&%d010d7C-mC_d)_h5qKeqQBWjqAfSRv5qTgW7-Vt5&>{wifNYTjQ0s!USVXK# zaU1Kxh*n$}rPxtK!37uUhFihbIy1Fe>rAy{zp-^%oqJyU*Ws5-wgSf(Jt~`^NJFA;GKDeH{m<+5?-PSgaJ|; zPbsxY1Axo}laWP;!P4d)Z2>w_av3jfWZ;IcA6k(V2!9KFgq3?DnJvd0 zLAJ&P8fseMR{0uuR$0LG@m++2MvmgLR`-;Ad#WD!XEn2%%OnzRKr-$hT+gMQPU1d1 z{s5wzl(6MVG!|M8}=z#kpXmBmUd&|Z_?{O2L>0&QtwKoS|?|cm1!~Y3? zPN@SX$dAeALqJ+s&K{akC3&*S40l>fVc=CYw{hE>cyoh4I^4gJsXhA)y>;#`6WrX# z&a4Qa6G-gwTrWG;9gY?Ac&453H=VBqbJNbnmw zhEdOTgtks!iJ3Xag+zO?doHq!-=ivZXZ0A_QqvOD$~4*yiv12PxPJpSRYfqp+@FFZ z?}6RozNcVsb^+eE;tkZe)=3`Jet_#SWeBuhF2?!sDJUkY&|drbO}O2=sK04a1Gc=k z*KX!ocP@NpBsLTo*t&Ol$BUk0?8X_R*lV)wcBY>?(e*4 zq}*_dIXZbeqp$b|P42}?gMZO;E1SMWKd(2jxh}mSGwdUj9lM{&I0?+UuZF<+d7JPw z{|?7b+zO;)lhjC`BZJG`YtaIiN+zTZVf(*oxt?e0@dDL7=W?Grvi5o-POI0U!vlXo z%Z@LVR!rWBi%w)qZgFayYVXOMFSEe)O}kKgRw@Ks)Uh}248pepPogWTQruRma;Q)T zarRrw?L1Z|cRPN2CUS6XM5Xd9xKjNWX060sUgoMsYb+ly^J{LRHuE~k-~$N|GUF>r zwPS-r&DR^iyLN;_OOOVJ*-u6HzW+PUec}Mc!E(j?Djsz{zK@?xUm}b7(vkW8vf!9-$}8NIDs@jyuiow+efoO#^V0SoFmTXd zZ=WH)e*Qy;4G#z$F>=)CH-g3lj|~w*!^VY2M2?S&o}i1F7#kO#Flq9Xsfp9{NpHTD zJUwNGAvG=CI5Q(NYgTqnZeG4=c7fSaSX4Y`?!33>FIc#!WO3;`@0KljuY766vdZP} zuUNTi^#@gJ)~;K>;ddK9+_ZVi*6QDHtJ%Keqn)+8cJJ9+_lJG^>pyNd@W~$!9%?+? zbmVAr%dz7pPPU#pedcW2r{~UJxOl1k@|COCu7B3idE@ghZhm=7`%tOjv9-RB?^-Bj zxmkI%)*H=iPL|nXXUs9uB1$S|3!~_8R@`>h9(2`Kihbn8WWKU2c4`2H;jV3o%Kaz*Jmw|3AgawO#k-&_ z6%!A(JMmHz`ySN3==j&cf`V0PRJ#9hFaM2t?EmG7)&5pblA_#A!D^+7J#MZ5Dvi=n zNJxlH9BqBDlv`O}N&!NsAPDi~OKE~26bqplLWCe>2w@rGStt>Rp{p=Kh@vyu+h(#8 zmu6~#8WLjX4fgdNNfzM~$J2dSF&)F{KuimQjt-%Ex)u3?AKhoC4xQ+qnmDyEJz`Y0 zep*5%<>$r`d~;`!L9`bp8X+JES?!5S%Il(UPD%{OOOMPD3{+oqPr@Ojx9td-kPC&Q@_NpdmGKY?hy6mk|SY7*om zI>Ofrp~cpF62Vd}A|r^Iv?_@~TF=GM^|=v+#E%#!)hG6eD!m%3Bq%*hhJx9JMvG6H p$>d`wD9A7NC^8n9v-0yi`~!#h5Ajp!Gyu;(&_sS$$B};z{{p%vL_q)m literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79a6f24b35cf797574a2db8d32609d38de32a0f9 GIT binary patch literal 5244 zcmc&&c~n!^*5Bt2$xR@TKyFMR%!8mPgMgwq1epazL=hAv3^|k%uTkEa&_Trv#@BQ0*pPNH|2~mVP z(*&JPtAc1f8VmqN+?wBe_9GwOz$1;jeli?I8EGnaxGl>J?%laV^I5}L&hX1)>xTCv zXuRrGH#R6z$X6SzH>Vdh%XJ>pK+9=!2U<)EX%5YiyMrC|eosoh*9HJRSBz=J2q*K7 zmg@o7uEpduA#($cP$uQYC{yERCn_@&X-p!P{E8$iG1~%UOZ}5SnFEsm&=@d{km&%J zc@d%$gP2Izj6w(mJOnt8O8_aLfWtgG%%jUZ0_M>ZP@Y1@j1^IT$NDm-t^deHO1>x4P`(+w@I!g>T+ZrJ4o(+JbAw1W^K>aKUB9Fa+t8e!M zCt)OfIb8-NKMG;d=L)neLl>JX8woW>oZ!NCKd?+00?#MCMVAi`h0Ss)vOjGN#8XG0 z(k8&TViDj@`F9%E=kD^Ou za%6Sp0EpN6gVmW!kY3>msozWBwBAg}pCgA)Umpjbf_R8NWC^WN9N5)Sj=J~=G%Y%C zQ)CFWH5NeZ@CGN%3&;(yf{BNVL3Y3yiZDCaVXT2A?+(L57!B9`Xsm11C|Ey946?8M zAot0AG&f5GM+j?3$>qUj2ND{N9Rr6Yz7X;&2P)L|V5{c9p#mi|-O-1}D4XCJ z7lUdjx~?Ou(|t z1%7WEA%hsq7G>r&@vUhKZ1II1JMymJ{mzZ{g)(S9EqrfevLk z0rpz#C8CXXfpwG(5wg1noeF4%juD%P1GC$8206Ck>jLb-`T9dhA)2u>TTY_Bfi0Lz z+7s@AwpM6ZaSL56eu`WMq+>hN;-K$TDbBMSLmnt>gTvLoB1;IuM%Fu_{Q&|rx%M=i zvCTkr%iFOPr*K%mU%F?m?f^=hkvgvJdd+iS(9)ieSzXoEut{9$0SAn0^FVN)j zfd9CAI4K4Jj97dQemM{Tf(Rc-HXVZfFzXboxp)r0Hf0Rt)y%#TN!PI*KoW5*=i`TzGHXk1YeVJ&KIhj>nec7H zO8ESIIr+uz8Q4yZGpZVR3Lp068+>NVBYey`3$9geHdgL;1?hh zMi$1xvTlFUVEICDIC+{Qx`EJ(pK$=?9&j2v8aG;~3sYXNL60WXA(a6S#hvpcD>J5I zq+Jv?aLiFMwlR|2d~_PJziLG8Z+eFg4rsz14sOJ&CitO69e%JiGY=mS)rl|Azk^be zzQN{yzZ!cwXBklu9D=Avo>;nmB^r>Dh83TfL%gP2$rHzdNo9Gmu;gHwc*4Bt=*#R| zn5nHLcW06vdfh6+6~h)22hyUjoR@y+%B$O=A67Q~joxI)OQ znu2K_4o1J6O(A5LZ$ZfDTnIgWm2AF$3biR;!t}HGc=aFws%f8&wd=hnsl;)gC6 zzxNWv_qC#5y5`s1yyPYOp? ztQYQ*j3euO<2c4{dFZ;&R5I-ge_2@fHasB_(A`g?^~M{l$FR>*upzfF@KyQijvD62 zdvL0RA7NO?pDmlkZyM!KlioAU9@YrSP4)G{%uESCMdu{mt@fe^ovjrLhj~-?(AS(u zn+e#9WD<0~>Lv58B;t8Nk=%HnwOj4dkne5vG@}C3{S<-@E>~!R0;w|L3KpW!FWdeWYL5ezyNAuRCuSQBQE^pAJ z@6nQ{H(^JXS0=oULZ0&m2EYnXM#-RuzbEziZ_1-rV3_g`!cPkn<*$ z;|cYEzgkiU=N|c?onI8_A0LoG9ju7LBa(U0uD>5GF;wH@4kL{Joh@m6;V3q#?VuRK z>-cUxVovPQAXH`l27Ow)hFTl856iBb$GJlqVKICw{8Fw8uJ1U2+LB^m&zlh4o8Jz? z?zq>ZE3%c?#g#IhwML#KzbKDuzBx*-_Q4#aBR+*z3XqCdXQ5p3zD z1p7~wkxCtN7pBACevQy^5_d1+j}^1LjuwD`PH zvgY1-1uYYCahYV>cz1w&KInAx45*p-ftQ9wYSwJ*Q;T%m)ttyc3*P8rMiDQ5yjtxZ7G$>ehv20K*OjgPs3BTZo;v8WRt9o+D+76~ z)%80T@>v{G+nv-RzG!8#w~$&_?`~uO4KJH5M{4Vz`te*YTe?V*Oil4d7gHEp0uw&uRvc%#w?nX&_3bT#o?%#jhXkJ<6&qm4RP431i6`k3} zsg>QwjnmHb{cN0G!y7K2-yqA8XEfQLkY`@o;bm7Co9Mg}w_n$Cb^sMrw>Eh>2!_AiTcIKEZ?dv{aw(L#cOXjuVk1$_B8Z9ti zX`=m1B?Zdgf1s3sZ)1q|7{YeDhn&XxiJbq3G>4axl**F2_)H=>DO1IbPmX6P5W{z> zMUv7HlDp2fN9=4*b6J-6=WFI~NzHOZ&kAC$C6?WdqpEa)L_|DBGW@PXKoLsB6GALQ z>|z*VH~TlX_L#k~7-2594@jz%HP?!idauJ0yT7h7O1}|Tj3KVB*Yz1%_uBKHXr2D= zBriEWH9^%+W;K{(D`wj@%J$ol46*ww#6JN4T@7SvH;{5NV-w?3lr+ImUK{LB)Y(QJ zbjWZVQyp-_q#qsx&FNSEkax0re;g5w(VQSp)|!xNkBep37=!Ts*ldg#|K_ZJE%3i5 z3lgzRE(`ik^ZNf0i~g^?)&H|sNpfJUL~Kh*wvP}4$mCL8MQG@x@R9v*mb8d)WW`P4 zrBEm)GY=~h6p9>$SAxPqs%_qrr}j zBbb5DM1OWZK8J06*oJYfPz14!R|GrN?kk+w`MemHiOwd z7S3fFt-tYFCrl8G^!w8e*}V_!Gb6*@Qsev*6iU`!3(us(NW}Yj=+vp~JdkbrQ`d&x zAJ05y%}b>WcT0nTPy__A`dX!& z_xO_owDo3+&B*)GD-_=B&@Y*-L3*UG$J`i~1x!s`StX{(Ic)uz5c>~>S@Y6bGLbXc zG5Zk;ublpK(gw>~F**uuCHsxEN%pU6Q;$gT&0=u0d9wD5f5wzOntn}?nx`u>=4ZvL thQy|)4^d`hq~(}r$7f_FrKOrXyAO39>Ld%21N{C6X9(suIp%lA{{vSbUhDt> literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e67164596565a48f5cd69702114b2db7228ee7eb GIT binary patch literal 4782 zcmc&&2UJwoy8ibmGiP9E!r%-JMVg2pMX(nJgrTX3QHr2cDbl2xD4-Y+u_Fjb2T_V3 zAZQd&0Rch93o4>fG(=;Iv7oW*J7_uT{nyN{fBXJ@&YJy=lgQbGNASFO zevUjY55@o(H#@%FRrYIjmd%K#?Xi>L&tew-bhaqo+Du9a#WgNb6yAD~oX4)=XLV1rqWIfjO|` z1$>T=cbMPGK<}tP9;Pg-dLjA*{SNeHJd&>mTgCM_^zTqJ;pOm0(F`z(^6E$zrhWaS*s4`2d$cRsbttCLDAXK*>{iShr7v5+k`-uwW*%)*8dv zJTYj78p5+hAJN5{$&fDOqlwMhKs+)6D%=4qHW1KXqYa!18lb3T0e!*pu+x1itWL9n z1%tuhu~r>8nTl|AVKtP>#(|nqJ-SdWL|QGCpp<3{S}hkKqR|Y(eo=;ISr5oqC4{Z- zkAii!KX@I|gkDb;6b{y*VJQR+=cVDgf+DoFjt3%tJ{UKQK(d_{EUYO8L8S>4VS121 zt^wjd)xcet0|R0nHvH`@*s(|n1c$|t{P0H<9HRiWgf@gGb0FP-gpMOez#!fRoZhma zaoa@D-Nu4L+1}87OAfj&k#MQN7}-}8gLc$8c(=|5wC)?hWuJ7o^)U?AM!P_Eaw&X@ z?Sw3GITYH@h2eb|dY5<)CZ(!^X0<6ieU=0Vu1)~$urcUG(XU9w?=FhgLl6p&jWP_2S#@ zCW6VpT?i%4VJ(^8q3AIjDUOYAlwh$oY;?Z^yZS4Iek!b z`YqA~M{MR9V^m=$gBG_nLyK-CYTwY0Z9IX)bt7Z+cC(sK&GvSco#wywfPnn-kM zK^LUgFGG#$Bha7}i&ZDj2etVQ(0ox9*V%Lg$l3?Ox6*9MUIabhMf|@3rz$>~xA&>L-@VTePv$c|Av8u7%c!K9t(x7S=9%Af< zpM@xxnd1wI4{XUX8@>jE@0wW(R}mWd6^C_$VlbXJ2Uq!;3p>^>$BsuX#gq&Y+2kNb z#m80=?|HrCu_Fse@468AlB#MYht;m=VC)S{O;?j$5Tu9R_X=>)l=VbqxF;6(N{lYO z8&Y_Z)Wu2Z^Wq#vk7Y8(b+GLdhpdDt|v3_)A}-~wci!%m-AJWvxN$picFGg5 z&D#L_$yF@JXb&o4B!T1faEA05%|uyp6BlH00`IOGV83($(#vut{M-=E?%Q<*nx=JQ zExY^h+PNBXI@_}d)5X{Ek_onW@B8oMXEyGTFI2W5+im<<0v21_n0Kw%l1$#GE@PN~C^3|6;l8IA9|oBuF2ANv2*3#8VyJ z**zP}fjh60@S8st8xrfGhJ7}2hKn~6)W`r%T~pz(hmYV_t9vo!x^POhybOfTWx&1U zJl=E7h!usOCoSxCh(i|%{M`&v;ac$=&X$N2eT}5T(#g=#ABTCJxlS#xi;>d({wCZ}i=7a$qydZGT+Lp! zSDmz8CC0pCy@|<9*-As7x>%eKS20`I1o^bH*sSV=h$|$eT;Gt`;9$^o=9>RhlE`F}9j*u;Q2I@;J zv1^{X#5YO{SfxtVD8M;^q^}yCvb?v_2D~EfA2%g}DLtQHe#B@gWou~B9scoO2z^7s z3o~Z5h)Zx10i*a9MH@Ea)EGV2(?uIMvE?;wvn`5~lBM~kfrF=uH*J>FuuSEPO1|Bq zsOMbn+FFvlb*!<^*=$j1%Jy;7BkvEkmTumuZjnR@Ecd7G*0N34b8FkbWzPhs65AZh zvb4-euJwV}+RCNMVJv%HRTL+WBJ@lFlvbh9n#TkQ8vt~i}xcS(0){_J>+XMb7Gxy&)&GSmKw z{<3P1mL#6TfPs5s$j!D)hpUrUIDPZxEoqQl>wP9oFe}9%r{4ekz8ULShWi?VE?2rW zr5NTO4gJ4I&mD~}lVA{oOCk?3k=qi0x>f1MNq8E6yKT!jaZTRwsQdjD%e!jwo8q1h zcjt-^7qqN<{rtiAU55)#B?2xbw5=^_-9#zsdz`H;Zr?1gWuIqTSJJVSKPC9a*}BrM zof=kYLc99?-Fx(wRC;vRmz~Qp_U*~DJ96Mc?)0dKH@c4;yjWpdX`Qc-z~9qDN2>soAly|+2di@TR7fD2?NI`V1&P{qX~LYw{dU#M zkDDffc9q2Mi=L4xGLa%38m1-!m8R$*_<$0%J8kmCM+;i&SB=OdY6sBCr|u2DzA0EW zk??RpiDZIscQu`G-BH!a3BYwYX6%bTP*oR>(Hn|27Yo;HlFAe$Jlz;wv*bj~Hp4-T(-uyXMP!_eXg5rD&%3KjKKEG8#E_0D2!nT{!v6oP*gPAKg6H8 zUDH?TBChhFq4M5Ix&u18NoW-oFOjnU z$dmkEdz6&#Oi`BArIa_AC;Mt*F0vGfM2qP!BMu@_oX9dj zWFryM0LCdH= z$&lIm!amB~#XQVU93b*$>?Pr8IJCHZo;xpH%FOMVW)!+4_0fD*(`%N|-Y(|hZmtW% z-CZTn>3EAr@!5a%Wj=JVX!%n{-L)Fy_GBf-ho)Bb5w=4eJWfv*wa9l0jPKibeYBErx+GBP|)Bi27M dDkwZm!({4YlgY*cMP%$Vdir5fE zrHC{^nu%hgs$d0IBUaX+f^MSG>{_C6-y4F7o8*uEzWp}$o4NO%dr$kFdp_oehuqVW zAsIWy!JVNQ7z|*za(T^**yrnePfWR&bgVbyQOb~p1?LXQf}d`xo03}~*;L8^qs>Un z7$Kv{@ED$+4VcgaIud%o3IK#7F*hSj^nk1r=`o~qJn%H75z>n14`(;w z!Tv!pkqJ@JDqwGs_!*L2AY{+5-hUDE58*b5nRc-|bA?hN&J+E(pK{8a0dbYHaa~w7 znNj*PysR9DPL0_mqK8j|gToGz3w8Fx$oWR3N8yjC-t|1(n3zSD`FDy;%`Xv~Tt|WB z^#>3|p2sJ597V5^$PrUqGtpCIx%=-lcMzo`UumMD2g$ zhSkPlB7r#$&Ldtyn^ljnw{HTa4eBs){r=5r^QXOEZ!!$T?S)b6}{1Y--TMHVFKf&MqD}k`JAYvFh zpcB%_V+*(Q-ef7^VQUKPK2uHYDV&S<)mftB#`T1C@jGJPg-68H)5G{9QWJ5t`8WLJ z`8HGo|9dLZ!k0J)N|>Az1grX7sKKk3f!Wa$yg}a}^x|J^vjr4^#b;9pon>m^^)?;t zRHhL_=64ZB*{$$sS~FUzaSz207NDr`J9x?W>u{!@4G4Bno%$$Dh8Lfy+`Fk+6c4qErhRDj_@sWiTb*BIu%eIsaa57 zp*=0a2NfsY#zSSp`To)c!nc>Cgxq>1Sr#)NC%tq=SAV)YsKD5TX#3QKp&aHJY}ak& zyCrL=?S9{l2ASj_OYJP4(EAA;b(a>byndW6^BRU8G-^_Fb47xl(8Ws4~T_e0- zY)8e4s&H0Ii>CJ$1r;eN03)-d{6s1ds^9O#b5}I-*J)5Z-SKHCsd^c@lr~OX^N}ee zP4eU|x$lI0_U%CaU+#g;T?U|3iiHLq!)WuYOrfaxI9}}c1L}60K##oh2!#pFgcoXT z_;a?b(ToxuCF&lGMz2mqk4UpBF8suP z@MfIn73OWzJI) z7brV7Bl6H?lCXHOU8gI5iXc699cm@&`KB|!M_)X-346CY5&8x{126h1f6eeSP!SP_ zt1>@D^*c4yZ99%ogFYV%t({3YW|lvi?wTN!-MRxkLlcc-z3OoM`U?J{od%TSB4-?s z7(iBj$*5f(F@^6C;!0i<6!BwUccJUYR%5;F25psB z5fW+FqZIXUxSD^T*vT_cPu12zJL0zyD-Ya3?aFN4=p&wBxA0qDzFM8=z=IqxE1e)} zu+;-=;T+WW!*d+Ge2YQzs`W08LM9%SI9z+LlYh0I&N8PiSw`ddkIrI(OgBe7#IWhiAA;^!*}c~bCfBP(>%q+k_b zT4I;CdJRDjHt}i5TeFt0IoxHJU4BZcNMaRsqalCodJRKGnnYgkWxAG$XNgZ^LF&dK z7J=t>$qUzQ(w!LBd!w;%eWrn33N2L}NXr`Gvem@5=|K86V~+xt97WNF9ix3}!n&G@ zHfEbIY)X^bAI!+H4(lxOZ9ceZ&jjdhk=o^u6Xsl@7CGF`%kX|La66bmDjkI)-s)Ut}4HI zz2*qFT$k#C){PSD#kbE@7oN>DwBMlTT63Unn~7K1ytbO6^Vt?b?YXYC2QTiKsC<08 zt+x2`e!I0oeYd)jjzX7hrt{nDO0O1s6uRwoJ5qM7+^2T&o%SQ;-yB}pyg`53SBJW4 z0Y86*)HK+qQLI2+( z878hWkk#wUE~)6(Z(HgQ$%7p;@dLl3LoiSXZm>Kl5OFOC@U(y7c+yAAPD0;MH7S6E z9?;=<((g}M2fwgAtYUkWqyCI0X?!3}ks;Bc@%;?&F%mySvRo6{Ib)V6{oehb1pOnd z(v0yS;VFZ{LZSi~k|l~p_&0phNYDL{mK{qOVLX<6TyTchulzBOSzuNpeM~+(|AEmD`7^&J?V|bZ%!a zm%DN+D-W!!q}nPu51b?U%;lra%_p)O#~Ci1Y@fuSq-mHFnZnHnmhb8q5I2I9|hRn?m9tT)|AS5P$f(rqWT|uQ_Nl?~+2r5f!l}$j_1_Wz0EEbB2xD*7d zsGuk=c$MO<1*{70A_^*kMas2Ww_00q7xm5wm$rUw|M;Hgd%io#nVEOyeShyVlPA9< zP!Z%Hrp1BcnJTf47={8^ZgM&lWX{bGo_gdJ#2!z}nfWUJ;ZCpEnCX+;UitY5M9Oh* z&TsJPSsf|{u|#ZSE!HD1j+kTO43@mXJt=Qc8vq2BnDlyxqM%Y=eiNC9X_*sn^kh2PFhRvp4%?4vTLPQ`HTgJj$4pKpTmbs4xtwoE!Z*T9(Pe&E1bybMrSMTBgbK> zcymfLyr|z!@hzt?yEnAKzN0VEC{SUSc~fD;R6l6EFq|?h zdWFoFeFeJHUctZS?gPrzo=OqB18-pg{mr~$&darF@Z*VG`10fdX5+RG@a8%P)L>Ch zjoe8BE1S2M?%Im-Q+Nwg26PD2{FY>0xayWY&u>`%b@P$NhG zDnflv$z_YE0{i#fD5Fn=5Zu2UnU|h`+kVX`eJCGAA8}=_+1aDj;m*J>F#yJL4z`$5 z%S4?DXM7u_u&#DCT4&9`*vm%Dt}}1Zo?&Mw>pg3zPdmNf8}%&Q;OB)ZzMfC_i(8qm zzw&1y4kYST?x`{KO9(|fGrI8z`6%w@cuUmZDx(zRm(aUY=HSd{Ug*-R9{q>8r}+78 z^Z5Tj4@AYrC%C>EUE!B+dy&572IOG4mZKALpO@6b<1guG;O!0`i5kz@;}13o_-fm+ zoI2ea{9TVZUOLrKKUJ_Fum9*SFJz69Nt9NCx%DD$29p5?-mb?Ve0q$Vugh@ip1hzl z4}5}J3nqy4Zd*a-B#_6DqPl8l(Im zUhEw!xPSYYPV>-Ha3pXOw;&>NMD=?XF#hHP#$fzfRKaz`#>T5DZvI2?-joeid3!jj z^bdKdeHwn?y~%>(qq2rD%Y-n5qNy``I=D|lfSIRsfuHJ9xNU2$K*Pi{`0&~`YX1~7 zUGr6?wBw9xRHcPC)!KhZ&*fl|-WG{F)9e||F?K3I9n)qpDI2|IGw+sCv2lQUSI!&a zF|-KdFOslLHwMpiZ{d66z4$llD|P;bE^@-n{cv9*1A>1%VG1tAQ3a}S?%AAb5Wats zj-EOU_jp;Nx-UI-ZD!=qyuJ>K*JZ@#-uQ~TTGNUp2U2*$tE<4|kqE*oTd1>LcAPY- zg>m;ar}tiu~z#HQf2@O_^!)y>LWE z1ikMwvGBrZH}2FJANmTvikteP3w3;xjZI3A8aiKI1s}KXH}f?gkE9z;Qs2(3qQX|( zK{JQbyeY54m^F>}P=8q}on$Zq;yiYugy{#U*yBKz{$LB|7nkEuhfd)?hpBn*hZ~`= zL_W0Z?m|lq3Muz}2d>mbp<>IrsGQkQXS4L-R0oQEZoFPp& z=Ock(J<1iu!=;KF)OwDoD9_Lcm85?`E!ln@olPs{j5`s=EC zM_o-|yv}TN_t!@_>7@WNr-&p=ip8kw*?s)>qkO)9kC6KHR4N+YyAhcND{ zpR>(y8j20lFy!m6MNfQM$6A~{3;UFKeqnOGUfJywWGb)S%x)jU;c$fV5Rp4Wz~w3aiO(w zT}@RhOYP=0705hx6mA$F*IpggykphIiO|b>=WB!o&F)2f4ehbhQv=eXa^k5wOsp}< zu~@y?_*3;Ar3T;^qPCP^iZu4m3EsIa1P_c1s`WK`o@)qHCDHlJQ3JH_OGXpW5BaQ& z`jO_BrFp{P%3|-fQJ$+z+;R#z<#n0URZ#P6!@M~9cZ#u8PhedLM4;U4Q%U5@k5Z*p5E@xL}|q!i>vTy{0z%CZ#tmn*N>Z}FI|`-E8KmZ2<0V`6=Sg#6NHs;vfub<3~p zMXz0ZomY@Zjivi;Oj1kEmp8~XzvSrt&{$|RQT@}MUq_!_GsuY7_51 zGH)q!jBVc=-zRU~LMbJ-GkOuJ>pz*T6RM zx&4jfZ_=ozX_P#FoQW6@5K8ZN$rnEi)(y6JwHN(+7BTA(r$b`5_zmMQg$^|BU}P zUjHg>S)mh1IcZUGF-Z|(n$T@?{0G)+l;0)5tlsaZBF18l z%9XXIrP>o>nLQ>R22aY|h~S&C_*%#RGg*-6$xLLb|Kpz&|0Op6|8kH2Yd1+!Y%kH1 z^CUlyH3X2ENQH`^py?ql0~brWm9}G>lR~LbC}xm{ZGH+xra~F3@Kh*b6&|tLH6>B- zXJ;M?g%3Ltd2l3>cACZ_ut19Gi^tj7O(K(N0p6_ak;(Sc*p5Ie6e_k;hOwe{Ut!P6 z36YKg4v`_V7e;$dP6!JPievYa0!e(IEFhg0&qE7Bf*@@GPdjDz-my;$4{=J4_KHC3fD0FP literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d0f1bd9b4393fd8255b1fde58a8b75d36fb67909 GIT binary patch literal 4341 zcmc&&X;f3!7T)I$$xQ%>l3)TT$Pkek1XMsuf-RpYYQp@b*jap*jBCfX$9ro8;Y&3?T@$CTd#X1XWGNr-##a~U*ak8G8Yg6 z2f+*v0VRMj09whpi^mVWOwzBOKJL|&T>m{MUw@Ije0LtNw8Ca?-;#A#&j>)k7YL1L zd!B$LU};!`J~g5fQX|#?z}3a*mO>cb3*ru>As|Fnz~YI7oB&y5;F1toY={6;rBqvx z>SZJj5mSE`N@PUm2m%b~hDUb*@|>rB$eBRELBQg$9YBfJGQu0~7?+Q-6;dM_A%%Xk zQKb?5d6icFh5(Zz=p;TM&{qnKS{5G>6dAW9P!=as3Z)03flHP}g~vxk$}#mL)$ftG z6&F9pH2#Y%{}Sl~bwR(nhu{rMen8KaUOi(uWvql1RvV$JO9EX@%fYC`5Wc&m2(Py2 z!IQR0u*`su4jqnyoW*?5+t~(D-|7LFs=$j;807vK3VtWv!Oi#OzzVj3!#*M?ea?e5 zyCi5`j3Tx|WCP8|%;4f4XV8u?h8J_*qpzw>AYVg>CN=5;@zfNkD0gs3C7`2P7dR8O zK!xuB?JIb&)z2EjvK?Vo*9usaqzRk?6}UaS3J#1)gz=^|=vtKq(rKyyezptfG+l$} zI!lQBNfjDLErRXKH6Um3B+M)ff~7~b;krKy_H`Xc54i|xuPQ*#XccH_o&ZGg3^1#G z32Cl6FuS@0L>1<+AJd0o^;$?7s)k1}9Xg!_*uzb>uw@P(M4vlDTK{)wMf_+uM(9FB z8VB-?NN7880*q3e!1FB&>T)N6IF|)S3uSQWzA{|6LBfr_X2`9)1axDsz`z|jI4C0YE|%1O9op94L6=LOB8#!n z*xsl>Sou>r#?iMYD~j5o`s`bz4IY?Hs~IYHRYG%H8lg!XgU+RPVC(B~*pjpm`_cL( zCOqnl?`u|s+JkLyKeGUaa%1obrt7e$%hwU1`Rc6x^EJpk^bD~#jm0AM{jrJmHRQ4j ze&n8(XHYlG6H$G3n4%?y#|s+3S?3L0vgqYLelmsR)1S+!$!FkAg)1oe&V=ys#@MsP z^^kS>41Q~#J!Cd7$6hv&=*FxIkYBR^)oH$jXysF=Ds2XgpWzOTUyZ}{Hl6_GnFH|c zl4`*1%%P$R~MOXW9`!f^R zI%x@5<=9L3T7PTOsB#+~Zsv<$gjldCS_bQ$xR7H~SAo%&jjYkP5qkMA96(kEW}i&Q z)mABjuSyYjP;j06;>0Xcc08O{T3N++5A#8X6YgN+#oFv8qQ%_7>mpn-Ihm-4^2ZWi zJEI!|_ePgabips#x(LV_i!`^Zx3S&UDl6_9dVogj7a?Yr z3g;TI6@D+!1Gg!Z?%h5V!ZqSvt+8e=*tVV*q3|VM`^XTzIUPZWzPbaR)6-!7lb!gL z@9I&z>@@_306J@+gqk~ium?FyM>&q!f|We7z_`C$gP>opqc;!32&`<4=$`#GVwC>YoCXzuo&#vR$ElRgQsNIUPq4p)CAf(60xPNJ=84Mc&_-H zyU;s6VPf>WS}ZQ3ioJY?COLDtGbT%r5he|V{Ch(eSe$paIkz}v?C4*+QRk^tOk?L+ zzV*$`u(IQrR(P)}CM-IS-;c{DeX}2<`Qr%6{+%zmt+5{s7Df{hD&t|lZ3Y=O=QtkR z3iv?EIXKhjjP~v>R6aR2hN`Ud$9=;&(4kz8QdDwrhiZgze-M-E7i+LN?Uj7+KF76s z#b+(6@jwle-l3e9EGm2c5iH?c80$W%h6QkS@M~#t&{*>( z6cJi{y2IapjgUPFuZEb_R5rm`2^b}G*q^!{r^e{}oZY{EBb%q~Qs_{!Ax%MO5z=+G zWMhW1mNZi+DczK%qVILYr@1sOXRKMk#X`w}^v&v1V;*-kAIR9M>9BzkNe^b`>A2+U z`?eg+`fQ?SsY{WxEW5zarzWJkr7UNs>C%==k>jDJ@wyOszi-Q;QblrE^ zWm%M6E-+eHpUjzj=xUK=SWjctS=q;osD!C`_rob1Lp>PgBm6&+j!Q54?S+ zF!k7j=1mIf_PJA!KWxiYw_U3^t)}+>SRy<$HjW(>9>mZ& zn!Ba4RR?ud-{^|J))u$Q+3g>%*^m@`7;@% zt>q%5Mr;@|`sY>J=p!=v+~n`bsPYGD8XgoG95+m{X*$}E#P^KF>Bcl2{d4%g+4Qf_ zigK+$$ckMS5)>g55Hu6l!~Q_%^)x(=syNWUBftt1eq3?^YuNZ>w#?Z5ei#Bw!1AzT zya}QFU>MSm(L0PBke(VHUyF$^-}pblf+{7_5PAHMe{G2W8S4LkxxV6m>L#h?8>muZ zitgxdZIOmhQR3w_$J=K3VhQ&Xri@rgq!Ni_F8yZ2T_Q=8NP{I#5=pScF<8DPr442= zBS(qEmFemENKen?!&oK)6G$>=wV|o$RC+LTwhJ?NOk~=bOhbp3NIaNE>dQ>!dWjh` z4-2rEZ64sgAU@D(T9~hoR|unz@TBuwwSsQs@f_tr=p<+vjwc^7x{v&0{k*Lr1D%5< zGR9xNJq?Ez-;uf3r=K!&H>MeeE>C?ppD=n&8YlC%it_cD9p&dEhfeP|cNm}B{EzdY zi$%*H@?lCTjSh5P5XUGat{#lN+$eEm|GBR6deg?8*{ye^0C8{rgB z?<2>__|x%eSLSGjEkURiEsI$h9~5W2EIQg)784VdsFe^D6B`;8sby|$Vs2t4^3VYI N@PbAHz?1&NkVV0zh$QTbia~{75!r;u7Fo3_1r&`tAd0wy z8>neokpnB z0fkXQ^6y;nEvOs;7XfO~q&5JQIZytOGmZlm1Z)=52@Ip8G-;?d+T^2bdE}6UN8-Qh z=uj2>^_CL;UI9{8(1U-dz?2gnAC?>&5tkGk9+DKokx>U(cyL&JOmb|TvZev3a}Nre zG2v5W@Lz=dWTn}<+?~2U+&hGFKD9CRCy_0kB``%5^+^Q4X(cbuQ@TTS<#xivzN(;Kc(fYU082F&6&DOBQlLKcr)`4$? z39u=>8)ejB;Hs51+!AOH9@`HhUb#EAuR$HKwv7PQ2V208if!Qi#sqA%)tAU&@)Sar ztIrr{t%5ePv-sXD27@pSLMDz?L|9uOvAgjJ@T%j1eQ&*x`b2~!b8CV7nAf1)_Kw&6(US!=U|q*q>^nb4plDcx25Sj;c}5$^t(pf<8V-VL z?G#jzYsmy= zSM-Qo2j-)_)i$u!qy{T#e-A%De;;!^JDNErD+QJ5wqt2Q_Jl=QHWp(o#m<35Fts2I zeA(|oXr(U$7N_eN!@q}c@D~g$?{x>()2Cs2%hW;PsYUoZZYOc-#0(;&GDfqgtU}v6 zIshI?xsFB&jhTC8rtn=S9}`VjftSVyq10FI@ba%WhCj({W3B00$T|iebMo|Cm|kfb z>O0@xgu_h>ppAA8L(Tso8GD0dt>~#GOZ`T}x^Jz~{M~F;!gw=AwMGTHc0+(xx@ZrV zvyY?g3;Id_Y%vkTD*^(GrOXr}1ysJ@j^;0KV6M?17}YP|;Hj0%V5edtNAtcpNSz{K zEV<fpfwSlv~ zyP1lR&{4&onScp*9up%b4fq7OL!zbx;Qa+f*@*SAMZZ`gM z7_mwn48w4&y{w1%(jO2D#Z$qf>Ud^X_IIFmQak!4rwcpoXrv+7l#koazKZ<>FEIiI z-l)jL0~?GXKt!{?3%E#0Xoes(aN;hC%^ zy!HdQGb+VI?pKYH)>beVZ8s!b7rCR5lo0&rYOealG1HhX5uW&WtV2xsn?Bg{bviQ0 zuh+J}vI#8dK5pbCm;iYNt=O44hcM~-2XM|voOJw`l*q0dfbVw5@z@cgK$Ps^(qGS@g-oXG&P@4z)pFKyhZ`mN#k)yLBj|=-=Jw)#p ztYOW#p^iOklf#iWcR@oxG3pBv;$LXbU>wkPg^`jp>Jyz?MRK!A3vN2|kFQglZaojD zhfh%fZlq0Z*l(90bR@uwVUXg|nK)|+KqSv;e|iQ+YMBPq@6TAp)HL?k;Z&HJ#pc;Y z_0|`zTB~6sR`5hctJjS%l@tdw6lHDDwGKVELv&!xCjH3?4|*F8tleVhlu7c%2NgME zJaSE?jR)6lHSsO-C=ef7zinJVRa9T&p$+*~3mX-D=ffKdCPZ}?OPdaF+BHdj>s*2J zkLE%Pg?DIOl+Swz20*S74uIg^j~c$%Rd%epUKj9E8kHWb}rX!ertKb ziO5Sup1a&CcAkvxt&&}9t=M%cUS8kMa#uws($Y^)PjAi9pK~;ObmDz2GjSUgJ86K5 zUA;M|*8Arh4cCUJsjqi0&Eup!J>|8Rig~YV*W6>xY2H&Fcq-xXz{_#H$3xq4Eu(+2xLUpV`>NQ7zgS*BE$kHLQ6`mj6`M!{Y77t!T0niA5sOZ0esTZ;>cf zB#sohiA0ei=Sbz9m{OQQpPWS^PkN{Lp`GHC$3)r#?MO81i*Z&~lc|I2Ob?oOPNi2@ zdZAp4L_YK)meQL%d|}T$=Zy7gg!X)K&RJQ+MK|5P!RPX~4|* zzyM|GRJ_@$`n=|RoDbD3ivE-j-Ab`M+vZJ(c~YnoZGA>qR0L z`sf}*_dp)#Hf3?B?NX{I6KN%?$*FYzsT5U1q3*;=ODc1!I!Z1TiBnZ`QYK4VQ92@J zCsjtuB31Xw(xtI($y6L=owPmWpE9LKqtXOCBY8-|lH`aa%P_gzG9)1(KGi5CA|X*0 cA7^A^Ki+1%HQ&bofDeD5j{5cLOZ|NRH{sIBfdBvi literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f892d384dcee4c386d866900d99cf672068e5bc4 GIT binary patch literal 4860 zcmc&&c|4VA`@imIIU9!~=ja@SlC6}bsE{1HNTRZ}SVPwAGZk8tN{VSAZA95aOl7Gk zN{faz+7~9YP1{Tp>3yDK>YaWw^T+S=c|Y%aInTXb_w~KM*ZrK2`+Ho(t_A`^;3Eid z5=aYR6o7t1MnIyG^~8rC6&vGskz)kU);Sbfn--)Ve;z&LH#i0zHP_R@#J^rV`m*)S z^ZY9d*9btMAW+m5@TfnAfT3&*+N>`zgsd-K03fA}Q6&%Ic-mBB&38yyK!}Wh!4nDD ze!dX_OM`u5f(4iYPoW1XJV%;oNR#zX!9=>$u0enSRq&_^fa<3pfrk^-rD<3Y2-pa4 zHj@C7%K`@VNK+4&%QE9p?zG8`RCJ7Q2nbnUW`r#ChmQh9>aU~J_+AvXq5Lr(rx0&-8#5=?E4n1awx62ey_P$n!0rV>u7Dc^O0KW-FM} zwHy{DssekbJlt|FheI+6prTieu9Pby^~Mt5Z?*^Z#w!q2I~5{cDnNtGBG|D^8MX}l z2sYV);8QjZ+7~n6Kvxxd%t26dSqg5;$wPC~SRnRUgMQ6hNOe#L=ZZWKl^DQ5OdIwo z)xhe{70?Sa;ijDcd%WHhGUoC@bkq(~2cDqiadJ>ejEC@4He~6N&~oe;=&rT}myZmn zP1gaL#_sfC@!x4}e>kiqehv0L3 zE9|x_f&;V7;qe{}4JJQ@iD_eCT)83q@_H>CzNH1@BSxWj2Y*G10lg?z8$ncrI@oS= zhUkj_`R+GXKS02 zB{>~Xapof$2TqvDIek>@z(w<#8=z4$8l6q)#Mac|kdf$(b-LNW3*)z#u*?oW&?F7U z$J|j(K?^)c+XK~xZu!qyvKaJ!A7pAW|4IfGzfa_@kVJefbS4-+()5X*H z4L5U0Yg&fAttZj7Ip@L5OoOOZeG4`Gc&t3t8dR(uq2cNnoS!j>G}avhS(`z4w6p?n zGkrW#U;(VSG~%Ss4#vmK82GtmE$nHnB6AOUWBY3iP`y?iKIO!(4RFESNu*I z+=e!xr;cY)>?k%0IBiCrj9!3|+KVwQ^J>!nycd~Oy%6bKS0szuKBLmnZMbgfcD&xv z4z296gIzIc_~^ymc%tkB6drO9Tk&!O_H0=)Q9H*4v7VY?QL-n|=ZD43(mxm<0`T)2G=k6rJE)$~q8A5Mi6qN{hoWo9bO@86AIdQyive1C(WU_fUk za8XmI2ljA_kBrr*3=I1r9OJyd0)fA`qYsZm39N9X=)U+J>7>v18lsO@Rrir3|Oo zMXac|No<#a8B#rU`5bbcG|1ot-d1{(`Q9B!A4?N>UK7de*nR`*r?p{?nH_kgxtgrT z*4>2RyxX{^d=6)+XbGpy)LuYZFH)VIBrDz9#3Q};O0hG~o#Vxm@sLpU9g2xj;HbKc zWra(f!1H2qj~2Z^r*yZMY+{ru^Y>o%XX=fHp}cV-*{hQa(y=c{^#=y_NQCy8|*@6 z)bHKs=E)RHdG{H<@%62+qO(#htVaP8=CtAuVzWrk&3$P87=mT~$&=jPFo1@#qlj>M z6*y?Rkqn(%g$JDjd~o$yIQ`TP?a$4Y{c&_OtF(48?it30PT69#T0R}Os6ZI!xhAP} zp&FapQOXDRvmB#$e1?Cu6ROwwgtjzqVr`yZgvFl?Wjr7iF+YwvekC;)Ztg2V9U*>D z_{l~3_RkZr2gc{nHPKq^;#!f^W<@iSb1;pmmbF->xn~)Y;@6?I+z_~y_ZZ*BQ01ob z717SvAMjNL_t3?d-HeH~u23Z4vGb&BqzZa-fK~QcuGY1PHHEVf^}l|DIZbSa1aq#! zsvQE<{o6Ba;QRp0xi5`hYOW$=_P+*IH%qL0u_p07e-7gi-v$M_CXv)NPu35+A zjkC|T$XlB#B{U4~I+M3Tr6_l;F-H z&$EZO=1z-xd?Cl`NX9z3DD#lr_Hhn-tc$Y> z9kUHXZl5du;mF*BmYWo9N_G^x6}c9BwUq2Eb+7bm+hbF@tK9oUbpP#^(%nawo>|LQ zvMtN5^1qO!s-amIv0wV6HaT35I3KP0e(k~lm z0>8g|@#NOk2eVQTDKp;Vr-$P8Y{BG!uAd%RrO9bH7kG3$wn-NmMf7xaJh9t8)+S@T zXJ?PY4xI%R0o|Rwj@kYOf$asJ*M6R}cShXHp6+W;=N=TN%4m39@0(ZPuyb?H6ia`JRy@|IXju(R2wGvF_4=qUANl1 z110kxQJ@AdQ8IB+gPY81FT8`2$t1XCND+A@YRx)J0^OcN}qA;=PP?nR}M>?)bw=}IOdvfHK`rA zQRZEr#5Mi#d3R0dol{#)kH5UvuySyiYj)z7N9VRE&B`!4`KI@Bu4%H&%({2|*Gs)m zXUwes^~K%#m7itI8$P`5xs*F=n|b5M_ff;|u{ZBA&6+970&GNI*nfK!XNQMG&{`uf zh6oFZiDd?c1=81a>Q0^_qp(X~p-5LVTT?TgNsEs^fBpXU;_V}F{wkeV_0m8hA`l}f z)zQUe5&nqHGxPgHL-f&KV9Nd~H>hP26SBVe&~oFiqm}_W%3r3 z3h8btCjZY~C50Me1)e5LVXq+{K%^{`7Q4F6bvGG#vxEl;J=%=KmSVAZ z9`(@XC>AG(ErZ0iVsVhzDo8SBNjc1+TUKJR16@=7FE!PZG-K!pbRhBE?)l8_u3mr(Z%w~eE4Bs}T$RD5bFeKaGUAXJO;jb0HK7(2;7 pDr%B%baZ5bT6|!1Oh{yent}0TgUR|LCuM-IA7~^1T&Ul@{sjY`8d(4U literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a786300b7ea789c918d25ecc282aa7737b43bf2 GIT binary patch literal 3554 zcmc&%d010d7C-mC_mYcLo_o*sJNJA! zC*gdA8%J|~+!!s#a?l6BwjgJ6)wJs~Pv+Az>f*J1r#!^yP;%e0z;n$_KNtRC5M zz*%w%7f!~Rb0SV;;|We|uZ)82l>z|iU`%Ei!qgf{)sAd9TIm5IbEQHOr%O+mk*v#3 z=CE8QZ$|PnNR_N&-=s?ELVN@c1DVXp0zl|kE=Ws!5PVkC6i5)z5-|;oDGMg1tdERv z3B$s;2%>VX-|t>4*xnljOZ~^eU(K&uBMs zc5S7W2fqz&9oHd^K8qVFj-khn=dgRmZSkD8RyeV+3!T}12f15i;w>2o@VIs-C2{g) z_OEY)Lyb?6J!rA_=W4Vs*c44@u7?Iy7HV10ju+NaP&Q{OZl5?7?s`7JijM-RtxYoU zJQ|5=cAkLmN-E%aX%=PYT8MqTMlmU?tVOp^RwK8R*JyDRuI!?!CY`TRxh&n2B8N(lVsEF0`9398#j-;Ab9XMzi zIy(yO$?L)cRQBR6DNd-TRY~!~3+VkB({cWzK=k#~D;D<_osukRiD=FxxWA{(AL-l9XcVhCP12S= zr{H+_CUJ=_ZouAm9AWsiyG-BVTsE zK~!tcF>~*a%gndR{g{@31d+8z3F;U-naS7~q?~lCl1fSjbah$mTmF5@FkYR8hjw8I z$O)3YGdxgot+q-^e(oA&+%~Zb*C_q{cbXR5SI`188QVm*1IR)>`#VB28MwcU5RFn= zQV5t;_-$LTkYf8dMKx|)SR^*L4_fE9eNnNA!acdOaeL8+X11CV1z)vzX6DuzT= zCtqydwXD)Lwz)*_>Wb#{ zHVYkBY_Xn~byK4UxKG&T6P>1dbvMy>b}P{b{rc{;(3=OWB&wXW&Jg4g4*l@mfMV8WEtE(IHba5dh?p#uT^22edc2>%DEaE~tTw*>>z1&<8`pvNS8VY{Fa&4zN?nG&S|FznS?lZsKG`dJW?$U@^7e1Qa zvUc3Jp6QqC=TlZ;U0zw&&u#Xv47==|ceksd$ZGr*pAWnVq=?bPXy`t8NSS#3t z^fu1l5n~VU7Fp=&?<8Ry0Bq| z@GKwyoR3u-2HJ4gSYv};E=-cNlysrqCT7#ADcL#V#MDHgXpsulD3xF8FaKPjsfBHljOB!$dB`S(Jiov7#IYdq~hw3kW>KizpW$%U+NPSY^NdZ02FL5uTXJcJtazh z>m9J6cQ|AJrh;qsqWq>NrYGeX`-$5~{sU4Svr}EOB_;P&^=~lxI;_f44^W7*+z|+Tzdr?sIg!>!kfs%eXAwFaOf2+1 zk3&DA-&W9XF#MljK`v9;D7F9N*6L4cZ~iaItNODtNxo{3T&7~>KlZl-P}(SDd_=_f zNN;0eDYntB!tmfVJkL)cUtxsud_J#9;sba-iT6)3oM{NdIAQ0{^TEPO=-!o3hHbW> zK~Thx|6qu#s~6df4GR)z|9oK{D@;T+&ufKA6D^R2dtNQjv*X;u+~Ojq1P&R|y>!In$pRfB zOh)Ji*Ny(nCTE%)U8F}wbW~VIOq2mSu{XhpFJ#io{*Yji@dtl|P--$00;l8%7ksc* z;5RVk_u8KjY=}1rHW|I4=lLB^I|iJBJkd`?g{Lucg2Faam-T*b sV$RU`%*>&>tgMWD+q}fA?39djTQ|>PZo||{tqs7757g5D;pE@luYn)u8UO$Q literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bc20a7699b64f53ab5cc24074d8f61c1e997b00d GIT binary patch literal 4872 zcmc&&cT`l@*5Bur=?Dsg3?QJ$h|-%zMZ*Bnn`ori06{>ibh}`HL9r20P(g|)7Kl=! zQlyCm5Cd4iC>SiL#2Cdq6;R*3Gr2Ozgg&OlOp5Jrz)9m1N)uYmgYUFOYN`5U7^|RV|ul$X>sWNr$?SF zyRp0UY-uka_=pIt zvHg6*{Z|J0MhEdRMV{g#r1%RGijk1@PtF9oRMf!7fNJEZ4nUHH-RCR_1Z)HhHj@Au z85WF_VVTQPTC~VXJKFS%xRqFAJ4!6{hmInZ%U@Ti%|CL%q`9P%pL5Zc@sC&)8x{~A zv(n!;##bho8U*{VTon--8x}6fE*(>RhlKBN;XYjTe}Vb8Dc7s>ThyKSzcD3e_dxbd zAJZmvEvzxy1|{7h=&oH2y7@Zr@D>l=?9zf~&C_Ak3~ zcr^}#^cTV4UH$=Xemnt;z&UW-Qve0O$-#z$BD5unhpiFJfrc}NaP_b)ObOG2SFRt? zjZ%HcP*pm{m4v-Tt?Y_|i=+UpQmX$s-LDniY;Ww39xD(o2j0TyHhfY0eE z(7v1jN4n3Vehz|)HZI&9FAt3klYz*w2E&RWNVeAmm(qL?oHT+wOdE1kDqz#6Qg{sW z;I=Iv>;HBx>~d8A!S}Y1{QMzW6FVNx5K|#6nGG4bBs7iElZ7i?By{B(B8L>6Zb0AHyLb#sjjF3*l4TWjJ743`ZR1 zL;pbxjcgeJ9q}ZXQeq6hy-I>(cc#JA@CoQm-oKEt|6>%RjUX~y6Kr<7z@pJKv_D#m zr=R!;Z%a+lMZGL8YlO*zS9^&8$0n2@+lO}APY0vhk0Fd` z#cDIpqnHV76c&6ROXz5a=EQrbC4UeZPmIKJBm80QuPGQ?dp>zGy8}w=htU*p!sc8u zL?`TJ(89(Vs1-(`rmdY=VigX%5|&~wW)5LWr)}{g4Lqng+6;Z-O!$-@g-!JqN@q938ZgIrrWhW}uLfJqcW}-4 z5$EZ%S)>BUz*igV;N3}kknvgop=x^A%N13S)>4PxS~MTT4Xd%CY7%uhUxAFWC8$z; z2qI-Kpps;3P_uS~nj4dFt!?EXyI=%`>uSEVsf4RZpwJPh9(imbDGw^Xn!vHIw78QxmV@o0rcdbx-ZVLk+#~s}K!yvRA>D zXLjU-t?NMdd<|p#9fXGdg#-8&f#KKlaOHJ8@RHBwjPl#bbLGyY@7Ykff>R|5jv=1t zc-%cqO*n5K~566~j5E$oP*1sTab zgYCU^NzP-BB^jzz09v{U%s4U*&VJgDE#1(-OpzrSDrRC7e|8;e7n{n+4a|V}+3t+B z{Wi!mHxsS=_7LpqQHSVgMUJ}LWR}6+-5hRHHMYk46?)(>i#7HB01D#ha@xvfGMDU0 zlndjY$15J|pm!I;2*Hhe;5IKA7C$?Hw?3>w9lmcNFbGin3>nnW>4`nq;WN%^!Y(ZT zu`$N^{W=7^Z%6O?LkO&BgP?c*bz+a3CdoaZiu;ZFV<^R)75H=p3=TAK^zLC$=XRJW z_VrUMo;wXp9=#ytP4=RErZJ|XvKwcnyac<$TVY1>DTY(bGFIf!X13e&+1y7}1srl4 z560mHe(lt4=IWOP!ifl&&zSY|D6s^RZ)Q!aAvULSmt0F?G3;OQjK%x< zGSOt?&+xaNi1pfPquUF3SB`OZu$g-;!VR{P_3D(eU9Q!T|WA!Mrnf>LsO^hAy zPDwkacs7DHskjJ!sfb{zzLo*4od*c-f;POR$AA%yw~-bOTEyua1a3IAPq`<59y={E z1vTSU%o(n~qD_N$A!nZrK1ux}Fv4Fl6Q^8;l8`9u% z_n_MswqmLW>J?_*+zD$t&uBPknINU?%lO5`MYz}Yr)cpcf;Inx7rCeAIU3E1B*Ns? zAjs-C3UNJ)2VMgFz(akwzCH)@H0tL4FfoetwYM_z3S~p5>&Y>2JLe?o2=)WB z8&1UCCo{0VnU_$PAPH+p5^%RGo0FV8F;gRB`MAbMs}WbB3MI(|Lsx!3zMr8kldPbO zGGo5OHy*u@TA~jybSmA!!sj6)pI5;>`ZybOPt4-ho2!BeX9;@t>NOTN%!T;*GKw4b z@lns)LG06t6t;6O4}Wzf5>0w=2&pf!#CnzsiS-K3j6#J4DA0W~IWn6Z?`z3q`&_!q z(yz`^=>2qs!TxZE9r{QS>x;=C^X8_KBb%C_Zon4h9?FtERk<8@?Kq_X9!(sxHwuiR z4{p3-z8sr%`qTJI_hOclP9r?O@*G+l&l2NuU(wf=dLBQQr^~PB{O86+&_}C8&_9Tj zAl5tRX zeg3voSq)3ElBnR@GSyyP+_3x4K0VOJI3ayjRmM zL))wIXxewv+zRZnEsM5i>Ufp~^)wdkIAGw@C>B^9OV2h5>MZtZI=1uBtjPYW*;dDQ z<;+cZ{nYDBs;m0fTY#KkDt4P}l`5y@ULjtnX}eovL568sMw-KZ{S)>3?7M_6Ip&)% zzQavz_adhp%S?w`GnSMrt4-oN-q!W5487l&>3Bzfh1>e~!=!H38Q)9W1#?q$v&#b7 z4$j`d&^uTWeDkDNb&B4hAHwccts7zJA3neKQA_sRRQ;T)*kA5m*uXT%ty%Zxc~5n! z!I6ubMu*>zFb(tS6A7-ex!5qjF`1__XCup~;8L1`j(3gNsPOVm)w%0OS;j}N?w#s* z&^*n!s3lW(Y1Kxy$+5O<(~!G0X(q?pbIdmkkFrgRf6TYup*(N9>50xFr$g(bIW)R! zs|0;c@xmgPO}1vaH7A!0E^ppWqkB~jkDSX4xs$kpyLsLp=t(l%F?8*?(=>X<2Pt~1 z1l^&0485E}x87k=(w?(s!|*b$1ignsuPFKHT;#)+9LIB|_p1|zp5Jdh_kCa8R-CKm zbpFiGO=5YS6>aCwKD@F=(=pfSLfPZiER&GlwhQG0H}b4@s5w_vJnPaaUUX{3^{UFj zJ7-q4<~mpZ@ZzWQ(SyC$tIxgctKY^|cd0r5>fzPzbXK<4To@YovC#2|OKsKfgWcsJ zeeJc?Z(q{z+>536fBOPn_u?3Qg8BQ`f55+Q9gxBYzR9yX?ooHqbfim)UOV#kPv}j3 ze??cTyAb_;sL=7aD}_GX{i08bE^ISVJUW4F{pjB%;I`)1f)YUsoxmFFDA*S^vaAWIr>K(17s37%9i1Sja~t+&WXZ-;81* zfA#+lsDGEYAl(d<7}2YO0>XUx1jWR(us<<6Emfz}@)i`c1T#$O%i{AHQspm660P^2 zW8q_bhLbsMO(;o@h$ej)g)nwhTFR7v4cfm1_*ZVLZ9l<)uLNf!RM z9wo&m#)>>4OR>UO0YIRt#1px@yL!x#K3Ph6gaIvPB1@4-w2=DJ;wTcui!1|0HX>1= z$SP2>W=SbH(<3X9$e!-0`B+cQB*SRh1MNuUx?ab?;45mdz{QT9TgB7u0=l7Gi$qRz zW9datCHo>ndLH6u>|*5Su_V^tW_E~|r+W~+ALd5Iw{8v9Nc>qzoKQhfBK0R3(tBUn zM|*pih5OqEihOB%2|Se!C0=85_iw(T=MHouO&Q<6G` zw@{j&!{RUFp^8PxpW>lQX&LEnyCjC*5ZOD?`Vyu1G5m%0l6q6crev)2B9S#cv<;Lt5&&-0e_;OxMuIRh literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..151008dc482639c3df72873480ca796f9ef31d42 GIT binary patch literal 3553 zcmc&%c~leE8o&3>OlA^-1kJ<*QK*)P>`GBVOPQd68WnJX@Ser62*@e~sE9eBfN>W$ zK-7l_O5LgyZ3Ul2#HB7M+M-xXU1(LbuUhM~ty+6$Qmj1nk9W>Huiqhe`+nbk&wQ6) zE_5hMvVQCsEz7W=Hvsmzt*PGE9rjKoHH$vr_uF{cbxzIf$k#s3By=9 zt8~G587pE%w!?uV(~VU!-6#Nn ziYBNy1=aCt=1r%(ID%JCG4qcIkPtx|`62>G%b3*Y%;eY^pL{%#((g->Rqos^pO6?1 zw`W~N1JncQ%?Xb1X^V>BJafq%sWVXa<3M=vNr!di;2@%?b*A_u_`T)Ye#hvRr=Q9; zJii959n0WQd9g?mc9%)+V8pp?4a|-yZ^1*SJ&@1-6|!`~qz(yxac;rGjy0&y?FaO>v z5L%OrB#+L5*k4-U&o>fDR6S4C;d_=`6+DQtD6u7?p2i?p6w1Wib^>>A9g(Bu04#ii0-h0jI*9BCX-?Oxi=EIJnE(qO-nI zLM@bnUIa;ezO#*f6b7i78Xs`4E|qRw^(AN+@j3cuaVxRc*WSuuc?mfzeo)<+p zOh6BlC}8o?Bg%9!o-k=6=u`PMK>FTsGGox>fxrXGE%)>THy}#+fzBUz9PCa^M|)R}g7xcVmZMwuQ`Xbmz_AP2C^aM!P8^ph zQGfL{=p2wWIBiNDGA!Cn&suLsjhz*U^jUgxPXQ}EH|Sk@Ozb%FOK~-w_Def#JD7)T zOB&_FzgQ0DT-a+r-oXIc)CHfGrjf~g27q|~?J#lj zJ|eCe5G6O=z}cKlD17K;>4CoK%zF_wP?sbI7py*n^ZP6#{PsX3xu>T3ovcOkRd;%u zT)z{y*2bdfS}klC@(eC*TFfk&T7$BVCW^kHY*3VB5OKE90NOU~fUOBp!0Viryz-qB z`et}Dyr^1$PAyPbEU_6)Nh(aV{puOLnmT7e3wb?UV3`0eZn;6M7ui`B%5C5}!x|#D z>T7r^qeSGqKNR@QyeZlut+S~5whXv@IKrZFv@P(IOow+K{D_i&wE)?^mWtf9ENp*# z7d?N!NIbDaN<27`2K!$thjvpmsC|Z-%#lwNZIzFOaiK=)i8p1f*U;jb%~u$=hEjRQ z^AjTRvrFQnP6hhLu$rnEwTOB$|0wwERv_G5UTU>-{|sVs>rOccJ1X(|NiM2>7A&n{ zWnyO6bL;(~HH_A|33Q$40;ikyk-auQ;J<>k44X&0;NPF0ZIUmolH@NwrZUoAjwj-` z^AP${;Qms)$)ic}9Do?5UqxO%!Sr?vZ>-2)NXrHWmHKU2P-vkX7Js>M%fdxg_8OCt zt1MXD$1$`f{77Zt(!L&1Cri1lMa%n*Ouv2k$ks(G?fe!nDovHCcu>%4M_p6Z;x&VV zD}%~3)l1enhu6lpH&rh!ai7^_Qu%LNR^}Ohp+ zYbbIpKahO2e(n>I+s04lbe<|3wa9H#edfI@2j|i5n-9%}M)?@`;>w(5Zbm@HxLX@# zT>}p1R*8+$!_%HbP(~LQW8LgW42h(GXA&L|QVVf8g&BrrYM}HH{Zz1yVBI7{2kodWu0k8t=)0 zZ)&_&FWPWYY)YKL{G2ei-UDx`Z`kA3`>Ogk7=2x=s%2h4DaweBk4@IIBz|=`pqCym z4z}7|ef)4s23|%IJDQg!l|otg2M)p(y36BWgZ10<`US)P z87xpRDqEHIfBdpi|25B?|I6i7|5Z0hvD!|-s2N39fBc12*(#-6Xz1iHAM?dhR*>#I zd2t$!<3jL{rwJUF&1vGe0FH~}{Nsc(4F*i)SN*buoZa0= z;>FlWLHyo7n;*yW16Iv(T7J;z_^rU_JotTL)UZiIqr#?V#sqjL>cT_gd46&*_HXWN zJP7*y1tr)COlEz;lIQi1&xi=~N{I=K=8qN!Ps(oi>f%kLL$-=z{BJe-iPT z#-I=LO4Wr=N{t8?LdW((%<+w%+S4DLEKD!?!>3Y{785w#z%#gUS{^Tevb*hvj1%&W zlZ~l6^&B^bUj`=eIY^5L@SYtt>;s&WcX$v^ayFlT?4r3S_)H^6V$X4On6KkB+2%S4 z&hkfF>8aWF hS+VIE38^XeLx;N!b@NbZZ2@@k2M&<{1ml11eg_+(_4xn* literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b485d3882928aa5ca25ac11f280ec415a3432930 GIT binary patch literal 4311 zcmc&&d0fp|8-LDkxl8M9`Ca#_AuYEeS}5z7?k(*|p$+ZYyJ?Scg^40jBgztGX`zU+ zm8A%cZDflv%otu`3`UmJ@SfjIhS$s=@8|P=-Z}L<=RD^;=lMS0=ltq^%TwxQ%}`7b z6XwCFF`x}#)p&aDzJ-&YT<8v{NM4#^?y$LRI6rGhcKO+pf#1nW0_PWQbvXU@*XEzE zTr@P?r}08}IRlK4>2JXZ@E3>S=-YuAuP=s(*XIQQs%8Z41&AcGC3ccdq|Z=dTi^)9 zB7R74d{|&)a8e{g2nE8sNcby~WFrah?}CXfaMr*OfI9&m0I;2j7+();vNY@|1bhS} zpGyIy&I1}h9Q>%N^Tw%TceZKu4s4812#9!nJ0c!>?<2&dK3~O~?@1Au)X6uJ68Zxv zGxaceXi8jod~#q|aB{GE6rMzd1%}FFQ{v*4G;0XqGD5P9lB_26{|n5&!Jvq;X7Tv3D0JQL_>F^AB>x@dd39G1`01(UTc zAph9}z)}aEX%LY6I12pt{sEWYRsttt43ztdq3o#u=B<;W1&L}zw0I0O?Y4r`8=S!~ zZYVsP{1#oP83qOVB4plZ4Adh_;K|*=K9hpB8e`y_8i0E|pjzx5`p5PC5hK^PIrwfhAlHDS+E=;~_oS2a2+`z`N9w zu-3T(Ho1+5JL?G4z2H6!$^HTit8L)vvqiA|nkg8^YopH1FH!%ndnnlqK|;I{IIWxn zi8WJ+y2x58xAHB#su_Wf3@uXS8M?#vLEEXI{+qyfrU~V_r4t=;Jp=8d@~Nu8Hr2tF z=gAeW=3ssG9>h^+h{J^kP?zah!bbj(n||>;v@E=dzAJr%Z2Bb-CGs%nI<%GKn~kTd ziZ4RVu{X#NJcuzTtWc$^I-1hl2!|z!=y+xuvG5QHdFcT}oA(5GZ1ch;dm>)QYlK%Q;8jLi{8-u z1ayZYdea*>LTv;=rZ5fQZ1fshZSJZ(=om@sf;x=)`Y61vas_q22@pGADDh<0A;|ge zD0#(uJY+Y`CZ0FY=+eYfFm9X)RX^}K)aj-Y)me@(z|kEVFML7j=5-^J#d|?>LO1*z zSOerZD^kwb11}|;IvBK?^CmwD{?)Pw)}5@S*KY|RO6sgpgXtl1c=;dXjI;O2@ka-7 zjj~dSTFYx(#Y{W;822|i*2<4O4M{MjI20CixX{{}8DMdsk)w4Dq3172fZ%Ol_0^YT z{|q(o?#V&--H)SWZ9WP+I*z_!X@zLBnS|;1J#^?Pf7-241OB`o3Xh*+R z(qhLda$dVL988=>G`KsXHA&fIznRy`bj{l+F6u{O&TmVIN3$1D^%Ff2@BTOo46 z<~4DbXwsaz7hP0ZZ3a4@Jwjb@e=wwt_TtRBml#@K-ivVK@IdA#Iwv_ zsiJzkfr$2hhJJDz$us`(K8jSaP&vEDjyrwTLIJd2FycYHIT(+B8FTH2Gyj={*}l<;6J&?>diO--)4!ZS%yp#-F2B zc^c8GYxT*Ho-hI}@!~~17z~f@H>n)gJ_ScTH*m9qLk3igHHG1KAJaO+^HC|+hS1Yn zNphDw0hbM#FgR-m$0KIuUI8uVf{&>$(Yb;nii3m>3^WMkmhIyv{^ zsw>bivXwZTf05ih-ayl2#ahZ{$_>&_r&uLWJWHj_)P-yv>%!2EGX~yk640B3_H_P< z69PTkY?PEFREbkPKu+ygOGZQj`e}KPhJ$tE6 z!1SnCFqs;Eek@aLSP{kkvczpW&%g{15^vM*hfr7E(CY(G}Ic&Vm=EL$Wk`zA-n%&Wq;sVr-GKdX?_Mba%xR_KjN ze9+#sW$DU+_KSF8+1BiQBbNd*zvitu-zGD5zggd8;9m?}Ij!pml!S&r?WN$~qSR;(&@$ksOZKT(mt#d(#X z>pI8Ef^F_aHc>ZDRDN4Nd9y61|Aea5mEILzmHsVNg*$w9hqSJnuwzYiz=6b$8!bE5 z?g~7%h_C0gv#2)obb)!m$(_Y}Bfc+lUGG%Au0E!HPt*@5tJm+BCmd`2fbMt9bb}II zdt}XL=s6p5FnVP>qcEO!{^mjN2Q2Ow1z2tSK?&xm1fvy^l^E*!Aq-W!{%TDh)VYl; z)H(&|R~GcfQCCoh^=JiDv?&0diWO=t>mAAm&5gXeEmUFZ9*ZNq+!UxXbPjfDSx;x`bua{rbi8SDHcg3rW6kM7khYmZd7TOMWc~ zam>*%pI(=)73#D?Z**$sYppPsd?VL2BQmtZ-3o{J?Mv>`p6yXQJnCv)hEBw!jbk%j zc6RARdX_rm2yN%;MtN;>FSbj0qZ>W7(z|kc{ahwydUe3T)E94I%5eD0potU0RA!e*IXC$1giG`bBy+X4oiz zH&s5B9e(U)kiwt~2i&(Sus-4K3*z?|grsw@K11~CN0(2f7C$a3p4F1!NmOi9QZhF@ zHk_4VSaW4+2y4uR4~!(c2TF3ethoI6N8&Gql_%rq!Kf%zl^L%=4QND#6Es$p?N~_$ zzn6z0@5j%TBtD^RB3_>_t0X^P#ZDh7$*tCsJ;VM^Nos!3m9gRR5y`!hxd#b5k)&j( z zf9zEfHd+b=5}vTlTo*vBFH)0wc}?~i)B9$LHdB^tvz5xEQt1@@u+3d6O_RzZq)t+4 zgw!EIxhBI76WN)ARO-r(*zOy#rJN?Q5!gV|$#aKTT8_e#36oq{+aZnZC$Jq3EtPt( zoy?ClmHSdF){Y6WnPeT}Gd(5DX>^RAuU90yALohl%ZSFEGMyZ6cY zB!3^<_%P=PX)t?U36JUE=GSL?O`FErZfw^}U731sJ~4PrmK^M3EBEuAB=`4KQpb2x zdil9c{WKq3EZjcIhb^TnAK|1uk=!>BRh4DWowY& z?=(6(#3mirWCZJki=4*RAE(%RDR@n$Jc%>M?C`#yRF>9zPs(K3Q`kqUtYq&YCCT1( zC3U|zrxc8%I7|U};(uxU8z)yG4gdfE literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0da33db3c308003647b082db9d88177f6ae33a22 GIT binary patch literal 3765 zcmc&%c~nzZ8vpKn?bU}~p)$}v-dD@r~;I>zdxmpQ$?rsM0{*lD+e z(|h;ap34EJy2kk9?_JNV33TAdP}_0AxOc3kfGl2+3<{8YBor5-|-{ zau#HAc9fiG3DT{z6HH%8Tg7(ms95?J9VLPC`YPG{1qvomPGYBa`Xv;NJi!>BZAeVZ znx3G~(#uoGASGdXyfHP~kY)k50xO>(r`wd%BFg^1i21vauiJ1x*o1S3NHGZNN9U-llfcAHuQEaV>f;RE{E#Ho_-P8R*!;3%H=3g0J1QXmzX?Jn(A6s@=iV z=4Kg4*7bn8?I+;dl6CN6MF!Q+Egz2>>CL2+*@=EQxgWWw9HBQAibRZNDjqa?KNEi{ zmZ>`W5W*T&u(>@9%Z6hrn`?mJ0ngyH$4%+odm|WIki*CYN8njaD9CkzklM!;Kb%$% zOV1pkzMME3N}6Zlwgv`W9DfSR_D@C!`nExx?OeR4FaY`ljDyC{dsAgD3^*?S5Uc}# zg72p91!|O*GID;vW|z<(&sZyJFU^G8ClM*b`6!P)lr{9Cis-cL;k%Gru)KNa5>Z{viS<3wjU|TyAllA zF>g@4a%HeSvxMq7^(r;b`deg3`3BGVM-g5%ZwkIYGoL;%J_50KM&Wepk5Ny95rcjQ zZ2f2^ZRT2-k3JmF==Y^6s%!Vyj+-5W-ksZu`#AL%Z%WZ1bBmhdhb^FMj8k#W(_nP* zr)xG37oCz6ou46jA3cz-wL2jWGg-?jUR*~unsVfBTPl)9-)9ZiSjmFR4Q$QCeyH)R z7Eh_Nl4J}WEUL5Kga3BT5&tE?)+XKR1H69PJvMrcKa;Ae21l29;{;= ztO3u8{U^E}Hi8}a%^j31b&;Oi?Aw(U##GL)U z9-Y@eg_y%@sU~MRYQ7MIFIdO-u$ENgt+za|^vP$C_`C%@yD^)_JMz@mMt?@Hi5S3G zZLp`}%n2A3MX^bDo#FnSX6do+r{G9LmAFJ7*JqdSAQ*P@0n=kxDcUObz;<@4DRI$5 z2&q~K&V{w2@T|Al^fr?u;sS;A5!=nSf$vk^ka?%C5Y@iOVH)ONsO^FME%yi z4OCJxpzF(LbQ{}!6~^xvuxl%ZpsWy?@33IW&H8Gou1C3ax_X-Qv~LK<1ian%RZ-tPWB4R`_$yhmR^0!vfgv&=;au$8oqVl>w{wpD-mClc|#MSn-*HT_78C zl1>QdiLV7~P+dikwQFPn&9+^p*sHxH;u{}QU+rnZ%6&$*_pTja|1UYjR-dED&uEj3 z`;m3wnxwxz0JM;XiWU+jvPI<3hji4>Um!G-fk(^9q)AFk@&U6dzby+3D7L#MrfEyT zVzHus$VR`diwdn&9?4glwk|HVcJMDz@zqO~_RvJ_ifOJcT;5X~_vuD{ThU6pkr{Wd zG;b?j-PdmstM=bsQaT`{Ortxxed#-cBC12m{dX)|Hza0%^3|g|mTz#IakNA|cIS%n zVaXSE>5lDOSvdl5liF_!GQkbT1XYZQ*1k)cFRu1B!79$gTKQ^iOl;1hyJ04v%ii@e z0orCGCuUm{ieEYXx!2d?PlJ)a(2wmo#Mv#x&0;CFV8uky)l zr=8dCirgMn?o;Tzu4dBi*pKHkL)P!s)y=wEUpQpL-l>iApSLqE8$Zw=U#9jgaw*@R zc&@@bPvlxrmvX5_*HGkIdC<^WpZk+&Xw{)PH_w#&77zVveYTGYR5lGH858@xc<4|9 zsl&z$KQm^9O+!A|ab}unL}pn|6+K^Wh8vujl`mkBY2Xkc;Xo}(Se@_H0g`;KL!Pgd zR`?dTTyM@^plAP zH8RnWW{(b_mz{IW%i27AIYD}{_(AnSgQ_9-Ec+7agl+XJo%Z*)=Wr)Y2zm8wl^9ae z1j>+@NvEb{W{DG16NPt|JhDwv<+8rYGj>joRZe#;gygsP@poU+2rl3y^I@-GU=XcG z451J4ggV3_;T8i-|D}JOIuHwssMszYfjYdtN|b)34t|6MJ#zdb>R|nnNTepFC1rIo zBQsJyMoz5*oy-FWd3fFbA4L5<$m$iIpb}-qCnp;898Cz6BmNB!bF>fN-NTQN3h>0L zSL@D+I+b7LHiF)-C&FP)6h2C@rd5{H5x5#A5xP#vu@~|0Ao#Zg{!g%=l&S62;s0Zj z_$Lvy|H~cz({7UT_CTe~iB*>MvjtGwt7Lps)SJ;hofk{Bg?1CfllSL&K9YO|8OQTE zynhlO#Pdn~*d)uEKT#MjjK=bOsL+#nS5Ib^VW!|gaKyhke~6pgNHPeV5F*US<_K+| z&=A)=A1*ZhI$>(r=e5FocAUoq_qgcE*$F}3vvo01$-=%Ng5;Mwi!>I0V=Ycd5=1)v zS%$*iEB2YO(Vl4u!AZPcu(yOKaER!-=24R-3G*4a`cy)&QLWX(TIAMI(>#Y`~9 z##o?}c#)m>!rpo{A5ttLzsW}^rGI)t@Z>CEgAWZC^esv`UGXDBE%heFCel@Uo(~X) z!Kp$G(qn_XXT^EUBQ+T=D3K!P2=yl^b{+~@^S4-%%n3TOujBo5I?u_HtYAfScuOTa zjVvHL*DcU>hM;T`$AVL^C;o}4a5SBopmIppXUxe?%yNxSPj}U4WEgWC<|bxjrWn&4 Z+`Wdn57nx}?EzkXpppDNiy(hk{|!(~IQIYm literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1e1b4765c9bc16d822b1bac7fb9e6d103c2763c9 GIT binary patch literal 4158 zcmc&&cT|+u5}*5(WfvA{`{6F4AZ4jaQ4kT8rB^{vDMsnNtyB{g6oZ1INE47CARwIx zLR8d<2-q7o&?F}Il88o6j2h*A-|{f}V*YsNyz??=_uf1A&dhIS?mg`9;3@Jlp$N*F zvU8)@6et3yuQq6?w1{^Z=^I`!;X>d@*WeNT5o*FZrT)|lKlw%4$~03z@hH9_#iisa z7R6FE2VM5KEuTGZ1pv}o7(H?kh8IZ*2azg82+V*bFW_^6gW^L2B7>45DNLEG+>4Y? zAz`GD{db`RhIEcVVL%V^^aMcbfs~UQodc7zF@b=C0Ov>%K+3W~PL^#YPg^mfFg7u+ z%9P5VS@YTBCww;gw~aDg$zONrLm6F3B^Zup4`9Lv#QRcoWkVA}QsTnmlLJD7l7nQU z=tWd$KuAJtN?g38xLi#6cO<-w3!h@D{{`kJOWr?`>YeCDJs~Ake<$^*!!)gP#eT54zwraz}emw}$=b_#839xxN5421AA>k)&03#)MHUWd&U!uT& z?_0S1t`S%fmQe32fL%}JVbu;1%1Gp5(E>|2y2lt!eB}rlans9d%IRS;NI>dC2oOhnP*a;64-$pRH5_PO%bPThIzM6VhOc zQ3pEPs){rZHvw;x6KEbj3*xSs5dTCO4o&zBio#W4^B>=UeQ6j3wrIe)Wh|&1YDc%E z5pPTU?zMKPF zJ$NAa+7YrI-A2(V3b2RJg19UW6d01wzjrSfu5kd**DUDD)dOKJ3tCEp;N(p?IDUzQ zOFNB`OJg-?C7p&hs~kY{fe~B|E`Xcw;$da750qxrz-a0KlsGm(rOQ0HwF5(MGVX&x z_GHj#HHD|oGGX^M9ngwbL@%m-Ln@(nQL-)qal9rt72n#6CO#eym$`!*L{ahRX;|i6UErhgiv_ZU5j&c=aEfi7on}^HPQe#%(BlIH9E_p zg}sO1urLuFTRVuY>&9X0%B9$#w>|t~{v6}CIO3H@xnRE62X)r=!_Dks7|l(@ry6Bo zRu;3#sDeqXM*|(mBpV&Wex0a}Y-gtlI;hi)&!~Tke|F|RKJUm>Db1`@tlj9ERQfV=vPbF(8Efo^ zpMWH=EDM2*hfbv8+Rwpo{~?ybHH4o3h64oEf${vgxXR~T@cttQ-FH2Pk`*~9^oSLC zAaOB9>Mp}{=5>%E$NkBIjwML%vI^OFauhXBJc%1NZ^I9|I-*rWj&N$&h?Rt&A+24s ziI(#OZv4DRWw?4SCr7*i_2b=A`W{cvn#b3nqR0WCto9CA@gq{}GzOqGCK20{HwSfZ zmzTA=*hMNVo(}zkX;{L-0OanRA}ze}J=~d+sw4L9#F96*N`-G%BkjW-v7po-qU}qH zdtP&{lwFuJafMSSC4M!Gt{zy6sh0Hc%r9?&6@z=!U9@K)e%S#2?V>u|Z_@*`XfnZ` z_tuZxcIXlMqf|`9DNTV$+jb*sqdvn;E7Byh0$6{DFNfj(u zS`$B;l?+!an$X3lU@$xHMqIzAkKHuyLze`Z*r`l`%qA5pQo1TzO1)s&gx)*hNQT#q zGG(LSQuQr-J4;PAi>HE$lMC_Hwcn#tNhK_UE-$bSyv?fScFNS=Ed#^GSu#CVsxU)( z33~YKS1j(e45ZDIRbE|0q2ZU0vC-ulIPO1i@n^@yX!4KcNX^>@8(tk@#Tf zXwAkvHS0{az@|1kU(=~T*RQuWr%=apms6Qd-KJs#-;T)P-nz{tMuENA0^8lWWiuiN z8~l##-cmkGeCtG+ZT;4YIV*pCFnp{&uX6rcjH6=Lu&vrQTT#!yuOYw2ahrzo4!g#J zI@eOusOx==h4mg)HaRNxO+}5~4PK4@{Y}NqK6`>s?yzs(-nw*u;=}9x%_Uz4^ki}- zIkc3vhny(TTRPBEwm0I!F6VNG)*W3jLmg2!23pJaC5U@YzDM`#(fLY(t~jgh6X-cK zdgG96x?%pL#pk{|FotZI4#w3tBygS*I5PcdZ5@NwO{2Az546!}zc6TLEA}Of{McVO zD{uMSRWG03y|gbcV1C9Zuy_{R0!8bk6oiZRZws>BC^yxi$Ra=3e)B|=@XP!2Lmcze zL>qXs3qqX>br$Ykbf6&2wZtfBplEhsxO@4m756S5D2(u^oWG7mSr$butZ~d2F77Uh z@~-!&b||(ij$YjC(;j}MyEw+@YX<#g@$$KQB{%yu|&QSl6k3riwmqmq)P!eYZ1CPdSo9%JG$Q3(l&ooT3uYMq-|H~Wx$6k_hi@q{f$X2e^;Q(wrR|0&kS^Ek;si1Z2TBiaxOAv^q64N1t!5hOHx7|X28bWdh7Ms$!NtW@VZA6kzjyXw+9NU+a?ic5fLgPs4Wc=y)v@3HpW0t^I z7Y8M-NC``x79ti;3rb8(NK;P@OH7JNh*vi;pKdbUSm34#@csjb2mnv|KL`H;bMo-^ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e9af93b02ee386e9ac98f1c9aea4130450e7a99 GIT binary patch literal 3781 zcmc&%d03Ry8h_9EzHjE6VMd1K8)t+RR0c$lO%|1uZ`c(LL1l4)VG#%rVnAGK02Km3 z7R}W3qTrgAhT#&JY@(S)38myU^L8^tE4`0(UBc{qLu6j-kNZ6LxgD5u&U?=L{@&%B z;rs~aBiuNebLA$6aUu@P0aOdToonMS#q#;GaF27~Dhl0~HoxCs)t=7*XUVCYI0g9; zaUyFkaA1e_RP2y807%DT(kl?AV4S8MS#z}76GRHNN|K;kk`$M!)2DJ+sZbt9$|j_V z*06sDrFJ5*0f&Kf3Nio?xe#&DVMG*y^`j|}AfP2;8W=eXBJwdIADNu>l@oQLnGFsC zhLRicBzzS+w4-9FUv-ourDs>krip_wB}`GJrypu=(!b^8AWzCl%*#w(k{g$#%hk!# z$RsT(E-@=3FLQ}8%Y#_C7Hck2nrqnlzwr5ox%ZlJUzvq*M@dyk_TREUq|-rFvmoc- z9Nd`NKv%r^Gu%5k5uJ42Y{HJ70efus(D5U-!`Ov(bokC6P*dnR==G_g_s4acIJ$IE z>q5tY+r{gUNuR^-Z#<43*q_JlSvSSYE_6X>(G~Pz-7VxkJR8?$CBcKHU6jOODzksf z1vuFH5RHZ~Jo%I>dNWjxX1AY!_cb}Fqp%wnHBnHxY(DOe4218z?qbzj!PNFP8F;-L zg&KEt!k48R;c-O{W$RLaeSIb|Y3r>-H%}i%ZfPxaZLvrcX_zM6oNXn|7I2ZjzB>7cK{<<0nLBmk_%9Z*tjmyTNtZ^VEpt zGKd@~Lw1`wp>IY9$~Bjuq!wT1vWqLKiuD4?MhjpZ7GnFUhnd7PvCQ-nDp-Gb9$N3j zz_?FGFmIlHj1CMxOF12=rdIX_!+SY%ar2B|^t!&38ouyzYMJSmC^PK~{L=R&_}1bA z`pC?1#P<2(Y}5D9@XRb+cWf~|z;!Xl-kr(l8Zs312i~@vksgiqF1>F;;SIus(IwL8$5HC4|zLURZ)hV8CFqOUW_!=^E*n-?FYedqh zTWsc4R`T-2W_Ew1Ejn?|70=&hBFULBPSj}nHvaonJN!z3rCGMgJNWfQ{cKdVmdQ}n zgPqed@ls|fG(3JC&tKUlE-__9jXyr1^Bb0dwuHA^@)@+5! z&qu+YN+)_$xGn2aQzbR&XvQzb{tI23?#+(6*td_wril)vUxG4TH z3Yd89Wcao*OMIdFGidfci{G!gK)o~7#?)@zCfa@WWvbpjgz6eNu9$qJQn6j>$8-cG ziL5+J(Z#^IOxD&A^_>1qR7xtKYvu97{LL#d-jIo%uV4tu4UtXp2$p=^R4=#b5Rb*qX^ zRPL$0t#zwkHMP-}s`&afWfl$*`=Zt)N%$V+LenFSpiB zTEQ6YqLUvhICl8zu|@?Q@w(wkz%wRoMitpI8=QH)E4)a(81{r2Pxq z{ff;!*UlLixi@@w#F4dg$44EC?_BR$KF@jnvFz*JN6MpI;#vwB3$Kc3x5UIV)pn6b zD`GrSKHd?y!E0TtXZodgl6#J>o9~_VWoxm8cjbaf`o2$gM;@zOI3@q~wayLRRg0cq zdGF5ko?}&U(+Y_D2?e0)nMT|qOS1x%^f>WYbO;q#r^geAtnyQ{0@}j|6~GzJ;4~~~ zLrHb)ku}oci+i3kfQ~UZq#y6n8If1|2-Q*8${*2YZs?JsXIhyB&po1N&q0r5MTZM6EW^4awwCSQ4OQKyFaEe$#WzWoOXOdg`H`(sYlu=I|ho z2zmO=m1L$Z5jaJ%p3X?q=Zcdvl7;t}Jidj)l$B$Y53Mv;RGMBRVfp=C{lgbFoC^r! zq{ipfflbE1An28|v<@+*evpWS8xD~0>-a3C=rw)^2#aEMzv>8-;@MRK@`O@^)SGFl zHGf7aOn)O78OcjhatEo?08;*dGy`Kb+r|=7@htwIc>SZ4)fJwg66q6DlQVT3O&FFP z{vA=XvkrU9!jF&*@WiU8>&}S=m7nH3g5Hx8;V>r(^A)UVmGN{0GKYzWp;NLOLGWz^ ze52$4Ocs<1wY56zf6OBPA_Dh+xx;_iO;X-;P%1R6^02KXfZAFm<0B$oh?+cju~a)~ z7omCbTAt@;laJ75@O(b6P2q!hK85#BF|KKe!c1Z2&-01c|J_&v@wEc+~-{dIX%IBmRmy9yu75KN$D}s5vjs{W;ls& z`HQ48;`tjvND!nA;u)vH-c$Da*eK5>Nx><+POvw+C+UzDGbBgMoh!)Gg>Eo)W9WnN zq?0vmt}eeUKV1Co*JRJ`y7HPkWM<}H>J1KZxuCT#}h6(ycrTiiP zvqO#bCdDRgsPsG^AWVZZgc@YW22FY~!F?I2$wWbk6ggk0KM8U0P{^9rXh|X`*^&Ji zUYkF7PR3vbE26_2D>-OnOmcADn0icRP#*DP%u}!@_{3B=nn6ua*<|Z-Udl_(bxzFA ocGl(OWaZl|P0rD$Wi7FB^P1o`!Brh*4e;A9oS*^1$v?2a0DmGVAOHXW literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d112e179eb9bbb8372b524bcc40eea2e5cae3f76 GIT binary patch literal 3997 zcmc&%cUV*B8h^iYPLh+bA|z=7Q79t}QBcIK%?UC^K^zRV$dXMEP`tIa2%@NniUSk} zLnQ2NTSv^_2fT!~I8vS#hZwu`H>4z^DdPw!R9u|U7AQamQNg z+o!bh&k8UMzXP&%;&la9I-$yt<*{LLNr9n3NkKZ1^dd4eFhm)%JT^{UQ$G@Yk7NzF z>=q{dFCgDpX_H8JTI4Cbz)}~Rul3{r%VqsCh;mGW${q@OjxPn>f4I|CTd#US&?NC>EZ0bjo?1$OvkDDjg(;X^*G+C!mF z6ST1?$z*7*v4>N8UBNVV6byL1MCYqUL$*|mtdE%k@xTs9$%1dNC1jZQGDCkhs=GzRWY1GqA~5{h(_VT4^B>a3I^v*TqT*ysjk$2%duaU8_` zB!XkQi(to6DQtRv7@Tv%U`d53oLkI>13k5lCm+7k6FP!qXt<0stUe?Y0&K|#QN4wfh;cpNUB{Sz5h0fT0RVF2y=){=R&qE z3)<@I!FII^c>m6Z#!PFFWwN0nHwZdz>cPoNEV#7a9(j}&fO+B>c(%#~%q4vseKZGqJ+*O2N!%KUA{fIyORP%Gn=jg`S_2;FQD1 zM456imi((Ly7cVEutG~Wd|lTPZZ-N*=Wz@R&OQH}zvtC0q*eV3Ym8tUo9Fw0%(xVR zXLts1%I3b0j-9c`CO2mB6GmIJ8}ur%Yd0*g10p%=^cofH`tUyKyH&x85f_5?2kk_w zODH>Z#(UVpRn43mu2#s%F#~lw&u5*>7^lO(YXixXeAvtST#(=XohWc^9%S_z!=Wr& zqG6AVp53-BJgwHFSX96Oy5%u}H2>xHKx{kqD-g`NmzIt3W#S$Fu{KztJ*oIt^ z+_a61w%}JP&tYzRQ@|#@oGq$VlJSo};d=K^(i*NR23DFj=;8$4QQpn@#TQsh6qDhH z1|_F!>t#4Pp#wXx`Z?6{vfYfRgrGCBdP*)jzk*~QnH+SSM!Hh(maf+4`}!H9zUOw;g<&Cep6ICc3$&?a1G#aY3R}?{&A!Pp#DaNdcxQSNbnh!eU6H}y zc;1t^`n?TybNmT(Ns@}4PL*hFG@Q!fP+P9$6^=)-rhjdjum zhG=KfcKqXm-=NcpIqZ>*J|JImn_ZyYpmp%O-C$cfL91!16dZU9(EWi&SnTgwkUU*S z^zjZM>izWr_UglR+*voY@qv@^X!xx>WIR`a^)8kX9|>l$iv-Rn+~*V4vq`LBK?)9c z$%(7v=%cxU8?R2Xxi7A8WA2Esn@QQM{3+{M&sMj>S9e{}{=8hh^2WuuS68_Jd|P>r zPXz4Z7v9<`A)iZDyc*W%qar;=wm|>MemK)oOX%(&V7_(gNZ~=!R+z&1>uZu=u2xJi zA854fCQJnMgulFi&~z3&Q8A|}9D=(HFj6efPg#SL`c{5T`D@ZRd{ehvc|mHrmUwJL zPg6nKdOZ_GhL|c`yTQQ9N9EUCn7-*f``}Z#RMEQ4MiUe6^)wf)-(oCJB_)c38QaX< zvaS4E4sO_P>0RiyTT#4m=SaW0h~AdsO*wW;S~4U~hcb6NL|jz)w;tM@HzB_7)NZGe ztbJ2fKDyW2TC(N92PqiWaE5AYfm4RQb-)SLwj$T9rtW)Ylx7#t%pDtf^+f6R60dy4 z21Dnv9i?+sKBWO|Wjo7#Yl1uWIG69LT=;py{i|)|In{wpsazwMirm_eQ`y!F+bee0 zhhHdk&vU8V(-_@T7kRC{GVe3xt(FZ&u2p-#NW6Ekbm7UWeMgfY_I2dBR_{N)>gl8V z*G^U+`07)@c~ZBU{N^;$z-G~@nu6B#d^3-|ZncGNo5T)L*H6_Jo!nyLv{C9_cd%o- z)!ed09d*TLa_mD+?{%*~)R{LivHyBUeaZO)@-*IXj|Np&k=u5g#itueFO@LpoW{Lh zsD_|hHV&dIZ7!G5=;)J&K9|#cHRw(ie6yvOk2IQI`4heTO7$D)sf*u6mmDtfstrlw zS)bl`YtUtxGN};LkR-RE6oVTZ8OIQ&utXvzGBJr078AzM51MPFa75RQMLou{ z{Nb_&H5+^LnZ1)x$c0m1Qz!DMHn0#8h*&gv`k8r`wA|B7?;uaKwOYjFpbbNw-ae&` zhLEQw>%SvUdatQeOjulal7RDkRr#U1zV&b7K7qT_RL)jf;_t#|zF(KP?D&tLv)fdB%RE$O#ydVoh zI=%@LUk&`9#e#^GNF|>CV|M%xQPuy;^OgNmFG!Xo&b`$A^^K3U}@084p&dflErYKKlQ1~-jwVtwP_R+y(XO9W?U9ddVWm2@ipHBp% zkM*YW`&Se_sN*@QgV0IPqKT(oGP)uDi2=TjaiOl^R1o8@hNsJ+#eZ<`Gk-p__h1H1 z>FU%q`9#xailiW4N2R~tY-NC-x^x zHcB4EpX09XH{EPn23t>2Gnl1o4AX=70GCNo!DCm_JsHOs(M?Wf`cJ2*xfJ?Lq4uOR zr>mp&{*)qFb5H7I886y~Qg>40q%Kl(t}fj_)@3=3qpp+jr{mME%++WtL2MEql(1}h tSkkDF`1nyl2?@$%lNDhJiIK`UlQH8*j~Q(*@st9*{)1x#fH(ca{0|mk!UF&R literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f3f7d2a7d0cc4ecb1608dee4e76b94cde19d2b83 GIT binary patch literal 4018 zcmc&%d011&7N41WZ*r4>M9obTKtL%0abXkOP)XPk6~PLq6vHAQTS{28?obpJ72E(p zR*_9m5LyM5T?My-&jq#0vseLN>-wI1efK8C>a+dhec$_D&zH>1nK|eD&N(yRmtQ=j zo)#2N*-;Me6iI;*02r)Z78|!}x1Rp2qp7})1!Ygd(-hlXZ{B>jHe|0&-nH;)HWZ)) zlxRG~rT7$!V(Hod6LQE#L=IU20JxtcW`qdq1Cnm2OW|T`z~YNVoB(-LkbkIL8A>4` zSJ(oDolp`gA^$9tcs!jWpb(%3K0N_I>mHPyJDmfQvk8s?4g{JUHV(8jNn1W`!N_p+ z#JCA5?cXyNkwYgU68p_YNLTUcE`7)|QB{F3EX4^(dhz#Vm}mw?2gXGPN5%RF$z$c3 zVe}#_$UiVTA}%sYRoELSd;}%uG07vO`(I%GVYy90s#EAreNU(=&gVVLBb;{31u<0% zP;F>6o?7-RcwRLLo|=%!Bgf1J2S*>omulyM&;5*WkD_O=-t`=~J|!J5_wVMJnq9=U zxsC-ESMPyH{2V&D=NNoxd>&aw53*NuT?FkLZo$ri2hj2}1uW~M0r*h zsA_rz#{hRUt<4;kxoX1sEhoT9NepaV-;FlZV_^G=#i-lM5&UfP6p0QwV|mRQz~-no ztUb^U?xgGi?^0v1(PrzA?UboR*iIeR;OQD@5!Q(3CbL*xYiDsY6*}Aqz1u`i%R?ZT z0ExU;Zb)Mi!s4g~;B5E;bXfLr`v)cy0-y<|tZoD^%3Xn`uOo=in}8mESr0aMHey%2 zW`dOFW$0-G0WZ%v18i-L@H+jcpjHr%Dw7?6p2KW#;?gKgu>CbON;(R(9AAUK`&R+X z)*Oqb>;M^;f*)VHoAoMP34Ur%1p7``6WK+JQEsgTY%s3JtV-TuOU~cNW;Tvy8z#r2 zYO`zXwSG246Z;n;!rT`-3zT45Rv=h6;6jX8zY>f;c7iqh8iY^(ML#xqF)*Js1Jhoq z0enYhao`E&{KAS#!R&A!SQ38==}E@0{l#B$-&_=9QmfT?d9)u&c$J1o)@o_&d)Etxn`A)?K{|`;{eX$OWZ6M3NM#Fo} zd}479j}tR-EUQ+l65YIQgz|)Tgo1YjrMI>5z0+hwgs2c0jbFizC*nc%yS-@fs%G{^ zErO+EodOf8SHg=alQjAFO+ms`Pu5p=ouE(d9_ar~HrU>y50pwFSKnhaX_mf&%WG{w z%NIR{y>64q&wscNL%HL*=WA@(3)436BYDTL+I!>Ri&K%f_|h%lF(Vl)7|6iR{a6pX z}cc7E$ zU6|>q7Czs~fwY`|1MBQDV=1xogq@oae&`a8nLpjF-BU1wvstkbwqx~d(|NzZH4kop z{kxs8QTo3ER@6iGhB2o>Wq1ravST`|-^JyI3oRK_Uj#qs{XYPFGDZ z3aoL54P)QJtu33$EelFfd}}!C4xx<#xQ5t;v<53jo%a-0>SfnW8&3ZSb-< z5p^btd0Vt?32uH0+hC{Pu$I1MkSC~ziJD>Ha=~3}FH2uDS)dK~#O}gYANT=wDl=H) z>O6tn(jQp`8nwIw_p-qFvdO$ATU}tqT?hvrKS7bNcpzb>rf~Ic3idpIfZlz%ku&GE z2KM-j0*>mD>X*R+Dh`_3>oJ5#Ho&JUZ{RUrW0tz7G80#@-`4~yv?WDse=s%6Dzv-QxAow*?8%C|XS3`}w(p<5;z@r`YsrqhS?du; z+o3eAz&>Tf*hOun=|#?IV_f$+lH;)*R^AW#U!7q@$+zT!wjIXO8R9 zgBP-=C~aZ%&aTTjUiuNbaorGlwCR;{8Xc~E zLrwXE77HpnlZV2Of@)>VNncw3RdC~Vaix`a6~XeLuk&?OA7l&3;XpY^`j&gRn+Xm z=k|xh!j7_i&;7_?1PF)+K!R-t9y0HfmVWibJm-&ipNO$miHIDsVOZ3syR^}7To9?a z;NaLlBT-uKDOE&pR7k9v9}Ys{Ae3A;k*qhNY1F6T{{-}psuibN0})FZ7#bWYr*N7^ z8lex^sFAMwp^?EXKd%6}1M{l7e4$zS!7gsH|tu7nh(8VLXp>xwj_o}Tl(r>P&7 zC?7Xt#9AtoN~QDZFC(+1(gdk2MCv4!hDhy0RC_YoU=Fjgmr7lknbwD9T2n2ROavy7 zbl%tF%*>|H3&*)GjBcO6jE>Aehn7m+nL*~usH$_RIirUMSkAQw@Lm`f*gEt#{A{FRHmt)qgRL!@%XUjLyPZ_?)k+RjPAw^ z>e5xItMdt`_hhkhZ|i7ZpSjVCd{m{=c=Ofuxh?oOAG%qze2@>*N|_?ad0{MbAa!+T z>{UhyL-_MuRsE)$P0LX0rBVlG=^VlIK(WYa>hb`~6?9J~F-CNg6PW(fDXK4p-jk_3 z>CEZs=yPAGEJ1xws$>~2+D5ACq}oYUqFEz=f88vc-tXauN0lK?s72p#_5j#fpN80*V`F zuvS0?>#B+y(dVdEtJYezxZ&x$H;C3}d(M03yz}~<+2Q3U1ysi7;p<8Z=bKfX zyK(Fd4YYvnZbR$ShBSxfn7e|6>e30Rt|$PYE5VpiAK|!A<1?15=yy)6`)9QasVLLY1d44yAf8RZ8&7>=utbr&xuKjM+-MJuB+ z6fr5PsnK$kTrYuHBt%b*QYL38QnU!yV1issoJ)w8$ z1fJ<*u;_6Dgzx<)TzaX~4o*KfeRZ^6b#>WyxL!oh(1Dsgn4Sf{0@NCRG zbasawEHf9Pe)U#BJhBH$=@0I?1YFo*1-!l%U}EYH=MxQKNw_N{E%1cVt%)!xs~7M} zOyEjj4XoGCgr4?$(3u)@WZAG4Oc(foWy2Xrt#yW!-vm&vKM9t{n#015eK4vt1|qii zf#%5^Sl7B6-O)jyIjsxVj7*@Zu{RLqBf&w_2Km005V)ffL|Yx90+T?wnFi*(-vRev zIJ9`v*qyJ3LD3jf5bgAa{9k@Ti5W()i?9MkJ`a}JkZ^SGUa*wx^WRDxCdDQKVN1(px&;Zo!>xb-dtvQ!~Zn!g_2Pd^4Ly*I-; zzY%a}6^7b#?!y3e59m|l1W%vMg^gGGf>nw!dS3Aob&tM>R1yTKDVE^1BoM}R6e3Oh zZer1vckpV5GdgHns!R3phmE~A5)s|kfz@PdLbm=nY8_lm-1k3%`r9|-i+%fnW6M2I z5T~$)lKn_!%tO(K+{mo+&2TjDI{KmV5pwF5img>f!^S@u^??9v z==Tn2i?1FU+f)w?;xu$5_X3u80EcV#4(PFK8z$WDjjwCugJ#1~xTP+E_lwf--u5|| z+u$K&!ZI_?FUR&E$Am+~+I$X&luX9@j@Uy+9SodLgWP>xc23F+78=18>XM zaMJ0n&Vz@8NK?>*!Ltv+>#e?^7d8r#d)i`8rW}C69}eM{$Bh7WV=UHIN1}_PkHfM( z!Kk)Z8)!_YV>S6Bq31|{s6X2Sx1P5b3`VuX?WsEecXPm%v^!84YT~G1Q> zxzO)YcXIXmiP&0=BdY6r0N--*J(_g-K0e}5Z?0wjbZlFJ1KG4ad9B;6qW?MC`k?37DxZBJ2FU(Ta36 z-fi+#Jj>t~QY75Kru|lcJ&MgCYDdcub>9t3HTVv7Qz$WzS3~8uu|x;mOn$p}G%4Sm zY*@9e#?(J46m6P*9qTFX!(E#oK^@H^Tsm+zu~j)4%Y5aHF1Ft^dNTJoui$(HZzp=J zx7_R~*Du?Ezv}&u$VgI#98HTkIw6lJ#Z8JgyQPlWI<7aWKjnZ;T%*fNv+Kvv7}Q|@ zxM_{e8EI;is=EtYG36l@vebi27FL0^O%``LIURPtUx7`W)yOR{AUT>BZ-~s@Gf=bI zSa`uMa8+(b?-D8=enA`!12Eema28%U?n0!R7ej{(7kK zLMV2@Aj-smSA|vHbHa39oPn4(&FJ-=Bm%3RCAvA{46#&ZN$RdN$0IwUF;oyl#XaZ` zkM1|>95Fr)hh%HGYI$VO&BOY_z`Ku0lYzyklIw(-nJvM&1y8_dO)m7$-^K}0O`=lU zvU##!hUk9ZCr6*0#|M3!z)x;#;l2m~GQwjh{HjrM&o8|Ub%Rb~4aMj2T_Y?EtQW5& zoW@?mtNQxj%^mv;Tp^(cX`K05JEl*s(zT&0 z_@BO&3APBFIa3gIPF1e?Q|#g;zHBwMZU)UUW zWaHx1gHrFDDD&J@R6Z>0_XlCS=8x(1RSuBG&T3h2&-sSdK^pZ~OYbEXqn0@rE-Um~ zVYlV*a^H*Mz;d@a80~jS5>y>f?or}*xqooYq=vb)e~V3cZSsw#68|f9Q)DyWyd`Z) zcgeqBAR1O+Q?@7O^r|7VIJT=a3758p)fL#T-lzD_v#;*Vu9t0-fon70WHUEq19fX_ zm{B%AceUpx)FEqn$beWfxD!N?7DVy59v!fn?w~fNl^b;sKQ{+v13y3Tik)2tNUA?o zChB-vFuHu7;!$gD`YZh@ZZp~*T;7{$Fm=S7_SY}kUKvEV=ixkobC#jpBVSkKs(Ni0 zIkM2CU$7?2D9UTG*^ueaUmHdH6nk6xu5g}V9OG9qAZ(xNjd5&1*}#Mr%?y*c!17@; zUOs8#q~IDheUBXBtzahYv=>G>!e3`H z8F%WU9P_71Ov1*3TB97ov>oM$vL(m&seA>or}xC3ZL!M+P>|V@(X+-Yv~ef54`sTu zFR5MhPn2zSJ?+ISL4^DMX*|7bt2Czk!LJ5PF3jkW4H{9l`R^%AxySEhFhPjsdR~W0 zA!}f296d{=462@8=K8q(V~v-UEr-NQ2bj}XC!6&7oPF^W2`MbIj7cYw6Vg@OnB*9i znK3l1f-l(BS5Ran-qA}um&;PkKYl{~lxxXouK-#{OSB|Z9Un+Uv|=Qe0y*MQZc!g7 zp(Tb4jvSHcfm#2);CwA#mWh6dl_gC!F;3L;ej!B7Abu#DMJ^!Zq({ZaDC9K3@N8@B4}9I)JYc(tJHxoa1rvVWc$(8`{5kz) z?LJ)zjnSL{H`bdFYOjZ-^ca(&>zb^)Gx06h_}aw(4i*IZB6Cr||N5VazlqHMU!LN> z?NJhJw-xA%DZ$fzrT`*yAzvC4G$v$d=aVI@AnaLjk$Om_(y`2&6@RHTQ|b{X^^!{C zq@HowGY`gKG`sSYN`2Xx*>}y%PP5QjcvbxXw z)5AktQlh=%q;l3@o1THgNLbfCXuefx)Wf#YBQ@GPSj8%&z5%Sg)+n4`II#56gR?LSju=cO=b9$HVPat0lv50iRicHWb=Sk{ZNk!m~H>7)hO zd9H;Xrtr#O@@R3g{!DzvmA#ryOAuP5%G0K0#HeheQd4c^X=%z#i|H|G=?Tgd3rANw VM>_{mfH}a&7c>w6GUgYKe*tj26RQ9K literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..f7f0fe9df --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-14 10:43:37", + "end_time": "2024-10-14 10:43:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 31.7, + "gpus": 0, + "memory": 15.83, + "object_store": 0, + "execution time, min": 0.003 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.2, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py new file mode 100644 index 000000000..294c86f25 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import sort_output_cli_param +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +class TestSparkClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, + sort_output_cli_param: True, + } + launcher = SparkTransformLauncher(ClusterAnalysisSparkTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "signature_calc", "bands"), + os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py new file mode 100644 index 000000000..919857e23 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py @@ -0,0 +1,58 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, + operation_mode_cli_param, +) +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +class TestSparkDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected/get_list_transform/docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + operation_mode_cli_param: "annotate", + } + launcher = SparkTransformLauncher(DataCleaningSparkTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "data_cleaning", "annotated"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py new file mode 100644 index 000000000..6d93dc7a9 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +class TestSparkSignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, + } + launcher = SparkTransformLauncher(SignatureCalculationSparkTransformConfiguration()) + fixtures = [ + (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ] + return fixtures From 77d85fde33e2905a19a5195adf64fecc5d88be9b Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 16:13:42 -0400 Subject: [PATCH 44/91] Adjust to file naming changes Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/ray/Dockerfile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index 27d101bb8..1265e8ee3 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -22,10 +22,7 @@ COPY --chown=ray:users images/ images/ RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/fdedup_transform_ray.py . - -# copy some of the samples in -COPY src/fdedup_local_ray.py local/ +COPY ./src/fuzzy_dedup_ray.py . # copy test COPY test/ test/ From 310d8139ca2bd52afd2e987fc52c6b530d4c2888 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 18:03:34 -0400 Subject: [PATCH 45/91] Create python Dockerfile Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/Dockerfile | 43 +++++++++++++++++++ .../universal/fdedup/python/requirements.txt | 10 +++++ 2 files changed, 53 insertions(+) create mode 100644 transforms/universal/fdedup/python/Dockerfile create mode 100644 transforms/universal/fdedup/python/requirements.txt diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile new file mode 100644 index 000000000..a0a557060 --- /dev/null +++ b/transforms/universal/fdedup/python/Dockerfile @@ -0,0 +1,43 @@ +FROM docker.io/python:3.10.14-slim-bullseye + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest + +# Create a user and use it to run the transform +RUN useradd -ms /bin/bash dpk +USER dpk +WORKDIR /home/dpk + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/ +RUN cd data-processing-lib-python && pip install --no-cache-dir -e . + +COPY --chown=dpk:root src/ src/ +COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt + +RUN pip install --no-cache-dir -e . + +# copy source data +COPY src/ src/ + +# copy source data +COPY ./src/signature_calc_transform_python.py fdedup_transform_python.py +COPY ./src/signature_calc_local_python.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +# Set environment +ENV PYTHONPATH /home/dpk + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt new file mode 100644 index 000000000..85806f809 --- /dev/null +++ b/transforms/universal/fdedup/python/requirements.txt @@ -0,0 +1,10 @@ +pyarrow==16.1.0 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars>=1.6.0 +disjoint-set>=0.8.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 +scipy>=1.12.0, <2.0.0 From 7d97cef7c741a703b27b2d5f17467b998bc2794b Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sat, 19 Oct 2024 14:51:24 -0400 Subject: [PATCH 46/91] Ray bug fixes Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/ray/Dockerfile | 10 +++++++--- .../fdedup/ray/src/cluster_analysis_local_ray.py | 2 +- .../fdedup/ray/src/signature_calc_transform_ray.py | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index 1265e8ee3..ec2c56f28 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -2,6 +2,8 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 FROM ${BASE_IMAGE} +USER ray + RUN pip install --upgrade --no-cache-dir pip # install pytest @@ -13,16 +15,18 @@ COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . +COPY --chown=ray:users python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . # Install ray project source COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml COPY --chown=ray:users README.md README.md -COPY --chown=ray:users images/ images/ RUN pip install --no-cache-dir -e . -# copy the main() entry point to the image -COPY ./src/fuzzy_dedup_ray.py . +# copy source files needed by test-image +COPY ./src/signature_calc_transform_ray.py fdedup_transform_ray.py +COPY ./src/signature_calc_local_ray.py local/fdedup_local_ray.py # copy test COPY test/ test/ diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py index 25b96788d..c078746ce 100644 --- a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py @@ -19,7 +19,7 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { "input_folder": input_folder, diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py index bc3c0d991..678d953f2 100644 --- a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py @@ -14,6 +14,7 @@ from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) +from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher from signature_calc_transform import SignatureCalculationTransformConfiguration From 87902ac1f8aa4eff51e127df76ea44fd86a632e0 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sat, 19 Oct 2024 17:03:09 -0400 Subject: [PATCH 47/91] Fix spark image to support testing Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/spark/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile index 523b94c06..a36a7cef7 100644 --- a/transforms/universal/fdedup/spark/Dockerfile +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -36,7 +36,8 @@ RUN pip install --no-cache-dir -e . COPY ./src/signature_calc_spark.py . # copy some of the samples in -# COPY src/filter_local_spark.py local/ +COPY src/signature_calc_transform_spark.py fdedup_transform_spark.py +COPY src/signature_calc_spark.py local/fdedup_local_spark.py # copy test COPY test/ test/ @@ -46,6 +47,7 @@ USER spark # Set environment ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH} +ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH} # Put these at the end since they seem to upset the docker cache. ARG BUILD_DATE From c84792452619fc57ce2ebeee8f872ef2b67deb82 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 07:57:22 -0400 Subject: [PATCH 48/91] Removed file copy utils Signed-off-by: Constantin M Adam --- .../fdedup/python/src/file_copy_util.py | 158 ----------- .../fdedup/spark/src/file_copy_util_spark.py | 261 ------------------ 2 files changed, 419 deletions(-) delete mode 100644 transforms/universal/fdedup/python/src/file_copy_util.py delete mode 100644 transforms/universal/fdedup/spark/src/file_copy_util_spark.py diff --git a/transforms/universal/fdedup/python/src/file_copy_util.py b/transforms/universal/fdedup/python/src/file_copy_util.py deleted file mode 100644 index 87867e532..000000000 --- a/transforms/universal/fdedup/python/src/file_copy_util.py +++ /dev/null @@ -1,158 +0,0 @@ -import argparse -import io -import os -import re - -import polars as pl -from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase -from data_processing.utils import ParamsUtils, get_logger - - -""" -This class reads all the parquet files inside an `input_folder` of the type -`.../bands/band=b/segment=s`, concatenates those files, and writes them into a -file called `.../consolidated_bands/band_b_segment_s.parquet` -""" - - -class FileCopyUtil: - def __init__( - self, - data_access_factory: DataAccessFactoryBase, - config: dict, - stats: dict, - ): - self.data_access_factory = data_access_factory - self.root_folder = config.get("root_folder") - self.logger = get_logger(__name__, level="INFO") - - def copy_data(self, subfolder_name: str, data_type: str): - self.logger.info(f"copy_data(): subfolder_name = {subfolder_name}, data_type = {data_type}") - if self.data_access_factory.s3_config is not None: - _, root_folder = self.root_folder.split("://") - else: - root_folder = self.root_folder - self.logger.debug(f"copy_data(): root_folder = {root_folder}") - if data_type == "bands": - match = re.match(r"^band=(\d+)/segment=(\d+)$", subfolder_name) - if match: - band = int(match.group(1)) - segment = int(match.group(2)) - else: - raise ValueError(f"Wrong subfolder_name {subfolder_name}, should be band=b/segment=s") - input_folder = os.path.join( - root_folder, - "bands", - f"band={band}", - f"segment={segment}/", - ) - output_path = os.path.join( - root_folder, - "bands_consolidated", - f"band_{band}_segment_{segment}.parquet", - ) - elif data_type == "docs_to_remove": - input_folder = os.path.join( - root_folder, - f"{subfolder_name}/", - ) - output_path = os.path.join( - root_folder, - "docs_to_remove_consolidated", - f"docs_to_remove_consolidated.parquet", - ) - self.logger.debug(f"copy_data(): input_folder = {input_folder}, output_path = {output_path}") - - data_access = self.data_access_factory.create_data_access() - self.logger.debug(f"copy_data(): getting the data from the input_folder {input_folder}") - file_dict, status = data_access.get_folder_files( - input_folder, - extensions=[".parquet"], - return_data=True, - ) - self.logger.info(f"Found {len(file_dict)} files in input folder {input_folder}") - consolidated_df = pl.DataFrame() - for fname, contents in file_dict.items(): - df = pl.read_parquet(io.BytesIO(contents)) - # self.logger.info(f"{fname} has {len(df)} rows") - consolidated_df = consolidated_df.vstack(df) - if "docs_to_remove" in consolidated_df.columns: - consolidated_df = consolidated_df.select("docs_to_remove").unique() - output_table = consolidated_df.to_arrow() - self.logger.info( - f"Writing to {output_path} table with {output_table.num_rows} rows and {output_table.nbytes:,d} bytes" - ) - stats = { - "input_files": len(file_dict), - "input_bytes": sum(len(v) for v in file_dict.values()), - "input_rows": output_table.num_rows, - "output_files": 1, - "output_bytes": output_table.nbytes, - "output_rows": output_table.num_rows, - } - data_access.save_table(output_path, output_table) - return stats - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--root_folder", - type=str, - default=os.getenv("HOME", os.path.join(os.sep)), - help="root folder", - ) - parser.add_argument( - "--subfolder_name", - type=str, - default=os.path.join("band=0", "segment=0"), - help="subfolder name", - ) - parser.add_argument( - "--data_type", - type=str, - default="docs_to_remove", - help="Processing either bands or docs_to_remove", - ) - parser.add_argument( - "--use_s3", - type=bool, - default=False, - help="use s3", - ) - args = parser.parse_args() - root_folder = args.root_folder - config = {"root_folder": args.root_folder} - input_folder = args.root_folder - output_folder = args.root_folder - data_type = args.data_type - data_access_factory: DataAccessFactoryBase = DataAccessFactory() - daf_args = [] - if args.use_s3: - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - s3_config = { - "input_folder": root_folder, - "output_folder": root_folder, - } - daf_args.append("--data_s3_cred") - daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) - daf_args.append("--data_s3_config") - daf_args.append(ParamsUtils.convert_to_ast(s3_config)), - else: - local_config = { - "input_folder": root_folder, - "output_folder": root_folder, - } - daf_args.append("--data_local_config") - daf_args.append(ParamsUtils.convert_to_ast(local_config)) - daf_parser = argparse.ArgumentParser() - data_access_factory.add_input_params(parser=daf_parser) - data_access_factory_args = daf_parser.parse_args(args=daf_args) - data_access_factory.apply_input_params(args=data_access_factory_args) - stats = {} - fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats) - fcu.copy_data(args.subfolder_name, args.data_type) diff --git a/transforms/universal/fdedup/spark/src/file_copy_util_spark.py b/transforms/universal/fdedup/spark/src/file_copy_util_spark.py deleted file mode 100644 index 58a43a736..000000000 --- a/transforms/universal/fdedup/spark/src/file_copy_util_spark.py +++ /dev/null @@ -1,261 +0,0 @@ -import argparse -import os -import socket -import time -import traceback -from datetime import datetime - -import polars as pl -import yaml -from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase -from data_processing.utils import ParamsUtils, get_logger -from file_copy_util import FileCopyUtil -from pyspark.sql import SparkSession - - -logger = get_logger(__name__) - - -class FileCopySpark: - def __init__(self, root_folder: str, num_bands: int, num_segments: int, use_s3: bool): - self.root_folder = root_folder - self.num_bands = num_bands - self.num_segments = num_segments - self.use_s3 = use_s3 - self.subdirs = [f"band={b}/segment={s}" for b in range(num_bands) for s in range(num_segments)] - - def _init_spark(self, app_name: str = "copy-app") -> SparkSession: - server_port_https = int(os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "-1")) - if server_port_https == -1: - # we are running locally - spark_config = {"spark.driver.host": "127.0.0.1"} - return SparkSession.builder.appName(app_name).config(map=spark_config).getOrCreate() - else: - # we are running in Kubernetes, use spark_profile.yml and - # environment variables for configuration - - server_port = os.environ["KUBERNETES_SERVICE_PORT"] - master_url = f"k8s://https://kubernetes.default:{server_port}" - - # Read Spark configuration profile - config_filepath = os.path.abspath( - os.path.join(os.getenv("SPARK_HOME"), "work-dir", "config", "spark_profile.yml") - ) - with open(config_filepath, "r") as config_fp: - spark_config = yaml.safe_load(os.path.expandvars(config_fp.read())) - spark_config["spark.submit.deployMode"] = "client" - - # configure the executor pods from template - executor_pod_template_file = os.path.join( - os.getenv("SPARK_HOME"), - "work-dir", - "src", - "templates", - "spark-executor-pod-template.yml", - ) - spark_config["spark.kubernetes.executor.podTemplateFile"] = executor_pod_template_file - spark_config["spark.kubernetes.container.image.pullPolicy"] = "Always" - - # Pass the driver IP address to the workers for callback - myservice_url = socket.gethostbyname(socket.gethostname()) - spark_config["spark.driver.host"] = myservice_url - spark_config["spark.driver.bindAddress"] = "0.0.0.0" - - spark_config["spark.decommission.enabled"] = True - logger.info(f"Launching Spark Session with configuration\n" f"{yaml.dump(spark_config, indent=2)}") - app_name = spark_config.get("spark.app.name", "my-spark-app") - return SparkSession.builder.master(master_url).appName(app_name).config(map=spark_config).getOrCreate() - - def create_data_access_factory(self, root_folder: str, use_s3: bool) -> DataAccessFactoryBase: - input_folder = root_folder - output_folder = root_folder - data_access_factory: DataAccessFactoryBase = DataAccessFactory() - daf_args = [] - if use_s3: - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - s3_config = { - "input_folder": root_folder, - "output_folder": root_folder, - } - daf_args.append("--data_s3_cred") - daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) - daf_args.append("--data_s3_config") - daf_args.append(ParamsUtils.convert_to_ast(s3_config)), - else: - local_config = { - "input_folder": root_folder, - "output_folder": os.path.join(root_folder, "bands_consolidated"), - } - daf_args.append("--data_local_config") - daf_args.append(ParamsUtils.convert_to_ast(local_config)) - daf_parser = argparse.ArgumentParser() - data_access_factory.add_input_params(parser=daf_parser) - data_access_factory_args = daf_parser.parse_args(args=daf_args) - data_access_factory.apply_input_params(args=data_access_factory_args) - - return data_access_factory - - def orchestrate( - self, runtime_config: dict, execution_config: dict, data_access_factory: DataAccessFactoryBase, data_type: str - ) -> int: - """ - orchestrator for transformer execution - :param execution_config: orchestrator configuration - :param data_access_factory: data access factory - :param runtime_config: transformer runtime configuration - :return: 0 - success or 1 - failure - """ - start_time = time.time() - start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - logger.info(f"orchestrator started at {start_ts}") - data_access = data_access_factory.create_data_access() - # initialize Spark - spark_session = self._init_spark() - sc = spark_session.sparkContext - transform_config = sc.broadcast(runtime_config) - daf = sc.broadcast(data_access_factory) - data_type = data_type - print("data_type") - print(data_type) - - def process_partition(iterator): - """ - process partitions - :param iterator: iterator of records - :return: - """ - # local statistics dictionary - stats = {} - # create file processor - file_processor = FileCopyUtil( - data_access_factory=daf.value, - config=transform_config.value, - stats=stats, - ) - for f in iterator: - stats = file_processor.copy_data(subfolder_name=f[0], data_type=data_type) - # return partition's statistics - return list(stats.items()) - - num_partitions = 0 - try: - if data_type == "bands": - # Get files to process - files = [ - f"band={band}/segment={segment}" - for band in range(self.num_bands) - for segment in range(self.num_segments) - ] - elif data_type == "docs_to_remove": - files = ["docs_to_remove"] - print(data_type) - - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}") - # process data - logger.debug("Begin processing files") - source_rdd = sc.parallelize(files, execution_config.get("parallelization")) - num_partitions = source_rdd.getNumPartitions() - logger.info(f"Parallelizing execution. Using {num_partitions} partitions") - stats_rdd = source_rdd.zipWithIndex().mapPartitions(process_partition) - # build overall statistics - stats = dict(stats_rdd.reduceByKey(lambda a, b: a + b).collect()) - return_code = 0 - status = "success" - except Exception as e: - # process execution exception - logger.error(f"Exception during execution {e}: {traceback.print_exc()}") - return_code = 1 - status = "failure" - stats = {} - try: - # build and save metadata - logger.debug("Building job metadata") - input_params = runtime_config - # input_params = runtime_config.get_transform_metadata() | execution_config.get_input_params() - metadata = { - "job details": { - "start_time": start_ts, - "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "status": status, - }, - "job_input_params": input_params | data_access_factory.get_input_params(), - "execution_stats": { - "num partitions": num_partitions, - "execution time, min": (time.time() - start_time) / 60, - }, - "job_output_stats": stats, - } - logger.debug(f"Saving job metadata: {metadata}.") - - if data_access_factory.s3_config is not None: - _, root_folder = self.root_folder.split("://") - in_path = os.path.join(root_folder, "bands") - out_path = os.path.join(root_folder, "bands_consolidated") - data_access.input_folder = f"{in_path}{os.sep}" - data_access.output_folder = f"{out_path}{os.sep}" - else: - data_access.input_folder = os.path.join(self.root_folder, "bands") - data_access.output_folder = os.path.join(self.root_folder, "bands_consolidated") - data_access.save_job_metadata(metadata) - logger.debug("Saved job metadata.") - return return_code - except Exception as e: - logger.error(f"Exception during execution {e}: {traceback.print_exc()}") - return 1 - finally: - # stop spark context at the end. Required for running multiple tests - spark_session.stop() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--root_folder", - type=str, - default="/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_second/", - help="root folder", - ) - parser.add_argument( - "--num_bands", - type=int, - default=14, - help="number of bands", - ) - parser.add_argument( - "--num_segments", - type=int, - default=2, - help="number of segments", - ) - parser.add_argument( - "--data_type", - type=str, - default="docs_to_remove", - help="bands or doc2remove", - ) - parser.add_argument( - "--parallelization", - type=int, - default=-1, - help="spark parallelization", - ) - parser.add_argument( - "--use_s3", - type=bool, - default=False, - help="use s3", - ) - args = parser.parse_args() - fcs = FileCopySpark(args.root_folder, args.num_bands, args.num_segments, args.use_s3) - data_access_factory = fcs.create_data_access_factory(args.root_folder, args.use_s3) - app_config = {"root_folder": args.root_folder} - execution_config = {"parallelization": args.parallelization} if args.parallelization > 0 else {} - status = fcs.orchestrate(app_config, execution_config, data_access_factory, args.data_type) - print(f"Orchestrate concluded with status {status}") From ba9b07ca0a9a4821df0f38cf488db4fc8db7408e Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:00:13 -0400 Subject: [PATCH 49/91] Add fdedup to kfp black list until we get kfp integration Signed-off-by: Constantin M Adam --- scripts/check-workflows.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh index d1f934368..d431f6fbd 100755 --- a/scripts/check-workflows.sh +++ b/scripts/check-workflows.sh @@ -17,7 +17,7 @@ if [ ! -d transforms ]; then echo Please run this script from the top of the repository exit 1 fi -KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering" +KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering fdedup" while [ $# -ne 0 ]; do case $1 in -show-kfp-black-list) echo $KFP_BLACK_LIST; exit 0; From f1879487bc4106f1b776ed6529e8e706096c4bc9 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:05:15 -0400 Subject: [PATCH 50/91] Freeze polars version to 1.9.0 for now Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/pyproject.toml | 4 ++-- .../universal/fdedup/python/requirements.txt | 2 +- .../universal/fdedup/spark/requirements.txt | 20 +++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index fa815441c..f46c8e8c4 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -15,9 +15,9 @@ dependencies = [ "pyyaml>=6.0.2", "boto3>=1.34.69", "kubernetes>=30.1.0", - "polars>=1.6.0", + "polars==1.9.0", "disjoint-set>=0.8.0", - "scipy>=1.14.1", + "scipy>=1.14.1, <2.0.0", "numpy<1.29.0", "sentencepiece>=0.2.0", "mmh3>=4.1.0", diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt index 85806f809..576c028a8 100644 --- a/transforms/universal/fdedup/python/requirements.txt +++ b/transforms/universal/fdedup/python/requirements.txt @@ -2,7 +2,7 @@ pyarrow==16.1.0 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 -polars>=1.6.0 +polars==1.9.0 disjoint-set>=0.8.0 numpy<1.29.0 sentencepiece>=0.2.0 diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt index 10f3e129b..576c028a8 100644 --- a/transforms/universal/fdedup/spark/requirements.txt +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -1,10 +1,10 @@ -pyarrow -pyyaml -boto3 -kubernetes -polars -disjoint-set -scipy -numpy -sentencepiece -mmh3 +pyarrow==16.1.0 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars==1.9.0 +disjoint-set>=0.8.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 +scipy>=1.12.0, <2.0.0 From 84b9104a7791661d368345d3c5b8e8cd02a67a19 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:08:47 -0400 Subject: [PATCH 51/91] Fixed duplicate_list_location bug Signed-off-by: Constantin M Adam --- .../python/src/data_cleaning_transform_python.py | 15 +++++++++++---- .../fdedup/ray/src/data_cleaning_transform_ray.py | 8 +++++--- .../spark/src/data_cleaning_transform_spark.py | 15 +++++++++++---- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py index e5c1e5025..9c60ecbba 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -13,7 +13,11 @@ import os from typing import Any -from data_cleaning_transform import DataCleaningTransformConfiguration +from data_cleaning_transform import ( + DataCleaningTransformConfiguration, + duplicate_list_location_default, + duplicate_list_location_key, +) from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.runtime.pure_python.runtime_configuration import ( @@ -53,9 +57,12 @@ def get_transform_config( :return: dictionary of transform init params """ data_access = data_access_factory.create_data_access() - duplicate_list_location = os.path.abspath( - os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"]) - ) + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py index e83960c24..5ed2cecbe 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -91,9 +91,11 @@ def get_transform_config( """ data_access = data_access_factory.create_data_access() duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) - duplicate_list_location = os.path.abspath( - os.path.join(data_access.output_folder, "..", duplicate_list_location) - ) + if not duplicate_list_location.startswith("/"): + out_paths = data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") duplicate_list, retries = data_access.get_file(duplicate_list_location) diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py index 29890d05f..56c10d801 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -13,7 +13,11 @@ import os from typing import Any -from data_cleaning_transform import DataCleaningTransformConfiguration +from data_cleaning_transform import ( + DataCleaningTransformConfiguration, + duplicate_list_location_default, + duplicate_list_location_key, +) from data_processing.data_access import DataAccessFactoryBase from data_processing.transform import TransformStatistics from data_processing.utils import get_logger @@ -53,9 +57,12 @@ def get_transform_config( :return: dictionary of transform init params """ data_access = data_access_factory.create_data_access() - duplicate_list_location = os.path.abspath( - os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"]) - ) + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) From 08ff0069f00d0a84c8ef6cd3e2f55eefc098b2fb Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:10:01 -0400 Subject: [PATCH 52/91] Allow input of s3 credentials on command line Signed-off-by: Constantin M Adam --- .../fdedup/python/src/fuzzy_dedup_python.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index acb1be3bb..054447e70 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -1,4 +1,5 @@ import argparse +import ast import os import sys @@ -119,8 +120,17 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: "output_folder": output_folder, } if in_args.use_s3: - sys_argv.append("--data_s3_cred") - sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) + if in_args.s3_cred is not None: + s3_cred_ast = ParamsUtils.convert_to_ast(in_args.s3_cred) + sys_argv.append("--data_s3_cred") + sys_argv.append(s3_cred_ast) + elif ( + s3_creds.get("access_key") is not None + and s3_creds.get("secret_key") is not None + and s3_creds.get("url") is not None + ): + sys_argv.append("--data_s3_cred") + sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) sys_argv.append("--data_s3_config") else: sys_argv.append("--data_local_config") @@ -207,6 +217,13 @@ def parse_args() -> argparse.Namespace: help="use s3", ) + parser.add_argument( + "--s3_cred", + type=ast.literal_eval, + default=None, + help="ast string of options for s3 credentials", + ) + return parser.parse_args() From d0c6f8a72efe75ccbfce0d89fd56b9b06dac4cb1 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:53:10 -0400 Subject: [PATCH 53/91] Added license Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/src/Murmur_MH.py | 13 +++++++++++++ .../fdedup/python/src/fuzzy_dedup_python.py | 12 ++++++++++++ .../universal/fdedup/ray/src/fuzzy_dedup_ray.py | 12 ++++++++++++ .../universal/fdedup/spark/src/fuzzy_dedup_spark.py | 12 ++++++++++++ 4 files changed, 49 insertions(+) diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py index e3442ba02..03d5047ea 100644 --- a/transforms/universal/fdedup/python/src/Murmur_MH.py +++ b/transforms/universal/fdedup/python/src/Murmur_MH.py @@ -1,3 +1,16 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + + import logging import os from typing import List, Set diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index 054447e70..bdd78c7da 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import argparse import ast import os diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py index 0b9be33ca..0d4c2954f 100644 --- a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py +++ b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import argparse import os import sys diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py index 5217f2f7b..58688de42 100644 --- a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py +++ b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import argparse import os import sys From 63e11eb729a85f3a1cf349b21e19a680f300ec10 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 11:49:22 -0400 Subject: [PATCH 54/91] Use str2bool for use_s3 argument Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/fuzzy_dedup_python.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index bdd78c7da..7135054d2 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -24,7 +24,7 @@ ) from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import ParamsUtils, get_logger +from data_processing.utils import ParamsUtils, get_logger, str2bool from get_duplicate_list_transform_python import ( GetDuplicateListPythonTransformConfiguration, ) @@ -159,6 +159,10 @@ def execute_service(self, service_short_name: str, params: list) -> int: launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) elif service_short_name == "fdclean": launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + else: + err_msg = f"Unknown service {service_short_name} specified. Must be one of {SERVICE_DICT.values()}" + self.logger.error(err_msg) + raise ValueError(err_msg) status = launcher.launch() return status @@ -225,7 +229,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--use_s3", - action="store_true", + type=lambda x: bool(str2bool(x)), + default=False, help="use s3", ) From bf550fde9ad3d1d9e8f7bd0f7f75b25df12d24a2 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Tue, 29 Oct 2024 19:22:55 -0400 Subject: [PATCH 55/91] Add overwrite output path argument Signed-off-by: Constantin M Adam --- .../python/src/signature_calc_transform.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 7c4dd391c..03f9bc9b4 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -48,6 +48,8 @@ """ This key holds the size of the word shingles calculated for each document""" num_segments_key = "num_segments" """ This key holds the number of segments across which we divide the hashing space for each band""" +overwrite_output_path_key = "overwrite_output_path" +""" This key holds the overwrite output path""" # command line arguments document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" @@ -68,6 +70,8 @@ """ The size of the word shingles calculated for each document""" num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments across which we divide the hashing space for each band""" +overwrite_output_path_cli_param = f"{cli_prefix}{overwrite_output_path_key}" +""" The overwrite output path""" captured_arg_keys = [ document_id_column_key, @@ -79,6 +83,7 @@ jaccard_similarity_threshold_key, word_shingle_size_key, num_segments_key, + overwrite_output_path_key, ] # defaults @@ -100,6 +105,8 @@ """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" num_segments_default = 1 """ Default number of segments across which we divide the hashing space for each band""" +overwrite_output_path_default = None +""" Default overwrite output path (no overwrite)""" NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?") @@ -136,7 +143,8 @@ class SignatureCalculationTransform(AbstractTableTransform): num_minhashes_per_band: number of minhashes to use in each band jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates word_shingle_size: the size of the word shingles calculated for each document - num_segments the number of segments across which we divide the hashing space for each band + num_segments: the number of segments across which we divide the hashing space for each band + overwrite_output_path: specify an output path other than the one used by the data_access """ def __init__(self, config: dict[str, Any]): @@ -158,6 +166,7 @@ def __init__(self, config: dict[str, Any]): self.num_segments = config.get(num_segments_key, num_segments_default) self.num_bands = config.get(num_bands_key, num_bands_default) self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) + self.overwrite_output_path = config.get(overwrite_output_path_key, overwrite_output_path_default) # use this dataframe to store the minhashes and size for each document self.all_minhashes: pl.DataFrame = None # use this dataframe to store the band hashes for each document @@ -311,7 +320,7 @@ def write_band_signatures(self): last_file_name_path = Path(self.last_file_name) suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) save_path = os.path.join( - self.data_access.output_folder, + self.overwrite_output_path if self.overwrite_output_path else self.data_access.output_folder, "bands", f"band={band_ix}", f"segment={segment_index}", @@ -470,6 +479,12 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=num_segments_default, help="the number of segments across which we divide the hashing space for each band", ) + parser.add_argument( + f"--{overwrite_output_path_cli_param}", + type=str, + default=overwrite_output_path_default, + help="overwrite of the output path", + ) def apply_input_params(self, args: Namespace) -> bool: """ From 272be3697239019ad604badcaf4ae2d8fd3c654b Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Wed, 30 Oct 2024 16:40:40 -0400 Subject: [PATCH 56/91] Add separate data access objects for reading and writing files Signed-off-by: Constantin M Adam --- .../python/src/signature_calc_local_python.py | 21 +++++++--- .../python/src/signature_calc_transform.py | 41 ++++++++++--------- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py index 062580f22..2800c70cd 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -12,6 +12,7 @@ import os import sys +from ast import Param from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils @@ -22,12 +23,23 @@ # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "test_scdata")) local_conf = {"input_folder": input_folder, "output_folder": output_folder} code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} +s3_config = { + "input_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_data/", + "output_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_output_data/s3_test_3/", +} + params = { # Data access. Only required parameters are specified "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "scdata_local_config": ParamsUtils.convert_to_ast(local_conf), # execution info "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", @@ -35,6 +47,8 @@ "minhash_num_permutations": 112, "minhash_num_bands": 14, "minhash_num_segments": 2, + # "scdata_s3_cred": ParamsUtils.convert_to_ast(s3_creds), + # "scdata_s3_config": ParamsUtils.convert_to_ast(s3_config), } @@ -44,11 +58,6 @@ print(sys.argv) sys.argv.append("--data_s3_cred") - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) # create launcher diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 03f9bc9b4..159697d19 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -20,10 +20,10 @@ import numpy as np import polars as pl import pyarrow as pa +from data_processing.data_access import DataAccessFactory from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider from Murmur_MH import Murmur_MH -from scipy.integrate import quad as integrate short_name = "minhash" @@ -48,8 +48,6 @@ """ This key holds the size of the word shingles calculated for each document""" num_segments_key = "num_segments" """ This key holds the number of segments across which we divide the hashing space for each band""" -overwrite_output_path_key = "overwrite_output_path" -""" This key holds the overwrite output path""" # command line arguments document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" @@ -70,8 +68,6 @@ """ The size of the word shingles calculated for each document""" num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments across which we divide the hashing space for each band""" -overwrite_output_path_cli_param = f"{cli_prefix}{overwrite_output_path_key}" -""" The overwrite output path""" captured_arg_keys = [ document_id_column_key, @@ -83,7 +79,6 @@ jaccard_similarity_threshold_key, word_shingle_size_key, num_segments_key, - overwrite_output_path_key, ] # defaults @@ -105,8 +100,10 @@ """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" num_segments_default = 1 """ Default number of segments across which we divide the hashing space for each band""" -overwrite_output_path_default = None -""" Default overwrite output path (no overwrite)""" + + +sigcalc_data_factory_key = "sc_data_factory" +sigcalc_data_access_key = "sc_data_access" NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?") @@ -144,7 +141,6 @@ class SignatureCalculationTransform(AbstractTableTransform): jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates word_shingle_size: the size of the word shingles calculated for each document num_segments: the number of segments across which we divide the hashing space for each band - overwrite_output_path: specify an output path other than the one used by the data_access """ def __init__(self, config: dict[str, Any]): @@ -166,7 +162,6 @@ def __init__(self, config: dict[str, Any]): self.num_segments = config.get(num_segments_key, num_segments_default) self.num_bands = config.get(num_bands_key, num_bands_default) self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) - self.overwrite_output_path = config.get(overwrite_output_path_key, overwrite_output_path_default) # use this dataframe to store the minhashes and size for each document self.all_minhashes: pl.DataFrame = None # use this dataframe to store the band hashes for each document @@ -177,6 +172,12 @@ def __init__(self, config: dict[str, Any]): self.bytes_processed = 0 self.data_access = config.get("data_access") self.last_file_name = None + self.sc_data_access = config.get(sigcalc_data_access_key, None) + if self.sc_data_access is None: + self.sc_daf = config.get(sigcalc_data_factory_key, None) + if self.sc_daf is None: + raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}") + self.sc_data_access = self.sc_daf.create_data_access() def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """ @@ -319,15 +320,17 @@ def write_band_signatures(self): common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name]) last_file_name_path = Path(self.last_file_name) suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) + if self.sc_data_access.output_folder is None: + self.sc_data_access.output_folder = self.data_access.output_folder save_path = os.path.join( - self.overwrite_output_path if self.overwrite_output_path else self.data_access.output_folder, + self.sc_data_access.output_folder, "bands", f"band={band_ix}", f"segment={segment_index}", suffix_path, ) segment_band_minhash_table = segment_band_minhash_df.to_arrow() - bytes_written, _, _ = self.data_access.save_table(save_path, segment_band_minhash_table) + bytes_written, _, _ = self.sc_data_access.save_table(save_path, segment_band_minhash_table) if bytes_written > 0: num_tables_written += 1 num_docs_written += segment_band_minhash_table.num_rows @@ -412,8 +415,10 @@ def __init__(self): super().__init__( name=short_name, transform_class=SignatureCalculationTransform, - remove_from_metadata=[], + remove_from_metadata=[sigcalc_data_factory_key], ) + self.daf = DataAccessFactory(cli_arg_prefix="scdata_") + from data_processing.utils import get_logger self.logger = get_logger(__name__, level="INFO") @@ -479,12 +484,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=num_segments_default, help="the number of segments across which we divide the hashing space for each band", ) - parser.add_argument( - f"--{overwrite_output_path_cli_param}", - type=str, - default=overwrite_output_path_default, - help="overwrite of the output path", - ) + self.daf.add_input_params(parser=parser) def apply_input_params(self, args: Namespace) -> bool: """ @@ -495,4 +495,5 @@ def apply_input_params(self, args: Namespace) -> bool: captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) self.params = self.params | captured self.logger.info(f"{short_name} parameters are : {self.params}") - return True + self.params[sigcalc_data_factory_key] = self.daf + return self.daf.apply_input_params(args=args) From ee411e1bd7957a802857b2b1ac6703f0d50c2968 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 31 Oct 2024 16:46:09 -0400 Subject: [PATCH 57/91] Define 2 data access objects for data and duplicate list Signed-off-by: Constantin M Adam --- .../fdedup/python/src/data_cleaning_transform.py | 10 +++++++++- .../python/src/data_cleaning_transform_python.py | 14 ++++++++++++-- .../fdedup/ray/src/data_cleaning_transform_ray.py | 14 ++++++++++++-- .../spark/src/data_cleaning_transform_spark.py | 14 ++++++++++++-- 4 files changed, 45 insertions(+), 7 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 8e17b757f..1a349ae85 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -17,6 +17,7 @@ import numpy as np import polars as pl import pyarrow as pa +from data_processing.data_access import DataAccessFactory from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger @@ -53,6 +54,9 @@ operation_mode_default = "filter_duplicates" """ Default value for operation mode, will filter out all the duplicate documents""" +dataclean_data_factory_key = "dc_data_factory" +dataclean_data_access_key = "dc_data_access" + class DataCleaningTransform(AbstractTableTransform): """ @@ -129,7 +133,9 @@ def __init__(self, transform_class: type[AbstractTableTransform] = DataCleaningT super().__init__( name=short_name, transform_class=transform_class, + remove_from_metadata=[dataclean_data_factory_key], ) + self.daf = DataAccessFactory(cli_arg_prefix="dcdata_") self.logger = get_logger(__name__, level="INFO") def add_input_params(self, parser: ArgumentParser) -> None: @@ -157,6 +163,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=operation_mode_default, help="operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents", ) + self.daf.add_input_params(parser=parser) def apply_input_params(self, args: Namespace) -> bool: """ @@ -167,4 +174,5 @@ def apply_input_params(self, args: Namespace) -> bool: captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) self.params = self.params | captured self.logger.info(f"{short_name} parameters are : {self.params}") - return True + self.params[dataclean_data_factory_key] = self.daf + return self.daf.apply_input_params(args=args) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py index 9c60ecbba..edef8b9c5 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -15,6 +15,8 @@ from data_cleaning_transform import ( DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, duplicate_list_location_default, duplicate_list_location_key, ) @@ -57,15 +59,23 @@ def get_transform_config( :return: dictionary of transform init params """ data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) if not duplicate_list_location.startswith("/"): - out_paths = data_access.output_folder.rstrip("/").split("/") + out_paths = dc_data_access.output_folder.rstrip("/").split("/") dupl_list_paths = duplicate_list_location.split("/") paths = out_paths[:-1] + dupl_list_paths duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") - self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) return self.params | {"df": self.duplicate_list} diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py index 5ed2cecbe..88171e260 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -17,6 +17,8 @@ from data_cleaning_transform import ( DataCleaningTransform, DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, duplicate_list_location_default, duplicate_list_location_key, ) @@ -90,15 +92,23 @@ def get_transform_config( :return: dictionary of filter init params """ data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) if not duplicate_list_location.startswith("/"): - out_paths = data_access.output_folder.rstrip("/").split("/") + out_paths = dc_data_access.output_folder.rstrip("/").split("/") dupl_list_paths = duplicate_list_location.split("/") paths = out_paths[:-1] + dupl_list_paths duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") - duplicate_list, retries = data_access.get_file(duplicate_list_location) + duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) docs_to_remove_list = ray.put(duplicate_list) return {"df": docs_to_remove_list} | self.params diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py index 56c10d801..2ff0df8bf 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -15,6 +15,8 @@ from data_cleaning_transform import ( DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, duplicate_list_location_default, duplicate_list_location_key, ) @@ -57,15 +59,23 @@ def get_transform_config( :return: dictionary of transform init params """ data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) if not duplicate_list_location.startswith("/"): - out_paths = data_access.output_folder.rstrip("/").split("/") + out_paths = dc_data_access.output_folder.rstrip("/").split("/") dupl_list_paths = duplicate_list_location.split("/") paths = out_paths[:-1] + dupl_list_paths duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") - self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) return self.params | {"df": self.duplicate_list} From 3a3050125ef8987f85ba85b59ca13f928f812584 Mon Sep 17 00:00:00 2001 From: David Wood Date: Fri, 1 Nov 2024 10:18:34 -0400 Subject: [PATCH 58/91] get fdedeup/python test-image to pass, and clean up req in ray version Signed-off-by: David Wood --- transforms/universal/fdedup/python/Dockerfile | 2 +- transforms/universal/fdedup/python/requirements.txt | 10 ---------- transforms/universal/fdedup/ray/Dockerfile | 3 +-- transforms/universal/fdedup/ray/pyproject.toml | 1 + 4 files changed, 3 insertions(+), 13 deletions(-) delete mode 100644 transforms/universal/fdedup/python/requirements.txt diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index a0a557060..f8c41791e 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -18,7 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md -COPY --chown=dpk:root requirements.txt requirements.txt +#COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt deleted file mode 100644 index 576c028a8..000000000 --- a/transforms/universal/fdedup/python/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -pyarrow==16.1.0 -pyyaml>=6.0.2 -boto3>=1.34.69 -kubernetes>=30.1.0 -polars==1.9.0 -disjoint-set>=0.8.0 -numpy<1.29.0 -sentencepiece>=0.2.0 -mmh3>=4.1.0 -scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index ec2c56f28..e921c4749 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -1,5 +1,4 @@ -ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 - +ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310 FROM ${BASE_IMAGE} USER ray diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 9c533231a..6a871abea 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -10,6 +10,7 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ + "dpk_fdedup_transform_python==0.2.2.dev1", "data-prep-toolkit-ray==0.2.2.dev1", "mmh3>=4.1.0", "xxhash==3.4.1", From 80ae8df747998feb0f4dba49ec4322ace854d01c Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 8 Nov 2024 16:51:39 -0500 Subject: [PATCH 59/91] Added an option to run either word or char shingle Signed-off-by: nelson --- .../fdedup/python/src/fuzzy_dedup_python.py | 8 ++++++ .../python/src/signature_calc_transform.py | 26 ++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index 7135054d2..bc5f3fded 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -57,6 +57,7 @@ signature_calc_transform.jaccard_similarity_threshold_key, signature_calc_transform.word_shingle_size_key, signature_calc_transform.num_segments_key, + signature_calc_transform.shingle_option_key, ], "cluster": [ cluster_analysis_transform.jaccard_similarity_threshold_key, @@ -240,6 +241,13 @@ def parse_args() -> argparse.Namespace: default=None, help="ast string of options for s3 credentials", ) + parser.add_argument( + "--shingle_option", + type=str, + required=False, + default="word", + help="Option used for shingling", + ) return parser.parse_args() diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 159697d19..2ed3ed258 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -48,6 +48,8 @@ """ This key holds the size of the word shingles calculated for each document""" num_segments_key = "num_segments" """ This key holds the number of segments across which we divide the hashing space for each band""" +shingle_option_key = "shingle_option" +""" This key holds the option that is used to do shingles calculation for each document""" # command line arguments document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" @@ -68,6 +70,8 @@ """ The size of the word shingles calculated for each document""" num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments across which we divide the hashing space for each band""" +shingle_option_cli_param = f"{cli_prefix}{shingle_option_key}" +""" This key holds the option that is used to do shingles calculation for each document""" captured_arg_keys = [ document_id_column_key, @@ -100,6 +104,8 @@ """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" num_segments_default = 1 """ Default number of segments across which we divide the hashing space for each band""" +shingle_option_default = "word" +""" Default option of doing shingling""" sigcalc_data_factory_key = "sc_data_factory" @@ -162,6 +168,7 @@ def __init__(self, config: dict[str, Any]): self.num_segments = config.get(num_segments_key, num_segments_default) self.num_bands = config.get(num_bands_key, num_bands_default) self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) + self.shingle_option = config.get(shingle_option_key, shingle_option_default) # use this dataframe to store the minhashes and size for each document self.all_minhashes: pl.DataFrame = None # use this dataframe to store the band hashes for each document @@ -202,7 +209,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab # generate minhash values minhashes = df.map_rows( lambda row: mm_min_hash.minhash2_nosalt( - *self._generate_word_shingles(row, window_size=self.word_shingle_size) + *self._generate_word_shingles(row, self.shingle_option, window_size=self.word_shingle_size) ) ) # rename columns, cast minhashes to list(uint32) @@ -353,7 +360,9 @@ def write_band_signatures(self): return [], metadata # define shingles generation function - def _generate_word_shingles(self, row: tuple, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]: + def _generate_word_shingles( + self, row: tuple, shingling_option: str, window_size: int = 5, delimiter: str = " " + ) -> tuple[list, int, int]: text = row[0] # lower case text = text.lower() @@ -366,7 +375,12 @@ def _generate_word_shingles(self, row: tuple, window_size: int = 5, delimiter: s # diacritics/unicode normalization text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn") text = text.strip() - words = text.split() + print(shingling_option) + print("=============") + if shingling_option == "char": + words = list(text) + else: + words = text.split() document_id = row[1] doc_len = len(row[0]) word_count = len(words) @@ -484,6 +498,12 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=num_segments_default, help="the number of segments across which we divide the hashing space for each band", ) + parser.add_argument( + f"--{shingle_option_cli_param}", + type=str, + default=shingle_option_default, + help="Shingling option", + ) self.daf.add_input_params(parser=parser) def apply_input_params(self, args: Namespace) -> bool: From c531809647c29de300052c1d9a698905bc904733 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 14:26:29 -0500 Subject: [PATCH 60/91] Use captured_arg_keys to list the arguments of each transform Signed-off-by: Constantin M Adam --- .../fdedup/python/src/fuzzy_dedup_python.py | 29 +++---------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index 7135054d2..f3d0b0fdc 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -47,31 +47,10 @@ } ARGS_MAP = { - "minhash": [ - signature_calc_transform.contents_column_key, - signature_calc_transform.document_id_column_key, - signature_calc_transform.seed_key, - signature_calc_transform.num_permutations_key, - signature_calc_transform.num_bands_key, - signature_calc_transform.num_minhashes_per_band_key, - signature_calc_transform.jaccard_similarity_threshold_key, - signature_calc_transform.word_shingle_size_key, - signature_calc_transform.num_segments_key, - ], - "cluster": [ - cluster_analysis_transform.jaccard_similarity_threshold_key, - cluster_analysis_transform.num_bands_key, - cluster_analysis_transform.num_segments_key, - ], - "fdlist": [ - get_duplicate_list_transform.subfolder_key, - get_duplicate_list_transform.consolidated_filename_key, - ], - "fdclean": [ - data_cleaning_transform.document_id_column_key, - data_cleaning_transform.duplicate_list_location_key, - data_cleaning_transform.operation_mode_key, - ], + "minhash": signature_calc_transform.captured_arg_keys, + "cluster": cluster_analysis_transform.captured_arg_keys, + "fdlist": get_duplicate_list_transform.captured_arg_keys, + "fdclean": data_cleaning_transform.captured_arg_keys, } From fe431104ca2d171b451be76a0cd7716f268f9d52 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 14:28:06 -0500 Subject: [PATCH 61/91] Ray implementation for get_duplicate_list_transform Signed-off-by: Constantin M Adam --- .../fdedup/ray/src/fuzzy_dedup_ray.py | 6 +- .../src/get_duplicate_list_transform_ray.py | 69 +++++++++++++++++++ .../test_get_duplicate_list_transform_ray.py | 9 ++- 3 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py index 0d4c2954f..987369714 100644 --- a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py +++ b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py @@ -23,6 +23,10 @@ from get_duplicate_list_transform_python import ( GetDuplicateListPythonTransformConfiguration, ) +from get_duplicate_list_transform_ray import ( + GetDuplicateListRayRuntime, + GetDuplicateListRayTransformConfiguration, +) from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration @@ -56,7 +60,7 @@ def execute_service(self, service_short_name: str, params: list) -> int: elif service_short_name == "cluster": launcher = RayTransformLauncher(runtime_config=ClusterAnalysisRayTransformConfiguration()) elif service_short_name == "fdlist": - launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + launcher = RayTransformLauncher(runtime_config=GetDuplicateListRayTransformConfiguration()) elif service_short_name == "fdclean": launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) status = launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py new file mode 100644 index 000000000..40081e658 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py @@ -0,0 +1,69 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, + RayTransformRuntimeConfiguration, +) +from get_duplicate_list_transform import ( + GetDuplicateListTransformConfiguration, + subfolder_key, +) + + +logger = get_logger(__name__) + + +class GetDuplicateListRayRuntime(DefaultRayTransformRuntime): + """ + Get duplicate list runtime support for Ray + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + return [self.params[subfolder_key]] + + +class GetDuplicateListRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for Fuzzy Dedup Get Duplicate List + as required by the RayTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=GetDuplicateListTransformConfiguration(), + runtime_class=GetDuplicateListRayRuntime, + ) + + +if __name__ == "__main__": + launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration()) + logger.info("Launching fuzzy dedup get duplicate list ray transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py index 4b59e3a7a..55869598c 100644 --- a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py +++ b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py @@ -12,14 +12,12 @@ import os -from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) +from data_processing_ray.runtime.ray import RayTransformLauncher from get_duplicate_list_transform import sort_output_cli_param -from get_duplicate_list_transform_python import ( - GetDuplicateListPythonTransformConfiguration, -) +from get_duplicate_list_transform_ray import GetDuplicateListRayTransformConfiguration class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): @@ -31,9 +29,10 @@ class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): def get_test_transform_fixtures(self) -> list[tuple]: basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) config = { + "run_locally": True, sort_output_cli_param: True, } - launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration()) fixtures = [ ( launcher, From 82a1860524e8ebd4c59ae0598356095d69021e3c Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 14:30:03 -0500 Subject: [PATCH 62/91] Bug fix: jaccard threshold type must be float Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/signature_calc_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 159697d19..b492eb3ae 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -456,7 +456,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: ) parser.add_argument( f"--{jaccard_similarity_threshold_cli_param}", - type=int, + type=float, default=jaccard_similarity_threshold_default, help="Jaccard similarity threshold above which two documents are duplicates", ) From 61ed40f347612787d32385df779d0d88fc4e3f88 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 14:31:18 -0500 Subject: [PATCH 63/91] Get fuzzy dedup ray image ready for kfp Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/ray/Dockerfile | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index e921c4749..d4b3ae484 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -1,8 +1,6 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310 FROM ${BASE_IMAGE} -USER ray - RUN pip install --upgrade --no-cache-dir pip # install pytest @@ -24,13 +22,20 @@ COPY --chown=ray:users README.md README.md RUN pip install --no-cache-dir -e . # copy source files needed by test-image -COPY ./src/signature_calc_transform_ray.py fdedup_transform_ray.py -COPY ./src/signature_calc_local_ray.py local/fdedup_local_ray.py +COPY --chown=ray:users ./src/signature_calc_transform_ray.py fdedup_transform_ray.py +COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py +COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py +COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py +COPY --chown=ray:users ./src/data_cleaning_transform_ray.py data_cleaning_transform_ray.py +COPY --chown=ray:users ./src/signature_calc_local_ray.py local/fdedup_local_ray.py # copy test COPY test/ test/ COPY test-data/ test-data/ +USER root +RUN chmod a+rwx /home/ray +USER ray # Set environment ENV PYTHONPATH /home/ray From a8ede002fba33a4e01df9421b60f30558b98260e Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 17:37:56 -0500 Subject: [PATCH 64/91] kfp implementation for fuzzy dedup Signed-off-by: Constantin M Adam --- .../universal/fdedup/kfp_ray/fdedup_wf.py | 321 +++++++++---- .../src/fdedup_compute_execution_params.py | 437 ++++++++++-------- 2 files changed, 494 insertions(+), 264 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 3156ab6f1..1c3e8e570 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -14,14 +14,24 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from src.fdedup_compute_execution_params import fdedup_compute_execution_params +from src.fdedup_compute_execution_params import ( + cluster_analysis_compute_execution_params, + compute_common_params, + data_cleaning_compute_execution_params, + get_duplicate_list_compute_execution_params, + signature_calc_compute_execution_params, +) from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils -task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest" +task_image = os.getenv("FDEDUP_IMAGE_LOCATION", "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest") +image_pull_secret = os.getenv("FDEDUP_IMAGE_PULL_SECRET", "my_secret") # the name of the job script -EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py" +SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "signature_calc_transform_ray.py" +CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "cluster_analysis_transform_ray.py" +GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "get_duplicate_list_transform_ray.py" +DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" @@ -40,8 +50,18 @@ # compilation time. import uuid - compute_exec_params_op = dsl.component_decorator.component( - func=fdedup_compute_execution_params, base_image=base_kfp_image + compute_common_params_op = dsl.component_decorator.component(func=compute_common_params, base_image=base_kfp_image) + compute_signature_calc_exec_params_op = dsl.component_decorator.component( + func=signature_calc_compute_execution_params, base_image=base_kfp_image + ) + compute_cluster_analysis_exec_params_op = dsl.component_decorator.component( + func=cluster_analysis_compute_execution_params, base_image=base_kfp_image + ) + compute_get_duplicate_list_exec_params_op = dsl.component_decorator.component( + func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image + ) + compute_data_cleaning_exec_params_op = dsl.component_decorator.component( + func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) print( "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " @@ -49,61 +69,94 @@ ) run_id = uuid.uuid4().hex else: - compute_exec_params_op = comp.create_component_from_func( - func=fdedup_compute_execution_params, base_image=base_kfp_image + compute_common_params_op = comp.create_component_from_func(func=compute_common_params, base_image=base_kfp_image) + compute_signature_calc_exec_params_op = comp.create_component_from_func( + func=signature_calc_compute_execution_params, base_image=base_kfp_image + ) + compute_cluster_analysis_exec_params_op = comp.create_component_from_func( + func=cluster_analysis_compute_execution_params, base_image=base_kfp_image + ) + compute_get_duplicate_list_exec_params_op = comp.create_component_from_func( + func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image + ) + compute_data_cleaning_exec_params_op = comp.create_component_from_func( + func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute signature calculation job +execute_signature_calc_job_op = comp.load_component_from_file( + component_spec_path + "executeRayJobComponent_multi_s3.yaml" +) +# execute cluster analysis job +execute_cluster_analysis_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute get duplicate list job +execute_get_duplicate_list_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute data cleaning job +execute_data_cleaning_job_op = comp.load_component_from_file( + component_spec_path + "executeRayJobComponent_multi_s3.yaml" +) # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "fdedup" +TASK_NAME: str = "fuzzydedup" @dsl.pipeline( name=TASK_NAME + "-ray-pipeline", - description="Pipeline for fdedup", + description="Pipeline for fuzzy dedup", ) -def fdedup( +def fuzzydedup( + # folders used # Ray cluster - ray_name: str = "fdedup-kfp-ray", # name of Ray cluster + ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed - ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_head_options: dict = { + "cpu": 1, + "memory": 4, + "image": task_image, + "image_pull_secret": image_pull_secret, + "imagePullPolicy": "Always", + }, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + "image_pull_secret": image_pull_secret, + "imagePullPolicy": "Always", + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access. checkpointing is not supported by dedup - data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}", - data_s3_access_secret: str = "s3-secret", + data_s3_config: str = "{'input_folder': 's3://cos-llm-pile-south/spark_test/fd_xs_dataset_test/', 'output_folder': 's3://cos-llm-pile-south/spark_test/fuzzy_dedup_test_output_data/kfp_test_1/'}", + data_s3_access_secret: str = "s3-south-secret", + scdata_s3_access_secret: str = "s3-south-secret", + dcdata_s3_access_secret: str = "s3-south-secret", data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {"num_cpus": 0.7}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # columns used - fdedup_doc_column: str = "contents", - fdedup_id_column: str = "int_id_column", - fdedup_cluster_column: str = "cluster", - # infrastructure - fdedup_bucket_cpu: float = 0.5, - fdedup_doc_cpu: float = 0.5, - fdedup_mhash_cpu: float = 0.5, + fdedup_contents_column: str = "contents", + fdedup_document_id_column: str = "int_id_column", # fuzzy parameters - fdedup_num_permutations: int = 64, - fdedup_threshold: float = 0.8, - fdedup_shingles_size: int = 5, - fdedup_delimiters: str = " ", - # Random delay between reads - fdedup_random_delay_limit: int = 5, - # snapshotting - fdedup_snapshot_delay: int = 1, - fdedup_use_doc_snapshot: bool = False, - fdedup_use_bucket_snapshot: bool = False, + fdedup_num_permutations: int = 112, + fdedup_num_bands: int = 14, + fdedup_num_minhashes_per_band: int = 8, + fdedup_word_shingle_size: int = 5, + fdedup_jaccard_similarity_threshold: float = 0.75, + fdedup_seed: int = 42, + fdedup_docs_to_remove_folder: str = "docs_to_remove", + fdedup_duplicate_list_location: str = os.path.join( + "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + ), + fdedup_operation_mode: str = "annotate", # data sampling fdedup_n_samples: int = 10, # additional parameters @@ -136,89 +189,189 @@ def fdedup( wait_print_tmout - time between prints, sec http_retries - http retries for API server calls :param data_s3_access_secret - s3 access secret + :param scdata_s3_access_secret - signature calculation s3 access secret + :param dcdata_s3_access_secret - data cleaning s3 access secret :param data_s3_config - s3 configuration :param data_max_files - max files to process :param data_num_samples - num samples to process - :param runtime_actor_options - actor options :param runtime_pipeline_id - pipeline id :param runtime_code_location - code location - :param fdedup_doc_column - document column name - :param fdedup_id_column - integer document id column name - :param fdedup_cluster_column - cluster column name - :param fdedup_bucket_cpu - number of CPUs per bucket hash - :param fdedup_doc_cpu - number of CPUs per doc hash - :param fdedup_mhash_cpu - number of CPUs per minhash hash + :param fdedup_contents_column - document column name + :param fdedup_document_id_column - integer document id column name :param fdedup_num_permutations - number of permutations - :param fdedup_threshold - threshold - :param fdedup_shingles_size - number of words in shingle - :param fdedup_delimiters - delimiter for splitting document - :param fdedup_random_delay_limit - delay between reads to reduce S3 load. - A random number between 0 and random_delay_limit is used - :param fdedup_snapshot_delay - delay between restoring individual actors - :param fdedup_use_bucket_snapshot - flag to skip buckets building and start from existing snapshots - :param fdedup_use_doc_snapshot - flag to skip documents building and start from existing snapshots + :param fdedup_num_bands - number of bands + :param fdedup_num_minhashes_per_band - length of a band + :param fdedup_word_shingle_size - length of word shingles + :param fdedup_jaccard_similarity_threshold - similarity threshold + :param fdedup_seed - seed for the random number generator + :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids + :param fdedup_duplicate_list_location - name of the file holding the consolidated list of duplicates + :param fdedup_operation_mode - data cleaning mode, one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate' :param fdedup_n_samples - number of samples for parameters computation :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, + compute_common_exec_params = compute_common_params_op( + ray_worker_options, + data_s3_config, + fdedup_num_permutations, + fdedup_n_samples, + ) + ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2) + ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret) + fdedup_num_segments = compute_common_exec_params.outputs["num_segments"] + runtime_actor_cpus = compute_common_exec_params.outputs["cpus_per_actor"] + runtime_num_actors = compute_common_exec_params.outputs["num_actors"] + + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=run_id, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_common_exec_params) + + # Get the parameters for the signature calculation job + compute_signature_calc_exec_params = compute_signature_calc_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, - doc_column=fdedup_doc_column, - id_column=fdedup_id_column, - cluster_column=fdedup_cluster_column, - bucket_cpu=fdedup_bucket_cpu, - doc_cpu=fdedup_doc_cpu, - mhash_cpu=fdedup_mhash_cpu, + doc_column=fdedup_contents_column, + id_column=fdedup_document_id_column, num_permutations=fdedup_num_permutations, - threshold=fdedup_threshold, - shingles_size=fdedup_shingles_size, - delimiters=fdedup_delimiters, - random_delay_limit=fdedup_random_delay_limit, - snapshot_delay=fdedup_snapshot_delay, - use_doc_snapshot=fdedup_use_doc_snapshot, - use_bucket_snapshot=fdedup_use_bucket_snapshot, - n_samples=fdedup_n_samples, + num_bands=fdedup_num_bands, + num_minhashes_per_band=fdedup_num_minhashes_per_band, + word_shingle_size=fdedup_word_shingle_size, + threshold=fdedup_jaccard_similarity_threshold, + num_segments=fdedup_num_segments, + seed=fdedup_seed, ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) + ComponentUtils.add_settings_to_component(compute_signature_calc_exec_params, ONE_HOUR_SEC * 2) + compute_signature_calc_exec_params.after(ray_cluster) - # start Ray cluster - ray_cluster = create_ray_op( + # Execute signature calculation job + execute_signature_calc_job = execute_signature_calc_job_op( ray_name=ray_name, run_id=run_id, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, + additional_params=additional_params, + exec_params=compute_signature_calc_exec_params.output, + exec_script_name=SIGNATURE_CALC_EXEC_SCRIPT_NAME, server_url=server_url, + prefix="scdata", + ) + ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" + ) + execute_signature_calc_job.after(compute_signature_calc_exec_params) + + # Get the parameters for the cluster analysis job + compute_cluster_analysis_exec_params = compute_cluster_analysis_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + num_bands=fdedup_num_bands, + threshold=fdedup_jaccard_similarity_threshold, + num_segments=fdedup_num_segments, + ) + ComponentUtils.add_settings_to_component(compute_cluster_analysis_exec_params, ONE_HOUR_SEC * 2) + compute_cluster_analysis_exec_params.after(execute_signature_calc_job) + # Execute job + execute_cluster_analysis_job = execute_cluster_analysis_job_op( + ray_name=ray_name, + run_id=run_id, additional_params=additional_params, + exec_params=compute_cluster_analysis_exec_params.output, + exec_script_name=CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME, + server_url=server_url, ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) + ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) + execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) + + compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + duplicate_docids_folder=fdedup_docs_to_remove_folder, + duplicate_list_location=fdedup_duplicate_list_location, + ) + ComponentUtils.add_settings_to_component(compute_get_duplicate_list_exec_params, ONE_HOUR_SEC * 2) + compute_get_duplicate_list_exec_params.after(execute_cluster_analysis_job) # Execute job - execute_job = execute_ray_jobs_op( + execute_get_duplicate_list_job = execute_get_duplicate_list_job_op( ray_name=ray_name, run_id=run_id, additional_params=additional_params, - exec_params=compute_exec_params.output, - exec_script_name=EXEC_SCRIPT_NAME, + exec_params=compute_get_duplicate_list_exec_params.output, + exec_script_name=GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME, server_url=server_url, ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) + ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) + execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) + + compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + id_column=fdedup_document_id_column, + duplicate_list_location=fdedup_duplicate_list_location, + operation_mode=fdedup_operation_mode, + ) + ComponentUtils.add_settings_to_component(compute_data_cleaning_exec_params, ONE_HOUR_SEC * 2) + compute_data_cleaning_exec_params.after(execute_get_duplicate_list_job) + + # Execute job + execute_data_cleaning_job = execute_data_cleaning_job_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + exec_params=compute_data_cleaning_exec_params.output, + exec_script_name=DATA_CLEANING_EXEC_SCRIPT_NAME, + server_url=server_url, + prefix="dcdata", + ) + ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" + ) + execute_data_cleaning_job.after(compute_data_cleaning_exec_params) if __name__ == "__main__": # Compiling the pipeline - compiler.Compiler().compile(fdedup, __file__.replace(".py", ".yaml")) + compiler.Compiler().compile(fuzzydedup, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index 726200339..c5ff4d52b 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -10,10 +10,77 @@ # limitations under the License. ################################################################################ +from typing import Any, Dict, NamedTuple -def fdedup_compute_execution_params( + +def compute_common_params( worker_options: dict, # ray worker configuration - actor_options: dict, # actor's resource requirements + data_s3_config: str, # S3 configuration + num_permutations: int, # number of permutations (minhashes) per document + n_samples: int, # files to sample for number of documents estimation +) -> NamedTuple("fdedup_params", [("num_segments", int), ("num_actors", int), ("cpus_per_actor", float)]): + + import sys + + from data_processing.data_access import DataAccessS3 + from data_processing.utils import GB + from runtime_utils import KFPUtils + + # get credentials + s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() + s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} + s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) + # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly + data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + # sample input data + sampling: dict[str, Any] + sampling, _ = data_access.sample_input_data(n_samples=n_samples) + number_of_docs = int(sampling.get("estimated number of docs")) + if number_of_docs == 0: + print(f"Estimated number of documents and documents size is zero. Please verify the input path.") + sys.exit(1) + print(f"Estimated number of docs: {number_of_docs}") + # Assume each document takes doc_bytes = (8 + num_permutations * 4 + 20) bytes, where: + # 8 bytes are taken by the band hash + # (num_permutations * 4) bytes are taken by the min hashes + # 20 bytes to provide some extra space for storage in a table + # The total amount of space needed by a band is number_of_docs * doc_bytes. + # To scale the handling of this data, divide each band into segments, where each segment size is below 3GB + doc_bytes = 8 + num_permutations * 4 + 20 + band_bytes = number_of_docs * doc_bytes + num_segments = 1 + (band_bytes // (3 * GB)) + print(f"Number of segments: {num_segments}") + + # To process data efficiently, each actor needs 16GB of memory. + # The actor config controls CPU allocation, not memory; + # use CPU allocation s.t. the number of actors on a worker provides access to 16GB of memory for each actor. + # Also, to keep S3 utilization in check, limit the number of actors to 2000 + num_nodes = worker_options["replicas"] + cpu_per_node = worker_options["cpu"] - 1 + memory_per_node = 0.85 * worker_options["memory"] + + memory_per_actor = 16 # GB + max_num_actors = 2000 + num_actors_per_node: int = int(memory_per_node / memory_per_actor) + if num_actors_per_node == 0: + num_actors_per_node = 1 + num_actors = num_nodes * num_actors_per_node + while num_actors > max_num_actors: + num_actors -= num_nodes + num_actors_per_node -= 1 + print(f"Number of actors per node = {num_actors_per_node}") + cpus_per_actor = cpu_per_node / num_actors_per_node + print(f"CPUs per actor = {cpus_per_actor}") + + from collections import namedtuple + + fdedup_params = namedtuple("fdedup_params", ["num_segments", "num_actors", "cpus_per_actor"]) + return fdedup_params(num_segments, num_actors, cpus_per_actor) + + +def signature_calc_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -22,27 +89,19 @@ def fdedup_compute_execution_params( runtime_code_location: dict, # code location doc_column: str, # document column name id_column: str, # integer document id column name - cluster_column: str, # cluster column name - bucket_cpu: float, # number of CPUs per bucket hash - doc_cpu: float, # number of CPUs per doc hash - mhash_cpu: float, # number of CPUs per minhash hash num_permutations: int, # number of permutations + num_bands: int, # number of bands + num_minhashes_per_band: int, # band length + word_shingle_size: int, # number of words in shingle threshold: float, # threshold, - shingles_size: int, # number of words in shingle - delimiters: str, # delimiter for splitting document - random_delay_limit: int, # delay between reads to reduce S3 load. - # A random number between 0 and random_delay_limit is used - snapshot_delay: int, # delay between restoring individual actors - use_doc_snapshot: bool, # flag to skip documents building and start from existing snapshots - use_bucket_snapshot: bool, # flag to skip buckets building and start from existing snapshots - n_samples: int, # number of samples to use -) -> dict: # NamedTuple( - # "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] + num_segments: int, # number of segments + seed: int, # seed for the random number generator +) -> dict: """ - Compute fuzzy dedup execution parameters - :param worker_options: cluster parameters - :param actor_options: actor request requirements + Compute fuzzy dedup execution parameters for signature calculation + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -51,182 +110,200 @@ def fdedup_compute_execution_params( :param runtime_code_location: code location :param doc_column: document column name :param id_column: integer document id column name - :param cluster_column: cluster column name - :param bucket_cpu: number of CPUs per bucket hash - :param doc_cpu: number of CPUs per doc hash - :param mhash_cpu: number of CPUs per minhash hash :param num_permutations: number of permutations + :param num_bands: number of bands + :param num_minhashes_per_band: band length + :param word_shingle_size: number of words in shingle :param threshold: threshold, - :param shingles_size: number of words in shingle - :param delimiters: delimiter for splitting document - :param random_delay_limit: # delay between reads to reduce S3 load. A random number between 0 and random_delay_limit is used - :param snapshot_delay: delay between restoring individual actors - :param use_doc_snapshot: flag to skip documents building and start from existing snapshots - :param use_bucket_snapshot: flag to skip buckets building and start from existing snapshots - :param n_samples: number of samples to use + :param num_segments: number of segments + :param seed: seed for the random number generator :return: a dictionary with a Ray Job execution parameters """ - import math - import sys - from data_processing.data_access import DataAccessS3 - from data_processing.utils import GB, KB - from runtime_utils import KFPUtils - from scipy.integrate import quad as integrate - - EXECUTION_OF_KB_DOC = 0.003 - - def fuzzy_optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float, - false_negative_weight: float, - ) -> tuple[int, int]: - """ - Computes parameters for fuzzy dedup - :param threshold: filtering threshold - :param num_perm: number of permutations - :param false_positive_weight: false positive weight - :param false_negative_weight: false negative weight - :return: number of buckets and bucket length - """ - - def _false_positive_probability(ths: float, b: int, r: int) -> float: - """ - Compute false positive probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, ths) - return a - - def _false_negative_probability(ths: float, b: int, r: int) -> float: - """ - Compute false negative probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, ths, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for perm in range(1, num_perm + 1): - max_r = int(num_perm / perm) - for rel in range(1, max_r + 1): - fp = _false_positive_probability(threshold, perm, rel) - fn = _false_negative_probability(threshold, perm, rel) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (perm, rel) - return opt + # fuzzy parameters for signature calculation + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + print(f"runtime_actor_options = {runtime_actor_options}") + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "minhash_contents_column": doc_column, + "minhash_document_id_column": id_column, + "minhash_num_permutations": num_permutations, + "minhash_num_bands": num_bands, + "minhash_num_minhashes_per_band": num_minhashes_per_band, + "minhash_word_shingle_size": word_shingle_size, + "minhash_jaccard_similarity_threshold": threshold, + "minhash_num_segments": num_segments, + "minhash_seed": seed, + "scdata_s3_config": data_s3_config, + } + + +def cluster_analysis_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + num_bands: int, # number of bands + threshold: float, # threshold, + num_segments: int, # number of segments +) -> dict: + + """ + Compute fuzzy dedup execution parameters for cluster analysis + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param num_bands: number of bands + :param threshold: threshold, + :param num_segments: number of segments + :return: a dictionary with a Ray Job execution parameters + """ + import json + import os # fuzzy parameters - num_buckets, length_bucket = fuzzy_optimal_param( - threshold=threshold, - num_perm=num_permutations, - false_positive_weight=0.5, - false_negative_weight=0.5, - ) - print(f"Fuzzy parameters: num buckets {num_buckets}, bucket length {length_bucket}") # Get cluster parameters - cluster_cpu = worker_options["replicas"] * worker_options["cpu"] - cluster_memory = worker_options["replicas"] * worker_options["memory"] - print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - cluster_cpu -= 1 - cluster_memory *= 0.85 - # get actor requirements - actor_cpu = actor_options["num_cpus"] - print(f"actor required cpu {actor_cpu}") - # get credentials - s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() - s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) - if type(s3_config) is list: - # S3 config is list. take the first element - s3_config = s3_config[0] - # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) - # sample input data - sampling, _ = data_access.sample_input_data(n_samples=n_samples) - avg_doc_size = sampling.get("average doc size KB") - number_of_docs = sampling.get("estimated number of docs") - avg_table_size = sampling.get("average table size MB") / KB - if number_of_docs == 0: - print(f"Estimated number of documents and documents size is zero. Please verify the input path.") - sys.exit(1) - # we are creating more buckets actors, so that we get better parallelization for bucket processing - b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB) - d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB) - m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB) - # compute cpu requirements - # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount - # of CPUs - n_preprocessors = int( - (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu - ) - if n_preprocessors <= 0: - print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}") - print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}") - print("Try to increase the size of the cluster") - sys.exit(1) - # compute the amount of workers - n_workers = int((0.85 * cluster_cpu - d_actors * doc_cpu) / actor_cpu) - # Ensure that we do not overwhelm S3 - if n_workers > 2000: - n_workers = 2000 - print( - f"Number of preprocessors: {n_preprocessors}, Number of workers: {n_workers}, bucket actors {b_actors}, " - f"minhash actors {m_actors}, document actors {d_actors}" - ) - - # Make sure that we have enough memory - r_mem = avg_table_size * 4 * n_preprocessors + 2 * (b_actors + m_actors + d_actors) - print(f"Required execution memory {r_mem} GB") - if r_mem > cluster_memory: - print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") - print(f"Try to increase the size of the cluster or increase size of the cpu per worker (current {actor_cpu})") - sys.exit(1) + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + data_s3_config_dict["input_folder"] = os.path.join(base_folder, "bands") + data_s3_config_dict["output_folder"] = os.path.join(base_folder, "docs_to_remove") + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "cluster_num_bands": num_bands, + "cluster_jaccard_similarity_threshold": threshold, + "cluster_num_segments": num_segments, + } - print( - f"Required cpu : " - f"{b_actors * bucket_cpu + m_actors * mhash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" - ) - projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60 - print(f"Projected execution time {projected_execution} min") +def get_duplicate_list_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + duplicate_docids_folder: str, # folder with the docs IDs to remove + duplicate_list_location: str, # location of the list of duplicate doc ids +) -> dict: + """ + Compute fuzzy dedup execution parameters for get duplicate list step + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param duplicate_docids_folder: folder with the docs IDs to remove + :param duplicate_list_location: location of the list of duplicate doc ids + :return: a dictionary with a Ray Job execution parameters + """ + import json + + # fuzzy parameters + # Get cluster parameters + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + data_s3_config_dict["input_folder"] = base_folder + data_s3_config_dict["output_folder"] = base_folder + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "fdlist_docs_to_remove": duplicate_docids_folder, + "fdlist_consolidated_filename": duplicate_list_location, + } + + +def data_cleaning_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + id_column: str, # integer document id column name + duplicate_list_location: str, # location of the list of duplicate doc ids + operation_mode: str, # filter (non-)duplicates or annotate +) -> dict: + """ + Compute fuzzy dedup execution parameters + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param id_column: integer document id column name + :param duplicate_list_location: location of the list of duplicate doc ids + :param operation_mode: filter (non-)duplicates or annotate + :return: a dictionary with a Ray Job execution parameters + """ + import json + import os + + # fuzzy parameters + # Get cluster parameters + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + if operation_mode == "filter_duplicates": + output_subfolder = "cleaned" + elif operation_mode == "filter_non_duplicates": + output_subfolder = "duplicates" + else: # operation_mode == "annotate" + output_subfolder = "annotated" + data_s3_config_dict["output_folder"] = os.path.join(base_folder, output_subfolder) + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, - "runtime_num_workers": n_workers, - "runtime_worker_options": str(actor_options), + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), - "fdedup_doc_column": doc_column, - "fdedup_id_column": id_column, - "fdedup_cluster_column": cluster_column, - "fdedup_bucket_cpu": bucket_cpu, - "fdedup_doc_cpu": doc_cpu, - "fdedup_mhash_cpu": mhash_cpu, - "fdedup_num_doc_actors": d_actors, - "fdedup_num_bucket_actors": b_actors, - "fdedup_num_minhash_actors": m_actors, - "fdedup_num_preprocessors": n_preprocessors, - "fdedup_num_permutations": num_permutations, - "fdedup_threshold": threshold, - "fdedup_shingles_size": shingles_size, - "fdedup_delimiters": delimiters, - "fdedup_random_delay_limit": random_delay_limit, - "fdedup_snapshot_delay": snapshot_delay, - "fdedup_use_doc_snapshot": use_doc_snapshot, - "fdedup_use_bucket_snapshot": use_bucket_snapshot, + "fdclean_document_id_column": id_column, + "fdclean_duplicate_list_location": duplicate_list_location, + "fdclean_operation_mode": operation_mode, } From 96edea4fe2cb976e0e20a7b0299a022ecd378ef0 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 22:08:05 -0500 Subject: [PATCH 65/91] Added params to captured_arg_keys Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/data_cleaning_transform.py | 1 + .../universal/fdedup/python/src/signature_calc_transform.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 1a349ae85..74597068c 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -44,6 +44,7 @@ captured_arg_keys = [ document_id_column_key, duplicate_list_location_key, + operation_mode_key, ] # defaults diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index c63fa3576..6b14e1ba0 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -71,7 +71,7 @@ num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments across which we divide the hashing space for each band""" shingle_option_cli_param = f"{cli_prefix}{shingle_option_key}" -""" This key holds the option that is used to do shingles calculation for each document""" +""" The option (word/char) used to do shingles calculation for each document""" captured_arg_keys = [ document_id_column_key, @@ -83,6 +83,7 @@ jaccard_similarity_threshold_key, word_shingle_size_key, num_segments_key, + shingle_option_key, ] # defaults @@ -375,8 +376,7 @@ def _generate_word_shingles( # diacritics/unicode normalization text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn") text = text.strip() - print(shingling_option) - print("=============") + self.logger.debug(shingling_option) if shingling_option == "char": words = list(text) else: From 24163af9d00f7603b9ec17091c785c0fead8eaae Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 11 Nov 2024 09:36:19 -0500 Subject: [PATCH 66/91] Add shingle type option (word or char) to kfp Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 3 +++ .../fdedup/kfp_ray/src/fdedup_compute_execution_params.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 1c3e8e570..139a0f919 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -150,6 +150,7 @@ def fuzzydedup( fdedup_num_bands: int = 14, fdedup_num_minhashes_per_band: int = 8, fdedup_word_shingle_size: int = 5, + fdedup_shingle_option: str = "word", fdedup_jaccard_similarity_threshold: float = 0.75, fdedup_seed: int = 42, fdedup_docs_to_remove_folder: str = "docs_to_remove", @@ -202,6 +203,7 @@ def fuzzydedup( :param fdedup_num_bands - number of bands :param fdedup_num_minhashes_per_band - length of a band :param fdedup_word_shingle_size - length of word shingles + :param fdedup_shingle_option - type of shingle, one of 'word', or 'char' :param fdedup_jaccard_similarity_threshold - similarity threshold :param fdedup_seed - seed for the random number generator :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids @@ -258,6 +260,7 @@ def fuzzydedup( num_bands=fdedup_num_bands, num_minhashes_per_band=fdedup_num_minhashes_per_band, word_shingle_size=fdedup_word_shingle_size, + shingle_option=fdedup_shingle_option, threshold=fdedup_jaccard_similarity_threshold, num_segments=fdedup_num_segments, seed=fdedup_seed, diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index c5ff4d52b..65b7ac2f6 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -93,6 +93,7 @@ def signature_calc_compute_execution_params( num_bands: int, # number of bands num_minhashes_per_band: int, # band length word_shingle_size: int, # number of words in shingle + shingle_option: str, # type of shingle, one of 'word' or 'char' threshold: float, # threshold, num_segments: int, # number of segments seed: int, # seed for the random number generator @@ -114,6 +115,7 @@ def signature_calc_compute_execution_params( :param num_bands: number of bands :param num_minhashes_per_band: band length :param word_shingle_size: number of words in shingle + :param shingle_option: str: type of shingle, one of 'word' or 'char' :param threshold: threshold, :param num_segments: number of segments :param seed: seed for the random number generator @@ -138,6 +140,7 @@ def signature_calc_compute_execution_params( "minhash_num_bands": num_bands, "minhash_num_minhashes_per_band": num_minhashes_per_band, "minhash_word_shingle_size": word_shingle_size, + "minhash_shingle_option": shingle_option, "minhash_jaccard_similarity_threshold": threshold, "minhash_num_segments": num_segments, "minhash_seed": seed, From 3a43c3d4370cdb31949a11190804552716a3adce Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Wed, 13 Nov 2024 10:53:09 -0500 Subject: [PATCH 67/91] Utility to calculate number of bands and length of a band Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/utils/Makefile | 16 ++++ .../universal/fdedup/utils/calc_r_and_b.ipynb | 74 +++++++++++++++++++ .../universal/fdedup/utils/requirements.txt | 3 + 3 files changed, 93 insertions(+) create mode 100644 transforms/universal/fdedup/utils/Makefile create mode 100644 transforms/universal/fdedup/utils/calc_r_and_b.ipynb create mode 100644 transforms/universal/fdedup/utils/requirements.txt diff --git a/transforms/universal/fdedup/utils/Makefile b/transforms/universal/fdedup/utils/Makefile new file mode 100644 index 000000000..dae3f30ea --- /dev/null +++ b/transforms/universal/fdedup/utils/Makefile @@ -0,0 +1,16 @@ +PYTHON=python +PIP=pip + +venv: requirements.txt + $(PYTHON) -m venv venv + if [ -e venv/Scripts/activate ]; then \ + echo "For Windows please try the following AS Administrator - no guarantees"; \ + echo " venv\\Scripts\\activate"; \ + echo " pip install --upgrade pip"; \ + echo " pip install -r requirements.txt"; \ + echo " pip install pytest"; \ + else \ + . venv/bin/activate; \ + $(PIP) install --upgrade pip; \ + $(PIP) install -r requirements.txt; \ + fi diff --git a/transforms/universal/fdedup/utils/calc_r_and_b.ipynb b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb new file mode 100644 index 000000000..8398f9efa --- /dev/null +++ b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb @@ -0,0 +1,74 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cf5dba9a-d530-4a0a-ae71-2d741f7e705f", + "metadata": {}, + "source": [ + "This notebook allows calculating the values for `b` (the number of bands) and `r` (the number of minhashes in a band) used in the fuzzy dedup algorithm. The default values are `b=14` and `r=8`, as defined in the [FineWeb datasets paper](https://arxiv.org/pdf/2406.17557). The x-axis of the graph represents the Jaccard similarity between a pair of documents, while the y-axis represents the probability that they become duplication candidates. Please refer to http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf for more details on this methodology." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "800bc113-8b5e-4cec-8717-98fa05753bd0", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Define the parameterized function\n", + "def f(s, r, b):\n", + " return 1 - (1 - s**r)**b\n", + "\n", + "# Set the parameters r and b\n", + "r = 8\n", + "b = 14\n", + "\n", + "# Generate values for s in a range, e.g., from 0 to 1\n", + "s_values = np.linspace(0, 1, 500) # 500 points between 0 and 1\n", + "f_values = f(s_values, r, b)\n", + "\n", + "# Plot the function\n", + "plt.figure(figsize=(8, 6))\n", + "plt.plot(s_values, f_values, label=fr\"$f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\", color='blue')\n", + "plt.xlabel(\"s\")\n", + "plt.ylabel(\"f(s)\")\n", + "plt.title(f\"Plot of the function $f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\")\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98016b04-b6a0-465d-b65b-6d402978c9f0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/utils/requirements.txt b/transforms/universal/fdedup/utils/requirements.txt new file mode 100644 index 000000000..ce2acfefb --- /dev/null +++ b/transforms/universal/fdedup/utils/requirements.txt @@ -0,0 +1,3 @@ +jupyter +numpy +matplotlib From 2f61be7938d7540a0a1831e85b8a961bef24d35c Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Wed, 13 Nov 2024 15:37:32 -0500 Subject: [PATCH 68/91] Set correct version for pyproject Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/pyproject.toml | 6 +++--- transforms/universal/fdedup/ray/Makefile | 2 +- transforms/universal/fdedup/ray/pyproject.toml | 2 +- transforms/universal/fdedup/spark/Makefile | 2 +- transforms/universal/fdedup/spark/pyproject.toml | 8 ++++---- transforms/universal/fdedup/utils/Makefile | 2 ++ 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index f46c8e8c4..dd58d41d4 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_fdedup_transform_python" -version = "0.2.2.dev1" -requires-python = ">=3.10" +version = "0.2.2.dev2" +requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} @@ -10,7 +10,7 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev1", + "data-prep-toolkit==0.2.2.dev2", "pyarrow==16.1.0", "pyyaml>=6.0.2", "boto3>=1.34.69", diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile index f5f06c3c3..ec193b6c3 100644 --- a/transforms/universal/fdedup/ray/Makefile +++ b/transforms/universal/fdedup/ray/Makefile @@ -43,7 +43,7 @@ setup:: .transforms.setup # TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index b24886ad9..037525126 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -11,7 +11,7 @@ authors = [ ] dependencies = [ "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_fdedup_transform_python==0.2.2.dev1", + "dpk_fdedup_transform_python==0.2.2.dev2", "mmh3>=4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile index 7eb132fbd..ac2735e7d 100644 --- a/transforms/universal/fdedup/spark/Makefile +++ b/transforms/universal/fdedup/spark/Makefile @@ -36,7 +36,7 @@ publish: publish-image publish-image:: .transforms.publish-image-spark set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index 548f350c0..cc66fc044 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_fdedup_transform_spark" -version = "0.2.2.dev1" -requires-python = ">=3.10" +version = "0.2.2.dev2" +requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Spark Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} @@ -10,8 +10,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk_fdedup_transform_python==0.2.2.dev1", - "data-prep-toolkit-spark==0.2.2.dev1", + "dpk_fdedup_transform_python==0.2.2.dev2", + "data-prep-toolkit-spark==0.2.2.dev2", ] [project.optional-dependencies] diff --git a/transforms/universal/fdedup/utils/Makefile b/transforms/universal/fdedup/utils/Makefile index dae3f30ea..d9dae01d7 100644 --- a/transforms/universal/fdedup/utils/Makefile +++ b/transforms/universal/fdedup/utils/Makefile @@ -14,3 +14,5 @@ venv: requirements.txt $(PIP) install --upgrade pip; \ $(PIP) install -r requirements.txt; \ fi +set-versions: + @: \ No newline at end of file From cd5eb05f82d1145a620a03d0094aac96846d5d55 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Wed, 13 Nov 2024 15:45:37 -0500 Subject: [PATCH 69/91] Change the name of the utils Makefile Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/utils/{Makefile => Makefile.local} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename transforms/universal/fdedup/utils/{Makefile => Makefile.local} (100%) diff --git a/transforms/universal/fdedup/utils/Makefile b/transforms/universal/fdedup/utils/Makefile.local similarity index 100% rename from transforms/universal/fdedup/utils/Makefile rename to transforms/universal/fdedup/utils/Makefile.local From 6cc18cd8eaba2fb12a31f49af52aba188a9f6ac4 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 08:36:45 -0500 Subject: [PATCH 70/91] Copy whl file to the context folder Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/Dockerfile | 5 +++-- transforms/universal/fdedup/spark/Dockerfile | 19 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index f8c41791e..a6724e6e7 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -4,6 +4,7 @@ RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME # Create a user and use it to run the transform RUN useradd -ms /bin/bash dpk @@ -12,8 +13,8 @@ WORKDIR /home/dpk # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/ -RUN cd data-processing-lib-python && pip install --no-cache-dir -e . +COPY --chown=dpk:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile index a36a7cef7..772dfef79 100644 --- a/transforms/universal/fdedup/spark/Dockerfile +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -1,35 +1,34 @@ ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0 - FROM ${BASE_IMAGE} -# USER root # install pytest RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME WORKDIR ${SPARK_HOME}/work-dir # Copy in the data processing framework source/project and install it # This is expected to be placed in the docker context before this is run (see the make image). -COPY --chown=spark:root data-processing-lib-python/ data-processing-lib-python/ -RUN cd data-processing-lib-python && pip install --no-cache-dir -e . -COPY --chown=spark:root data-processing-lib-spark/ data-processing-lib-spark/ -RUN cd data-processing-lib-spark && pip install --no-cache-dir -e . +COPY --chown=spark:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark] + +## Copy the python version of the tansform COPY --chown=spark:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -# Install project source +# Install spark project source COPY --chown=spark:root src/ src/ COPY --chown=spark:root pyproject.toml pyproject.toml +COPY --chown=spark:root README.md README.md RUN mkdir -p /opt/spark/work-dir/src/templates && \ mkdir -p /opt/spark/work-dir/config +COPY --chown=spark:root deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ +COPY --chown=spark:root deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ # install requirements from requirements.txt COPY requirements.txt . RUN pip3 install -r requirements.txt -COPY deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ -COPY deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ - RUN pip install --no-cache-dir -e . # copy the main() entry point to the image From 9f336203571b07e8486292793599406b87abf830 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 08:38:49 -0500 Subject: [PATCH 71/91] Use keyword args in compute_common_params Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 139a0f919..0a0a4d9bf 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -221,10 +221,10 @@ def fuzzydedup( with dsl.ExitHandler(clean_up_task): # compute execution params compute_common_exec_params = compute_common_params_op( - ray_worker_options, - data_s3_config, - fdedup_num_permutations, - fdedup_n_samples, + worker_options=ray_worker_options, + data_s3_config=data_s3_config, + num_permutations=fdedup_num_permutations, + n_samples=fdedup_n_samples, ) ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2) ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret) From 528457c5cc91dad1439c72258be92e8030f45015 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 10:42:20 -0500 Subject: [PATCH 72/91] Use dynamic dependencies Signed-off-by: Constantin M Adam --- data-processing-lib/spark/pyproject.toml | 55 ------------------- transforms/universal/fdedup/python/Dockerfile | 2 +- .../universal/fdedup/python/pyproject.toml | 16 +----- .../universal/fdedup/python/requirements.txt | 10 ++++ transforms/universal/fdedup/ray/Dockerfile | 1 + .../universal/fdedup/ray/pyproject.toml | 11 +--- .../universal/fdedup/ray/requirements.txt | 6 ++ .../universal/fdedup/spark/pyproject.toml | 11 ++-- .../universal/fdedup/spark/requirements.txt | 3 +- 9 files changed, 33 insertions(+), 82 deletions(-) delete mode 100644 data-processing-lib/spark/pyproject.toml create mode 100644 transforms/universal/fdedup/python/requirements.txt create mode 100644 transforms/universal/fdedup/ray/requirements.txt diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml deleted file mode 100644 index 89b4d9bf8..000000000 --- a/data-processing-lib/spark/pyproject.toml +++ /dev/null @@ -1,55 +0,0 @@ -[project] -name = "data_prep_toolkit_spark" -version = "0.2.2.dev2" -keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -requires-python = ">=3.10,<3.13" -description = "Data Preparation Toolkit Library for Spark" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, -] -dependencies = [ - "data-prep-toolkit==0.2.2.dev2", - "pyspark>=3.5.2", - "psutil>=6.0.0", - "PyYAML>=6.0.2" -] - -[project_urls] -Repository = "https://github.com/IBM/data-prep-kit" -Issues = "https://github.com/IBM/data-prep-kit/issues" -Documentation = "https://ibm.github.io/data-prep-kit/" -"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/data_processing_spark"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index a6724e6e7..280063863 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -19,7 +19,7 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md -#COPY --chown=dpk:root requirements.txt requirements.txt +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index dd58d41d4..97be33d54 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -9,23 +9,13 @@ authors = [ { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev2", - "pyarrow==16.1.0", - "pyyaml>=6.0.2", - "boto3>=1.34.69", - "kubernetes>=30.1.0", - "polars==1.9.0", - "disjoint-set>=0.8.0", - "scipy>=1.14.1, <2.0.0", - "numpy<1.29.0", - "sentencepiece>=0.2.0", - "mmh3>=4.1.0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt new file mode 100644 index 000000000..4e69a72e4 --- /dev/null +++ b/transforms/universal/fdedup/python/requirements.txt @@ -0,0 +1,10 @@ +data-prep-toolkit==0.2.2.dev2 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars==1.9.0 +disjoint-set>=0.8.0 +scipy>=1.14.1, <2.0.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index af32f0fb3..71287ced7 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -20,6 +20,7 @@ RUN cd python-transform && pip install --no-cache-dir -e . COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml COPY --chown=ray:users README.md README.md +COPY --chown=ray:users requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source files needed by test-image diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 037525126..cb8c6306a 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -9,18 +9,13 @@ authors = [ { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_fdedup_transform_python==0.2.2.dev2", - "mmh3>=4.1.0", - "xxhash==3.4.1", - "tqdm==4.66.3", - "scipy>=1.12.0, <2.0.0" -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt new file mode 100644 index 000000000..6ee40ef7f --- /dev/null +++ b/transforms/universal/fdedup/ray/requirements.txt @@ -0,0 +1,6 @@ +data-prep-toolkit[ray]==0.2.2.dev2 +dpk_fdedup_transform_python==0.2.2.dev2 +mmh3>=4.1.0 +xxhash==3.4.1 +tqdm==4.66.3 +scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index cc66fc044..f77df2010 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -9,10 +9,13 @@ authors = [ { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "dpk_fdedup_transform_python==0.2.2.dev2", - "data-prep-toolkit-spark==0.2.2.dev2", -] +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt index 576c028a8..c373ffbb7 100644 --- a/transforms/universal/fdedup/spark/requirements.txt +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -1,4 +1,5 @@ -pyarrow==16.1.0 +dpk_fdedup_transform_python==0.2.2.dev2 +data-prep-toolkit[spark]==0.2.2.dev2 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 From fffb6305e7dbd018c343fde736b396db18a3d3d3 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 12:38:00 -0500 Subject: [PATCH 73/91] Add FIXME for https://github.com/kubeflow/pipelines/issues/10914 Signed-off-by: Constantin M Adam --- .../universal/fdedup/kfp_ray/fdedup_wf.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 0a0a4d9bf..fabc4e084 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -279,10 +279,12 @@ def fuzzydedup( prefix="scdata", ) ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) - ComponentUtils.set_s3_env_vars_to_component( - execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" - ) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") == "1": + ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" + ) execute_signature_calc_job.after(compute_signature_calc_exec_params) # Get the parameters for the cluster analysis job @@ -311,7 +313,9 @@ def fuzzydedup( server_url=server_url, ) ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") == "1": + ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op( @@ -338,7 +342,9 @@ def fuzzydedup( server_url=server_url, ) ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") == "1": + ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op( @@ -368,10 +374,12 @@ def fuzzydedup( prefix="dcdata", ) ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) - ComponentUtils.set_s3_env_vars_to_component( - execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" - ) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") == "1": + ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" + ) execute_data_cleaning_job.after(compute_data_cleaning_exec_params) From 5547d7fb574b8ebe2f8a98d6656f16faf9537808 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 13:03:02 -0500 Subject: [PATCH 74/91] Add FIXME for https://github.com/kubeflow/pipelines/issues/10914 Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index fabc4e084..683f93210 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -280,7 +280,7 @@ def fuzzydedup( ) ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC) # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 - if os.getenv("KFPv2", "0") == "1": + if os.getenv("KFPv2", "0") != "1": ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) ComponentUtils.set_s3_env_vars_to_component( execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" @@ -314,7 +314,7 @@ def fuzzydedup( ) ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC) # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 - if os.getenv("KFPv2", "0") == "1": + if os.getenv("KFPv2", "0") != "1": ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) @@ -343,7 +343,7 @@ def fuzzydedup( ) ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC) # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 - if os.getenv("KFPv2", "0") == "1": + if os.getenv("KFPv2", "0") != "1": ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) @@ -375,7 +375,7 @@ def fuzzydedup( ) ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC) # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 - if os.getenv("KFPv2", "0") == "1": + if os.getenv("KFPv2", "0") != "1": ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) ComponentUtils.set_s3_env_vars_to_component( execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" From 09e56e05dea66de01a023c53978a23497723b698 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 13:06:24 -0500 Subject: [PATCH 75/91] Remove pyproject.toml dependencies Signed-off-by: Constantin M Adam --- data-processing-lib/spark/Makefile | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/data-processing-lib/spark/Makefile b/data-processing-lib/spark/Makefile index d4769187b..5fde2bb07 100644 --- a/data-processing-lib/spark/Makefile +++ b/data-processing-lib/spark/Makefile @@ -11,9 +11,14 @@ setup:: set-versions: .check-env $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml - sed -e 's/"pyspark...*",/"pyspark>=${SPARK_VERSION}",/' \ - pyproject.toml > tt.toml - mv tt.toml pyproject.toml + if [ -e pyproject.toml ]; then \ + cat pyproject.toml | sed -e 's/"spark[default]==.*",/"spark[default]==$(SPARK_VERSION)",/' > tt.toml; \ + mv tt.toml pyproject.toml; \ + fi + if [ -e requirements.txt ]; then \ + cat requirements.txt | sed -e 's/ray[default]==.*/ray[default]==$(SPARK_VERSION)/' > tt.txt; \ + mv tt.txt requirements.txt; \ + fi build:: build-dist @@ -26,7 +31,7 @@ publish-dist :: .check-env .defaults.publish-dist publish-image:: .defaults.publish-image -venv:: pyproject.toml +venv:: $(MAKE) .defaults.spark-lib-src-venv pip install pytest pytest-cov From d3eac50704aa8bf032f212a0604430a3f0764cc2 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:24:30 -0500 Subject: [PATCH 76/91] Fix bug in number of actors calculation Signed-off-by: Constantin M Adam --- .../fdedup/kfp_ray/src/fdedup_compute_execution_params.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index 65b7ac2f6..cd3a58b99 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -57,16 +57,18 @@ def compute_common_params( # Also, to keep S3 utilization in check, limit the number of actors to 2000 num_nodes = worker_options["replicas"] cpu_per_node = worker_options["cpu"] - 1 - memory_per_node = 0.85 * worker_options["memory"] + memory_per_node = worker_options["memory"] memory_per_actor = 16 # GB max_num_actors = 2000 num_actors_per_node: int = int(memory_per_node / memory_per_actor) if num_actors_per_node == 0: num_actors_per_node = 1 - num_actors = num_nodes * num_actors_per_node + # never run actors on the head node, so (n - 1) nodes to run actors + num_actors = (num_nodes - 1) * num_actors_per_node + while num_actors > max_num_actors: - num_actors -= num_nodes + num_actors -= num_nodes - 1 num_actors_per_node -= 1 print(f"Number of actors per node = {num_actors_per_node}") cpus_per_actor = cpu_per_node / num_actors_per_node From fa5959b5f90ce90e97a52288be6aee18c06b9068 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:28:39 -0500 Subject: [PATCH 77/91] Cleanup main entry point and local implementation of python transforms Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/Dockerfile | 4 ++-- .../python/src/cluster_analysis_local_python.py | 5 +++-- .../python/src/cluster_analysis_transform.py | 10 +++++----- .../python/src/data_cleaning_local_python.py | 12 ++++++++---- ...dup_python.py => fdedup_transform_python.py} | 0 ...get_duplicate_list_transform_local_python.py | 6 ++++-- .../python/src/signature_calc_local_python.py | 17 +---------------- 7 files changed, 23 insertions(+), 31 deletions(-) rename transforms/universal/fdedup/python/src/{fuzzy_dedup_python.py => fdedup_transform_python.py} (100%) diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index 280063863..071478870 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -27,8 +27,8 @@ RUN pip install --no-cache-dir -e . COPY src/ src/ # copy source data -COPY ./src/signature_calc_transform_python.py fdedup_transform_python.py -COPY ./src/signature_calc_local_python.py local/ +COPY ./src/fdedup_transform_python.py fdedup_transform_python.py +COPY ./src/fdedup_transform_python.py local/ # copy test COPY test/ test/ diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py index 915cdcd1e..bb785021c 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -21,7 +21,9 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands")) +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { "input_folder": input_folder, @@ -42,7 +44,6 @@ if __name__ == "__main__": # Set the simulated command line args sys.argv = ParamsUtils.dict_to_req(d=params) - print(sys.argv) # create launcher launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) # Launch python to process the input diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 412fc1fa8..a9822babe 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -140,7 +140,7 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str # Generate the docs_to_remove dataframe docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove") output_data = TransformUtils.convert_arrow_to_binary(docs_to_remove_dataframe.to_arrow()) - self.logger.info(f"{len(docs_to_remove_dataframe)} documents marked to remove") + self.logger.debug(f"{len(docs_to_remove_dataframe)} documents marked to remove") metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)} return [(output_data, output_path)], metadata @@ -187,8 +187,8 @@ def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFra max_cdocs = 0 min_cdocs = 0 avg_cdocs = 0 - self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") - self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + self.logger.debug(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") cluster_stats = { "groupby_clusters": num_clusters, "cluster_duplicate_docs": sum_cdocs, @@ -226,8 +226,8 @@ def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, An max_cdocs = 0 min_cdocs = 0 avg_cdocs = 0 - self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") - self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + self.logger.debug(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") jaccard_stats = { "jaccard_clusters": num_clusters, "jaccard_duplicate_docs": sum_cdocs, diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py index 4295e4e82..aa4aabb90 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py @@ -23,15 +23,20 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, } duplicate_location = os.path.abspath( os.path.join( - os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", ) ) code_location = {"github": "github", "commit_hash": "12345", "path": "path"} @@ -49,7 +54,6 @@ if __name__ == "__main__": # Set the simulated command line args sys.argv = ParamsUtils.dict_to_req(d=params) - print(sys.argv) # create launcher launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) # Launch the ray actor(s) to process the input diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py similarity index 100% rename from transforms/universal/fdedup/python/src/fuzzy_dedup_python.py rename to transforms/universal/fdedup/python/src/fdedup_transform_python.py diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py index be90b3073..34b18ab04 100644 --- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py @@ -21,8 +21,10 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected/cluster_analysis")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected")) +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "cluster_analysis") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py index 2800c70cd..be395ed4d 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -23,18 +23,9 @@ # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "test_scdata")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = {"input_folder": input_folder, "output_folder": output_folder} code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), -} -s3_config = { - "input_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_data/", - "output_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_output_data/s3_test_3/", -} params = { # Data access. Only required parameters are specified @@ -47,18 +38,12 @@ "minhash_num_permutations": 112, "minhash_num_bands": 14, "minhash_num_segments": 2, - # "scdata_s3_cred": ParamsUtils.convert_to_ast(s3_creds), - # "scdata_s3_config": ParamsUtils.convert_to_ast(s3_config), } if __name__ == "__main__": # Set the simulated command line args sys.argv = ParamsUtils.dict_to_req(d=params) - print(sys.argv) - - sys.argv.append("--data_s3_cred") - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) # create launcher launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) From c4f889b37e165e9c0f6243e7cf47d19b1185c521 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:30:40 -0500 Subject: [PATCH 78/91] Cleanup main entry point and local implementation of ray transforms Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/ray/Dockerfile | 2 +- .../fdedup/ray/src/cluster_analysis_local_ray.py | 4 +++- .../fdedup/ray/src/data_cleaning_local_ray.py | 11 ++++++++--- .../{fuzzy_dedup_ray.py => fdedup_transform_ray.py} | 2 +- .../fdedup/ray/src/signature_calc_local_ray.py | 2 +- 5 files changed, 14 insertions(+), 7 deletions(-) rename transforms/universal/fdedup/ray/src/{fuzzy_dedup_ray.py => fdedup_transform_ray.py} (97%) diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index 71287ced7..4bfe32a9e 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -24,7 +24,7 @@ COPY --chown=ray:users requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source files needed by test-image -COPY --chown=ray:users ./src/signature_calc_transform_ray.py fdedup_transform_ray.py +COPY --chown=ray:users ./src/fdedup_transform_ray.py fdedup_transform_ray.py COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py index c078746ce..c54ba85c2 100644 --- a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py @@ -19,7 +19,9 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands")) +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { "input_folder": input_folder, diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py index 54fa2ccac..b951e2fc8 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py @@ -23,15 +23,20 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, } duplicate_location = os.path.abspath( os.path.join( - os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", ) ) worker_options = {"num_cpus": 0.8} diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py similarity index 97% rename from transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py rename to transforms/universal/fdedup/ray/src/fdedup_transform_ray.py index 987369714..be1bf5fcb 100644 --- a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py +++ b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py @@ -19,7 +19,7 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from fuzzy_dedup_python import ServiceOrchestrator, parse_args +from fdedup_transform_python import ServiceOrchestrator, parse_args from get_duplicate_list_transform_python import ( GetDuplicateListPythonTransformConfiguration, ) diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py index 64f492584..cb87b56af 100644 --- a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py +++ b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py @@ -19,7 +19,7 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, From f3c5be0c276c228710d753b377d539aba634f95c Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:32:18 -0500 Subject: [PATCH 79/91] Cleanup main entry point and local implementation of spark transforms Signed-off-by: Constantin M Adam --- ...ark.py => cluster_analysis_local_spark.py} | 30 +++++++++++---- ..._spark.py => data_cleaning_local_spark.py} | 38 +++++++++++++++---- ...dup_spark.py => fdedup_transform_spark.py} | 2 +- ...spark.py => signature_calc_local_spark.py} | 29 ++++++++++---- 4 files changed, 77 insertions(+), 22 deletions(-) rename transforms/universal/fdedup/spark/src/{cluster_analysis_spark.py => cluster_analysis_local_spark.py} (54%) rename transforms/universal/fdedup/spark/src/{data_cleaning_spark.py => data_cleaning_local_spark.py} (50%) rename transforms/universal/fdedup/spark/src/{fuzzy_dedup_spark.py => fdedup_transform_spark.py} (97%) rename transforms/universal/fdedup/spark/src/{signature_calc_spark.py => signature_calc_local_spark.py} (56%) diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py similarity index 54% rename from transforms/universal/fdedup/spark/src/cluster_analysis_spark.py rename to transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py index 83498f59e..c9950657c 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py @@ -19,14 +19,30 @@ from data_processing_spark.runtime.spark import SparkTransformLauncher +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, +} if __name__ == "__main__": - sys.argv.append("--data_s3_cred") - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) # create launcher launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) # Launch the spark worker(s) to process the input diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py similarity index 50% rename from transforms/universal/fdedup/spark/src/data_cleaning_spark.py rename to transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py index 7b6bd626d..9c14c67d8 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py @@ -19,14 +19,38 @@ from data_processing_spark.runtime.spark import SparkTransformLauncher +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) +) +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + if __name__ == "__main__": - sys.argv.append("--data_s3_cred") - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) # create launcher launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) # Launch the spark worker(s) to process the input diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py similarity index 97% rename from transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py rename to transforms/universal/fdedup/spark/src/fdedup_transform_spark.py index 58688de42..82767f849 100644 --- a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py +++ b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py @@ -18,7 +18,7 @@ from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing_spark.runtime.spark import SparkTransformLauncher -from fuzzy_dedup_python import ServiceOrchestrator, parse_args +from fdedup_transform_python import ServiceOrchestrator, parse_args from get_duplicate_list_transform_python import ( GetDuplicateListPythonTransformConfiguration, ) diff --git a/transforms/universal/fdedup/spark/src/signature_calc_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py similarity index 56% rename from transforms/universal/fdedup/spark/src/signature_calc_spark.py rename to transforms/universal/fdedup/spark/src/signature_calc_local_spark.py index 0e7046549..2db884346 100644 --- a/transforms/universal/fdedup/spark/src/signature_calc_spark.py +++ b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py @@ -21,14 +21,29 @@ ) +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = {"input_folder": input_folder, "output_folder": output_folder} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "scdata_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + + if __name__ == "__main__": - sys.argv.append("--data_s3_cred") - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) # create launcher launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) # Launch the spark worker(s) to process the input From 4941d5bab37a0bdc1e5873ce8e7288483703751f Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:46:43 -0500 Subject: [PATCH 80/91] Cleanup main entry point and local implementation of spark transforms Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/spark/Dockerfile | 6 +----- .../universal/fdedup/spark/src/data_cleaning_local_spark.py | 4 ++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile index 772dfef79..b04994d46 100644 --- a/transforms/universal/fdedup/spark/Dockerfile +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -32,11 +32,7 @@ RUN pip3 install -r requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/signature_calc_spark.py . - -# copy some of the samples in -COPY src/signature_calc_transform_spark.py fdedup_transform_spark.py -COPY src/signature_calc_spark.py local/fdedup_local_spark.py +COPY ./src/fdedup_transform_spark.py . # copy test COPY test/ test/ diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py index 9c14c67d8..eb1e61845 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py @@ -14,6 +14,10 @@ import sys import polars as pl +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher From 9c82fe0fb9734fb317ad8f18bfd940fe8fe361cb Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 18 Nov 2024 13:58:38 -0500 Subject: [PATCH 81/91] Added documentation for python, ray, spark and kfp Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/README.md | 19 +- transforms/universal/fdedup/kfp_ray/README.md | 14 +- transforms/universal/fdedup/python/README.md | 239 +++++++++++++++++- transforms/universal/fdedup/ray/README.md | 211 ++++------------ transforms/universal/fdedup/spark/README.md | 150 ++++------- 5 files changed, 348 insertions(+), 285 deletions(-) diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md index e128566d2..fed3c1370 100644 --- a/transforms/universal/fdedup/README.md +++ b/transforms/universal/fdedup/README.md @@ -1,10 +1,11 @@ -# Fuzzy Deduplification Transform -The fdedup transforms removes documents that are very similar to each other within a set of parquet files, -per the set of -[transform project conventions](../../README.md#transform-project-conventions) -the following runtimes are available: +# Fuzzy Deduplication Transform +The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a +set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries. +Per the set of [transform project conventions](../../README.md#transform-project-conventions) the following runtimes are available: -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime -* [kfp](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. +* [python](python/README.md) - enables running the base transform in a pure python environment +* [ray](ray/README.md) - enables running the base python transform in a Ray runtime +* [spark](spark/README.md) - enables running the base python transform in a spark runtime +* [kfp](kfp_ray/README.md) - enables running the ray docker image in a kubernetes cluster using a generated `yaml` file. + +Please check [here](python/README.md) for a more detailed description of this transform. diff --git a/transforms/universal/fdedup/kfp_ray/README.md b/transforms/universal/fdedup/kfp_ray/README.md index 97fd45a69..75eb77a08 100644 --- a/transforms/universal/fdedup/kfp_ray/README.md +++ b/transforms/universal/fdedup/kfp_ray/README.md @@ -1,8 +1,8 @@ -# Fuzzy Deduplication Ray-base KubeFlow Pipeline Transformation +# Fuzzy Deduplication Ray-based KubeFlow Pipeline Transformation ## Summary -This project allows execution of the [noop Ray transform](../ray) as a +This project allows execution of the [fuzzy dedup Ray transform](../ray) as a [KubeFlow Pipeline](https://www.kubeflow.org/docs/components/pipelines/overview/) The detail pipeline is presented in the [Simplest Transform pipeline tutorial](../../../../kfp/doc/simple_transform_pipeline.md) @@ -16,13 +16,9 @@ make workflow-build from the directory. It creates a virtual environment (make workflow-venv) and after that compiles the pipeline definitions in the folder. The virtual environment is created once for all transformers. -Note: the pipelines definitions can be compiled and executed on KFPv1 and KFPv2. Meantime, KFPv1 is our default. If you -prefer KFPv2, please do the following: -```shell -make clean -export KFPv2=1 -make workflow-build -``` +## Considerations +Currently, fuzzy dedup KFP pipeline definitions can be compiled and executed on KFPv1. KFPv2 is not +supported currently, because of this issue: https://github.com/kubeflow/pipelines/issues/10914 The next steps are described in [Deploying a pipeline](../../../../kfp/doc/simple_transform_pipeline.md#deploying-a-pipeline-) and [Executing pipeline and watching execution results](../../../../kfp/doc/simple_transform_pipeline.md#executing-pipeline-and-watching-execution-results-) \ No newline at end of file diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md index 34f18c73b..d2d940344 100644 --- a/transforms/universal/fdedup/python/README.md +++ b/transforms/universal/fdedup/python/README.md @@ -5,7 +5,240 @@ Please see the set of for details on general project conventions, transform configuration, testing and IDE set up. -## Summary +## Contributors +- Nelson Bore (kibnelson@gmail.com) +- Constantin Adam (cmadam@us.ibm.com) -The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see -[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details. \ No newline at end of file +## Description +The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a +set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries. + +Fuzzy dedup is a complex process made up of a pipeline that performs four main steps: + +1. **Signature Calculation**: creates a set of minhashes for each document, and uses them to create band signatures for +the document. +2. **Cluster Analysis**: groups documents into clusters based on matching band signatures. Within each cluster, it +retains only the documents that have a Jaccard similarity above a specified threshold, and it identifies which documents +to keep as unique and which ones to mark as duplicates. +3. **Duplicate List Generation**: combines the similarity clusters identified in each band to create a single, unified +list of duplicate documents. +4. **Data Cleaning**: processes the documents by either filtering out duplicates or adding annotations to distinguish +duplicates from non-duplicates. + +Each one of these steps is described in more detail below. + +### Signature Calculation + +This transform computes `num_permutations` minhashes and `num_bands` signatures for each document in the dataset, by +following these processing steps: +1. **Shingle Generation**: create a set of character or word shingles, using a specified window length. Character +shingles are more effective at detecting similar documents, but require more computational resources compared to word +shingles. +2. **Minhash Calculation**: using the shingles as input, compute `num_permutations` minhashes for each document. +3. **Band Signature Calculation**: divide the minhashes into `num_bands`, where each band contains +`num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band. + +The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard +similarity will be marked as duplicates. A Jupyter notebook in the [utils](utils) folder generates a graph of this +probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact +the deduplication process. + +To help distribute the workload and speed up processing of the next steps, the hash space of each band is divided into +`num_segments` segments. The band signatures, the minhashes, the document ids, and lengths are stored in an organized +output folder structure `bands/band=b/segment=s`, where `b` is the band number and `s` is the segment number. + +### Cluster Analysis + +This transform leverages segmented processing to analyze the data generated by the **Signature Calculation** step +efficiently and in parallel. Each worker processes a specific segment `s` of a band `b` by loading and analyzing all +Parquet files from the folder `bands/band=b/segment=s`. Each row in the Parquet files contains, for a document: +* `band_hash`, the document's band signature, and +* `data`, a structure with three fields: the unique `document_id`, document's `minhashes`, and `document_size`. + +The transform runs the following processing steps: +1. **Data Loading**: combine into a single dataframe all Parquet files in `bands/band=b/segment=s`. +2. **Clustering**: run a `group_by` operation on the `band_hash` column that will group documents with the same band +signature into clusters. +3. **Similarity Analysis**: for each cluster, calculate Jaccard similarity between pairs of documents using their +minhashes, and move documents below the specified Jaccard similarity threshold into new clusters. +4. **Duplicate Identification**: in clusters with more than one document remaining, retain the largest document with the +smallest document id, and mark as duplicates all other documents in the cluster. +5. **Persist Results**: save the duplicate clusters in a file. + +### Duplicate List Generation + +The **Cluster Analysis** step identifies duplicates across multiple bands, meaning a document can be marked as a +duplicate in one or more bands (e.g., if two documents are identical, one will be marked as a duplicate in all bands). +This transform consolidates all duplicate information from each band segment into a single file, providing a unified +record of duplicates detected across the dataset. + +### Data Cleaning + +This transform processes the original dataset using the list of duplicate documents generated by the **Duplicate List +Generation** step. It imports each file in the original dataset into a table and produces a new dataset. The directory +structure of the input dataset is preserved, but the contents of the output files depend on the selected operating mode: +1. **Annotate** - add a new `duplicate` column to the dataset, that contains a `d` for documents marked as duplicates, +and is empty for non-duplicates +2. **Filter duplicates** - removes all documents identified as duplicates from the dataset. +3. **Filter non-duplicates** - removes from the dataset all documents that were not marked as duplicates, leaving only +the duplicates. + +The output dataset reflects the selected mode, providing flexibility for downstream processing. + +## Input Columns Used by This Transform + +| Input Column Name | Data Type | Description | +|---------------------------------------------------------------------|-----------|----------------------------------| +| Column specified by the _contents_column_ configuration argument | str | Column that stores document text | +| Column specified by the _document_id_column_ configuration argument | int64 | Column that stores document ID | + +## Output Columns Annotated by This Transform +| Output Column Name | Data Type | Description | +|------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| duplicate | str | Column added if fuzzy dedup runs in 'annotate' mode. Value is 'd' for duplicate documents, empty for non-duplicates | + +## Configuration and Usage +### Fuzzy Deduplication Transform +The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as +follows: +```text +--input_folder INPUT_FOLDER + Input folder path +--output_folder OUTPUT_FOLDER + Output folder path +--operation_mode {filter_duplicates,filter_non_duplicates,annotate} + operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents +--contents_column CONTENTS_COLUMN + name of the column that stores document text +--document_id_column DOCUMENT_ID_COLUMN + name of the column that stores document ID +--seed SEED seed of the random number generator +--num_permutations NUM_PERMUTATIONS + number of permutations to use for minhash calculation +--num_bands NUM_BANDS + number of bands to use for band hash calculation +--num_minhashes_per_band NUM_MINHASHES_PER_BAND + number of minhashes to use in each band +--word_shingle_size WORD_SHINGLE_SIZE + number of words included in one shingle +--jaccard_similarity_threshold JACCARD_SIMILARITY_THRESHOLD + jaccard similarity threshold above which two documents are similar +--num_segments NUM_SEGMENTS + the number of segments dividing the hashing space for each band (for scalability) +--duplicate_list_location DUPLICATE_LIST_LOCATION + path to the file with all the duplicate document ids +--services SERVICES Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning) +--use_s3 USE_S3 use s3 +--s3_cred S3_CRED ast string of options for s3 credentials +--shingle_option SHINGLE_OPTION + Option used for shingling + +``` + +### Signature Calculation Transform +The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values +are as follows: +```text +--minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN + name of the column storing the unique ID assigned to each document +--minhash_contents_column MINHASH_CONTENTS_COLUMN + name of the column storing the contents of each document +--minhash_seed MINHASH_SEED + the seed used to instantiate the random number generator +--minhash_num_permutations MINHASH_NUM_PERMUTATIONS + number of permutations (minhashes) calculated for each document +--minhash_word_shingle_size MINHASH_WORD_SHINGLE_SIZE + the size of the word shingles calculated for each document +--minhash_num_bands MINHASH_NUM_BANDS + the number of bands to use in the banding technique +--minhash_num_minhashes_per_band MINHASH_NUM_MINHASHES_PER_BAND + the number of minhashes to use in each band +--minhash_num_segments MINHASH_NUM_SEGMENTS + the number of segments across which we divide the hashing space for each band +--minhash_shingle_option MINHASH_SHINGLE_OPTION + Shingling option ('word' or 'char') +``` + +### Cluster Analysis Transform +The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values +are as follows: +```text +--cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD + Jaccard similarity threshold above which two documents are duplicates +--cluster_num_bands CLUSTER_NUM_BANDS + The number of bands used in the banding technique +--cluster_num_segments CLUSTER_NUM_SEGMENTS + The number of segments dividing the hashing space for each band +``` + +### Get Duplicates List Transform +This transform currently has no configuration parameters. + +### Data Cleaning Transform +The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values +are as follows: +```text + --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN + name of the column storing the unique ID assigned to each document + --fdclean_operation_mode {filter_duplicates,filter_non_duplicates,annotate} + operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents +``` + +### Running the samples +To run the samples, use the following `make` target to create a virtual environment: + +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_python.py +``` +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_python.py +python cluster_analysis_local_python.py +python get_duplicate_list_local_python.py +python data_cleaning_local_python.py +``` +After running the transforms, execute: +```shell +ls output +``` +To see results of the transform. + +### Code example + +TBD (link to the notebook will be provided) + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +For testing fuzzy deduplication in a pure python runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` + +## Further Resources +The following is a list of references to research articles and github repositories that inspired the module's design: + +1. [Jure Leskovec, Anand Rajaraman, Jeff Ullman, Mining of Massive Datasets, Chapter 3: Finding Similar Items](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) +2. [G Penedo et al., The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/pdf/2406.17557) +3. [Datatrove github repo](https://github.com/huggingface/datatrove) diff --git a/transforms/universal/fdedup/ray/README.md b/transforms/universal/fdedup/ray/README.md index 41be44301..d93be3a4a 100644 --- a/transforms/universal/fdedup/ray/README.md +++ b/transforms/universal/fdedup/ray/README.md @@ -1,185 +1,45 @@ # Fuzzy Dedup -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. +Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +configuration, testing and IDE set up. ## Summary -The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see -[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details. The architecture of the implementation is presented here: +This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime. -![](images/fuzzy.png) +## Configuration and command line Options -The main components of implementation are driver, processors (implemented as actor pools) - table processor, table -filter and bucket hash processor, and hash actors - minhash, buckets and docs. - -The complication of mapping this model to transform model is the fact that in this model assumes a two pass processing, -while a transform model is a single pass. The solution to this mismatch is to use transform runtime to implement the -first path and use the native transform pipeline to implement filtering. - -## Transform runtime -The [transform runtime](src/fdedup_transform_ray.py) is implementing complete first path of the fuzzy deduping: -* creates bucket and minhash collectors -* implements initial file processing to populate bucket and minhash caches -* creates doc collectors -* implement bucket processing -* Clean up everything except for doc collectors in preparation to filter, that is implemented by the framework proper -The main components of runtime are described below - -### TableProcessor Actor - -[Table processing actor](src/fdedup_transform_ray.py) is implemented following framework itself is implemented as a pair - -`FdedupTransform` implementing the actual transformation and and -[transform table processor](../../../../data-processing-lib/src/data_processing/runtime/ray/transform_table_processor.py) -(from the framework itself). - -### DocsMinHash Actor - -This [actor](src/fdedup_support.py) stores MInHashes - -### BucketsHash Actor - -This actor [actor](src/fdedup_support.py) - -### BucketHashProcessor - -BucketHash [actor](src/fdedup_support.py) implement the actual buckets processing, removing duplicates. -Implementation of this actor allows to better manage this "expensive" process, by using Actor pool load balancing -thus minimizing overall time for this operation. Instead of pre partitioning buckets, it is using dynamic load -partitioning. We also are processing "longest" buckets first thus further improving performance. To further improve -the overall performance we can in future implement bucket splitting - its faster to process more smaller buckets -then the long ones - -### BucketHashProcessor - -This [actor](src/fdedup_support.py) is queueing up requests to the `BucketHashProcessor` actor pool, which load -balances their execution - -### DocCollector Actor - -This [actor](src/fdedup_support.py) is a collector for unique documents - -## Transformer - -In the fuzzy dedup implementation, the [transformer](src/fdedup_transform_ray.py) only implements filtering. For every -table, it checks document ids with the `DocumentsCollector` cache and removes all of the rows which do not have ids in -the hash - -## Snapshotting - -Fuzzy dedup often runs on very large data sets and implements three very distinct phases: -* Building buckets -* Processing buckets -* Filtering data -To improve recoverability of fuzzy dedup, current implementation includes snapshotting - at the end of the first two -phases we snapshot the current state of execution - bucket and minhash actors after the first phase and document actors -after the second. This snapshotting provide code with the ability to restart from the existing snapshot. You can use one -of two configuration flags (assuming snapshots exist): -* `use_bucket_snapshot` to start from the second phase -* `use_doc_snapshot` to start from the third phase - -## Building - -A [docker file](Dockerfile) that can be used for building docker image. You can use - -```shell -make build to build it -``` - -### Configuration and command line Options - -The set of dictionary keys holding [BlockListTransform](src/blocklist_transform.py) -configuration for values are as follows: - -* _bucket_cpu_ - specifies number of CPUs for bucket actor -* _doc_cpu_ - specifies number of CPUs for doc actor -* _mhash_cpu_ - specifies number of CPUs for minhash actor -* _num_doc_actors_ - specifies number of doc actors -* _num_bucket_actors_ - specifies number of bucket actors -* _num_minhash_actors_ - specifies number of minhash actors -* _num_preprocessors_ - specifies number of preprocessors -* _num_permutations_ - specifies number of permutations -* _threshold_ - specifies threshold -* _shingles_size_ - specifies shingles size -* _japanese_data_ - specifies whether to use japanese specific document splitting -* _delimiters_ - specifies delimiter for non japanese document splitting -* _snapshot_delay_ - delay between different actors reading/writing snapshot not to overwhelm storage -* -use_bucket_snapshot_ - run from the existing buckets snapshot (bypass building buckets) -* -use_doc_snapshot_ - run from the existing docs snapshot (bypass building and processing buckets) - -Above you see both parameters and their values for small runs (tens of files). We also provide an -[estimate](src/cluster_estimator.py) to roughly determine cluster size for running transformer. +Fuzzy Dedup configuration and command line options are the same as for the base python transform. ## Running - - -### Launched Command Line Options +### Launched Command Line Options When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). - -```shell - --fdedup_doc_column FDEDUP_DOC_COLUMN - document column name - --fdedup_id_column FDEDUP_ID_COLUMN - integer document id column name - --fdedup_cluster_column FDEDUP_CLUSTER_COLUMN - cluster column name - --fdedup_bucket_cpu FDEDUP_BUCKET_CPU - number of CPUs per bucket hash - --fdedup_mhash_cpu FDEDUP_MHASH_CPU - number of CPUs per minhash hash - --fdedup_doc_cpu FDEDUP_DOC_CPU - number of CPUs per doc hash - --fdedup_num_doc_actors FDEDUP_NUM_DOC_ACTORS - number of doc actors to use - --fdedup_num_minhash_actors FDEDUP_NUM_MINHASH_ACTORS - number of minhash actors to use - --fdedup_num_bucket_actors FDEDUP_NUM_BUCKET_ACTORS - number of bucket actors to use - --fdedup_num_preprocessors FDEDUP_NUM_PREPROCESSORS - number of preprocessors to use - --fdedup_num_permutations FDEDUP_NUM_PERMUTATIONS - number of permutations - --fdedup_threshold FDEDUP_THRESHOLD - threshold - --fdedup_shingles_size FDEDUP_SHINGLES_SIZE - number of words in shingle - --fdedup_delimiters FDEDUP_DELIMITERS - delimiter for splitting document - --fdedup_snapshot_delay FDEDUP_SNAPSHOT_DELAY - snapshot delay time - --fdedup_use_bucket_snapshot FDEDUP_USE_BUCKET_SNAPSHOT - flag to continue with bucket snapshot - --fdedup_use_doc_snapshot FDEDUP_USE_DOC_SNAPSHOT - flag to continue with doc snapshot - --fdedup_random_delay_limit FDEDUP_RANDOM_DELAY_LIMIT - maximum delay between read -``` - -These correspond to the configuration keys described above. +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. ### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/fdedup_transform_ray.py using command line args -* `run-local-sample` - runs src/fdedup_local_ray.py -* `run-s3-sample` - runs src/fdedup_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. +To run the samples, use the following `make` target to create a virtual environment: -For example, -```shell -make run-cli-sample -... +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_ray.py ``` -Then +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_ray.py +python cluster_analysis_local_ray.py +python get_duplicate_list_local_ray.py +python data_cleaning_local_ray.py +``` +After running the transforms, execute: ```shell ls output ``` @@ -190,3 +50,18 @@ To see results of the transform. To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. + +## Testing + +For testing fuzzy deduplication in a ray runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` \ No newline at end of file diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md index 3bf9b3245..dd0294aed 100644 --- a/transforms/universal/fdedup/spark/README.md +++ b/transforms/universal/fdedup/spark/README.md @@ -1,109 +1,67 @@ -# Spark-GUF +# Fuzzy Dedup -This is an implementation of Spark data processing modules. At a high level, every Spark application consists of a driver program that runs the user’s main function and executes various parallel operations on a cluster. +Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +configuration, testing and IDE set up. -The modules can run locally or remotely in a Kubernetes cluster. +## Summary -## Running Transforms locally +This project wraps the [Fuzzy Dedup transform](../python) with a Spark runtime. -Start in the `spark-guf` directory. To run the modules locally, follow these steps: -1. Create a virtual environment using this command - ``` - make venv - ``` -2. Activate the virtual environment: - ``` - source venv/bin/activate - ``` +## Configuration and command line Options -3. Set the `PYTHONPATH` environment variable to include the `src` directory: - ``` - export PYTHONPATH=${PYTHONPATH}:${PWD}/src - ``` -4. Invoke one of the transforms: - ``` - python src/transforms/spark_pi/spark_transformer_pi.py - ``` -5. To find out which arguments a transform takes, run that transform with a `--help` flag: - ``` - python src/transforms/spark_filter/spark_filter_transform.py --help - usage: spark_filter_transform.py [-h] --input_folder INPUT_FOLDER --output_folder OUTPUT_FOLDER [--data_type DATA_TYPE] - --filter_criteria_list FILTER_CRITERIA_LIST [--filter_columns_to_drop FILTER_COLUMNS_TO_DROP] - [--filter_logical_operator {AND,OR}] +Fuzzy Dedup configuration and command line options are the same as for the base python transform. - optional arguments: - -h, --help show this help message and exit - --input_folder INPUT_FOLDER - path to read the input files (local fs or s3) - --output_folder OUTPUT_FOLDER - path to write the output files (local fs or s3) - --data_type DATA_TYPE - Type of files to filter (parquet, orc, csv, json, txt) - --filter_criteria_list FILTER_CRITERIA_LIST - list of filter criteria (in SQL WHERE clause format), for example: [ "docq_total_words > 100 AND docq_total_words < 200", "docq_perplex_score < 230", "date_acquired BETWEEN '2023-07-04' - AND '2023-07-08'", "title LIKE 'https://%'", "document_id IN ('doc-id-1', 'doc-id-2', 'doc-id-3')" ] - --filter_columns_to_drop FILTER_COLUMNS_TO_DROP - list of columns to drop after filtering, for example: ["column1", "column2"] - --filter_logical_operator {AND,OR} - logical operator (AND or OR) that joins filter criteria - ``` +## Running +### Launched Command Line Options +When running the transform with the Spark launcher (i.e. TransformLauncher), +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[spark launcher](../../../../data-processing-lib/doc/spark-launcher-options.md) are available. -## Running Transforms in Kubernetes/OpenShift +### Running the samples +To run the samples, use the following `make` target to create a virtual environment: -Start in the `spark-guf` directory. To run the transforms in a Kubernetes or OpenShift cluster, follow these steps: +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_spark.py +``` +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_spark.py +python cluster_analysis_local_spark.py +python get_duplicate_list_local_spark.py +python data_cleaning_local_spark.py +``` +After running the transforms, execute: +```shell +ls output +``` +To see results of the transform. -1. Build and push a pyspark base docker image (this example assumes that images are pushed to the Docker hub, but same approach can be used to push images to icr.io, or quai.io: - ``` - docker build -t my-docker-username/my-pyspark:3.5.1 . - docker push my-docker-username/my-pyspark:3.5.1 - ``` -2. Build and push a specific transform image (this will use the pyspark built in the previous point as the base image): - ``` - docker build -t my-docker-username/my-pyspark-filter:3.5.1 -f src/transforms/spark_filter/Dockerfile --build-arg BASE_IMAGE=my-docker-username/my-pyspark:3.5.1 . - docker push my-docker-username/my-pyspark-filter:3.5.1 - ``` +### Transforming data using the transform image -3. Configure the `spark` service account (note that you can use any other service account name, but you will need then to replace `spark` with `your-service-account-name` in all the yaml files listed below). This is a one-time process to perform for each namespace where you want to run spark apps: - ``` - # create 'spark' service account - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-serviceaccount.yaml --namespace=my-namespace +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. - # create 'spark' role - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role.yaml --namespace=my-namespace +## Testing - # bind the 'spark' service account to the 'spark' role - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role-binding.yaml --namespace=my-namespace +For testing fuzzy deduplication in a spark runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` - # bind the 'spark' service account to the cluster roles - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-edit-role-binding.yaml --namespace=my-namespace - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-cluster-role-binding.yaml --namespace=my-namespace - ``` - - 4. Create any secrets that are needed to access S3 folders used for input or output of the transforms. Follow [this link](https://github.com/aws-samples/machine-learning-using-k8s/blob/master/docs/aws-creds-secret.md) for more information on how to build the S3 secrets. - - 5. Edit a pod yaml file from the `deployment/kubernetes/pods` directory. The steps below refer to the [yaml file used to build the filter pod] (deployment/kubernetes/pods/spark-driver-pod-filter.yaml): - 1. Give a name to the pod (`metadata/name`), the container launched inside the pod (`spec/containers/name`), and the Spark application (the `APP_NAME` variable in `spec/containers/env`). - 2. Specify the namespace where the pod will be created (`metadata/namespace`). Use the same namespace for the `EXECUTOR_NAMESPACE` variable in `spec/containers/env`) - 3. Specify the command to launch the Spark application (in `spec/containers/args`) - 4. Specify the image used by the driver (`spec/containers/image` - usually this is the transform image built under point 2). - 5. Specify the image used by the executors (`EXECUTOR_DOCKER_IMAGE` variable in `spec/containers/env`) - 6. Specify the service account to use by the driver (`spec/containers/serviceAccount`) and by the executors(the `SERVICE_ACCOUNT` variable in `spec/containers/env`) - 7. Configure S3: - 1. Specify the input (`AWS_ENDPOINT_URL_IN`) and output (`AWS_ENDPOINT_URL_OUT`) endpoint URLs. - 2. Specify the input and out access key ids and secret access keys. - -6. Launch the Spark application by creating the driver pod: - ``` - kubectl apply -f deployment/kubernetes/pod/spark-driver-pod-filter.yaml - ``` - -7. Monitor the creation of the executor pods: - ``` - kubectl get pods -w - ``` - -8. Monitor the driver logs: - ``` - kubectl logs spark-driver-pod-filter -f - ``` - ``` +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` \ No newline at end of file From ed4e9c1f8cfb77084d095d99200b68355cc059f4 Mon Sep 17 00:00:00 2001 From: Shahrokh Daijavad Date: Mon, 18 Nov 2024 16:40:37 -0800 Subject: [PATCH 82/91] Update README.md utils folder is one level up from the python folder --- transforms/universal/fdedup/python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md index d2d940344..295862221 100644 --- a/transforms/universal/fdedup/python/README.md +++ b/transforms/universal/fdedup/python/README.md @@ -39,7 +39,7 @@ shingles. `num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band. The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard -similarity will be marked as duplicates. A Jupyter notebook in the [utils](utils) folder generates a graph of this +similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact the deduplication process. From fb5601a7eefa66236b9d2b42edbebc476b509606 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Tue, 19 Nov 2024 14:28:42 -0500 Subject: [PATCH 83/91] Code cleanup and bug fixes Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/Dockerfile | 3 -- .../python/src/cluster_analysis_transform.py | 43 ++++++++--------- .../python/src/data_cleaning_transform.py | 12 ++--- .../python/src/fdedup_transform_python.py | 29 +++++++++-- .../src/get_duplicate_list_transform.py | 23 +++------ .../python/src/signature_calc_transform.py | 48 +++++++++---------- .../src/signature_calc_transform_python.py | 2 +- 7 files changed, 82 insertions(+), 78 deletions(-) diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index 071478870..79c85e4ac 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -23,9 +23,6 @@ COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . -# copy source data -COPY src/ src/ - # copy source data COPY ./src/fdedup_transform_python.py fdedup_transform_python.py COPY ./src/fdedup_transform_python.py local/ diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index a9822babe..16febc0dc 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -13,13 +13,17 @@ import os import re from argparse import ArgumentParser, Namespace -from typing import Any, List, Tuple +from typing import Any, List import numpy as np import polars as pl -import pyarrow as pa from data_processing.transform import AbstractFolderTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger +from data_processing.utils import ( + CLIArgumentProvider, + TransformUtils, + UnrecoverableException, + get_logger, +) from Murmur_MH import Murmur_MH @@ -86,7 +90,7 @@ class ClusterAnalysisTransform(AbstractFolderTransform): to keep (the largest size document), and mark the other documents as duplicates. The resulting clusters are saved in a file for further analysis. - Args: + The following internal variables are initialized from the config parameter: num_bands: number of bands used in the banding technique jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates num_segments: the number of segments dividing the hashing space for each band @@ -106,12 +110,14 @@ def __init__(self, config: dict[str, Any]): ) self.sort_output = config.get(sort_output_key, sort_output_default) self.data_access = config.get("data_access") + if self.data_access is None: + raise UnrecoverableException("Could not get a pointer to the data access object inside the transform.") self.logger = get_logger(__name__) def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: self.logger.info(f"Cluster analysis for folder {folder_name}") metadata = {} - input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name)) files, retries = self.data_access.get_folder_files( path=input_folder, extensions=[".parquet"], @@ -125,17 +131,17 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str segment = int(match.group(2)) else: raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s") - output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_folder = TransformUtils.clean_path(self.data_access.output_folder) output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet") # consolidate into a single data frame band hashes computed by workers - band_segment_dataframe, consolidation_stats = self.consolidate_band_segment_files(files) + band_segment_dataframe, consolidation_stats = self._consolidate_band_segment_files(files) metadata |= consolidation_stats # cluster grouping by band hashes - cluster_dataframe, cluster_stats = self.get_clusters(band_segment_dataframe) + cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe) metadata |= cluster_stats # cluster analysis using jaccard similarity - jaccard_cluster_dataframe, jaccard_stats = self.analyze_clusters(cluster_dataframe) + jaccard_cluster_dataframe, jaccard_stats = self._analyze_clusters(cluster_dataframe) metadata |= jaccard_stats # Generate the docs_to_remove dataframe docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove") @@ -144,14 +150,7 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)} return [(output_data, output_path)], metadata - def sanitize_folder_name(self, folder_name: str) -> str: - if "://" in folder_name: - _, folder_name = folder_name.split("://") - if folder_name[-1] != "/": - folder_name = f"{folder_name}/" - return folder_name - - def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + def _consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: band_segment_dataframe = pl.DataFrame() total_input_rows = 0 for fname, contents in files.items(): @@ -170,7 +169,7 @@ def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.Da } return band_segment_dataframe, consolidation_stats - def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + def _get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: groupby_dataframe = band_segment_dataframe.group_by("band_hash").agg("document_data") cluster_dataframe = groupby_dataframe.with_columns(cluster_length=pl.col("document_data").list.len()).filter( pl.col("cluster_length") > 1 @@ -195,14 +194,14 @@ def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFra } return cluster_dataframe, cluster_stats - def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + def _analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: # Define the schema with specific data types schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64} doc_ids_lists = [] docs_to_remove_lists = [] len_of_docs2remove_lists = [] for row in df.iter_rows(named=True): - doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self.jaccard_distance_calculation(row) + doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self._jaccard_distance_calculation(row) doc_ids_lists += doc_ids_list docs_to_remove_lists += docs_to_remove_list len_of_docs2remove_lists += len_of_docs2remove_list @@ -236,7 +235,7 @@ def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, An filtered_jaccard_dataframe = filtered_jaccard_dataframe.sort(by="first_doc") return filtered_jaccard_dataframe, jaccard_stats - def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: + def _jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: # Process row and return a new list of Series or a new row threshold = self.jaccard_similarity_threshold doc_ids_list = [] @@ -321,7 +320,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: f"--{sort_output_cli_param}", type=bool, default=sort_output_default, - help="Sort", + help="Sort the similarity clusters by the document ID of the kept doc (used primarily for testing)", ) def apply_input_params(self, args: Namespace) -> bool: diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 74597068c..3403bfc42 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -12,14 +12,13 @@ import io import os from argparse import ArgumentParser, Namespace -from typing import Any, List, Tuple +from typing import Any -import numpy as np import polars as pl import pyarrow as pa from data_processing.data_access import DataAccessFactory from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger +from data_processing.utils import CLIArgumentProvider, get_logger short_name = "fdclean" @@ -69,8 +68,9 @@ class DataCleaningTransform(AbstractTableTransform): keeps the directory structure of the input dataset, but has all the fuzzy duplicates removed. - Args: - duplicate_location: location (local or s3) of the duplicate document list + The following internal variables are initialized from the config dictionary: + duplicate_list_location: location (local or s3) of the duplicate document list + operation_mode: one of annotate, filter_duplicates, or filter_non_duplicates """ def __init__(self, config: dict[str, Any]): @@ -90,7 +90,7 @@ def __init__(self, config: dict[str, Any]): self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column}) def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + self.logger.debug(f"Transforming table with {table.num_rows} rows from file {file_name}") input_df = pl.from_arrow(table) # handle the case when the doc_id columns in the input dataframe and the # docs_to_remove_df have different types, i.e. one is int32 and the diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py index b77f44401..166e48e26 100644 --- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py +++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py @@ -115,17 +115,38 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: s3_cred_ast = ParamsUtils.convert_to_ast(in_args.s3_cred) sys_argv.append("--data_s3_cred") sys_argv.append(s3_cred_ast) + if service_name == "minhash": + sys_argv.append("--scdata_s3_cred") + sys_argv.append(s3_cred_ast) + if service_name == "fdclean": + sys_argv.append("--dcdata_s3_cred") + sys_argv.append(s3_cred_ast) elif ( s3_creds.get("access_key") is not None and s3_creds.get("secret_key") is not None and s3_creds.get("url") is not None ): + ast_s3_cred = ParamsUtils.convert_to_ast(s3_creds) sys_argv.append("--data_s3_cred") - sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) + sys_argv.append(ast_s3_cred) + if service_name == "minhash": + sys_argv.append("--scdata_s3_cred") + sys_argv.append(ast_s3_cred) + if service_name == "fdclean": + sys_argv.append("--dcdata_s3_cred") + sys_argv.append(ast_s3_cred) sys_argv.append("--data_s3_config") else: sys_argv.append("--data_local_config") - sys_argv.append(ParamsUtils.convert_to_ast(data_io)) + ast_data_io = ParamsUtils.convert_to_ast(data_io) + sys_argv.append(ast_data_io) + if in_args.use_s3: + if service_name == "minhash": + sys_argv.append("--scdata_s3_config") + sys_argv.append(ast_data_io) + if service_name == "fdclean": + sys_argv.append("--dcdata_s3_config") + sys_argv.append(ast_data_io) return sys_argv def execute_service(self, service_short_name: str, params: list) -> int: @@ -163,9 +184,9 @@ def parse_args() -> argparse.Namespace: "--contents_column", type=str, required=False, help="name of the column that stores document text" ) parser.add_argument( - "--document_id_column", type=str, required=False, help="name of the column that stores document text" + "--document_id_column", type=str, required=False, help="name of the column that stores document ID" ) - parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text") + parser.add_argument("--seed", type=int, required=False, help="seed of the random number generator") parser.add_argument( "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation" ) diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py index c49124cf1..c14c4bdce 100644 --- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py @@ -11,16 +11,12 @@ ################################################################################ import io import os -import re from argparse import ArgumentParser, Namespace -from typing import Any, List, Tuple +from typing import Any -import numpy as np import polars as pl -import pyarrow as pa from data_processing.transform import AbstractFolderTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger -from Murmur_MH import Murmur_MH short_name = "fdlist" @@ -61,7 +57,7 @@ class GetDuplicateListTransform(AbstractFolderTransform): This is an intermediate step of the fuzzy dedup pipeline. It runs in a single location and consolidates in a single file all the duplicates found for each band segment. - Args: + These internal variables are initialized from the config dictionary: subfolder: name of the subfolder with the duplicate records consolidated_filename: name of the file with the consolidated list of duplicates """ @@ -82,7 +78,7 @@ def __init__(self, config: dict[str, Any]): def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: self.logger.info(f"Get Duplicate List for folder {folder_name}") metadata = {} - input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name)) files, retries = self.data_access.get_folder_files( path=input_folder, extensions=[".parquet"], @@ -90,24 +86,17 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str ) if retries > 0: metadata |= {"data_access_retries": retries} - output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_folder = TransformUtils.clean_path(self.data_access.output_folder) output_path = os.path.join(output_folder, self.consolidated_filename) # consolidate into a single data frame band hashes computed by workers - consolidated_dataframe, consolidation_stats = self.consolidate_docs_to_remove_files(files) + consolidated_dataframe, consolidation_stats = self._consolidate_docs_to_remove_files(files) self.logger.info(f"{len(consolidated_dataframe)} documents marked as duplicates") metadata |= consolidation_stats output_data = TransformUtils.convert_arrow_to_binary(consolidated_dataframe.to_arrow()) return [(output_data, output_path)], metadata - def sanitize_folder_name(self, folder_name: str) -> str: - if "://" in folder_name: - _, folder_name = folder_name.split("://") - if folder_name[-1] != "/": - folder_name = f"{folder_name}/" - return folder_name - - def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + def _consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: consolidated_dataframe = pl.DataFrame() total_input_rows = 0 for fname, contents in files.items(): diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 6b14e1ba0..4e64bcb5a 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -14,7 +14,7 @@ import unicodedata from argparse import ArgumentParser, Namespace from pathlib import Path -from typing import Any, List +from typing import Any import mmh3 import numpy as np @@ -22,7 +22,7 @@ import pyarrow as pa from data_processing.data_access import DataAccessFactory from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider +from data_processing.utils import CLIArgumentProvider, UnrecoverableException from Murmur_MH import Murmur_MH @@ -129,16 +129,13 @@ class SignatureCalculationTransform(AbstractTableTransform): """ This is the first transform of the fuzzy dedup pipeline. First, it calculates, for each document in a dataset, `num_permutations` minhashes. It accepts as - input the number of bands and the length of each band. If those two parameters - are not specified, then, based on the values of `jaccard_similarity_threshold` - and `num_permutations`, it determines the optimal number of bands, and the - length of each band (how many minhashes will be used to get the signature for - each band). The band signatures, the minhashes and the document lengths are + input the number of bands and the length (number of minhashes used for) each + band. The band signatures, the minhashes and the document lengths are then saved in the output folder, under a folder structure `bands/band=b/segment=s`. To improve scalability of the next step of fuzzy dedup, the hash space of each band is divided into `num_segments` segments. - Args: + The following internal variables are retrieved from the config parameter: document_id_column: name of the column storing the unique ID assigned to each document contents_column_cli_param: name of the column storing the contents of each document seed: the seed used to instantiate the random number generator @@ -171,21 +168,22 @@ def __init__(self, config: dict[str, Any]): self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) self.shingle_option = config.get(shingle_option_key, shingle_option_default) # use this dataframe to store the minhashes and size for each document - self.all_minhashes: pl.DataFrame = None + self.all_minhashes = None # use this dataframe to store the band hashes for each document - self.all_band_hashes: pl.DataFrame = None + self.all_band_hashes = None # this variable keeps track of how many files were processed since last # data write to properly update metadata self.files_processed = 0 self.bytes_processed = 0 self.data_access = config.get("data_access") + if self.data_access is None: + raise UnrecoverableException("Could not get a pointer to the data access object inside the transform.") self.last_file_name = None + self.sc_data_access = config.get(sigcalc_data_access_key, None) - if self.sc_data_access is None: - self.sc_daf = config.get(sigcalc_data_factory_key, None) - if self.sc_daf is None: - raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}") - self.sc_data_access = self.sc_daf.create_data_access() + self.sc_daf = config.get(sigcalc_data_factory_key, None) + if self.sc_daf is None: + raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}") def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """ @@ -194,7 +192,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab This implementation makes no modifications so effectively implements a copy of the input parquet to the output folder, without modification. """ - self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + self.logger.debug(f"Transforming table with {table.num_rows} rows from file {file_name}") self.logger.debug("----minhash---") self.last_file_name = file_name self.files_processed += 1 @@ -226,7 +224,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab self.all_minhashes = self.all_minhashes.vstack(minhashes) # Calculate band hashes - band_hashes_list = self.process_rows_into_bands( + band_hashes_list = self._process_rows_into_bands( minhashes, self.num_bands, self.num_rows, @@ -247,7 +245,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab self.all_band_hashes = self.all_band_hashes.vstack(band_hashes) if len(self.all_minhashes) > 750000: - tables, metadata = self.write_band_signatures() + tables, metadata = self._write_band_signatures() else: tables = [] metadata = {} @@ -266,14 +264,16 @@ def flush(self) -> tuple[list[pa.Table], dict[str, Any]]: """ self.logger.info(f"Starting flush()") if self.all_band_hashes is not None and self.all_minhashes is not None: - tables, metadata = self.write_band_signatures() + tables, metadata = self._write_band_signatures() else: tables = [] metadata = {} return tables, metadata - def write_band_signatures(self): + def _write_band_signatures(self): # define the upper and lower bounds of each band segment + if self.sc_data_access is None: + self.sc_data_access = self.sc_daf.create_data_access() segment_bounds_list = [] upper_bound = np.uint64(np.iinfo(np.uint64).max) segment_len = np.uint64(upper_bound // self.num_segments) @@ -325,7 +325,6 @@ def write_band_signatures(self): self.logger.debug(f"band {band_ix} segment {segment_index} encapsulated document info in a structure") # append the table to the result list, and the path to metadata - common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name]) last_file_name_path = Path(self.last_file_name) suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) if self.sc_data_access.output_folder is None: @@ -389,7 +388,7 @@ def _generate_word_shingles( k_shingles.append(delimiter.join(words[i : i + window_size])) return k_shingles, doc_len, document_id - def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b: int, r: int, seed: int = 42): + def _emit_bands(self, int_id_column: str, minhashes: np.array, b: int, r: int, seed: int = 42): num_minhashes = len(minhashes) assert b * r <= num_minhashes, f"b*r must be <= num minhashes, was b={b}, r={r}, num_minhashes={num_minhashes}" results = [] @@ -403,13 +402,12 @@ def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b return results # Apply the function - def process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band): + def _process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band): result = [] for row in df.iter_rows(): - bands = self.emit_bands( + bands = self._emit_bands( row[0], # document id np.array(row[1], dtype=np.uint32), # minhashes - row[2], # document length minhashlsh_num_bands, minhashlsh_length_band, ) diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py index 5ddc102eb..40e0e97e3 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py @@ -40,5 +40,5 @@ def __init__(self): if __name__ == "__main__": launcher = PythonTransformLauncher(SignatureCalculationTransformConfiguration()) - logger.info("Launching noop transform") + logger.info("Launching fuzzy dedup signature calculation transform") launcher.launch() From 0636d5f998c61d9169fbd5afb3d124aa6b1bad4f Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 21 Nov 2024 00:44:28 -0500 Subject: [PATCH 84/91] Reduce the amount of logging Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/cluster_analysis_transform.py | 2 +- .../universal/fdedup/python/src/data_cleaning_transform.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 16febc0dc..fa3ce6d28 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -115,7 +115,7 @@ def __init__(self, config: dict[str, Any]): self.logger = get_logger(__name__) def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: - self.logger.info(f"Cluster analysis for folder {folder_name}") + self.logger.debug(f"Cluster analysis for folder {folder_name}") metadata = {} input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name)) files, retries = self.data_access.get_folder_files( diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 3403bfc42..cb07923ae 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -86,7 +86,7 @@ def __init__(self, config: dict[str, Any]): self.operation_mode = config.get(operation_mode_key, operation_mode_default) contents = config.get("df") self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents)) - self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") + self.logger.debug(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column}) def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: From d58518bfe9d52eacd0063909267cabafb1f546dc Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 21 Nov 2024 00:45:39 -0500 Subject: [PATCH 85/91] Cleanup KFP pipeline code Signed-off-by: Constantin M Adam --- .../universal/fdedup/kfp_ray/fdedup_wf.py | 40 +++--- .../src/fdedup_compute_execution_params.py | 134 ++++++++++-------- 2 files changed, 92 insertions(+), 82 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 683f93210..ffc6f79bc 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -115,22 +115,23 @@ def fuzzydedup( ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = { - "cpu": 1, - "memory": 4, + "cpu": 8, + "memory": 64, "image": task_image, "image_pull_secret": image_pull_secret, "imagePullPolicy": "Always", }, ray_worker_options: dict = { - "replicas": 2, - "max_replicas": 2, - "min_replicas": 2, - "cpu": 2, - "memory": 4, + "replicas": 10, + "max_replicas": 10, + "min_replicas": 10, + "cpu": 16, + "memory": 128, "image": task_image, "image_pull_secret": image_pull_secret, "imagePullPolicy": "Always", }, + runtime_actor_options: dict = {"num_cpus": 0.8, "memory": 16}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access. checkpointing is not supported by dedup data_s3_config: str = "{'input_folder': 's3://cos-llm-pile-south/spark_test/fd_xs_dataset_test/', 'output_folder': 's3://cos-llm-pile-south/spark_test/fuzzy_dedup_test_output_data/kfp_test_1/'}", @@ -153,10 +154,6 @@ def fuzzydedup( fdedup_shingle_option: str = "word", fdedup_jaccard_similarity_threshold: float = 0.75, fdedup_seed: int = 42, - fdedup_docs_to_remove_folder: str = "docs_to_remove", - fdedup_duplicate_list_location: str = os.path.join( - "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" - ), fdedup_operation_mode: str = "annotate", # data sampling fdedup_n_samples: int = 10, @@ -206,8 +203,6 @@ def fuzzydedup( :param fdedup_shingle_option - type of shingle, one of 'word', or 'char' :param fdedup_jaccard_similarity_threshold - similarity threshold :param fdedup_seed - seed for the random number generator - :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids - :param fdedup_duplicate_list_location - name of the file holding the consolidated list of duplicates :param fdedup_operation_mode - data cleaning mode, one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate' :param fdedup_n_samples - number of samples for parameters computation :return: None @@ -222,6 +217,7 @@ def fuzzydedup( # compute execution params compute_common_exec_params = compute_common_params_op( worker_options=ray_worker_options, + actor_options=runtime_actor_options, data_s3_config=data_s3_config, num_permutations=fdedup_num_permutations, n_samples=fdedup_n_samples, @@ -229,8 +225,9 @@ def fuzzydedup( ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2) ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret) fdedup_num_segments = compute_common_exec_params.outputs["num_segments"] - runtime_actor_cpus = compute_common_exec_params.outputs["cpus_per_actor"] runtime_num_actors = compute_common_exec_params.outputs["num_actors"] + runtime_actor_cpus = compute_common_exec_params.outputs["actor_cpu"] + runtime_actor_memory = compute_common_exec_params.outputs["actor_memory"] # start Ray cluster ray_cluster = create_ray_op( @@ -246,8 +243,9 @@ def fuzzydedup( # Get the parameters for the signature calculation job compute_signature_calc_exec_params = compute_signature_calc_exec_params_op( - runtime_actor_cpus=runtime_actor_cpus, runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, @@ -289,8 +287,9 @@ def fuzzydedup( # Get the parameters for the cluster analysis job compute_cluster_analysis_exec_params = compute_cluster_analysis_exec_params_op( - runtime_actor_cpus=runtime_actor_cpus, runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, @@ -319,16 +318,15 @@ def fuzzydedup( execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op( - runtime_actor_cpus=runtime_actor_cpus, runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, - duplicate_docids_folder=fdedup_docs_to_remove_folder, - duplicate_list_location=fdedup_duplicate_list_location, ) ComponentUtils.add_settings_to_component(compute_get_duplicate_list_exec_params, ONE_HOUR_SEC * 2) compute_get_duplicate_list_exec_params.after(execute_cluster_analysis_job) @@ -348,8 +346,9 @@ def fuzzydedup( execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op( - runtime_actor_cpus=runtime_actor_cpus, runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, @@ -357,7 +356,6 @@ def fuzzydedup( runtime_job_id=run_id, runtime_code_location=runtime_code_location, id_column=fdedup_document_id_column, - duplicate_list_location=fdedup_duplicate_list_location, operation_mode=fdedup_operation_mode, ) ComponentUtils.add_settings_to_component(compute_data_cleaning_exec_params, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index cd3a58b99..15722c164 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -10,15 +10,27 @@ # limitations under the License. ################################################################################ -from typing import Any, Dict, NamedTuple +from typing import Any, NamedTuple def compute_common_params( worker_options: dict, # ray worker configuration + actor_options: dict, # actor desired configuration data_s3_config: str, # S3 configuration num_permutations: int, # number of permutations (minhashes) per document n_samples: int, # files to sample for number of documents estimation -) -> NamedTuple("fdedup_params", [("num_segments", int), ("num_actors", int), ("cpus_per_actor", float)]): +) -> NamedTuple( + "fdedup_params", [("num_segments", int), ("num_actors", str), ("actor_cpu", float), ("actor_memory", int)] +): + """ + Compute fuzzy dedup execution parameters common to all the transforms + :param worker_options: worker group configuration + :param actor_options: desired actor configuration + :param data_s3_config: s3 configuration + :param num_permutations: number of permutations + :param n_samples: number of samples used to estimate the total number of documents in the dataset + :return: fdedup_params NamedTuple: num_segments - int, num_actors - str, cpus (float) and memory (int) per actor + """ import sys @@ -40,49 +52,45 @@ def compute_common_params( print(f"Estimated number of documents and documents size is zero. Please verify the input path.") sys.exit(1) print(f"Estimated number of docs: {number_of_docs}") + actor_cpu: float = actor_options.get("num_cpus", 1) # if num_cpus not specified, request 1 CPU per actor + actor_memory: int = int(actor_options.get("memory", 16)) * GB # if memory not specified, request 16 GB per actor + # Calculate the number of segments # Assume each document takes doc_bytes = (8 + num_permutations * 4 + 20) bytes, where: # 8 bytes are taken by the band hash # (num_permutations * 4) bytes are taken by the min hashes # 20 bytes to provide some extra space for storage in a table # The total amount of space needed by a band is number_of_docs * doc_bytes. - # To scale the handling of this data, divide each band into segments, where each segment size is below 3GB + # To scale band handling, divide each band into segments, each smaller than 1/6 of an actor's allocated memory doc_bytes = 8 + num_permutations * 4 + 20 band_bytes = number_of_docs * doc_bytes - num_segments = 1 + (band_bytes // (3 * GB)) + num_segments = 1 + (band_bytes // (actor_memory // 6)) print(f"Number of segments: {num_segments}") - # To process data efficiently, each actor needs 16GB of memory. - # The actor config controls CPU allocation, not memory; - # use CPU allocation s.t. the number of actors on a worker provides access to 16GB of memory for each actor. - # Also, to keep S3 utilization in check, limit the number of actors to 2000 - num_nodes = worker_options["replicas"] - cpu_per_node = worker_options["cpu"] - 1 - memory_per_node = worker_options["memory"] - - memory_per_actor = 16 # GB - max_num_actors = 2000 - num_actors_per_node: int = int(memory_per_node / memory_per_actor) - if num_actors_per_node == 0: - num_actors_per_node = 1 - # never run actors on the head node, so (n - 1) nodes to run actors - num_actors = (num_nodes - 1) * num_actors_per_node - - while num_actors > max_num_actors: - num_actors -= num_nodes - 1 - num_actors_per_node -= 1 - print(f"Number of actors per node = {num_actors_per_node}") - cpus_per_actor = cpu_per_node / num_actors_per_node - print(f"CPUs per actor = {cpus_per_actor}") + # Calculate number of actors, using KFPUtils.default_compute_execution_params() + # Create new dict with memory expressed in bytes, as expected by KFPUtils.default_compute_execution_params() + actor_config = { + "num_cpus": actor_cpu, + "memory": actor_memory, + } + num_actors = KFPUtils.default_compute_execution_params(str(worker_options), str(actor_config)) + print(f"num_actors = {num_actors}") from collections import namedtuple - fdedup_params = namedtuple("fdedup_params", ["num_segments", "num_actors", "cpus_per_actor"]) - return fdedup_params(num_segments, num_actors, cpus_per_actor) + fdedup_params = namedtuple( + typename="fdedup_params", + field_names=["num_segments", "num_actors", "actor_cpu", "actor_memory"], + ) + print( + f"num_segments = {num_segments}, num_actors = {num_actors}, actor_cpu = {actor_cpu}, actor_memory = {actor_memory}" + ) + return fdedup_params(num_segments, num_actors, actor_cpu, actor_memory) def signature_calc_compute_execution_params( - runtime_actor_cpus: float, # actor's CPU requirements - runtime_num_actors: int, # number of actors needed to run this step + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -103,8 +111,9 @@ def signature_calc_compute_execution_params( """ Compute fuzzy dedup execution parameters for signature calculation - :param runtime_actor_cpus: actor's CPU requirements - :param runtime_num_actors: number of actors to run this step + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -116,23 +125,22 @@ def signature_calc_compute_execution_params( :param num_permutations: number of permutations :param num_bands: number of bands :param num_minhashes_per_band: band length - :param word_shingle_size: number of words in shingle + :param word_shingle_size: number of words/chars in shingle :param shingle_option: str: type of shingle, one of 'word' or 'char' :param threshold: threshold, :param num_segments: number of segments :param seed: seed for the random number generator - :return: a dictionary with a Ray Job execution parameters + :return: dictionary with Ray Job execution parameters """ # fuzzy parameters for signature calculation - runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} - print(f"runtime_actor_options = {runtime_actor_options}") + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, "runtime_num_workers": runtime_num_actors, - "runtime_worker_options": str(runtime_actor_options), + "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), @@ -151,8 +159,9 @@ def signature_calc_compute_execution_params( def cluster_analysis_compute_execution_params( - runtime_actor_cpus: float, # actor's CPU requirements - runtime_num_actors: int, # number of actors needed to run this step + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -166,8 +175,9 @@ def cluster_analysis_compute_execution_params( """ Compute fuzzy dedup execution parameters for cluster analysis - :param runtime_actor_cpus: actor's CPU requirements - :param runtime_num_actors: number of actors to run this step + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -189,13 +199,13 @@ def cluster_analysis_compute_execution_params( data_s3_config_dict["input_folder"] = os.path.join(base_folder, "bands") data_s3_config_dict["output_folder"] = os.path.join(base_folder, "docs_to_remove") data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") - runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, "runtime_num_workers": runtime_num_actors, - "runtime_worker_options": str(runtime_actor_options), + "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), @@ -206,47 +216,48 @@ def cluster_analysis_compute_execution_params( def get_duplicate_list_compute_execution_params( - runtime_actor_cpus: float, # actor's CPU requirements - runtime_num_actors: int, # number of actors needed to run this step + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process runtime_pipeline_id: str, # pipeline id runtime_job_id: str, # job id runtime_code_location: dict, # code location - duplicate_docids_folder: str, # folder with the docs IDs to remove - duplicate_list_location: str, # location of the list of duplicate doc ids ) -> dict: """ Compute fuzzy dedup execution parameters for get duplicate list step - :param runtime_actor_cpus: actor's CPU requirements - :param runtime_num_actors: number of actors to run this step + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process :param runtime_pipeline_id: pipeline id :param runtime_job_id: job id :param runtime_code_location: code location - :param duplicate_docids_folder: folder with the docs IDs to remove - :param duplicate_list_location: location of the list of duplicate doc ids :return: a dictionary with a Ray Job execution parameters """ import json + import os # fuzzy parameters + duplicate_docids_folder: str = "docs_to_remove" + duplicate_list_location: str = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") # Get cluster parameters data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) base_folder = data_s3_config_dict.get("output_folder") data_s3_config_dict["input_folder"] = base_folder data_s3_config_dict["output_folder"] = base_folder data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") - runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, "runtime_num_workers": runtime_num_actors, - "runtime_worker_options": str(runtime_actor_options), + "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), @@ -256,8 +267,9 @@ def get_duplicate_list_compute_execution_params( def data_cleaning_compute_execution_params( - runtime_actor_cpus: float, # actor's CPU requirements - runtime_num_actors: int, # number of actors needed to run this step + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -265,13 +277,13 @@ def data_cleaning_compute_execution_params( runtime_job_id: str, # job id runtime_code_location: dict, # code location id_column: str, # integer document id column name - duplicate_list_location: str, # location of the list of duplicate doc ids operation_mode: str, # filter (non-)duplicates or annotate ) -> dict: """ Compute fuzzy dedup execution parameters - :param runtime_actor_cpus: actor's CPU requirements - :param runtime_num_actors: number of actors to run this step + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -279,7 +291,6 @@ def data_cleaning_compute_execution_params( :param runtime_job_id: job id :param runtime_code_location: code location :param id_column: integer document id column name - :param duplicate_list_location: location of the list of duplicate doc ids :param operation_mode: filter (non-)duplicates or annotate :return: a dictionary with a Ray Job execution parameters """ @@ -298,13 +309,14 @@ def data_cleaning_compute_execution_params( output_subfolder = "annotated" data_s3_config_dict["output_folder"] = os.path.join(base_folder, output_subfolder) data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") - runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + duplicate_list_location: str = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, "runtime_num_workers": runtime_num_actors, - "runtime_worker_options": str(runtime_actor_options), + "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), From 280d105a1b5ced45ae4fc7d5bdf4123e86669022 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 22 Nov 2024 20:21:02 -0500 Subject: [PATCH 86/91] added fdedup to build package for all transforms Signed-off-by: Maroun Touma --- transforms/pyproject.toml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 2357553e4..badb8bbd9 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.2.dev3" +version = "0.2.2.dev4" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -44,6 +44,7 @@ all = { file = [ "universal/hap/python/requirements.txt", "universal/tokenization/python/requirements.txt", "universal/ededup/python/requirements.txt", +"universal/fdedup/python/requirements.txt", "universal/profiler/python/requirements.txt", "universal/doc_id/python/requirements.txt", "universal/filter/python/requirements.txt", @@ -71,6 +72,7 @@ pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]} hap = { file = ["universal/hap/python/requirements.txt"]} tokenization = { file = ["universal/tokenization/python/requirements.txt"]} ededup = { file = ["universal/ededup/python/requirements.txt"]} +fdedup = { file = ["universal/fdedup/python/requirements.txt"]} profiler = { file = ["universal/profiler/python/requirements.txt"]} doc_id = { file = ["universal/doc_id/python/requirements.txt"]} filter = { file = ["universal/filter/python/requirements.txt"]} @@ -80,11 +82,8 @@ web2parquet = { file = ["universal/web2parquet/requirements.txt"]} # Does not seem to work for our custom layout # copy all files to a single src and let automatic discovery find them -[tool.setuptools.package-data] -"*" = ["*.txt"] - -[tool.setuptools.packages.find] -where = ["src"] +#[tool.setuptools.package-data] +#"*" = ["*.txt"] #[tool.setuptools.package-dir] #dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" From 1a762e01d6cdd3af08c5983c6bcf81e175ab3627 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 25 Nov 2024 11:53:18 -0500 Subject: [PATCH 87/91] First draft of fdedup notebook Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/fdedup.ipynb | 152 +++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 transforms/universal/fdedup/fdedup.ipynb diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb new file mode 100644 index 000000000..ee1d9b561 --- /dev/null +++ b/transforms/universal/fdedup/fdedup.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args\n", + "from fdedup_transform_ray import RayServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(\"ray\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"ray\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"num_segments\": 1,\n", + " \"operation_mode\": \"annotate\",\n", + " # ray configuration parameters\n", + " \"run_locally\": True,\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use ray runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = RayServiceOrchestrator(global_params=args)\n", + "# Launch ray fuzzy dedup execution\n", + "orchestrator.orchestrate()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ffebdc1c388440a1b03e4efe88178405b4c569dc Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 25 Nov 2024 13:52:51 -0500 Subject: [PATCH 88/91] Added sample ray fuzzy dedup jupyter notebook Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/fdedup.ipynb | 71 ++++++++++++++++--- .../python/src/fdedup_transform_python.py | 10 +++ 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb index ee1d9b561..88bcd87aa 100644 --- a/transforms/universal/fdedup/fdedup.ipynb +++ b/transforms/universal/fdedup/fdedup.ipynb @@ -67,8 +67,8 @@ "outputs": [], "source": [ "# create parameters\n", - "input_folder = os.path.join(\"ray\", \"test-data\", \"input\")\n", - "output_folder = os.path.join( \"ray\", \"output\")\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", "params = {\n", " # transform configuration parameters\n", " \"input_folder\": input_folder,\n", @@ -79,7 +79,7 @@ " \"num_bands\": 14,\n", " \"num_minhashes_per_band\": 8,\n", " \"num_segments\": 1,\n", - " \"operation_mode\": \"annotate\",\n", + " \"operation_mode\": \"filter_duplicates\",\n", " # ray configuration parameters\n", " \"run_locally\": True,\n", "}\n" @@ -90,7 +90,7 @@ "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", "metadata": {}, "source": [ - "##### ***** Use ray runtime to invoke the transform" + "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" ] }, { @@ -100,12 +100,13 @@ "metadata": {}, "outputs": [], "source": [ - "%%capture\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", "args = parse_args()\n", "# Initialize the orchestrator\n", "orchestrator = RayServiceOrchestrator(global_params=args)\n", "# Launch ray fuzzy dedup execution\n", - "orchestrator.orchestrate()\n" + "orchestrator.orchestrate()" ] }, { @@ -124,15 +125,65 @@ "outputs": [], "source": [ "import glob\n", - "glob.glob(\"python/output/*\")" + "glob.glob(\"ray/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "fdedup_ray", "language": "python", - "name": "python3" + "name": "fdedup_ray" }, "language_info": { "codemirror_mode": { @@ -144,7 +195,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py index 166e48e26..b200676da 100644 --- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py +++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py @@ -147,6 +147,8 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: if service_name == "fdclean": sys_argv.append("--dcdata_s3_config") sys_argv.append(ast_data_io) + if in_args.run_locally: + sys_argv.append(f"--run_locally={in_args.run_locally}") return sys_argv def execute_service(self, service_short_name: str, params: list) -> int: @@ -240,6 +242,7 @@ def parse_args() -> argparse.Namespace: default=None, help="ast string of options for s3 credentials", ) + parser.add_argument( "--shingle_option", type=str, @@ -248,6 +251,13 @@ def parse_args() -> argparse.Namespace: help="Option used for shingling", ) + parser.add_argument( + "--run_locally", + type=lambda x: bool(str2bool(x)), + default=True, + help="run locally or connect to a remote machine", + ) + return parser.parse_args() From 75fc4d1464d4d8c83dc0a087528c46c873a46d2f Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 25 Nov 2024 16:59:26 -0500 Subject: [PATCH 89/91] Add jupyter notebooks for python, ray and spark fuzzy dedup Signed-off-by: Constantin M Adam --- .../universal/fdedup/fdedup_python.ipynb | 215 ++++++++++++++++++ transforms/universal/fdedup/fdedup_ray.ipynb | 214 +++++++++++++++++ .../universal/fdedup/fdedup_spark.ipynb | 212 +++++++++++++++++ .../python/src/fdedup_transform_python.py | 2 +- 4 files changed, 642 insertions(+), 1 deletion(-) create mode 100644 transforms/universal/fdedup/fdedup_python.ipynb create mode 100644 transforms/universal/fdedup/fdedup_ray.ipynb create mode 100644 transforms/universal/fdedup/fdedup_spark.ipynb diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb new file mode 100644 index 000000000..83f9bd600 --- /dev/null +++ b/transforms/universal/fdedup/fdedup_python.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args, ServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform\n", + "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n", + "|parameter:type | value | description |\n", + "|-|-|-|\n", + "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n", + "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n", + "| contents_column:str | contents | name of the column that stores document text |\n", + "| document_id_column:str | int_id_column | name of the column that stores document ID |\n", + "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", + "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", + "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"operation_mode\": \"filter_duplicates\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = ServiceOrchestrator(global_params=args)\n", + "# Launch python fuzzy dedup execution\n", + "orchestrator.orchestrate()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"python/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", + "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n", + "input_df = input_df_1.vstack(input_df_2)\n", + "\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n", + "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n", + "output_df = output_df_1.vstack(output_df_2)\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fdedup_ray", + "language": "python", + "name": "fdedup_ray" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb new file mode 100644 index 000000000..533ca019f --- /dev/null +++ b/transforms/universal/fdedup/fdedup_ray.ipynb @@ -0,0 +1,214 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args\n", + "from fdedup_transform_ray import RayServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform\n", + "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n", + "|parameter:type | value | description |\n", + "|-|-|-|\n", + "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n", + "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n", + "| contents_column:str | contents | name of the column that stores document text |\n", + "| document_id_column:str | int_id_column | name of the column that stores document ID |\n", + "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", + "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", + "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |\n", + "| run_locally:bool | true | if true, launch a ray cluster locally, otherwise connect to an already existing cluster | \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"operation_mode\": \"filter_duplicates\",\n", + " # ray configuration parameters\n", + " \"run_locally\": True,\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = RayServiceOrchestrator(global_params=args)\n", + "# Launch ray fuzzy dedup execution\n", + "orchestrator.orchestrate()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"ray/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fdedup_ray", + "language": "python", + "name": "fdedup_ray" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/fdedup_spark.ipynb b/transforms/universal/fdedup/fdedup_spark.ipynb new file mode 100644 index 000000000..9f4bf1772 --- /dev/null +++ b/transforms/universal/fdedup/fdedup_spark.ipynb @@ -0,0 +1,212 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args\n", + "from fdedup_transform_spark import SparkServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform\n", + "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n", + "|parameter:type | value | description |\n", + "|-|-|-|\n", + "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n", + "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n", + "| contents_column:str | contents | name of the column that stores document text |\n", + "| document_id_column:str | int_id_column | name of the column that stores document ID |\n", + "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", + "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", + "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"operation_mode\": \"filter_duplicates\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use spark runtime to invoke each transform in the fuzzy dedup pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = SparkServiceOrchestrator(global_params=args)\n", + "# Launch spark fuzzy dedup execution\n", + "orchestrator.orchestrate()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"spark/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\", \"df1.parquet\"))\n", + "\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fdedup_spark", + "language": "python", + "name": "fdedup_spark" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py index b200676da..def3590e4 100644 --- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py +++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py @@ -254,7 +254,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--run_locally", type=lambda x: bool(str2bool(x)), - default=True, + default=False, help="run locally or connect to a remote machine", ) From edd5841bb199c974489a8f612968c587bdeebab3 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 25 Nov 2024 17:08:43 -0500 Subject: [PATCH 90/91] Add jupyter notebooks for python, ray and spark fuzzy dedup Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/fdedup.ipynb | 203 ----------------------- 1 file changed, 203 deletions(-) delete mode 100644 transforms/universal/fdedup/fdedup.ipynb diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb deleted file mode 100644 index 88bcd87aa..000000000 --- a/transforms/universal/fdedup/fdedup.ipynb +++ /dev/null @@ -1,203 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "afd55886-5f5b-4794-838e-ef8179fb0394", - "metadata": {}, - "source": [ - "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", - "```\n", - "make venv\n", - "source venv/bin/activate && pip install jupyterlab\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "## This is here as a reference only\n", - "# Users and application developers must use the right tag for the latest from pypi\n", - "#!pip install data-prep-toolkit\n", - "#!pip install data-prep-toolkit-transforms\n", - "#!pip install data-prep-connector" - ] - }, - { - "cell_type": "markdown", - "id": "ebf1f782-0e61-485c-8670-81066beb734c", - "metadata": {}, - "source": [ - "##### ***** Import required Classes and modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", - "metadata": {}, - "outputs": [], - "source": [ - "import ast\n", - "import os\n", - "import sys\n", - "\n", - "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_python import parse_args\n", - "from fdedup_transform_ray import RayServiceOrchestrator" - ] - }, - { - "cell_type": "markdown", - "id": "7234563c-2924-4150-8a31-4aec98c1bf33", - "metadata": {}, - "source": [ - "##### ***** Setup runtime parameters for this transform" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90a853e-412f-45d7-af3d-959e755aeebb", - "metadata": {}, - "outputs": [], - "source": [ - "# create parameters\n", - "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", - "params = {\n", - " # transform configuration parameters\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - " \"contents_column\": \"contents\",\n", - " \"document_id_column\": \"int_id_column\",\n", - " \"num_permutations\": 112,\n", - " \"num_bands\": 14,\n", - " \"num_minhashes_per_band\": 8,\n", - " \"num_segments\": 1,\n", - " \"operation_mode\": \"filter_duplicates\",\n", - " # ray configuration parameters\n", - " \"run_locally\": True,\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", - "metadata": {}, - "source": [ - "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0775e400-7469-49a6-8998-bd4772931459", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "args = parse_args()\n", - "# Initialize the orchestrator\n", - "orchestrator = RayServiceOrchestrator(global_params=args)\n", - "# Launch ray fuzzy dedup execution\n", - "orchestrator.orchestrate()" - ] - }, - { - "cell_type": "markdown", - "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", - "metadata": {}, - "source": [ - "##### **** The specified folder will include the transformed parquet files." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7276fe84-6512-4605-ab65-747351e13a7c", - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "glob.glob(\"ray/output/cleaned/*\")" - ] - }, - { - "cell_type": "markdown", - "id": "d30489d9-fc98-423e-90a8-e8f372787e88", - "metadata": {}, - "source": [ - "***** print the input data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", - "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", - " print(input_df)" - ] - }, - { - "cell_type": "markdown", - "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", - "metadata": {}, - "source": [ - "***** print the output result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", - "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", - " print(output_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d60e391d-cf58-47ae-9991-04c05d114edc", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fdedup_ray", - "language": "python", - "name": "fdedup_ray" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 7ae1f135ccc3ba7bb4cc4ff500b0e070b7d30b7b Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 26 Nov 2024 07:01:52 -0500 Subject: [PATCH 91/91] remove reference to noop transform project Signed-off-by: Maroun Touma --- data-processing-lib/pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index 40bf6b2a1..a347a14a1 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -16,7 +16,6 @@ dynamic = ["dependencies", "optional-dependencies"] Repository = "https://github.com/IBM/data-prep-kit" Issues = "https://github.com/IBM/data-prep-kit/issues" Documentation = "https://ibm.github.io/data-prep-kit/doc" -"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]