From 47f4526cd5217dd55a88185fdc99c93fed00953e Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:05:39 +0100 Subject: [PATCH 001/105] added folder_transform --- .../pure_python/transform_file_processor.py | 15 ++++-- .../pure_python/transform_orchestrator.py | 42 ++++++++++------ .../runtime/transform_file_processor.py | 41 ++++++++------- .../src/data_processing/transform/__init__.py | 2 + .../transform/abstract_transform.py | 16 ++++++ .../transform/binary_transform.py | 5 +- .../transform/folder_transform.py | 50 +++++++++++++++++++ .../runtime/ray/transform_file_processor.py | 1 + .../runtime/ray/transform_orchestrator.py | 19 ++++--- .../runtime/spark/transform_file_processor.py | 5 +- .../runtime/spark/transform_orchestrator.py | 25 +++++++--- 11 files changed, 168 insertions(+), 53 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/transform/abstract_transform.py create mode 100644 data-processing-lib/python/src/data_processing/transform/folder_transform.py diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index 143835dd0..fa3e69e4a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -14,7 +14,7 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime import AbstractTransformFileProcessor -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractTransform, TransformStatistics from data_processing.utils import UnrecoverableException @@ -28,7 +28,8 @@ def __init__( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ): """ Init method @@ -36,11 +37,13 @@ def __init__( :param statistics - reference to statistics class :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag """ # invoke superclass super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) self.transform_params["statistics"] = statistics # Create local processor @@ -52,7 +55,8 @@ def __init__( # Create statistics self.stats = statistics - def _publish_stats(self, stats: dict[str, Any]) -> None: + +def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) @@ -65,17 +69,20 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool ): """ Init method :param data_access_factory - data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder tranform flag """ super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) # Add data access and statistics to the processor parameters self.transform_params["data_access"] = self.data_access diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 8692da29e..153eaaf0a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,7 +24,7 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger @@ -48,8 +48,6 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } - - def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -74,15 +72,21 @@ def orchestrate( return 1 # create additional execution parameters runtime = runtime_config.create_transform_runtime() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - if retries > 0: - statistics.add_stats({"data access retries": retries}) - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + if retries > 0: + statistics.add_stats({"data access retries": retries}) + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -99,6 +103,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) else: # using sequential execution @@ -111,6 +116,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) status = "success" return_code = 0 @@ -157,7 +163,8 @@ def _process_transforms( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ) -> None: """ Process transforms sequentially @@ -167,9 +174,8 @@ def _process_transforms( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag :return: metadata for the execution - - :return: None """ # create executor executor = PythonTransformFileProcessor( @@ -177,6 +183,7 @@ def _process_transforms( statistics=statistics, transform_params=transform_params, transform_class=transform_class, + is_folder=is_folder, ) # process data t_start = time.time() @@ -203,6 +210,7 @@ def _process_transforms_multiprocessor( data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], transform_class: type[AbstractBinaryTransform], + is_folder: bool ) -> TransformStatistics: """ Process transforms using multiprocessing pool @@ -212,13 +220,17 @@ def _process_transforms_multiprocessor( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform class :return: metadata for the execution """ # result statistics statistics = TransformStatistics() # create processor processor = PythonPoolTransformFileProcessor( - data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class + data_access_factory=data_access_factory, + transform_params=transform_params, + transform_class=transform_class, + is_folder=is_folder, ) completed = 0 t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index d4ec548d8..1d268875f 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -26,11 +26,13 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_parameters: dict[str, Any], + is_folder: bool = False, ): """ Init method :param data_access_factory: Data Access Factory :param transform_parameters: Transform parameters + :param is_folder: folder transform flag """ self.logger = get_logger(__name__) # validate parameters @@ -46,6 +48,7 @@ def __init__( # Add data access and statistics to the processor parameters self.transform_params = transform_parameters self.transform_params["data_access"] = self.data_access + self.is_folder = is_folder def process_file(self, f_name: str) -> None: """ @@ -58,25 +61,29 @@ def process_file(self, f_name: str) -> None: self.logger.warning("No data_access found. Returning.") return t_start = time.time() - # Read source file - filedata, retries = self.data_access.get_file(path=f_name) - if retries > 0: - self._publish_stats({"data access retries": retries}) - if filedata is None: - self.logger.warning(f"File read resulted in None for {f_name}. Returning.") - self._publish_stats({"failed_reads": 1}) - return - self._publish_stats({"source_files": 1, "source_size": len(filedata)}) + if not self.is_folder: + # Read source file only if we are processing file + filedata, retries = self.data_access.get_file(path=f_name) + if retries > 0: + self._publish_stats({"data access retries": retries}) + if filedata is None: + self.logger.warning(f"File read resulted in None for {f_name}. Returning.") + self._publish_stats({"failed_reads": 1}) + return + self._publish_stats({"source_files": 1, "source_size": len(filedata)}) # Process input file try: - # execute local processing - name_extension = TransformUtils.get_file_extension(f_name) self.logger.debug(f"Begin transforming file {f_name}") - out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + if not self.is_folder: + # execute local processing + out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + name_extension = TransformUtils.get_file_extension(f_name) + self.last_file_name = name_extension[0] + self.last_file_name_next_index = None + self.last_extension = name_extension[1] + else: + out_files, stats = self.transform.transform(folder_name=f_name) self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") - self.last_file_name = name_extension[0] - self.last_file_name_next_index = None - self.last_extension = name_extension[1] # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) # Process unrecoverable exceptions @@ -95,10 +102,10 @@ def flush(self) -> None: the hook for them to return back locally stored data and their statistics. :return: None """ - if self.last_file_name is None: + if self.last_file_name is None or self.is_folder: # for some reason a given worker never processed anything. Happens in testing # when the amount of workers is greater than the amount of files - self.logger.debug("skipping flush, no name for file is defined") + self.logger.debug("skipping flush, no name for file is defined or this is a folder transform") return try: t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py index 6af43ad60..20254e47b 100644 --- a/data-processing-lib/python/src/data_processing/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/transform/__init__.py @@ -1,3 +1,5 @@ +from data_processing.transform.abstract_transform import AbstractTransform +from data_processing.transform.folder_transform import AbstractFolderTransform from data_processing.transform.binary_transform import AbstractBinaryTransform from data_processing.transform.table_transform import AbstractTableTransform from data_processing.transform.transform_statistics import TransformStatistics diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py new file mode 100644 index 000000000..89db70f42 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py @@ -0,0 +1,16 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +class AbstractTransform: + """ + Base class for all transform types + """ \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py index 80dff61ea..b313aff2f 100644 --- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py @@ -10,10 +10,11 @@ # limitations under the License. ################################################################################ -from typing import Any, TypeVar +from typing import Any +from data_processing.transform import AbstractTransform -class AbstractBinaryTransform: +class AbstractBinaryTransform(AbstractTransform): """ Converts input binary file to output file(s) (binary) Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py new file mode 100644 index 000000000..866e3286f --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from data_processing.data_access import data_access +from data_processing.transform import AbstractTransform + + +class AbstractFolderTransform(AbstractTransform): + """ + Converts input folder to output file(s) (binary) + Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or + more new binary files and metadata. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This simply stores the given instance in this instance for later use. + """ + self.config = config + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the extension to be used when writing out the new bytes. + """ + raise NotImplemented() + + @staticmethod + def get_folders(data_access:data_access) -> list(str): + """ + Compute the list of folders to use. + :param data_access - data access class + :return: + """ + raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py index e1fabb144..cdad1309f 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py @@ -35,6 +35,7 @@ def __init__(self, params: dict[str, Any]): super().__init__( data_access_factory=params.get("data_access_factory", None), transform_parameters=dict(params.get("transform_params", {})), + is_folder=params.get("is_folder", False) ) # Create statistics self.stats = params.get("statistics", None) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 42eba47a6..8276eb56c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -16,6 +16,7 @@ import ray from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import AbstractFolderTransform from data_processing_ray.runtime.ray import ( RayTransformExecutionConfiguration, RayTransformFileProcessor, @@ -56,13 +57,18 @@ def orchestrate( # create transformer runtime runtime = runtime_config.create_transform_runtime() resources = RayUtils.get_cluster_resources() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -84,6 +90,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), "statistics": statistics, + "is_folder": is_folder, } logger.debug("Creating actors") processors = RayUtils.create_actors( diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py index d63664ac4..a0968ab1d 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py @@ -29,12 +29,15 @@ def __init__( data_access_factory: DataAccessFactoryBase, runtime_configuration: SparkTransformRuntimeConfiguration, statistics: TransformStatistics, + is_folder: bool, ): """ Init method """ super().__init__( - data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params() + data_access_factory=data_access_factory, + transform_parameters=runtime_configuration.get_transform_params(), + is_folder=is_folder, ) # Add data access ant statistics to the processor parameters self.runtime_configuration = runtime_configuration diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index 57a6c58fc..11589dbaf 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -15,7 +15,7 @@ from datetime import datetime from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics +from data_processing.transform import TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger from data_processing_spark.runtime.spark import ( SparkTransformFileProcessor, @@ -68,7 +68,10 @@ def process_partition(iterator): runtime = runtime_conf.create_transform_runtime() # create file processor file_processor = SparkTransformFileProcessor( - data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics + data_access_factory=d_access_factory, + runtime_configuration=runtime_conf, + statistics=statistics, + is_folder=is_folder, ) first = True for f in iterator: @@ -92,13 +95,19 @@ def process_partition(iterator): return list(statistics.get_execution_stats().items()) num_partitions = 0 + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # process data logger.debug("Begin processing files") # process files split by partitions From 5fd20a125a71a40d6db7dc958dce50321369f3c0 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:13:01 +0100 Subject: [PATCH 002/105] added folder_transform --- .../runtime/pure_python/transform_orchestrator.py | 2 +- .../python/src/data_processing/transform/folder_transform.py | 4 ++-- .../data_processing_ray/runtime/ray/transform_orchestrator.py | 2 +- .../runtime/spark/transform_orchestrator.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 153eaaf0a..d51f80a8a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 866e3286f..eca191bbb 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -41,10 +41,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str raise NotImplemented() @staticmethod - def get_folders(data_access:data_access) -> list(str): + def get_folders(d_access: data_access) -> list(str): """ Compute the list of folders to use. - :param data_access - data access class + :param d_access - data access class :return: """ raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 8276eb56c..a8ff95729 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index 11589dbaf..a4c0c5835 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -99,7 +99,7 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process From 38b47259977fbe64ead50231a52660e375625add Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 21:00:43 +0100 Subject: [PATCH 003/105] added folder_transform --- .../runtime/pure_python/transform_file_processor.py | 3 +-- .../runtime/pure_python/transform_orchestrator.py | 11 ++++++----- .../runtime/pure_python/transform_runtime.py | 10 +++++++++- .../data_processing/transform/folder_transform.py | 12 +----------- .../runtime/ray/transform_orchestrator.py | 2 +- .../runtime/ray/transform_runtime.py | 10 +++++++++- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index fa3e69e4a..44ccd0ef0 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -55,8 +55,7 @@ def __init__( # Create statistics self.stats = statistics - -def _publish_stats(self, stats: dict[str, Any]) -> None: + def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index d51f80a8a..812be8caf 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,14 +24,13 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform +from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger logger = get_logger(__name__) -@staticmethod def _execution_resources() -> dict[str, Any]: """ Get Execution resource @@ -48,6 +47,7 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } + def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process @@ -145,7 +145,8 @@ def orchestrate( "job_input_params": input_params | data_access_factory.get_input_params() | execution_config.get_input_params(), - "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, + "execution_stats": _execution_resources() | + {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, "job_output_stats": stats, } logger.debug(f"Saving job metadata: {metadata}.") @@ -209,7 +210,7 @@ def _process_transforms_multiprocessor( print_interval: int, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], is_folder: bool ) -> TransformStatistics: """ diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py index 4173154ae..478d40837 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] ) -> dict[str, Any]: diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index eca191bbb..9a2fb3713 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -11,7 +11,6 @@ ################################################################################ from typing import Any -from data_processing.data_access import data_access from data_processing.transform import AbstractTransform @@ -38,13 +37,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str to metadata. Each element of the return list, is a tuple of the transformed bytes and a string holding the extension to be used when writing out the new bytes. """ - raise NotImplemented() - - @staticmethod - def get_folders(d_access: data_access) -> list(str): - """ - Compute the list of folders to use. - :param d_access - data access class - :return: - """ - raise NotImplemented() + raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index a8ff95729..b29682997 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py index 57f071406..64479302c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from ray.actor import ActorHandle @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] ) -> dict[str, Any]: From a3abf21cda7e280f7089555bc974058d193b502f Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 08:48:00 +0100 Subject: [PATCH 004/105] added folder_transform --- .../runtime/spark/transform_orchestrator.py | 3 ++- .../runtime/spark/transform_runtime.py | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index a4c0c5835..c404559d8 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -99,7 +99,8 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + runtime = runtime_config.create_transform_runtime() + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index f16b09520..3c9fca76f 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics ) -> dict[str, Any]: From af8475df9648a76cb268b284f60de3597fa579c8 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 11 Oct 2024 10:20:48 -0400 Subject: [PATCH 005/105] Fuzzy dedup pure python implementation Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/README.md | 11 + .../universal/fdedup/python/pyproject.toml | 55 ++ .../universal/fdedup/python/src/Murmur_MH.py | 99 ++++ .../src/cluster_analysis_local_python.py | 46 ++ .../python/src/cluster_analysis_transform.py | 229 ++++++++ .../src/cluster_analysis_transform_python.py | 44 ++ .../python/src/data_cleaning_local_python.py | 56 ++ .../python/src/data_cleaning_transform.py | 150 ++++++ .../src/data_cleaning_transform_python.py | 83 +++ .../fdedup/python/src/file_copy_util.py | 158 ++++++ .../fdedup/python/src/service_orchestrator.py | 265 +++++++++ .../python/src/signature_calc_local_python.py | 60 +++ .../python/src/signature_calc_transform.py | 504 ++++++++++++++++++ .../src/signature_calc_transform_python.py | 44 ++ 14 files changed, 1804 insertions(+) create mode 100644 transforms/universal/fdedup/python/README.md create mode 100644 transforms/universal/fdedup/python/pyproject.toml create mode 100644 transforms/universal/fdedup/python/src/Murmur_MH.py create mode 100644 transforms/universal/fdedup/python/src/cluster_analysis_local_python.py create mode 100644 transforms/universal/fdedup/python/src/cluster_analysis_transform.py create mode 100644 transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py create mode 100644 transforms/universal/fdedup/python/src/data_cleaning_local_python.py create mode 100644 transforms/universal/fdedup/python/src/data_cleaning_transform.py create mode 100644 transforms/universal/fdedup/python/src/data_cleaning_transform_python.py create mode 100644 transforms/universal/fdedup/python/src/file_copy_util.py create mode 100644 transforms/universal/fdedup/python/src/service_orchestrator.py create mode 100644 transforms/universal/fdedup/python/src/signature_calc_local_python.py create mode 100644 transforms/universal/fdedup/python/src/signature_calc_transform.py create mode 100644 transforms/universal/fdedup/python/src/signature_calc_transform_python.py diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md new file mode 100644 index 000000000..34f18c73b --- /dev/null +++ b/transforms/universal/fdedup/python/README.md @@ -0,0 +1,11 @@ +# Fuzzy Dedup + +Please see the set of +[transform project conventions](../../../README.md) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary + +The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see +[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details. \ No newline at end of file diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml new file mode 100644 index 000000000..f2b9d8268 --- /dev/null +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -0,0 +1,55 @@ +[project] +name = "dpk_fdedup_transform_python" +version = "0.3.0.dev0" +requires-python = ">=3.10" +description = "Fuzzy Dedup Transform for Python" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, +] +dependencies = [ + "data-prep-toolkit==0.2.2.dev0", + "pyarrow==16.1.0", + "pyyaml>=6.0.2", + "boto3>=1.34.69", + "kubernetes>=30.1.0", + "polars>=1.6.0", + "disjoint-set>=0.8.0", + "scipy>=1.14.1", + "numpy<1.29.0", + "sentencepiece>=0.2.0", + "mmh3>=4.1.0", +] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py new file mode 100644 index 000000000..e3442ba02 --- /dev/null +++ b/transforms/universal/fdedup/python/src/Murmur_MH.py @@ -0,0 +1,99 @@ +import logging +import os +from typing import List, Set + +import mmh3 +import numpy as np + + +class Murmur_MH: + def __init__(self, num_perm=64, seed=42, hashfunc=None): + self.seed = seed + self.num_perm = num_perm # the number of buckets, i.e. the vector length after self.minhash() call + self.permutations = self._init_permutations(seed, num_perm) + + def _init_permutations(self, seed, num_perm): + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + max_int = np.uint64((1 << 64) - 1) + # initialize pseudo random number generator with given seed value + gen = np.random.RandomState(seed) + # get self.num_perm pseudo random numbers between 2 and max_int (excl) + permutations = np.array( + [gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], + dtype=np.uint64, + ).T + # make all even pseudo random numbers odd by adding 1 + permutations[permutations % 2 == 0] += 1 + return permutations + + def minhash(self, shingles: List[str]): + """return np.array of minhash""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0) + ) + + def minhash2(self, shingles: List[str], doc_len: int): + """ + for each shingle (i.e. a group of k-words) it generates a digest value based on + mmh3-hash function (32-bit) + + return tuple (A, B) + A = an array of values = np.array of minhash + B = document_length = number of characters""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0), + doc_len, + ) + + def minhash2_nosalt(self, shingles: List[str], doc_len: int, doc_id: int): + """ + for each shingle (i.e. a group of k-words) it generates a digest value based on + mmh3-hash function (32-bit) + + return tuple (A, B) + A = an array of values = np.array of minhash + B = document_length = number of characters""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0) + .tolist(), + doc_len, + doc_id, + ) + + @staticmethod + def jaccard(mh1: np.array, mh2: np.array) -> float: + """ + The Jaccard similarity measures the similarity between two sets of data + to see which members are shared and distinct. + + The Jaccard similarity is calculated by dividing the number of observations + in both sets by the number of observations in either set. + + Developed by Paul Jaccard, the index ranges from 0 to 1. + The closer to 1, the more similar the two sets of data. + + As a document is represented by a set. We use Jaccard distance to see how similar between two documents. + """ + assert len(mh1) == len(mh2) + return np.count_nonzero(mh1 == mh2) / len(mh1) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py new file mode 100644 index 000000000..dcfc9a7e4 --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} +if __name__ == "__main__": + # Set the simulated command line args + # sys.argv = ParamsUtils.dict_to_req(d=params) + # print(sys.argv) + # create launcher + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py new file mode 100644 index 000000000..5ad18362a --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -0,0 +1,229 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os +from argparse import ArgumentParser, Namespace +from typing import Any, List, Tuple + +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, get_logger +from Murmur_MH import Murmur_MH + + +short_name = "cluster" +cli_prefix = f"{short_name}_" + +# configuration keys +jaccard_similarity_threshold_key = "jaccard_similarity_threshold" +""" This key holds the Jaccard similarity threshold above which two documents are duplicates""" + +# command line arguments +jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" +""" Jaccard similarity threshold above which two documents are duplicates""" + +captured_arg_keys = [ + jaccard_similarity_threshold_key, +] + +# defaults +jaccard_similarity_threshold_default = 0.8 +""" Default Jaccard similarity threshold above which two documents are duplicates""" + + +class ClusterAnalysisTransform(AbstractTableTransform): + """ + This is the second transform of the fuzzy dedup pipeline. It runs in parallel: + for each band, the hashing interval is divided into segments. A cluster analysis + uses as input all the parquet files from segment of a band. The `bands` output + of the signature calculation, the first transform in the fuzzy dedup pipeline + contains all the data for a given segment s of a specific band b in the + subfolder `bands/band=b/segment=s`. + The transform loads all the parquet files in the `bands/band=b/segment=s` + subfolder. Each one of these parquet files has two columns: the `band_hash` + and a `data` structure, which includes the `document_id`, the `minhashes` and + the `document_size` fields. Once all the files have been loaded in a single + dataframe, a `group_by` operation on the `band_hash` field is performed in + that dataframe. All the documents that have the same band_hash are grouped + in a cluster. Subsequently, the documents of each cluster are sorted in + descending order according to their size, and a Jaccard similarity is + calculated between the cluster documents. The documents for which the Jaccard + similarity is above the `jaccard_similarity_threshold` remain in the cluster, + the others are removed from the cluster. Finally, from each cluster that has + more than one document after running the Jaccard similarity, we select a doc + to keep (the largest size document), and mark the other documents as + duplicates. The resulting clusters are saved in a file for further analysis. + + Args: + jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.jaccard_similarity_threshold = config.get( + jaccard_similarity_threshold_key, jaccard_similarity_threshold_default + ) + self.logger = get_logger(__name__) + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + bands_dataframe = pl.from_arrow(table) + docs2remove_list = [] + # clustering + bands_dataframe_groups = bands_dataframe.group_by("band_hash").agg("document_data") + bands_dataframe_cluster = bands_dataframe_groups.with_columns( + cluster_length=pl.col("document_data").list.len() + ).filter(pl.col("cluster_length") > 1) + self.logger.info(f"file_name = {file_name}") + num_clusters = len(bands_dataframe_cluster) + if num_clusters > 0: + sum_cdocs = bands_dataframe_cluster.select(pl.sum("cluster_length")).item() + max_cdocs = bands_dataframe_cluster.select(pl.max("cluster_length")).item() + min_cdocs = bands_dataframe_cluster.select(pl.min("cluster_length")).item() + avg_cdocs = bands_dataframe_cluster.select(pl.mean("cluster_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + bands_dataframe_response = self.process_bands(bands_dataframe_cluster) + + filtered_doc2remove_dataframe = bands_dataframe_response.filter(pl.col("docs_to_remove_length") > 0) + num_clusters = len(filtered_doc2remove_dataframe) + if num_clusters > 0: + sum_cdocs = filtered_doc2remove_dataframe.select(pl.sum("docs_to_remove_length")).item() + max_cdocs = filtered_doc2remove_dataframe.select(pl.max("docs_to_remove_length")).item() + min_cdocs = filtered_doc2remove_dataframe.select(pl.min("docs_to_remove_length")).item() + avg_cdocs = filtered_doc2remove_dataframe.select(pl.mean("docs_to_remove_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + + # Explode the 'docs_to_remove' column + doc2remove_exploded_dataframe = filtered_doc2remove_dataframe.explode("docs_to_remove") + table = doc2remove_exploded_dataframe.to_arrow() + self.logger.info(f"{len(doc2remove_exploded_dataframe)} documents marked to remove") + metadata = {"nrows": len(table)} + return [table], metadata + + def process_bands(self, df: pl.DataFrame) -> pl.DataFrame: + # Define the schema with specific data types + schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64} + doc_ids_lists = [] + docs_to_remove_lists = [] + len_of_docs2remove_lists = [] + for row in df.iter_rows(named=True): + doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self.jaccard_distance_calculation(row) + doc_ids_lists += doc_ids_list + docs_to_remove_lists += docs_to_remove_list + len_of_docs2remove_lists += len_of_docs2remove_list + processed_rows = pl.DataFrame( + { + "first_doc": doc_ids_lists, + "docs_to_remove": docs_to_remove_lists, + "docs_to_remove_length": len_of_docs2remove_lists, + }, + schema=schema, + ) + return processed_rows + + def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: + # Process row and return a new list of Series or a new row + threshold = self.jaccard_similarity_threshold + doc_ids_list = [] + docs_to_remove_list = [] + len_of_docs2remove_list = [] + # sort documents + document_data = row["document_data"] + + # Sort the list by 'document_length' + sorted_document_data = sorted(document_data, key=lambda x: (-x["document_length"], x["int_id_column"])) + + # Extracting int_id_column values into a list + doc_list = list(set([item["int_id_column"] for item in sorted_document_data])) + + # Creating a dictionary with int_id_column as key and minhashes as value + doc_minhashes = {item["int_id_column"]: item["minhashes"] for item in sorted_document_data} + + while len(doc_list) > 1: + docs_to_remove = [] + new_doc_list = [] + # this is the document we are going to keep + first_doc = doc_list[0] + first_mh = doc_minhashes[first_doc] + for int_id_column in doc_list[1:]: + doc_mh = doc_minhashes[int_id_column] + distance = Murmur_MH.jaccard(np.array(first_mh), np.array(doc_mh)) + if distance >= threshold: + docs_to_remove.append(int_id_column) + else: + new_doc_list.append(int_id_column) + if len(docs_to_remove) > 0: + docs_to_remove = list(set(docs_to_remove)) + doc_ids_list.append(first_doc) + docs_to_remove_list.append(docs_to_remove) + len_of_docs2remove_list.append(len(docs_to_remove)) + doc_list = new_doc_list + + return doc_ids_list, docs_to_remove_list, len_of_docs2remove_list + + +class ClusterAnalysisTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=ClusterAnalysisTransform, + remove_from_metadata=[], + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{jaccard_similarity_threshold_cli_param}", + type=float, + default=jaccard_similarity_threshold_default, + help="Jaccard similarity threshold above which two documents are duplicates", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py new file mode 100644 index 000000000..28d96f428 --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class ClusterAnalysisPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(ClusterAnalysisTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py new file mode 100644 index 000000000..4295e4e82 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py @@ -0,0 +1,56 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + ) +) +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) + # create launcher + launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py new file mode 100644 index 000000000..f03b6c1d0 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -0,0 +1,150 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +from argparse import ArgumentParser, Namespace +from typing import Any, List, Tuple + +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger + + +short_name = "fdclean" +cli_prefix = f"{short_name}_" + +# configuration keys +document_id_column_key = "document_id_column" +""" This key holds the name of the column storing the unique ID assigned to each document""" +duplicate_list_location_key = "duplicate_list_location" +""" This key holds the location of the list of duplicate documents marked for removal""" + +# command line arguments +document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" +""" Name of the column storing the unique ID assigned to each document""" +duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" +""" Location of the list of duplicate documents marked for removal""" + +captured_arg_keys = [ + document_id_column_key, + duplicate_list_location_key, +] + +# defaults +document_id_column_default = "int_id_column" +""" Default name of the column storing the unique ID assigned to each document""" +duplicate_list_location_default = None +""" Default location of the list of duplicate documents marked for removal""" + + +class DataCleaningTransform(AbstractTableTransform): + """ + This is the third transform of the fuzzy dedup pipeline. It takes as input + the list of the documents to remove (identified as duplicates during the + cluster analysis phase, and the original dataset. Each dataset file is + imported into a table, and the documents that are in the documents to remove + list are filtered out from that table. The output is a new dataset, which + keeps the directory structure of the input dataset, but has all the fuzzy + duplicates removed. + + Args: + duplicate_location: location (local or s3) of the duplicate document list + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.logger = get_logger(__name__) + self.document_id_column = config.get(document_id_column_key, document_id_column_default) + self.duplicate_list_location = config.get(duplicate_list_location_key, duplicate_list_location_default) + contents = config.get("df") + self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents)) + self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") + self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column}) + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + input_df = pl.from_arrow(table) + # handle the case when the doc_id columns in the input dataframe and the + # docs_to_remove_df have different types, i.e. one is int32 and the + # other is int64 + input_doc_id_type = input_df[self.document_id_column].dtype + if input_doc_id_type != self.docs_to_remove_df[self.document_id_column].dtype: + self.docs_to_remove_df = self.docs_to_remove_df.select( + pl.col(self.document_id_column).cast(input_doc_id_type) + ) + filtered_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti") + filtered_table = filtered_df.to_arrow() + metadata = { + "input_files": 1, + "input_docs": table.num_rows, + "input_bytes": table.nbytes, + "output_files": 1, + "output_docs": filtered_table.num_rows, + "output_bytes": filtered_table.nbytes, + "filtered_docs": (table.num_rows - filtered_table.num_rows), + "filtered_bytes": (table.nbytes - filtered_table.nbytes), + } + return [filtered_table], metadata + + +class DataCleaningTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=DataCleaningTransform, + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{document_id_column_cli_param}", + type=str, + default=document_id_column_default, + help="name of the column storing the unique ID assigned to each document", + ) + parser.add_argument( + f"--{duplicate_list_location_cli_param}", + type=str, + required=True, + default=duplicate_list_location_default, + help="location of duplicate document list that are marked for removal", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py new file mode 100644 index 000000000..c0b5fefd6 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -0,0 +1,83 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +from data_cleaning_transform import DataCleaningTransformConfiguration +from data_processing.data_access import DataAccessFactoryBase +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + DefaultPythonTransformRuntime, + PythonTransformRuntimeConfiguration, +) +from data_processing.transform import TransformStatistics +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class DataCleaningPythonRuntime(DefaultPythonTransformRuntime): + """ + Data cleaning runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_transform_config( + self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + ) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :param statistics - reference to statistics actor + :param files - list of files to process + :return: dictionary of transform init params + """ + duplicate_list_location = self.params["duplicate_list_location"] + data_access = data_access_factory.create_data_access() + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + return self.params | {"df": self.duplicate_list} + + +class DataCleaningPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for fuzzy dedup data cleaning step + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param: transform_configuration - transform configuration class + :param: runtime_class - name of the runtime configuration class + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(), + runtime_class=DataCleaningPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(DataCleaningTransformConfiguration()) + logger.info("Launching fuzzy dedup data cleaning transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/file_copy_util.py b/transforms/universal/fdedup/python/src/file_copy_util.py new file mode 100644 index 000000000..87867e532 --- /dev/null +++ b/transforms/universal/fdedup/python/src/file_copy_util.py @@ -0,0 +1,158 @@ +import argparse +import io +import os +import re + +import polars as pl +from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase +from data_processing.utils import ParamsUtils, get_logger + + +""" +This class reads all the parquet files inside an `input_folder` of the type +`.../bands/band=b/segment=s`, concatenates those files, and writes them into a +file called `.../consolidated_bands/band_b_segment_s.parquet` +""" + + +class FileCopyUtil: + def __init__( + self, + data_access_factory: DataAccessFactoryBase, + config: dict, + stats: dict, + ): + self.data_access_factory = data_access_factory + self.root_folder = config.get("root_folder") + self.logger = get_logger(__name__, level="INFO") + + def copy_data(self, subfolder_name: str, data_type: str): + self.logger.info(f"copy_data(): subfolder_name = {subfolder_name}, data_type = {data_type}") + if self.data_access_factory.s3_config is not None: + _, root_folder = self.root_folder.split("://") + else: + root_folder = self.root_folder + self.logger.debug(f"copy_data(): root_folder = {root_folder}") + if data_type == "bands": + match = re.match(r"^band=(\d+)/segment=(\d+)$", subfolder_name) + if match: + band = int(match.group(1)) + segment = int(match.group(2)) + else: + raise ValueError(f"Wrong subfolder_name {subfolder_name}, should be band=b/segment=s") + input_folder = os.path.join( + root_folder, + "bands", + f"band={band}", + f"segment={segment}/", + ) + output_path = os.path.join( + root_folder, + "bands_consolidated", + f"band_{band}_segment_{segment}.parquet", + ) + elif data_type == "docs_to_remove": + input_folder = os.path.join( + root_folder, + f"{subfolder_name}/", + ) + output_path = os.path.join( + root_folder, + "docs_to_remove_consolidated", + f"docs_to_remove_consolidated.parquet", + ) + self.logger.debug(f"copy_data(): input_folder = {input_folder}, output_path = {output_path}") + + data_access = self.data_access_factory.create_data_access() + self.logger.debug(f"copy_data(): getting the data from the input_folder {input_folder}") + file_dict, status = data_access.get_folder_files( + input_folder, + extensions=[".parquet"], + return_data=True, + ) + self.logger.info(f"Found {len(file_dict)} files in input folder {input_folder}") + consolidated_df = pl.DataFrame() + for fname, contents in file_dict.items(): + df = pl.read_parquet(io.BytesIO(contents)) + # self.logger.info(f"{fname} has {len(df)} rows") + consolidated_df = consolidated_df.vstack(df) + if "docs_to_remove" in consolidated_df.columns: + consolidated_df = consolidated_df.select("docs_to_remove").unique() + output_table = consolidated_df.to_arrow() + self.logger.info( + f"Writing to {output_path} table with {output_table.num_rows} rows and {output_table.nbytes:,d} bytes" + ) + stats = { + "input_files": len(file_dict), + "input_bytes": sum(len(v) for v in file_dict.values()), + "input_rows": output_table.num_rows, + "output_files": 1, + "output_bytes": output_table.nbytes, + "output_rows": output_table.num_rows, + } + data_access.save_table(output_path, output_table) + return stats + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--root_folder", + type=str, + default=os.getenv("HOME", os.path.join(os.sep)), + help="root folder", + ) + parser.add_argument( + "--subfolder_name", + type=str, + default=os.path.join("band=0", "segment=0"), + help="subfolder name", + ) + parser.add_argument( + "--data_type", + type=str, + default="docs_to_remove", + help="Processing either bands or docs_to_remove", + ) + parser.add_argument( + "--use_s3", + type=bool, + default=False, + help="use s3", + ) + args = parser.parse_args() + root_folder = args.root_folder + config = {"root_folder": args.root_folder} + input_folder = args.root_folder + output_folder = args.root_folder + data_type = args.data_type + data_access_factory: DataAccessFactoryBase = DataAccessFactory() + daf_args = [] + if args.use_s3: + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + s3_config = { + "input_folder": root_folder, + "output_folder": root_folder, + } + daf_args.append("--data_s3_cred") + daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) + daf_args.append("--data_s3_config") + daf_args.append(ParamsUtils.convert_to_ast(s3_config)), + else: + local_config = { + "input_folder": root_folder, + "output_folder": root_folder, + } + daf_args.append("--data_local_config") + daf_args.append(ParamsUtils.convert_to_ast(local_config)) + daf_parser = argparse.ArgumentParser() + data_access_factory.add_input_params(parser=daf_parser) + data_access_factory_args = daf_parser.parse_args(args=daf_args) + data_access_factory.apply_input_params(args=data_access_factory_args) + stats = {} + fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats) + fcu.copy_data(args.subfolder_name, args.data_type) diff --git a/transforms/universal/fdedup/python/src/service_orchestrator.py b/transforms/universal/fdedup/python/src/service_orchestrator.py new file mode 100644 index 000000000..897a3210c --- /dev/null +++ b/transforms/universal/fdedup/python/src/service_orchestrator.py @@ -0,0 +1,265 @@ +import argparse +import os +import sys + +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from file_copy_util import FileCopyUtil +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +class ServiceOrchestrator: + def __init__(self, global_params=None): + self.global_params = global_params or {} + + def execute_service(self, service_logic, service_params): + # Call the generic service logic + service_logic(service_params) + + def orchestrate(self, service_logic): + service_list = self.global_params["services"].split(",") + + for service in service_list: + if service == "SignatureCalculation": + params = create_transform_args_payload(args, service) + params["service_type"] = "SignatureCalculation" + self.execute_service(service_logic, params) + elif service == "ClusterAnalysis": + params = create_transform_args_payload(args, service) + params["service_type"] = "ClusterAnalysis" + self.execute_service(service_logic, params) + elif service == "DataCleaning": + params = create_transform_args_payload(args, service) + params["service_type"] = "DataCleaning" + self.execute_service(service_logic, params) + elif service == "BandsFileCopy": + params = args + params["service_type"] = "BandsFileCopy" + self.execute_service(service_logic, params) + elif service == "DocsToRemoveFileCopy": + params = args + params["service_type"] = "DocsToRemoveFileCopy" + self.execute_service(service_logic, params) + else: + print(f"Warning: {service} is not a recognized service.") + + +def generic_service_logic(params): + print("Service executed with parameters:", params) + service_type = params["service_type"] + use_s3 = params["use_s3"] + # Remove the 'service_type' key + params.pop("service_type", None) # Using pop() method + + if service_type == "SignatureCalculation" or service_type == "ClusterAnalysis" or service_type == "DataCleaning": + # Set the simulated command line args + params.pop("num_permutations", None) # Using pop() method + params.pop("num_bands", None) # Using pop() method + params.pop("num_segments", None) # Using pop() method + params.pop("use_s3", None) # Using pop() method + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + if use_s3: + sys.argv.append("--data_s3_cred") + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + + if service_type == "SignatureCalculation": + runtime_config = SignatureCalculationPythonTransformConfiguration() + launch_transform_service(runtime_config) + elif service_type == "ClusterAnalysis": + runtime_config = ClusterAnalysisPythonTransformConfiguration() + launch_transform_service(runtime_config) + elif service_type == "DataCleaning": + runtime_config = DataCleaningPythonTransformConfiguration() + launch_transform_service(runtime_config) + elif service_type == "BandsFileCopy": + launch_file_copy_service(params, service_type) + elif service_type == "DocsToRemoveFileCopy": + launch_file_copy_service(params, service_type) + + +def launch_transform_service(params): + # create launcher + launcher = PythonTransformLauncher(runtime_config=params) + # Launch the ray actor(s) to process the input + launcher.launch() + + +def launch_file_copy_service(args, service_type): + root_folder = os.path.join(args["root_folder"], args["output_folder"]) + data_type = None + if service_type == "BandsFileCopy": + data_type = "bands" + # Get files to process + files = [ + f"band={band}/segment={segment}" + for band in range(args["num_bands"]) + for segment in range(args["num_segments"]) + ] + elif service_type == "DocsToRemoveFileCopy": + files = ["docs_to_remove"] + data_type = "docs_to_remove" + config = {"root_folder": root_folder} + data_access_factory: DataAccessFactoryBase = DataAccessFactory() + daf_args = [] + + if args["use_s3"]: + + s3_config = { + "input_folder": root_folder, + "output_folder": root_folder, + } + daf_args.append("--data_s3_cred") + daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) + daf_args.append("--data_s3_config") + daf_args.append(ParamsUtils.convert_to_ast(s3_config)), + else: + + # Construct folders + local_config = { + "input_folder": root_folder, + "output_folder": os.path.abspath(os.path.join(args["root_folder"], args["output_folder"])), + } + daf_args.append("--data_local_config") + daf_args.append(ParamsUtils.convert_to_ast(local_config)) + + daf_parser = argparse.ArgumentParser() + data_access_factory.add_input_params(parser=daf_parser) + data_access_factory_args = daf_parser.parse_args(args=daf_args) + data_access_factory.apply_input_params(args=data_access_factory_args) + stats = {} + fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats) + for file in files: + fcu.copy_data(file, data_type) + + +def create_transform_args_payload(args, service): + print(args) + # Construct folders + input_folder = os.path.join(args["root_folder"], args["input_folder"]) + output_folder = os.path.join(args["root_folder"], args["output_folder"]) + if service == "ClusterAnalysis": + input_folder = os.path.join(args["root_folder"], args["output_folder"], "bands_consolidated") + output_folder = os.path.join(args["root_folder"], args["output_folder"], "docs_to_remove") + elif service == "DataCleaning": + output_folder = os.path.join(args["root_folder"], args["output_folder"], "cleaned") + duplicate_location = os.path.join( + args["root_folder"], + args["output_folder"], + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + + # Create a local configuration + local_conf = {"input_folder": input_folder, "output_folder": output_folder} + + # Create parameters + params = { + "num_permutations": args["num_permutations"], + "num_bands": args["num_bands"], + "num_segments": args["num_segments"], + "use_s3": args["use_s3"], + } + + if args["use_s3"]: + params["data_s3_config"] = ParamsUtils.convert_to_ast(local_conf) + else: + params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + + # add extra + if service == "DataCleaning": + short_name = "fdclean" + cli_prefix = f"{short_name}_" + + # configuration keys + document_id_column_key = "document_id_column" + """ This key holds the name of the column storing the unique ID assigned to each document""" + duplicate_list_location_key = "duplicate_list_location" + """ This key holds the location of the list of duplicate documents marked for removal""" + + # command line arguments + document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" + """ Name of the column storing the unique ID assigned to each document""" + duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" + """ Location of the list of duplicate documents marked for removal""" + + params[document_id_column_cli_param] = "int_id_column" + params[duplicate_list_location_cli_param] = duplicate_location + + return params + + +def create_file_copy_args_payload(args): + daf_args = [] + local_config = { + "input_folder": args.root_folder, + "output_folder": args.root_folder, + } + daf_args.append("--data_local_config") + daf_args.append(ParamsUtils.convert_to_ast(local_config)) + data_access_factory: DataAccessFactoryBase = DataAccessFactory() + daf_parser = argparse.ArgumentParser() + data_access_factory.add_input_params(parser=daf_parser) + data_access_factory_args = daf_parser.parse_args(args=daf_args) + data_access_factory.apply_input_params(args=data_access_factory_args) + return data_access_factory + + +def parse_args(): + parser = argparse.ArgumentParser(description="Service Orchestrator") + + # Define command line arguments + parser.add_argument("--root_folder", type=str, required=True, help="Root folder path") + parser.add_argument("--input_folder", type=str, required=True, help="Input folder path") + parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") + + parser.add_argument( + "--contents_column", type=str, default="text", help="Name of the column that holds document text" + ) + parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations") + parser.add_argument("--num_bands", type=int, default=14, help="Number of bands") + parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band") + parser.add_argument("--num_segments", type=int, default=2, help="Number of segments") + + # Single argument for service execution + parser.add_argument( + "--services", + type=str, + required=True, + help="Comma-separated list of services to run (e.g., SignatureCalculation,BandsFileCopy,ClusterAnalysis,DocsToRemoveFileCopy,DataCleaning)", + ) + + parser.add_argument( + "--use_s3", + type=bool, + default=False, + help="use s3", + ) + + args = parser.parse_args() + return vars(args) # Convert Namespace to dictionary + + +if __name__ == "__main__": + + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + + # Parse command line arguments + args = parse_args() + + # Initialize the orchestrator + orchestrator = ServiceOrchestrator(global_params=args) + + # Example service execution (if you had defined services) + orchestrator.orchestrate(generic_service_logic) diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py new file mode 100644 index 000000000..eb958ee3d --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -0,0 +1,60 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +# # create parameters +# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +# output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output_second")) +# local_conf = { +# "input_folder": input_folder, +# "output_folder": output_folder +# } +# code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +# params = { +# # Data access. Only required parameters are specified +# "data_local_config": ParamsUtils.convert_to_ast(local_conf), +# # execution info +# "runtime_pipeline_id": "pipeline_id", +# "runtime_job_id": "job_id", +# "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +# "minhash_num_permutations":112, +# "minhash_num_bands":14, +# "minhash_num_segments":2 +# } + + +if __name__ == "__main__": + # Set the simulated command line args + # sys.argv = ParamsUtils.dict_to_req(d=params) + # print(sys.argv) + + sys.argv.append("--data_s3_cred") + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + + # create launcher + launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) + # Launch python to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py new file mode 100644 index 000000000..7ac8eb057 --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -0,0 +1,504 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Any, List + +import mmh3 +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider +from Murmur_MH import Murmur_MH +from scipy.integrate import quad as integrate + + +short_name = "minhash" +cli_prefix = f"{short_name}_" + +# configuration keys +document_id_column_key = "document_id_column" +""" This key holds the name of the column storing the unique ID assigned to each document""" +contents_column_key = "contents_column" +""" This key holds the name of the column storing the contents of each document""" +seed_key = "seed" +""" This key holds the seed used to instantiate the random number generator""" +num_permutations_key = "num_permutations" +""" This key holds the number of permutations that determine how many minhashes to calculate for each document""" +num_bands_key = "num_bands" +""" This key holds the number of bands to use in the banding technique""" +num_minhashes_per_band_key = "num_minhashes_per_band" +""" This key holds the number of minhashes to use in each band""" +jaccard_similarity_threshold_key = "jaccard_similarity_threshold" +""" This key holds the Jaccard similarity threshold above which two documents are duplicates""" +word_shingle_size_key = "word_shingle_size" +""" This key holds the size of the word shingles calculated for each document""" +num_segments_key = "num_segments" +""" This key holds the number of segments across which we divide the hashing space for each band""" + +# command line arguments +document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" +""" Name of the column storing the unique ID assigned to each document""" +contents_column_cli_param = f"{cli_prefix}{contents_column_key}" +""" Name of the column storing the contents of each document""" +seed_cli_param = f"{cli_prefix}{seed_key}" +""" The seed used to instantiate the random number generator""" +num_permutations_cli_param = f"{cli_prefix}{num_permutations_key}" +""" Number of permutations that determine how many minhashes to calculate for each document""" +num_bands_cli_param = f"{cli_prefix}{num_bands_key}" +""" The number of bands to use in the banding technique""" +num_minhashes_per_band_cli_param = f"{cli_prefix}{num_minhashes_per_band_key}" +""" The number of minhashes to use in each band""" +jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" +""" Jaccard similarity threshold above which two documents are duplicates""" +word_shingle_size_cli_param = f"{cli_prefix}{word_shingle_size_key}" +""" The size of the word shingles calculated for each document""" +num_segments_cli_param = f"{cli_prefix}{num_segments_key}" +""" The number of segments across which we divide the hashing space for each band""" + +captured_arg_keys = [ + document_id_column_key, + contents_column_key, + seed_key, + num_bands_key, + num_minhashes_per_band_key, + num_permutations_key, + jaccard_similarity_threshold_key, + word_shingle_size_key, + num_segments_key, +] + +# defaults +document_id_column_default = "int_id_column" +""" Default name of the column storing the unique ID assigned to each document""" +contents_column_default = "contents" +""" Default name of the column storing the contents of each document""" +seed_default = 42 +""" Default seed used to instantiate the random number generator""" +num_permutations_default = 112 +""" Default number of minhashes used for each document (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_bands_default = 14 +""" Default number of bands to use in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_minhashes_per_band_default = 8 +""" Default number of minhashes to use in each band (from FineWeb https://arxiv.org/pdf/2406.17557)""" +word_shingle_size_default = 5 +""" Default size of the word shingles (from FineWeb https://arxiv.org/pdf/2406.17557)""" +jaccard_similarity_threshold_default = 0.75 +""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_segments_default = 1 +""" Default number of segments across which we divide the hashing space for each band""" + + +def _optimal_minhashlsh_param( + threshold: float = jaccard_similarity_threshold_default, + num_perm: int = num_permutations_default, + false_positive_weight: float = 0.5, + false_negative_weight: float = 0.5, +): + """ + Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum + of probabilities of false positive and false negative. + :param threshold: desired similarity threshold + :param num_perm: number of permutations + :param false_positive_weight: importance of avoiding false positive results + :param false_negative_weight: importance of avoiding false negative results + :return: a tuple (optimal number of bands, optimal number of rows) + """ + + def _false_positive_probability(threshold, b, r): + _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) + a, err = integrate(_probability, 0.0, threshold) + return a + + def _false_negative_probability(threshold, b, r): + _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) + a, err = integrate(_probability, threshold, 1.0) + return a + + min_error = float("inf") + opt = (0, 0) + for b in range(1, num_perm + 1): + max_r = int(num_perm / b) + for r in range(1, max_r + 1): + fp = _false_positive_probability(threshold, b, r) + fn = _false_negative_probability(threshold, b, r) + error = fp * false_positive_weight + fn * false_negative_weight + if error < min_error: + min_error = error + opt = (b, r) + return opt + + +class SignatureCalculationTransform(AbstractTableTransform): + """ + This is the first transform of the fuzzy dedup pipeline. First, it calculates, + for each document in a dataset, `num_permutations` minhashes. It accepts as + input the number of bands and the length of each band. If those two parameters + are not specified, then, based on the values of `jaccard_similarity_threshold` + and `num_permutations`, it determines the optimal number of bands, and the + length of each band (how many minhashes will be used to get the signature for + each band). The band signatures, the minhashes and the document lengths are + then saved in the output folder, under a folder structure `bands/band=b/segment=s`. + To improve scalability of the next step of fuzzy dedup, the hash space of + each band is divided into `num_segments` segments. + + Args: + document_id_column: name of the column storing the unique ID assigned to each document + contents_column_cli_param: name of the column storing the contents of each document + seed: the seed used to instantiate the random number generator + num_permutations: number of minhashes to calculate for each document + num_bands: number of bands to use for banding technique + num_minhashes_per_band: number of minhashes to use in each band + jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + word_shingle_size: the size of the word shingles calculated for each document + num_segments the number of segments across which we divide the hashing space for each band + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, SignatureCalculationTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + super().__init__(config) + self.document_id_column = config.get(document_id_column_key, document_id_column_default) + self.contents_column = config.get(contents_column_key, contents_column_default) + self.seed = config.get(seed_key, seed_default) + self.num_permutations = config.get(num_permutations_key, num_permutations_default) + self.jaccard_similarity_threshold = config.get( + jaccard_similarity_threshold_key, jaccard_similarity_threshold_default + ) + self.word_shingle_size = config.get(word_shingle_size_key, word_shingle_size_default) + self.num_segments = config.get(num_segments_key, num_segments_default) + self.num_bands = config.get(num_bands_key, num_bands_default) + self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) + # Calculate optimal parameters for bands calculation + # self.num_bands, self.num_rows = _optimal_minhashlsh_param( + # threshold=self.jaccard_similarity_threshold, + # num_perm=self.num_permutations, + # false_positive_weight=0.5, + # false_negative_weight=0.5, + # ) + # use this dataframe to store the minhashes and size for each document + self.all_minhashes: pl.DataFrame = None + # use this dataframe to store the band hashes for each document + self.all_band_hashes: pl.DataFrame = None + # this variable keeps track of how many files were processed since last + # data write to properly update metadata + self.files_processed = 0 + self.bytes_processed = 0 + self.data_access = config.get("data_access") + self.last_file_name = None + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + """ + Put Transform-specific to convert one Table to 0 or more tables. It also returns + a dictionary of execution statistics - arbitrary dictionary + This implementation makes no modifications so effectively implements a copy of the + input parquet to the output folder, without modification. + """ + self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + self.logger.debug("----minhash---") + self.last_file_name = file_name + self.files_processed += 1 + self.bytes_processed += table.nbytes + # instantiate with same seed so every worker use same hash functions + mm_min_hash = Murmur_MH(num_perm=self.num_permutations, seed=self.seed) + + # load the data from pyarrow table + df = pl.from_arrow(table) + # read the target columns + df = df.select(self.contents_column, self.document_id_column) + + # generate minhash values + minhashes = df.map_rows( + lambda text: mm_min_hash.minhash2_nosalt( + *self._generate_word_shingles(text, window_size=self.word_shingle_size) + ) + ) + # rename columns, cast minhashes to list(uint32) + minhashes = minhashes.select( + pl.col("column_2").alias(self.document_id_column), + pl.col("column_0").cast(pl.List(pl.UInt32)).alias("minhashes"), + pl.col("column_1").alias("document_length"), + ) + # store the minhash calculations to send out at the end of execution + if self.all_minhashes is None: + self.all_minhashes = minhashes + else: + self.all_minhashes = self.all_minhashes.vstack(minhashes) + + # Calculate band hashes + band_hashes_list = self.process_rows_into_bands( + minhashes, + self.num_bands, + self.num_rows, + ) + band_hash_schema = pl.Schema( + { + "band_hash": pl.UInt64, + "band_index": pl.Int32, + self.document_id_column: pl.Int64, + } + ) + band_hashes = pl.DataFrame(band_hashes_list, schema=band_hash_schema) + + # store the band hash calculations to send out at the end of execution + if self.all_band_hashes is None: + self.all_band_hashes = band_hashes + else: + self.all_band_hashes = self.all_band_hashes.vstack(band_hashes) + + if len(self.all_minhashes) > 750000: + tables, metadata = self.write_band_signatures() + else: + tables = [] + metadata = {} + # update metadata stats and return the stats (no tables are returned in transform) + return tables, metadata + + def flush(self) -> tuple[list[pa.Table], dict[str, Any]]: + """ + This is supporting method for transformers, that implement buffering of tables, for example coalesce. + These transformers can have buffers containing tables that were not written to the output. Flush is + the hook for them to return back locally stored tables and their statistics. The majority of transformers + should use default implementation. + If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray. + :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be + propagated to metadata + """ + self.logger.info(f"Starting flush()") + if self.all_band_hashes is not None and self.all_minhashes is not None: + tables, metadata = self.write_band_signatures() + else: + tables = [] + metadata = {} + return tables, metadata + + def write_band_signatures(self): + # define the upper and lower bounds of each band segment + segment_bounds_list = [] + upper_bound = np.uint64(np.iinfo(np.uint64).max) + segment_len = np.uint64(upper_bound // self.num_segments) + for segment_index in range(self.num_segments): + segment_bounds_list.append(np.uint64(segment_index) * segment_len) + segment_bounds_list.append(upper_bound) + segment_bounds = np.array(segment_bounds_list, dtype=np.uint64) + self.logger.debug(f"Calculated {len(segment_bounds)} segment_bounds") + # output stats for the metadata + num_tables_written = 0 + num_docs_written = 0 + num_bytes_written = 0 + self.logger.debug(f"dataframe self.all_band_hashes has {len(self.all_band_hashes)} rows") + self.logger.debug(f"dataframe self.all_minhashes has {len(self.all_minhashes)} rows") + # iterate through the bands, get the band hashes for each band, divide + # them into segments, join with minhashes, and upload to storage + for band_ix in range(self.num_bands): + # Filtering on, then dropping the `band_index` column + band_df = self.all_band_hashes.filter(pl.col("band_index") == band_ix).drop("band_index") + # assign each band hash to a segment of the hashing space + self.logger.debug(f"band {band_ix} band_df has {len(band_df)} rows") + for segment_index in range(self.num_segments): + segment_band_df = band_df.filter( + (pl.col("band_hash") > segment_bounds[segment_index]) + & (pl.col("band_hash") <= segment_bounds[segment_index + 1]) + ) + self.logger.debug( + f"band {band_ix} segment {segment_index} segment_band_df has {len(segment_band_df)} rows" + ) + # join the band hash dataframe with the minihash and doc length dataframe + segment_band_minhash_df = segment_band_df.join( + self.all_minhashes, + on=self.document_id_column, + how="inner", + ) + self.logger.debug(f"band {band_ix} segment {segment_index} joined segment_band_df and minhashes") + + # encapsulate document info in a structure + segment_band_minhash_df = segment_band_minhash_df.select( + pl.col("band_hash"), + pl.struct( + [ + pl.col(self.document_id_column), + pl.col("minhashes"), + pl.col("document_length"), + ] + ).alias("document_data"), + ) + self.logger.debug(f"band {band_ix} segment {segment_index} encapsulated document info in a structure") + + # append the table to the result list, and the path to metadata + common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name]) + last_file_name_path = Path(self.last_file_name) + suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) + save_path = os.path.join( + self.data_access.output_folder, + "bands", + f"band={band_ix}", + f"segment={segment_index}", + suffix_path, + ) + segment_band_minhash_table = segment_band_minhash_df.to_arrow() + bytes_written, _, _ = self.data_access.save_table(save_path, segment_band_minhash_table) + if bytes_written > 0: + num_tables_written += 1 + num_docs_written += segment_band_minhash_table.num_rows + num_bytes_written += bytes_written + self.logger.debug(f"Uploaded table for band {band_ix} and segment {segment_index}") + # add the stats to metadata + metadata = { + "input_files": self.files_processed, + "input_docs": len(self.all_minhashes), + "input_bytes": self.bytes_processed, + "output_files": num_tables_written, + "output_docs": num_docs_written, + "output_bytes": num_bytes_written, + } + self.logger.info(f"Wrote {num_tables_written} tables with a total size of {num_bytes_written:,d} bytes") + self.files_processed = 0 + self.bytes_processed = 0 + self.all_minhashes = None + self.all_band_hashes = None + return [], metadata + + # define shingles generation function + def _generate_word_shingles(self, text: str, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]: + words = text[0].split() + document_id = text[1] + doc_len = len(text[0]) + word_count = len(words) + k_shingles = [] + for i in range(0, max(1, word_count - window_size + 1)): + k_shingles.append(delimiter.join(words[i : i + window_size])) + return k_shingles, doc_len, document_id + + def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b: int, r: int, seed: int = 42): + num_minhashes = len(minhashes) + assert b * r <= num_minhashes, f"b*r must be <= num minhashes, was b={b}, r={r}, num_minhashes={num_minhashes}" + results = [] + for band_index in range(b): + band_hash, _ = mmh3.hash64( + minhashes[band_index * r : (band_index + 1) * r], + seed=seed, + signed=False, + ) + results.append((band_hash, band_index, int_id_column)) + return results + + # Apply the function + def process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band): + result = [] + for row in df.iter_rows(): + bands = self.emit_bands( + row[0], # document id + np.array(row[1], dtype=np.uint32), # minhashes + row[2], # document length + minhashlsh_num_bands, + minhashlsh_length_band, + ) + for band in bands: + result.append(band) + return result + + +class SignatureCalculationTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=SignatureCalculationTransform, + remove_from_metadata=[], + ) + from data_processing.utils import get_logger + + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{document_id_column_cli_param}", + type=str, + default=document_id_column_default, + help="name of the column storing the unique ID assigned to each document", + ) + parser.add_argument( + f"--{contents_column_cli_param}", + type=str, + default=contents_column_default, + help="name of the column storing the contents of each document", + ) + parser.add_argument( + f"--{seed_cli_param}", + type=int, + default=seed_default, + help="the seed used to instantiate the random number generator", + ) + parser.add_argument( + f"--{num_permutations_cli_param}", + type=int, + default=num_permutations_default, + help="number of permutations (minhashes) calculated for each document", + ) + parser.add_argument( + f"--{jaccard_similarity_threshold_cli_param}", + type=int, + default=jaccard_similarity_threshold_default, + help="Jaccard similarity threshold above which two documents are duplicates", + ) + parser.add_argument( + f"--{word_shingle_size_cli_param}", + type=int, + default=word_shingle_size_default, + help="the size of the word shingles calculated for each document", + ) + parser.add_argument( + f"--{num_bands_cli_param}", + type=int, + default=num_bands_default, + help="the number of bands to use in the banding technique", + ) + parser.add_argument( + f"--{num_minhashes_per_band_cli_param}", + type=int, + default=num_minhashes_per_band_default, + help="the number of minhashes to use in each band", + ) + parser.add_argument( + f"--{num_segments_cli_param}", + type=int, + default=num_segments_default, + help="the number of segments across which we divide the hashing space for each band", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py new file mode 100644 index 000000000..5ddc102eb --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(SignatureCalculationTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() From 7f9b503978c4d7daf9cafc2ae7b448577ca5a7d6 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 11 Oct 2024 10:27:16 -0400 Subject: [PATCH 006/105] Fuzzy dedup spark implementation Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/spark/Dockerfile | 54 ++++ transforms/universal/fdedup/spark/Makefile | 45 +++ transforms/universal/fdedup/spark/README.md | 109 ++++++++ .../spark-executor-pod-template.yml | 8 + .../deployment/kubernetes/spark_profile.yml | 14 + .../universal/fdedup/spark/pyproject.toml | 42 +++ .../universal/fdedup/spark/requirements.txt | 10 + .../spark/src/cluster_analysis_spark.py | 33 +++ .../src/cluster_analysis_transform_spark.py | 42 +++ .../fdedup/spark/src/data_cleaning_spark.py | 33 +++ .../src/data_cleaning_transform_spark.py | 102 +++++++ .../fdedup/spark/src/file_copy_util_spark.py | 261 ++++++++++++++++++ .../fdedup/spark/src/fuzzy_dedup_spark.py | 205 ++++++++++++++ .../fdedup/spark/src/requirements.txt | 8 + .../fdedup/spark/src/signature_calc_spark.py | 35 +++ .../src/signature_calc_transform_spark.py | 42 +++ 16 files changed, 1043 insertions(+) create mode 100644 transforms/universal/fdedup/spark/Dockerfile create mode 100644 transforms/universal/fdedup/spark/Makefile create mode 100644 transforms/universal/fdedup/spark/README.md create mode 100644 transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml create mode 100644 transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml create mode 100644 transforms/universal/fdedup/spark/pyproject.toml create mode 100644 transforms/universal/fdedup/spark/requirements.txt create mode 100644 transforms/universal/fdedup/spark/src/cluster_analysis_spark.py create mode 100644 transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/src/data_cleaning_spark.py create mode 100644 transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/src/file_copy_util_spark.py create mode 100644 transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py create mode 100644 transforms/universal/fdedup/spark/src/requirements.txt create mode 100644 transforms/universal/fdedup/spark/src/signature_calc_spark.py create mode 100644 transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile new file mode 100644 index 000000000..523b94c06 --- /dev/null +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -0,0 +1,54 @@ +ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0 + +FROM ${BASE_IMAGE} + +# USER root +# install pytest +RUN pip install --no-cache-dir pytest + +WORKDIR ${SPARK_HOME}/work-dir + +# Copy in the data processing framework source/project and install it +# This is expected to be placed in the docker context before this is run (see the make image). +COPY --chown=spark:root data-processing-lib-python/ data-processing-lib-python/ +RUN cd data-processing-lib-python && pip install --no-cache-dir -e . +COPY --chown=spark:root data-processing-lib-spark/ data-processing-lib-spark/ +RUN cd data-processing-lib-spark && pip install --no-cache-dir -e . +COPY --chown=spark:root python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . + +# Install project source +COPY --chown=spark:root src/ src/ +COPY --chown=spark:root pyproject.toml pyproject.toml +RUN mkdir -p /opt/spark/work-dir/src/templates && \ + mkdir -p /opt/spark/work-dir/config + +# install requirements from requirements.txt +COPY requirements.txt . +RUN pip3 install -r requirements.txt + +COPY deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ +COPY deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ + +RUN pip install --no-cache-dir -e . + +# copy the main() entry point to the image +COPY ./src/signature_calc_spark.py . + +# copy some of the samples in +# COPY src/filter_local_spark.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +USER spark + +# Set environment +ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH} + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile new file mode 100644 index 000000000..d30013da8 --- /dev/null +++ b/transforms/universal/fdedup/spark/Makefile @@ -0,0 +1,45 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +# This is included in the image name, if defined +TRANSFORM_NAME=fd-sig-calc + +DOCKER_IMAGE_NAME=pyspark-base +DOCKER_IMAGE_VERSION=latest +DOCKER_FILE=Dockerfile +REGISTRY_HOST=docker.io +REGISTRY_PATH= +DOCKER=docker +PYTHON=python + +venv: requirements.txt + @# Help: Create the virtual environment using requirements.txt + $(PYTHON) -m venv venv + @source venv/bin/activate; \ + pip install --upgrade pip; \ + pip install wheel; \ + pip install -r requirements.txt; + +image:: .transforms.spark-image + +image-direct: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings. + @# Help: Create the docker image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + $(DOCKER) build -t $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -f $(DOCKER_FILE) . + +publish-docker: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings. + @# Help: Publish image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) to $(REGISTRY_HOST) container registry + $(DOCKER) logout $(REGISTRY_HOST) + $(DOCKER) login $(REGISTRY_HOST) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)' + $(DOCKER) push $(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + +publish-ibm: + ibmcloud login -q -u "$(IBM_CLOUD_USER)" -apikey "$(IBM_CLOUD_API_KEY)" + ibmcloud cr login --client docker + $(DOCKER) tag $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + $(DOCKER) push $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + # ibmcloud cr image-list | grep $(DOCKER_IMAGE_NAME) diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md new file mode 100644 index 000000000..3bf9b3245 --- /dev/null +++ b/transforms/universal/fdedup/spark/README.md @@ -0,0 +1,109 @@ +# Spark-GUF + +This is an implementation of Spark data processing modules. At a high level, every Spark application consists of a driver program that runs the user’s main function and executes various parallel operations on a cluster. + +The modules can run locally or remotely in a Kubernetes cluster. + +## Running Transforms locally + +Start in the `spark-guf` directory. To run the modules locally, follow these steps: +1. Create a virtual environment using this command + ``` + make venv + ``` +2. Activate the virtual environment: + ``` + source venv/bin/activate + ``` + +3. Set the `PYTHONPATH` environment variable to include the `src` directory: + ``` + export PYTHONPATH=${PYTHONPATH}:${PWD}/src + ``` +4. Invoke one of the transforms: + ``` + python src/transforms/spark_pi/spark_transformer_pi.py + ``` +5. To find out which arguments a transform takes, run that transform with a `--help` flag: + ``` + python src/transforms/spark_filter/spark_filter_transform.py --help + usage: spark_filter_transform.py [-h] --input_folder INPUT_FOLDER --output_folder OUTPUT_FOLDER [--data_type DATA_TYPE] + --filter_criteria_list FILTER_CRITERIA_LIST [--filter_columns_to_drop FILTER_COLUMNS_TO_DROP] + [--filter_logical_operator {AND,OR}] + + optional arguments: + -h, --help show this help message and exit + --input_folder INPUT_FOLDER + path to read the input files (local fs or s3) + --output_folder OUTPUT_FOLDER + path to write the output files (local fs or s3) + --data_type DATA_TYPE + Type of files to filter (parquet, orc, csv, json, txt) + --filter_criteria_list FILTER_CRITERIA_LIST + list of filter criteria (in SQL WHERE clause format), for example: [ "docq_total_words > 100 AND docq_total_words < 200", "docq_perplex_score < 230", "date_acquired BETWEEN '2023-07-04' + AND '2023-07-08'", "title LIKE 'https://%'", "document_id IN ('doc-id-1', 'doc-id-2', 'doc-id-3')" ] + --filter_columns_to_drop FILTER_COLUMNS_TO_DROP + list of columns to drop after filtering, for example: ["column1", "column2"] + --filter_logical_operator {AND,OR} + logical operator (AND or OR) that joins filter criteria + ``` + +## Running Transforms in Kubernetes/OpenShift + +Start in the `spark-guf` directory. To run the transforms in a Kubernetes or OpenShift cluster, follow these steps: + +1. Build and push a pyspark base docker image (this example assumes that images are pushed to the Docker hub, but same approach can be used to push images to icr.io, or quai.io: + ``` + docker build -t my-docker-username/my-pyspark:3.5.1 . + docker push my-docker-username/my-pyspark:3.5.1 + ``` +2. Build and push a specific transform image (this will use the pyspark built in the previous point as the base image): + ``` + docker build -t my-docker-username/my-pyspark-filter:3.5.1 -f src/transforms/spark_filter/Dockerfile --build-arg BASE_IMAGE=my-docker-username/my-pyspark:3.5.1 . + docker push my-docker-username/my-pyspark-filter:3.5.1 + ``` + +3. Configure the `spark` service account (note that you can use any other service account name, but you will need then to replace `spark` with `your-service-account-name` in all the yaml files listed below). This is a one-time process to perform for each namespace where you want to run spark apps: + ``` + # create 'spark' service account + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-serviceaccount.yaml --namespace=my-namespace + + # create 'spark' role + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role.yaml --namespace=my-namespace + + # bind the 'spark' service account to the 'spark' role + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role-binding.yaml --namespace=my-namespace + + # bind the 'spark' service account to the cluster roles + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-edit-role-binding.yaml --namespace=my-namespace + kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-cluster-role-binding.yaml --namespace=my-namespace + ``` + + 4. Create any secrets that are needed to access S3 folders used for input or output of the transforms. Follow [this link](https://github.com/aws-samples/machine-learning-using-k8s/blob/master/docs/aws-creds-secret.md) for more information on how to build the S3 secrets. + + 5. Edit a pod yaml file from the `deployment/kubernetes/pods` directory. The steps below refer to the [yaml file used to build the filter pod] (deployment/kubernetes/pods/spark-driver-pod-filter.yaml): + 1. Give a name to the pod (`metadata/name`), the container launched inside the pod (`spec/containers/name`), and the Spark application (the `APP_NAME` variable in `spec/containers/env`). + 2. Specify the namespace where the pod will be created (`metadata/namespace`). Use the same namespace for the `EXECUTOR_NAMESPACE` variable in `spec/containers/env`) + 3. Specify the command to launch the Spark application (in `spec/containers/args`) + 4. Specify the image used by the driver (`spec/containers/image` - usually this is the transform image built under point 2). + 5. Specify the image used by the executors (`EXECUTOR_DOCKER_IMAGE` variable in `spec/containers/env`) + 6. Specify the service account to use by the driver (`spec/containers/serviceAccount`) and by the executors(the `SERVICE_ACCOUNT` variable in `spec/containers/env`) + 7. Configure S3: + 1. Specify the input (`AWS_ENDPOINT_URL_IN`) and output (`AWS_ENDPOINT_URL_OUT`) endpoint URLs. + 2. Specify the input and out access key ids and secret access keys. + +6. Launch the Spark application by creating the driver pod: + ``` + kubectl apply -f deployment/kubernetes/pod/spark-driver-pod-filter.yaml + ``` + +7. Monitor the creation of the executor pods: + ``` + kubectl get pods -w + ``` + +8. Monitor the driver logs: + ``` + kubectl logs spark-driver-pod-filter -f + ``` + ``` diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml new file mode 100644 index 000000000..d9579e0c7 --- /dev/null +++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Pod +metadata: +spec: + imagePullSecrets: + - name: prod-all-icr-io + securityContext: + fsGroup: 0 diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml new file mode 100644 index 000000000..eeddbd694 --- /dev/null +++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml @@ -0,0 +1,14 @@ +spark.app.name: ${APP_NAME} +spark.driver.memory: ${DRIVER_MEMORY} +spark.executor.instances: ${NUM_EXECUTORS} +spark.executor.memory: ${EXECUTOR_MEMORY} +spark.executor.cores: ${EXECUTOR_CORES} +spark.sql.shuffle.partitions: ${NUM_TASKS} +spark.task.cpus: ${TASK_CPUS} +spark.sql.legacy.parquet.nanosAsLong: true +spark.executor.decommission.forceKillTimeout: "10h" +# spark.sql.files.ignoreCorruptFiles: true +# configuration needed when running in kubernetes +spark.kubernetes.authenticate.driver.serviceAccountName: ${SERVICE_ACCOUNT} +spark.kubernetes.container.image: ${EXECUTOR_DOCKER_IMAGE} +spark.kubernetes.namespace: ${EXECUTOR_NAMESPACE} diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml new file mode 100644 index 000000000..dcf1f48e2 --- /dev/null +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -0,0 +1,42 @@ +[project] +name = "dpk_fdedup_transform_spark" +version = "0.3.0.dev0" +requires-python = ">=3.10" +description = "Fuzzy Dedup Spark Transform" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, +] +dependencies = [ + "dpk_fdedup_transform_python==0.3.0.dev0", + "data-prep-toolkit-spark==0.2.2.dev0", +] + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt new file mode 100644 index 000000000..10f3e129b --- /dev/null +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -0,0 +1,10 @@ +pyarrow +pyyaml +boto3 +kubernetes +polars +disjoint-set +scipy +numpy +sentencepiece +mmh3 diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py new file mode 100644 index 000000000..83498f59e --- /dev/null +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +if __name__ == "__main__": + sys.argv.append("--data_s3_cred") + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # create launcher + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py new file mode 100644 index 000000000..afb8c51b7 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class ClusterAnalysisSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Cluster Analysis + as required by the SparkTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup signature calculation transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_spark.py new file mode 100644 index 000000000..7b6bd626d --- /dev/null +++ b/transforms/universal/fdedup/spark/src/data_cleaning_spark.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +if __name__ == "__main__": + sys.argv.append("--data_s3_cred") + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py new file mode 100644 index 000000000..03976bac8 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -0,0 +1,102 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +from data_cleaning_transform import DataCleaningTransformConfiguration +from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import TransformStatistics +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class DataCleaningSparkRuntime(DefaultSparkTransformRuntime): + """ + Data cleaning runtime support for Spark + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_transform_config( + self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics + ) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :param statistics - reference to statistics actor + :param files - list of files to process + :return: dictionary of transform init params + """ + duplicate_list_location = self.params["duplicate_list_location"] + data_access = data_access_factory.create_data_access() + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + return self.params | {"df": self.duplicate_list} + + +class DataCleaningSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Data Cleaning + as required by the SparkTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(), + runtime_class=DataCleaningSparkRuntime, + ) + + def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :return: dictionary of parameters to be broadcast + """ + duplicate_list_location = self.transform_config.params["duplicate_list_location"] + data_access = data_access_factory.create_data_access() + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + return {"df": self.duplicate_list} + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup data cleaning transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/file_copy_util_spark.py b/transforms/universal/fdedup/spark/src/file_copy_util_spark.py new file mode 100644 index 000000000..58a43a736 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/file_copy_util_spark.py @@ -0,0 +1,261 @@ +import argparse +import os +import socket +import time +import traceback +from datetime import datetime + +import polars as pl +import yaml +from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase +from data_processing.utils import ParamsUtils, get_logger +from file_copy_util import FileCopyUtil +from pyspark.sql import SparkSession + + +logger = get_logger(__name__) + + +class FileCopySpark: + def __init__(self, root_folder: str, num_bands: int, num_segments: int, use_s3: bool): + self.root_folder = root_folder + self.num_bands = num_bands + self.num_segments = num_segments + self.use_s3 = use_s3 + self.subdirs = [f"band={b}/segment={s}" for b in range(num_bands) for s in range(num_segments)] + + def _init_spark(self, app_name: str = "copy-app") -> SparkSession: + server_port_https = int(os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "-1")) + if server_port_https == -1: + # we are running locally + spark_config = {"spark.driver.host": "127.0.0.1"} + return SparkSession.builder.appName(app_name).config(map=spark_config).getOrCreate() + else: + # we are running in Kubernetes, use spark_profile.yml and + # environment variables for configuration + + server_port = os.environ["KUBERNETES_SERVICE_PORT"] + master_url = f"k8s://https://kubernetes.default:{server_port}" + + # Read Spark configuration profile + config_filepath = os.path.abspath( + os.path.join(os.getenv("SPARK_HOME"), "work-dir", "config", "spark_profile.yml") + ) + with open(config_filepath, "r") as config_fp: + spark_config = yaml.safe_load(os.path.expandvars(config_fp.read())) + spark_config["spark.submit.deployMode"] = "client" + + # configure the executor pods from template + executor_pod_template_file = os.path.join( + os.getenv("SPARK_HOME"), + "work-dir", + "src", + "templates", + "spark-executor-pod-template.yml", + ) + spark_config["spark.kubernetes.executor.podTemplateFile"] = executor_pod_template_file + spark_config["spark.kubernetes.container.image.pullPolicy"] = "Always" + + # Pass the driver IP address to the workers for callback + myservice_url = socket.gethostbyname(socket.gethostname()) + spark_config["spark.driver.host"] = myservice_url + spark_config["spark.driver.bindAddress"] = "0.0.0.0" + + spark_config["spark.decommission.enabled"] = True + logger.info(f"Launching Spark Session with configuration\n" f"{yaml.dump(spark_config, indent=2)}") + app_name = spark_config.get("spark.app.name", "my-spark-app") + return SparkSession.builder.master(master_url).appName(app_name).config(map=spark_config).getOrCreate() + + def create_data_access_factory(self, root_folder: str, use_s3: bool) -> DataAccessFactoryBase: + input_folder = root_folder + output_folder = root_folder + data_access_factory: DataAccessFactoryBase = DataAccessFactory() + daf_args = [] + if use_s3: + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + s3_config = { + "input_folder": root_folder, + "output_folder": root_folder, + } + daf_args.append("--data_s3_cred") + daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) + daf_args.append("--data_s3_config") + daf_args.append(ParamsUtils.convert_to_ast(s3_config)), + else: + local_config = { + "input_folder": root_folder, + "output_folder": os.path.join(root_folder, "bands_consolidated"), + } + daf_args.append("--data_local_config") + daf_args.append(ParamsUtils.convert_to_ast(local_config)) + daf_parser = argparse.ArgumentParser() + data_access_factory.add_input_params(parser=daf_parser) + data_access_factory_args = daf_parser.parse_args(args=daf_args) + data_access_factory.apply_input_params(args=data_access_factory_args) + + return data_access_factory + + def orchestrate( + self, runtime_config: dict, execution_config: dict, data_access_factory: DataAccessFactoryBase, data_type: str + ) -> int: + """ + orchestrator for transformer execution + :param execution_config: orchestrator configuration + :param data_access_factory: data access factory + :param runtime_config: transformer runtime configuration + :return: 0 - success or 1 - failure + """ + start_time = time.time() + start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info(f"orchestrator started at {start_ts}") + data_access = data_access_factory.create_data_access() + # initialize Spark + spark_session = self._init_spark() + sc = spark_session.sparkContext + transform_config = sc.broadcast(runtime_config) + daf = sc.broadcast(data_access_factory) + data_type = data_type + print("data_type") + print(data_type) + + def process_partition(iterator): + """ + process partitions + :param iterator: iterator of records + :return: + """ + # local statistics dictionary + stats = {} + # create file processor + file_processor = FileCopyUtil( + data_access_factory=daf.value, + config=transform_config.value, + stats=stats, + ) + for f in iterator: + stats = file_processor.copy_data(subfolder_name=f[0], data_type=data_type) + # return partition's statistics + return list(stats.items()) + + num_partitions = 0 + try: + if data_type == "bands": + # Get files to process + files = [ + f"band={band}/segment={segment}" + for band in range(self.num_bands) + for segment in range(self.num_segments) + ] + elif data_type == "docs_to_remove": + files = ["docs_to_remove"] + print(data_type) + + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}") + # process data + logger.debug("Begin processing files") + source_rdd = sc.parallelize(files, execution_config.get("parallelization")) + num_partitions = source_rdd.getNumPartitions() + logger.info(f"Parallelizing execution. Using {num_partitions} partitions") + stats_rdd = source_rdd.zipWithIndex().mapPartitions(process_partition) + # build overall statistics + stats = dict(stats_rdd.reduceByKey(lambda a, b: a + b).collect()) + return_code = 0 + status = "success" + except Exception as e: + # process execution exception + logger.error(f"Exception during execution {e}: {traceback.print_exc()}") + return_code = 1 + status = "failure" + stats = {} + try: + # build and save metadata + logger.debug("Building job metadata") + input_params = runtime_config + # input_params = runtime_config.get_transform_metadata() | execution_config.get_input_params() + metadata = { + "job details": { + "start_time": start_ts, + "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "status": status, + }, + "job_input_params": input_params | data_access_factory.get_input_params(), + "execution_stats": { + "num partitions": num_partitions, + "execution time, min": (time.time() - start_time) / 60, + }, + "job_output_stats": stats, + } + logger.debug(f"Saving job metadata: {metadata}.") + + if data_access_factory.s3_config is not None: + _, root_folder = self.root_folder.split("://") + in_path = os.path.join(root_folder, "bands") + out_path = os.path.join(root_folder, "bands_consolidated") + data_access.input_folder = f"{in_path}{os.sep}" + data_access.output_folder = f"{out_path}{os.sep}" + else: + data_access.input_folder = os.path.join(self.root_folder, "bands") + data_access.output_folder = os.path.join(self.root_folder, "bands_consolidated") + data_access.save_job_metadata(metadata) + logger.debug("Saved job metadata.") + return return_code + except Exception as e: + logger.error(f"Exception during execution {e}: {traceback.print_exc()}") + return 1 + finally: + # stop spark context at the end. Required for running multiple tests + spark_session.stop() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--root_folder", + type=str, + default="/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_second/", + help="root folder", + ) + parser.add_argument( + "--num_bands", + type=int, + default=14, + help="number of bands", + ) + parser.add_argument( + "--num_segments", + type=int, + default=2, + help="number of segments", + ) + parser.add_argument( + "--data_type", + type=str, + default="docs_to_remove", + help="bands or doc2remove", + ) + parser.add_argument( + "--parallelization", + type=int, + default=-1, + help="spark parallelization", + ) + parser.add_argument( + "--use_s3", + type=bool, + default=False, + help="use s3", + ) + args = parser.parse_args() + fcs = FileCopySpark(args.root_folder, args.num_bands, args.num_segments, args.use_s3) + data_access_factory = fcs.create_data_access_factory(args.root_folder, args.use_s3) + app_config = {"root_folder": args.root_folder} + execution_config = {"parallelization": args.parallelization} if args.parallelization > 0 else {} + status = fcs.orchestrate(app_config, execution_config, data_access_factory, args.data_type) + print(f"Orchestrate concluded with status {status}") diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py new file mode 100644 index 000000000..6d0e090e4 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py @@ -0,0 +1,205 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import argparse +import logging +import os +import sys +from typing import Union + +import polars as pl +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from file_copy_util import FileCopyUtil +from file_copy_util_spark import FileCopySpark +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + +args_map = { + "minhash": [ + "document_id_column", + "contents_column", + "seed", + "num_permutations", + "num_bands", + "num_minhashes_per_band", + "jaccard_similarity_threshold", + "word_shingle_size", + "num_segments", + ], + "copyutil": [ + "subfolder_name", + "data_type", + "num_bands", + "num_segments", + "parallelization", + "use_s3", + ], + "cluster": [ + "jaccard_similarity_threshold", + ], + "fdclean": [ + "document_id_column", + "duplicate_list_location", + ], +} + + +def get_arguments(in_args: argparse.Namespace, module_name: str) -> Union[list, dict]: + sys_argv = ["python"] + in_args_dict = vars(in_args) + if in_args.use_s3: + sys_argv.append("--data_s3_cred") + sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) + all_module_arguments = args_map.get(module_name, []) + passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None} + if module_name == "copyutil": + copy_util_config = {k: v for k, v in passed_args.items()} + copy_util_config["root_folder"] = in_args_dict["output_folder"] + return copy_util_config + else: + for k, v in passed_args.items(): + sys_argv.append(f"--{module_name}_{k}") + sys_argv.append(str(v)) + if module_name == "minhash": + input_folder = in_args_dict["input_folder"] + output_folder = os.path.join(in_args_dict["output_folder"]) + elif module_name == "cluster": + input_folder = os.path.join(in_args_dict["output_folder"], "bands_consolidated") + output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove") + elif module_name == "fdclean": + if f"--{module_name}_duplicate_list_location" not in sys_argv: + sys_argv.append(f"--{module_name}_duplicate_list_location") + sys_argv.append( + os.path.join( + in_args_dict["output_folder"], + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + input_folder = in_args_dict["input_folder"] + output_folder = os.path.join(in_args_dict["output_folder"], "cleaned") + else: + logging.error(f"Unknown module name: {module_name}") + data_io = { + "input_folder": input_folder, + "output_folder": output_folder, + } + if in_args.use_s3: + sys_argv.append("--data_s3_config") + else: + sys_argv.append("--data_local_config") + sys_argv.append(ParamsUtils.convert_to_ast(data_io)) + return sys_argv + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_folder", type=str, required=True, help="path to read the input files") + parser.add_argument("--output_folder", type=str, required=True, help="path to write the output files") + parser.add_argument( + "--use_s3", type=bool, required=False, default=False, help="if true, use S3, if false use local FS" + ) + parser.add_argument( + "--contents_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument( + "--document_id_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text") + parser.add_argument( + "--num_permutations", type=int, required=True, help="number of permutations to use for minhash calculation" + ) + parser.add_argument( + "--num_bands", type=int, required=True, help="number of bands to use for band hash calculation" + ) + parser.add_argument( + "--num_minhashes_per_band", type=int, required=True, help="number of minhashes to use in each band" + ) + parser.add_argument( + "--word_shingle_size", type=int, required=False, help="number of words included in one shingle" + ) + parser.add_argument( + "--jaccard_similarity_threshold", + type=float, + required=False, + help="jaccard similarity threshold above which two documents are similar", + ) + parser.add_argument( + "--num_segments", + type=int, + required=True, + help="number of segments to divide each band hash interval (to improve scalability)", + ) + parser.add_argument("--parallelization", type=int, required=False, default=-1, help="spark parallelization") + parser.add_argument( + "--duplicate_list_location", + type=str, + required=False, + help="path to the file with all the duplicate document ids", + ) + return parser.parse_args() + + +if __name__ == "__main__": + # configure logging + logging.basicConfig( + format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + ) + args = parse_arguments() + sys.argv = get_arguments(args, "minhash") + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + status = launcher.launch() + logging.info(f"Signature calculation concluded with status {status}") + + fcs_config = get_arguments(args, "copyutil") + + root_folder = fcs_config["root_folder"] + parallelization = fcs_config["parallelization"] + fcs = FileCopySpark(root_folder, fcs_config["num_bands"], fcs_config["num_segments"], args.use_s3) + data_access_factory = fcs.create_data_access_factory(root_folder, args.use_s3) + app_config = {"root_folder": root_folder} + execution_config = {"parallelization": parallelization} if parallelization > 0 else {} + status = fcs.orchestrate(app_config, execution_config, data_access_factory, data_type="bands") + logging.info(f"Consolidate bands concluded with status {status}") + + sys.argv = get_arguments(args, "cluster") + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + status = launcher.launch() + logging.info(f"Cluster analysis concluded with status {status}") + + stats = {} + fcu_config = get_arguments(args, "copyutil") + fcu = FileCopyUtil(data_access_factory=data_access_factory, config=fcu_config, stats=stats) + fcu.copy_data(subfolder_name="docs_to_remove", data_type="docs_to_remove") + + sys.argv = get_arguments(args, "fdclean") + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + status = launcher.launch() + logging.info(f"Data cleanup concluded with status {status}") diff --git a/transforms/universal/fdedup/spark/src/requirements.txt b/transforms/universal/fdedup/spark/src/requirements.txt new file mode 100644 index 000000000..c1a1f2c3d --- /dev/null +++ b/transforms/universal/fdedup/spark/src/requirements.txt @@ -0,0 +1,8 @@ +pyspark +pyarrow +pyyaml +boto3 +kubernetes +disjoint_set +mmh3 +scipy diff --git a/transforms/universal/fdedup/spark/src/signature_calc_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_spark.py new file mode 100644 index 000000000..0e7046549 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/signature_calc_spark.py @@ -0,0 +1,35 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +if __name__ == "__main__": + sys.argv.append("--data_s3_cred") + s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), + } + sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py new file mode 100644 index 000000000..4e39810c6 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Signature Calculation + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup signature calculation transform") + # Launch the spark worker(s) to process the input + launcher.launch() From 3349521bdfe3b1d95d8160cf442b722988c344be Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:05:39 +0100 Subject: [PATCH 007/105] added folder_transform --- .../pure_python/transform_file_processor.py | 15 ++++-- .../pure_python/transform_orchestrator.py | 42 ++++++++++------ .../runtime/transform_file_processor.py | 41 ++++++++------- .../src/data_processing/transform/__init__.py | 2 + .../transform/abstract_transform.py | 16 ++++++ .../transform/binary_transform.py | 5 +- .../transform/folder_transform.py | 50 +++++++++++++++++++ .../runtime/ray/transform_file_processor.py | 1 + .../runtime/ray/transform_orchestrator.py | 19 ++++--- .../runtime/spark/transform_file_processor.py | 5 +- .../runtime/spark/transform_orchestrator.py | 25 +++++++--- 11 files changed, 168 insertions(+), 53 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/transform/abstract_transform.py create mode 100644 data-processing-lib/python/src/data_processing/transform/folder_transform.py diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index 143835dd0..fa3e69e4a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -14,7 +14,7 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime import AbstractTransformFileProcessor -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractTransform, TransformStatistics from data_processing.utils import UnrecoverableException @@ -28,7 +28,8 @@ def __init__( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ): """ Init method @@ -36,11 +37,13 @@ def __init__( :param statistics - reference to statistics class :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag """ # invoke superclass super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) self.transform_params["statistics"] = statistics # Create local processor @@ -52,7 +55,8 @@ def __init__( # Create statistics self.stats = statistics - def _publish_stats(self, stats: dict[str, Any]) -> None: + +def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) @@ -65,17 +69,20 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool ): """ Init method :param data_access_factory - data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder tranform flag """ super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) # Add data access and statistics to the processor parameters self.transform_params["data_access"] = self.data_access diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 8692da29e..153eaaf0a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,7 +24,7 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger @@ -48,8 +48,6 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } - - def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -74,15 +72,21 @@ def orchestrate( return 1 # create additional execution parameters runtime = runtime_config.create_transform_runtime() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - if retries > 0: - statistics.add_stats({"data access retries": retries}) - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + if retries > 0: + statistics.add_stats({"data access retries": retries}) + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -99,6 +103,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) else: # using sequential execution @@ -111,6 +116,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) status = "success" return_code = 0 @@ -157,7 +163,8 @@ def _process_transforms( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ) -> None: """ Process transforms sequentially @@ -167,9 +174,8 @@ def _process_transforms( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag :return: metadata for the execution - - :return: None """ # create executor executor = PythonTransformFileProcessor( @@ -177,6 +183,7 @@ def _process_transforms( statistics=statistics, transform_params=transform_params, transform_class=transform_class, + is_folder=is_folder, ) # process data t_start = time.time() @@ -203,6 +210,7 @@ def _process_transforms_multiprocessor( data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], transform_class: type[AbstractBinaryTransform], + is_folder: bool ) -> TransformStatistics: """ Process transforms using multiprocessing pool @@ -212,13 +220,17 @@ def _process_transforms_multiprocessor( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform class :return: metadata for the execution """ # result statistics statistics = TransformStatistics() # create processor processor = PythonPoolTransformFileProcessor( - data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class + data_access_factory=data_access_factory, + transform_params=transform_params, + transform_class=transform_class, + is_folder=is_folder, ) completed = 0 t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index d4ec548d8..1d268875f 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -26,11 +26,13 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_parameters: dict[str, Any], + is_folder: bool = False, ): """ Init method :param data_access_factory: Data Access Factory :param transform_parameters: Transform parameters + :param is_folder: folder transform flag """ self.logger = get_logger(__name__) # validate parameters @@ -46,6 +48,7 @@ def __init__( # Add data access and statistics to the processor parameters self.transform_params = transform_parameters self.transform_params["data_access"] = self.data_access + self.is_folder = is_folder def process_file(self, f_name: str) -> None: """ @@ -58,25 +61,29 @@ def process_file(self, f_name: str) -> None: self.logger.warning("No data_access found. Returning.") return t_start = time.time() - # Read source file - filedata, retries = self.data_access.get_file(path=f_name) - if retries > 0: - self._publish_stats({"data access retries": retries}) - if filedata is None: - self.logger.warning(f"File read resulted in None for {f_name}. Returning.") - self._publish_stats({"failed_reads": 1}) - return - self._publish_stats({"source_files": 1, "source_size": len(filedata)}) + if not self.is_folder: + # Read source file only if we are processing file + filedata, retries = self.data_access.get_file(path=f_name) + if retries > 0: + self._publish_stats({"data access retries": retries}) + if filedata is None: + self.logger.warning(f"File read resulted in None for {f_name}. Returning.") + self._publish_stats({"failed_reads": 1}) + return + self._publish_stats({"source_files": 1, "source_size": len(filedata)}) # Process input file try: - # execute local processing - name_extension = TransformUtils.get_file_extension(f_name) self.logger.debug(f"Begin transforming file {f_name}") - out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + if not self.is_folder: + # execute local processing + out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + name_extension = TransformUtils.get_file_extension(f_name) + self.last_file_name = name_extension[0] + self.last_file_name_next_index = None + self.last_extension = name_extension[1] + else: + out_files, stats = self.transform.transform(folder_name=f_name) self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") - self.last_file_name = name_extension[0] - self.last_file_name_next_index = None - self.last_extension = name_extension[1] # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) # Process unrecoverable exceptions @@ -95,10 +102,10 @@ def flush(self) -> None: the hook for them to return back locally stored data and their statistics. :return: None """ - if self.last_file_name is None: + if self.last_file_name is None or self.is_folder: # for some reason a given worker never processed anything. Happens in testing # when the amount of workers is greater than the amount of files - self.logger.debug("skipping flush, no name for file is defined") + self.logger.debug("skipping flush, no name for file is defined or this is a folder transform") return try: t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py index 6af43ad60..20254e47b 100644 --- a/data-processing-lib/python/src/data_processing/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/transform/__init__.py @@ -1,3 +1,5 @@ +from data_processing.transform.abstract_transform import AbstractTransform +from data_processing.transform.folder_transform import AbstractFolderTransform from data_processing.transform.binary_transform import AbstractBinaryTransform from data_processing.transform.table_transform import AbstractTableTransform from data_processing.transform.transform_statistics import TransformStatistics diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py new file mode 100644 index 000000000..89db70f42 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py @@ -0,0 +1,16 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +class AbstractTransform: + """ + Base class for all transform types + """ \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py index 80dff61ea..b313aff2f 100644 --- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py @@ -10,10 +10,11 @@ # limitations under the License. ################################################################################ -from typing import Any, TypeVar +from typing import Any +from data_processing.transform import AbstractTransform -class AbstractBinaryTransform: +class AbstractBinaryTransform(AbstractTransform): """ Converts input binary file to output file(s) (binary) Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py new file mode 100644 index 000000000..866e3286f --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from data_processing.data_access import data_access +from data_processing.transform import AbstractTransform + + +class AbstractFolderTransform(AbstractTransform): + """ + Converts input folder to output file(s) (binary) + Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or + more new binary files and metadata. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This simply stores the given instance in this instance for later use. + """ + self.config = config + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the extension to be used when writing out the new bytes. + """ + raise NotImplemented() + + @staticmethod + def get_folders(data_access:data_access) -> list(str): + """ + Compute the list of folders to use. + :param data_access - data access class + :return: + """ + raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py index e1fabb144..cdad1309f 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py @@ -35,6 +35,7 @@ def __init__(self, params: dict[str, Any]): super().__init__( data_access_factory=params.get("data_access_factory", None), transform_parameters=dict(params.get("transform_params", {})), + is_folder=params.get("is_folder", False) ) # Create statistics self.stats = params.get("statistics", None) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 42eba47a6..8276eb56c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -16,6 +16,7 @@ import ray from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import AbstractFolderTransform from data_processing_ray.runtime.ray import ( RayTransformExecutionConfiguration, RayTransformFileProcessor, @@ -56,13 +57,18 @@ def orchestrate( # create transformer runtime runtime = runtime_config.create_transform_runtime() resources = RayUtils.get_cluster_resources() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -84,6 +90,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), "statistics": statistics, + "is_folder": is_folder, } logger.debug("Creating actors") processors = RayUtils.create_actors( diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py index d63664ac4..a0968ab1d 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py @@ -29,12 +29,15 @@ def __init__( data_access_factory: DataAccessFactoryBase, runtime_configuration: SparkTransformRuntimeConfiguration, statistics: TransformStatistics, + is_folder: bool, ): """ Init method """ super().__init__( - data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params() + data_access_factory=data_access_factory, + transform_parameters=runtime_configuration.get_transform_params(), + is_folder=is_folder, ) # Add data access ant statistics to the processor parameters self.runtime_configuration = runtime_configuration diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index c279f2b73..c534b685f 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -18,7 +18,7 @@ import yaml from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics +from data_processing.transform import TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger from data_processing_spark.runtime.spark import ( SparkTransformExecutionConfiguration, @@ -117,7 +117,10 @@ def process_partition(iterator): runtime = runtime_conf.create_transform_runtime() # create file processor file_processor = SparkTransformFileProcessor( - data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics + data_access_factory=d_access_factory, + runtime_configuration=runtime_conf, + statistics=statistics, + is_folder=is_folder, ) first = True for f in iterator: @@ -144,13 +147,19 @@ def process_partition(iterator): return list(statistics.get_execution_stats().items()) num_partitions = 0 + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # process data logger.debug("Begin processing files") # process files split by partitions From 0553edf9d5a6d9507a470927b14f5c65b7ec8773 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:13:01 +0100 Subject: [PATCH 008/105] added folder_transform --- .../runtime/pure_python/transform_orchestrator.py | 2 +- .../python/src/data_processing/transform/folder_transform.py | 4 ++-- .../data_processing_ray/runtime/ray/transform_orchestrator.py | 2 +- .../runtime/spark/transform_orchestrator.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 153eaaf0a..d51f80a8a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 866e3286f..eca191bbb 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -41,10 +41,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str raise NotImplemented() @staticmethod - def get_folders(data_access:data_access) -> list(str): + def get_folders(d_access: data_access) -> list(str): """ Compute the list of folders to use. - :param data_access - data access class + :param d_access - data access class :return: """ raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 8276eb56c..a8ff95729 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index c534b685f..4a0897952 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -151,7 +151,7 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process From a53412ecb5a00535dd85c56939c2d2fa4542c14a Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 21:00:43 +0100 Subject: [PATCH 009/105] added folder_transform --- .../runtime/pure_python/transform_file_processor.py | 3 +-- .../runtime/pure_python/transform_orchestrator.py | 11 ++++++----- .../runtime/pure_python/transform_runtime.py | 10 +++++++++- .../data_processing/transform/folder_transform.py | 12 +----------- .../runtime/ray/transform_orchestrator.py | 2 +- .../runtime/ray/transform_runtime.py | 10 +++++++++- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index fa3e69e4a..44ccd0ef0 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -55,8 +55,7 @@ def __init__( # Create statistics self.stats = statistics - -def _publish_stats(self, stats: dict[str, Any]) -> None: + def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index d51f80a8a..812be8caf 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,14 +24,13 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform +from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger logger = get_logger(__name__) -@staticmethod def _execution_resources() -> dict[str, Any]: """ Get Execution resource @@ -48,6 +47,7 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } + def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process @@ -145,7 +145,8 @@ def orchestrate( "job_input_params": input_params | data_access_factory.get_input_params() | execution_config.get_input_params(), - "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, + "execution_stats": _execution_resources() | + {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, "job_output_stats": stats, } logger.debug(f"Saving job metadata: {metadata}.") @@ -209,7 +210,7 @@ def _process_transforms_multiprocessor( print_interval: int, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], is_folder: bool ) -> TransformStatistics: """ diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py index 4173154ae..478d40837 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] ) -> dict[str, Any]: diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index eca191bbb..9a2fb3713 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -11,7 +11,6 @@ ################################################################################ from typing import Any -from data_processing.data_access import data_access from data_processing.transform import AbstractTransform @@ -38,13 +37,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str to metadata. Each element of the return list, is a tuple of the transformed bytes and a string holding the extension to be used when writing out the new bytes. """ - raise NotImplemented() - - @staticmethod - def get_folders(d_access: data_access) -> list(str): - """ - Compute the list of folders to use. - :param d_access - data access class - :return: - """ - raise NotImplemented() + raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index a8ff95729..b29682997 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py index 57f071406..64479302c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from ray.actor import ActorHandle @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] ) -> dict[str, Any]: From 9c3ace785b9a529e047df93ed9e65d27bf3d7ba0 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 08:48:00 +0100 Subject: [PATCH 010/105] added folder_transform --- .../runtime/spark/transform_orchestrator.py | 3 ++- .../runtime/spark/transform_runtime.py | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index 4a0897952..096fab272 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -151,7 +151,8 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + runtime = runtime_config.create_transform_runtime() + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index 7b968b1e9..7410d09d1 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics ) -> dict[str, Any]: From 7091a2e6087c77d5b204c803917f97b60d974310 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 15:35:00 +0100 Subject: [PATCH 011/105] added noop testing --- .../runtime/transform_file_processor.py | 44 +++++--- .../test_support/transform/__init__.py | 13 ++- .../transform/noop_folder_transform.py | 105 ++++++++++++++++++ .../test_support/transform/noop_transform.py | 6 +- .../transform/folder_transform.py | 2 +- .../transform/transform_configuration.py | 6 +- .../transform/test_folders_noop.py | 33 ++++++ .../launch/ray/ray_test_noop_launch.py | 6 - .../ededup/ray/src/ededup_transform_ray.py | 9 +- 9 files changed, 187 insertions(+), 37 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py create mode 100644 data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index 1d268875f..4075f40be 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -83,6 +83,7 @@ def process_file(self, f_name: str) -> None: self.last_extension = name_extension[1] else: out_files, stats = self.transform.transform(folder_name=f_name) + self.last_file_name = f_name self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) @@ -148,15 +149,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats ) case 1: # we have exactly 1 output file - file_ext = out_files[0] - lfn = self.last_file_name - if self.last_file_name_next_index is not None: - lfn = f"{lfn}_{self.last_file_name_next_index}" - output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}") + if self.is_folder: + # its folder + output_name = out_files[0][1] + dt = out_files[0][0] + else: + file_ext = out_files[0] + lfn = self.last_file_name + if self.last_file_name_next_index is not None: + lfn = f"{lfn}_{self.last_file_name_next_index}" + output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}") + dt = file_ext[0] self.logger.debug( f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}" ) - save_res, retries = self.data_access.save_file(path=output_name, data=file_ext[0]) + save_res, retries = self.data_access.save_file(path=output_name, data=dt) if retries > 0: self._publish_stats({"data access retries": retries}) if save_res is None: @@ -166,7 +173,7 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats self._publish_stats( { "result_files": 1, - "result_size": len(file_ext[0]), + "result_size": len(dt), "processing_time": time.time() - t_start, } ) @@ -183,14 +190,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats start_index = 0 count = len(out_files) for index in range(count): - file_ext = out_files[index] - output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}" - file_sizes += len(file_ext[0]) - self.logger.debug( - f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} " - f"of {count} to {output_name_indexed}" - ) - save_res, retries = self.data_access.save_file(path=output_name_indexed, data=file_ext[0]) + if self.is_folder: + # its a folder + output_name_indexed = out_files[index][1] + dt = out_files[index][0] + else: + # files + file_ext = out_files[index] + output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}" + self.logger.debug( + f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} " + f"of {count} to {output_name_indexed}" + ) + dt = file_ext[0] + file_sizes += len(dt) + save_res, retries = self.data_access.save_file(path=output_name_indexed, data=dt) if retries > 0: self._publish_stats({"data access retries": retries}) if save_res is None: diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py index 0e90f7ffd..04d6f3b0f 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py @@ -1,6 +1,11 @@ -from .table_transform_test import AbstractTableTransformTest -from .binary_transform_test import AbstractBinaryTransformTest -from .noop_transform import ( +from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest +from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest +from data_processing.test_support.transform.noop_transform import ( NOOPTransform, - NOOPPythonTransformConfiguration, + NOOPTransformConfiguration, + NOOPPythonTransformConfiguration ) +from data_processing.test_support.transform.noop_folder_transform import ( + NOOPFolderTransform, + NOOPFolderPythonTransformConfiguration +) \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py new file mode 100644 index 000000000..5baab7858 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py @@ -0,0 +1,105 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, + DefaultPythonTransformRuntime) +from data_processing.transform import AbstractFolderTransform +from data_processing.utils import get_logger +from data_processing.test_support.transform import NOOPTransformConfiguration + + +logger = get_logger(__name__) + + +class NOOPFolderTransform(AbstractFolderTransform): + """ + Implements a simple copy of a pyarrow Table. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, NOOPTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + # Make sure that the param name corresponds to the name used in apply_input_params method + # of NOOPTransformConfiguration class + super().__init__(config) + self.sleep = config.get("sleep_sec", 1) + self.data_access = config.get("data_access") + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the file name to use. + """ + logger.debug(f"Transforming one folder {folder_name}") + metadata = {} + # get folder files + files, retries = self.data_access.get_folder_files(path=folder_name) + if retries > 0: + metadata |= {"data access retries": retries} + result = [()] * len(files) + index = 0 + for name, file in files.items(): + result[index] = (file, self.data_access.get_output_location(name)) + if self.sleep is not None: + logger.info(f"Sleep for {self.sleep} seconds") + time.sleep(self.sleep) + logger.info("Sleep completed - continue") + index += 1 + # Add some sample metadata. + metadata |= {"nfiles": len(files)} + return result, metadata + + +class NOOPFolderPythonRuntime(DefaultPythonTransformRuntime): + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + return [data_access.get_input_folder()] + + +class NOOPFolderPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderPythonRuntime) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py index 0dee013a4..2fea35506 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py @@ -19,7 +19,7 @@ from data_processing.runtime.pure_python.runtime_configuration import ( PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.transform import AbstractTableTransform, TransformConfiguration, AbstractTransform from data_processing.utils import CLIArgumentProvider, get_logger @@ -75,10 +75,10 @@ class NOOPTransformConfiguration(TransformConfiguration): configuration with CLI args. """ - def __init__(self): + def __init__(self, clazz: type[AbstractTransform] = NOOPTransform): super().__init__( name=short_name, - transform_class=NOOPTransform, + transform_class=clazz, remove_from_metadata=[pwd_key], ) diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 9a2fb3713..caa3bfa52 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -35,6 +35,6 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str :param folder_name: the name of the folder containing arbitrary amount of files. :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated to metadata. Each element of the return list, is a tuple of the transformed bytes and a string - holding the extension to be used when writing out the new bytes. + holding the file name to use. """ raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py index 033e92f2a..a5c9ec9ad 100644 --- a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py +++ b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py @@ -13,7 +13,7 @@ from argparse import ArgumentParser from typing import Any -from data_processing.transform import AbstractBinaryTransform +from data_processing.transform import AbstractTransform from data_processing.utils import CLIArgumentProvider @@ -23,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider): """ def __init__( - self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = [] + self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = [] ): """ Initialization @@ -36,7 +36,7 @@ def __init__( self.remove_from_metadata = remove_from_metadata self.params = {} - def get_transform_class(self) -> type[AbstractBinaryTransform]: + def get_transform_class(self) -> type[AbstractTransform]: """ Get the class extending AbstractBinaryTransform which implements a specific transformation. The class will generally be instantiated with a dictionary of configuration produced by diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py new file mode 100644 index 000000000..e0fdd86c8 --- /dev/null +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.transform import NOOPFolderPythonTransformConfiguration + + +class TestRayNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../../../test-data/data_processing/python/noop/" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration()) + fixtures = [(launcher, {"noop_sleep_sec": 0}, basedir + "/input", basedir + "/expected")] + return fixtures diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py index d4cc874f0..e706a4dfa 100644 --- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py @@ -12,7 +12,6 @@ import os -import pyarrow as pa from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) @@ -20,11 +19,6 @@ from data_processing_ray.test_support.transform import NOOPRayTransformConfiguration -table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])}) -expected_table = table # We're a noop after all. -expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result - - class TestRayNOOPTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. diff --git a/transforms/universal/ededup/ray/src/ededup_transform_ray.py b/transforms/universal/ededup/ray/src/ededup_transform_ray.py index c0823a22e..d90dfa780 100644 --- a/transforms/universal/ededup/ray/src/ededup_transform_ray.py +++ b/transforms/universal/ededup/ray/src/ededup_transform_ray.py @@ -149,13 +149,12 @@ def _load_snapshots(self, data_access_factory: DataAccessFactoryBase, statistics statistics.add_stats.remote({"data access retries": retries}) self.logger.info(f"Found the following snapshot files {files.keys()}") # process snapshot files - for file in files.keys(): - # load the file + for file in files.values(): + # convert the file try: - b_hashes, _ = data_access.get_file(file) - snaps = pickle.loads(b_hashes) + snaps = pickle.loads(file) except Exception as e: - self.logger.warning(f"Failed to load hashes from file {file} with exception {e}") + self.logger.warning(f"Failed to load hashes with exception {e}") raise UnrecoverableException("failed to load hashes") request = [[] for _ in range(len(self.filters))] for h in snaps: From 680c78ac3f38724dfcf646673aae2ac3661107be Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 11 Oct 2024 10:47:42 -0400 Subject: [PATCH 012/105] Fuzzy dedup ray implementation Signed-off-by: nelson --- .../universal/fdedup/ray/pyproject.toml | 10 +- .../ray/src/cluster_analysis_local_ray.py | 51 ++ .../ray/src/cluster_analysis_transform_ray.py | 42 + .../fdedup/ray/src/compute_shingles.py | 50 -- ...ocal_ray.py => data_cleaning_local_ray.py} | 61 +- .../ray/src/data_cleaning_transform_ray.py | 120 +++ .../universal/fdedup/ray/src/fdedup_s3_ray.py | 76 -- .../fdedup/ray/src/fdedup_support.py | 621 -------------- .../fdedup/ray/src/fdedup_transform_ray.py | 803 ------------------ .../ray/src/signature_calc_local_ray.py | 54 ++ .../ray/src/signature_calc_transform_ray.py | 42 + 11 files changed, 340 insertions(+), 1590 deletions(-) create mode 100644 transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py create mode 100644 transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py delete mode 100644 transforms/universal/fdedup/ray/src/compute_shingles.py rename transforms/universal/fdedup/ray/src/{fdedup_local_ray.py => data_cleaning_local_ray.py} (59%) create mode 100644 transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py delete mode 100644 transforms/universal/fdedup/ray/src/fdedup_s3_ray.py delete mode 100644 transforms/universal/fdedup/ray/src/fdedup_support.py delete mode 100644 transforms/universal/fdedup/ray/src/fdedup_transform_ray.py create mode 100644 transforms/universal/fdedup/ray/src/signature_calc_local_ray.py create mode 100644 transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 3f2c8ba51..e2a2d34c9 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,20 +1,18 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.2.dev0" +version = "0.3.0.dev0" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ + "dpk_fdedup_transform_python==0.3.0.dev0", "data-prep-toolkit-ray==0.2.2.dev0", - "mmh3==4.1.0", - "xxhash==3.4.1", "tqdm==4.66.3", - "scipy==1.12.0" ] [build-system] diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py new file mode 100644 index 000000000..25b96788d --- /dev/null +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py @@ -0,0 +1,51 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py new file mode 100644 index 000000000..970686e13 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class ClusterAnalysisRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/compute_shingles.py b/transforms/universal/fdedup/ray/src/compute_shingles.py deleted file mode 100644 index 2db75ebe2..000000000 --- a/transforms/universal/fdedup/ray/src/compute_shingles.py +++ /dev/null @@ -1,50 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import string - - -""" -This implements the most simplistic splitting of document based on the white spaces -that can be overwritten by a different document splitter (tokenizer). This method is -build in the library and can be overwritten using approach described at -https://stackoverflow.com/questions/37553545/how-do-i-override-a-function-of-a-python-library - -import compute_shingles -compute_shingles.compute_shingles = my_local_compute_shingles -""" - - -def _find(s: str, ch: str) -> list[int]: - """ - Get indexes of all locations of character in string - :param s: string - :param ch: character - :return: list of locations - """ - return [i for i, ltr in enumerate(s) if ltr == ch] - - -def compute_shingles(txt: str, word_shingle_size: int, delimiter: str = " ") -> list[str]: - """ - Generate word shingles - :param txt: document - :param delimiter: delimiter to split document - :param word_shingle_size: size of shingle in words - :return: list of shingles - """ - text = txt.replace("\n", "").lower().translate(str.maketrans("", "", string.punctuation)) - separators = _find(text, delimiter) - if len(separators) + 1 <= word_shingle_size: - return [text] - bounds = [-1] + separators + [len(text)] - return [text[bounds[i] + 1 : bounds[i + word_shingle_size]] for i in range(0, len(bounds) - word_shingle_size)] diff --git a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py similarity index 59% rename from transforms/universal/fdedup/ray/src/fdedup_local_ray.py rename to transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py index af7bec71c..54fa2ccac 100644 --- a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py @@ -13,59 +13,52 @@ import os import sys +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration -# create launcher -launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, } +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + ) +) worker_options = {"num_cpus": 0.8} + code_location = {"github": "github", "commit_hash": "12345", "path": "path"} params = { # where to run "run_locally": True, # Data access. Only required parameters are specified "data_local_config": ParamsUtils.convert_to_ast(local_conf), - # Orchestration parameters - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 1, + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", "runtime_creation_delay": 0, "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 1, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 2, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, } -sys.argv = ParamsUtils.dict_to_req(d=params) -# launch -launcher.launch() + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py new file mode 100644 index 000000000..9fdb220f7 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -0,0 +1,120 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +import ray +from data_cleaning_transform import ( + DataCleaningTransform, + DataCleaningTransformConfiguration, + docs2remove_list, + docs2remove_list_key, + get_docs_to_remove, +) +from data_processing.data_access import DataAccessFactoryBase +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, +) +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from ray.actor import ActorHandle + + +logger = get_logger(__name__) + + +class DataCleaningRayTransform(DataCleaningTransform): + """ """ + + def __init__(self, config: dict): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, LangSelectorTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + docs2remove = config.get(docs2remove_list_key, None) + if docs2remove is not None: + # This is recommended for production approach. In this case domain list is build by the + # runtime once, loaded to the object store and can be accessed by actors without additional reads + try: + + config[docs2remove_list_key] = ray.get(config.get(docs2remove_list_key)) + except Exception as e: + self.logger.warning(f"Exception loading languages list from ray object storage {e}") + raise RuntimeError(f"exception loading from object storage for key {docs2remove}") + super().__init__(config) + + +class DataCleaningRuntime(DefaultRayTransformRuntime): + """ + Ingest Data cleaning runtime support + """ + + def __init__(self, params: dict[str, Any]): + """ + Create filter runtime + :param params: parameters, that should include + ingest_supported_langs_file_key: supported languages file + ingest_detect_programming_lang_key: whether to detect programming language + ingest_domain_key: domain + ingest_snapshot_key: snapshot + """ + super().__init__(params) + from data_processing.utils import get_logger + + self.logger = get_logger(__name__) + + def get_transform_config( + self, + data_access_factory: DataAccessFactoryBase, + statistics: ActorHandle, + files: list[str], + ) -> dict[str, Any]: + """ + Set environment for filter execution + :param data_access_factory - data access factory + :param statistics - reference to the statistics object + :param files - list of files to remove + :return: dictionary of filter init params + """ + docs_to_remove = get_docs_to_remove(self.params) + docs_to_remove_list = ray.put(docs_to_remove) + return {docs2remove_list_key: docs_to_remove_list} | self.params + + +class DataCleaningRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(transform_class=DataCleaningRayTransform), + runtime_class=DataCleaningRuntime, + ) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py b/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py deleted file mode 100644 index 285fcfa22..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py +++ /dev/null @@ -1,76 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import sys - -from data_processing.utils import ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration - - -# create launcher -launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) -# create parameters -s3_cred = { - "access_key": "localminioaccesskey", - "secret_key": "localminiosecretkey", - "url": "http://localhost:9000", -} - -s3_conf = { - "input_folder": "test/fdedup/input", - "output_folder": "test/fdedup/output", -} -worker_options = {"num_cpus": 0.8} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -params = { - # where to run - "run_locally": True, - # Data access. Only required parameters are specified - "data_s3_config": ParamsUtils.convert_to_ast(s3_conf), - "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred), - # Orchestration parameters - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 5, - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 2, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 2, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, -} -sys.argv = ParamsUtils.dict_to_req(d=params) - - -# launch -launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fdedup_support.py b/transforms/universal/fdedup/ray/src/fdedup_support.py deleted file mode 100644 index 60afb84bf..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_support.py +++ /dev/null @@ -1,621 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import pickle -import time -from typing import Any, Iterator, Union - -import numpy as np -import ray -from data_processing.data_access import SnapshotUtils -from data_processing.utils import GB, RANDOM_SEED, TransformUtils, get_logger -from data_processing_ray.runtime.ray import RayUtils -from ray.actor import ActorHandle -from ray.util import ActorPool -from scipy.integrate import quad as integrate - - -NO_SIMILARITY = -1 -REQUEST_LEN = 4096 -LONG_BUCKET = 5000 -LONG_BUCKET_PRINT = 1000 - - -def fuzzy_optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float, - false_negative_weight: float, -) -> tuple[int, int]: - """ - Computes parameters for fuzzy dedup - :param threshold: filtering threshold - :param num_perm: number of permutations - :param false_positive_weight: false positive weight - :param false_negative_weight: false negative weight - :return: number of buckets and bucket length - """ - - def _false_positive_probability(ths: float, b: int, r: int) -> float: - """ - Compute false positive probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, ths) - return a - - def _false_negative_probability(ths: float, b: int, r: int) -> float: - """ - Compute false negative probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, ths, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for perm in range(1, num_perm + 1): - max_r = int(num_perm / perm) - for rel in range(1, max_r + 1): - fp = _false_positive_probability(threshold, perm, rel) - fn = _false_negative_probability(threshold, perm, rel) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (perm, rel) - return opt - - -class MurmurMH: - def __init__(self, num_perm: int, seed: int = RANDOM_SEED): - self.seed = seed - self.num_perm = num_perm - self.permutations = self._init_permutations(seed, num_perm) - - def minhash(self, shingle_count: int, shingles: Iterator[str]) -> np.array: - def generator(): - for shingle in shingles: - yield TransformUtils.str_to_int(shingle) - - hash_values = np.fromiter(generator(), dtype=np.uint64, count=shingle_count) - - result = np.zeros(self.permutations.shape, dtype=np.uint32) - for i, perm in enumerate(self.permutations): - result[i] = np.right_shift((perm * hash_values).T, 32).astype(np.uint32).min(axis=0, keepdims=False) - return result - - @staticmethod - def _init_permutations(seed: int, num_perm: int) -> np.array: - # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic - max_int = np.uint64((1 << 64) - 1) - gen = np.random.RandomState(seed) - # get self.num_perm pseudo random numbers between 2 and max_int (excl) - permutations = np.array([gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], dtype=np.uint64).T - # make all even pseudo random numbers odd by adding 1 - permutations[permutations % 2 == 0] += 1 - return permutations - - @staticmethod - def jaccard(mh1: np.array, mh2: np.array) -> float: - return np.count_nonzero(mh1 == mh2) - - -@ray.remote(scheduling_strategy="SPREAD") -class DocCollector: - """ - An actor collecting de duped document IDs - """ - - def __init__(self, params: dict[str, Any]): - """ - Initializer - """ - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - self.removed = set() - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.ids = {} - else: - try: - bids, _ = self.data_access.get_file(snapshot) - self.ids = pickle.loads(bids) - except Exception as e: - self.logger.warning(f"Failed to load doc collector {self.actor_id} with exception {e}") - raise e - - def add_documents(self, dr: tuple[list[tuple[int, int]], list[int]]) -> None: - """ - Add documents and removed document - :param dr: documents to keep and documents to remove - :return: - """ - docs = dr[0] - rm = dr[1] - # process documents to remove - for did in rm: - self.ids.pop(did, None) - self.removed.update(rm) - # process documents to keep - for key, val in docs: - if key in self.removed: - continue - if key in self.ids and val == NO_SIMILARITY: - # Do not update existing docs with NO_SIMILARITY - continue - else: - self.ids[key] = val - - def filter(self, docs: list[int]) -> dict[int, int]: - """ - Filter documents - :param docs: documents to filter - :return: documents to keep - """ - result = {} - for doc_id in docs: - r = self.ids.get(doc_id, None) - if r is not None: - result[doc_id] = r - return result - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_doc = pickle.dumps(self.ids) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}docs/doc_collector_{self.actor_id}", b_doc - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot doc collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float, int, float]: - """ - get sizes - :return: number of ids, its memory utilization, number of removed, its memory utilization - """ - return ( - len(self.ids), - TransformUtils.deep_get_size(self.ids) / GB, - len(self.removed), - TransformUtils.deep_get_size(self.removed) / GB, - ) - - -@ray.remote(scheduling_strategy="SPREAD") -class DocsMinHash: - """ - An actor storing min hashes for a doc id - """ - - def __init__(self, params: dict[str, Any]): - """ - Initialize - :param params: parameters - """ - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.docs = {} - else: - try: - bdocs, _ = self.data_access.get_file(snapshot) - self.docs = pickle.loads(bdocs) - except Exception as e: - self.logger.warning(f"Failed to load minhash collector {self.actor_id} with exception {e}") - raise e - - def add_minhashes(self, updates: list[tuple[int, int, np.array]]) -> None: - """ - Add minhashes - :param updates: minhash for doc_id a tuple of doc len and array of hashes - :return: None - """ - for doc_id, length, minhash in updates: - self.docs[doc_id] = np.concatenate(([length], minhash)) - - def get_minhashes(self, doc_ids: list[int]) -> list[tuple[int, int, np.array]]: - """ - Get minhashes for a list of documents - :param doc_ids: list of doc ids - :return: doc id, len, minhashes - """ - result = [] - for doc_id in doc_ids: - info = self.docs.get(doc_id) - if info is not None: - result.append((doc_id, info[0], info[1:])) - return result - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_doc = pickle.dumps(self.docs) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}minhash/minhash_collector_{self.actor_id}", - b_doc, - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot minhash collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float]: - """ - Get size of used min hashes - :return: number of docs, its memory utilization - """ - return len(self.docs), TransformUtils.deep_get_size(self.docs) / GB - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHash: - """ - Actor storing buckets information - """ - - def __init__(self, params: dict[str, Any]): - """ - Initialization - """ - from ray.util.metrics import Counter - - self.submitter = None - self.n_buckets = 0 - self.bucket_memory = 0 - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.buckets = {} - else: - try: - b_buckets, _ = self.data_access.get_file(snapshot) - self.buckets = pickle.loads(b_buckets) - except Exception as e: - self.logger.warning(f"Failed to load buckets collector {self.actor_id} with exception {e}") - raise e - self.bucket_created_counter = Counter("bucket_created", "Amount of buckets created") - self.long_bucket_submit_counter = Counter("long_bucket_submitted", "Amount of long buckets submitted") - self.short_bucket_submit_counter = Counter("short_bucket_submitted", "Amount of short buckets submitted") - - def add_buckets(self, bck: list[tuple[int, list[int]]]) -> None: - """ - Add additional buckets to hash - :param bck: bucket information - :return: None - """ - for bucket in bck: - b_hash = bucket[0] - buckets_for_hash = self.buckets.get(b_hash) - if buckets_for_hash: - if type(buckets_for_hash) == int: - self.buckets[b_hash] = [buckets_for_hash] + bucket[1] - else: - buckets_for_hash.extend(bucket[1]) - else: - if len(bucket[1]) == 1: - self.buckets[b_hash] = bucket[1][0] - else: - self.buckets[b_hash] = bucket[1] - self.bucket_created_counter.inc(1) - - def add_processing_submitter(self, submitter: ActorHandle) -> None: - """ - Add process submitter - :param submitter: reference to submitter - :return: - """ - self.submitter = submitter - - def process_buckets(self) -> None: - """ - Process buckets to generate documents - :return: None - """ - - # Remember usage - self.n_buckets = len(self.buckets) - self.bucket_memory = TransformUtils.deep_get_size(self.buckets) / GB - - # split buckets into short and long. Long buckets can take very long to process - long_buckets = [] - short_buckets = [] - while len(self.buckets) > 0: - doc_id, bucket = self.buckets.popitem() - if type(bucket) == list and len(bucket) > LONG_BUCKET: - # Its long - long_buckets.append(bucket) - else: - short_buckets.append(bucket) - self.logger.info(f"processing buckets {len(long_buckets)} long, {len(short_buckets)} short") - - # process long buckets first - we are submitting them one at a time - for bucket in long_buckets: - if len(bucket) > 2 * LONG_BUCKET: - # For very long buckets, split them - self.logger.info(f"Splitting bucket of length len(bucket) into chunks") - smaller_bucket = [ - bucket[i * LONG_BUCKET : (i + 1) * LONG_BUCKET] - for i in range((len(bucket) + LONG_BUCKET - 1) // LONG_BUCKET) - ] - for b in smaller_bucket: - ray.get(self.submitter.submit_for_processing.remote([b])) - self.long_bucket_submit_counter.inc(1) - else: - ray.get(self.submitter.submit_for_processing.remote([bucket])) - self.long_bucket_submit_counter.inc(1) - self.logger.info("Done submitting long buckets") - - # And now the rest of buckets - bucket_chunks = [short_buckets[i * 100 : (i + 1) * 100] for i in range((len(short_buckets) + 99) // 100)] - for b in bucket_chunks: - ray.get(self.submitter.submit_for_processing.remote(b)) - self.short_bucket_submit_counter.inc(len(b)) - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_buckets = pickle.dumps(self.buckets) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}buckets/buckets_collector_{self.actor_id}", - b_buckets, - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot buckets collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float]: - """ - Get buckets resource utilization - :return: number of buckets and memory utilization - """ - return self.n_buckets, self.bucket_memory - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHashProcessor: - """ - Actor for processing buckets - """ - - def __init__(self, params: dict[str, Any]): - """ - Init method - :param params - dictionary of parameters containing the following keys - remote_docs - handles to the remote docs - remote_minhashes - handles to the remote minhashes - mn_min_hash - MurmurMH class - threshold - threshold - statistics - statistics actor - """ - from ray.util.metrics import Counter - - self.threshold = params["threshold"] - self.mn_min_hash = params["mn_min_hash"] - self.remote_docs = params["remote_docs"] - self.remote_minhashes = params["remote_minhashes"] - self.stats = params["statistics"] - self.logger = get_logger(__name__) - self.bucket_processed_counter = Counter("bucket_processed", "Amount of buckets processed") - - def _submit_generated_docs(self, docs: dict[int, int], removed: set[int]) -> None: - """ - Submit generated documents - :param docs: docs to submit - :param removed: removed documents - :return: None - """ - # Remove doc ids that are already removed - for did in removed: - docs.pop(did, None) - # Build remote requests - request = [([], []) for _ in range(len(self.remote_docs))] - for key, value in docs.items(): - req_tuple = request[key % len(self.remote_docs)] - req_tuple[0].append((key, value)) - for did in removed: - req_tuple = request[did % len(self.remote_docs)] - req_tuple[1].append(did) - # Submit requests and wait for replies - remote_replies = [] - i = 0 - for req in request: - if len(req[0]) > 0 or len(req[1]) > 0: # Only submit if the request has data - remote_replies.append(self.remote_docs[i].add_documents.remote(req)) - i += 1 - # Process replies - RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies) - - # get minhashes and length for docs in the bucket - def _get_minhashes_docs(self, doc_ids: list[int]) -> dict[int, tuple[int, list[int]]]: - """ - Get minhashes for documents by submitting requests to an appropriate doc collectors - :param doc_ids: doc ids - :return: doc ids with hashes - """ - request = [[] for _ in range(len(self.remote_minhashes))] - for value in doc_ids: - request[value % len(self.remote_minhashes)].append(value) - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.remote_minhashes[i].get_minhashes.remote(req)) - i += 1 - # Process replies - hashes = {} - while remote_replies: - # Wait for replies - ready, not_ready = ray.wait(remote_replies) - reply = ray.get(ready)[0] - for r in reply: - hashes[r[0]] = (r[1], r[2]) - remote_replies = not_ready - return hashes - - def process_buckets(self, buckets: list[Union[int, list[int]]]) -> None: - """ - process buckets to generate documents - :param buckets: buckets - :return: none - """ - t_start = time.time() - docs = {} - removed = set() - for bucket in buckets: - if type(bucket) == int: - # This hash has a single document - if bucket not in docs: - docs[bucket] = NO_SIMILARITY - self.bucket_processed_counter.inc(1) - continue - # multiple documents - start = time.time() - bucket_len = len(bucket) - very_long = bucket_len > LONG_BUCKET - - hashes = self._get_minhashes_docs(bucket) - set_list = [] - unvisited = set(bucket) - - # combine similar documents - index = 0 - while len(unvisited) > 0: - current_doc_id = unvisited.pop() - current_mh = hashes[current_doc_id][1] - current_set = set() - for other_doc_id in bucket: - if other_doc_id in unvisited: - other_mh = hashes[other_doc_id][1] - if self.mn_min_hash.jaccard(current_mh, other_mh) >= self.threshold: - current_set.add(current_doc_id) - current_set.add(other_doc_id) - unvisited.discard(other_doc_id) - if len(current_set) > 0: - set_list.append(current_set) - index += 1 - if index % LONG_BUCKET_PRINT == 0: - self.logger.info(f"processing very long {bucket_len} bucket, {index} documents so far") - if index > LONG_BUCKET_PRINT: - self.logger.info(f"done processing very long {bucket_len}") - - # process created sets - for current_set in set_list: - for d in current_set: - bucket.remove(d) - removed.update(current_set) - for i, doc_id in enumerate(current_set): - if i == 0: - cluster_id = doc_id - remaining = doc_id - min_len = hashes[doc_id][0] - max_len = min_len - continue - c_len = hashes[doc_id][0] - if c_len > max_len: - max_len = c_len - remaining = doc_id - continue - if c_len <= min_len: - min_len = c_len - cluster_id = doc_id - docs[remaining] = cluster_id - removed.discard(remaining) - - # if we did not find docs in connections, submit them as NO_SIMILARITY - for d in bucket: - if d not in docs: - docs[d] = NO_SIMILARITY - if very_long: - self.logger.info( - f"Processed long ({bucket_len}) bucket in {round((time.time() - start) / 60.,3)} " - f"min; " - f"docs chains {len(set_list)}" - ) - self.bucket_processed_counter.inc(1) - # Submit docs - self._submit_generated_docs(docs, removed) - # peg stats - self.stats.add_stats.remote({"generated doc_ids": len(docs), "bucket processing time": time.time() - t_start}) - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHashProcessorInvoker(object): - """ - Bucket hash processing coordinator (singleton) - """ - - def __init__(self, bucket_processors: list[ActorHandle]) -> None: - self.n_processors = len(bucket_processors) - self.pool = ActorPool(bucket_processors) - self.submitted = 0 - self.processed = 0 - self.logger = get_logger(__name__) - self.start = time.time() - - def submit_for_processing(self, buckets: list[Union[int, list[int]]]) -> None: - # Get completed results - if self.submitted < self.n_processors: # still have room - self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets) - self.logger.debug("Submitted bucket processing request") - self.submitted += 1 - return - else: - while True: - # we can have several workers fail here - try: - self.pool.get_next_unordered() - break - except Exception as e: - self.logger.error(f"Failed to process request worker exception {e}") - self.processed += 1 - self.processed += 1 - if self.processed % 100 == 0: - self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min") - self.logger.debug("Completed bucket processing request") - self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets) - self.submitted += 1 - self.logger.debug("Submitted bucket processing request") - return - - def wait_for_completion(self) -> None: - self.logger.info(f"Waiting bucket processing completion. Submitted requests {self.submitted}") - while self.pool.has_next(): - try: - self.pool.get_next_unordered() - except Exception as e: - self.logger.error(f"Failed to process request worker exception {e}") - self.processed += 1 - if self.processed % 100 == 0: - self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min") diff --git a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py deleted file mode 100644 index 6c6c02bb3..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py +++ /dev/null @@ -1,803 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import random -import time -from argparse import ArgumentParser, Namespace -from typing import Any - -import mmh3 -import numpy as np -import pyarrow as pa -import ray -from data_processing.data_access import DataAccessFactoryBase, SnapshotUtils -from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import ( - RANDOM_SEED, - CLIArgumentProvider, - TransformUtils, - str2bool, -) -from data_processing_ray.runtime.ray import ( - DefaultRayTransformRuntime, - RayTransformFileProcessor, - RayTransformLauncher, - RayUtils, -) -from data_processing_ray.runtime.ray.runtime_configuration import ( - RayTransformRuntimeConfiguration, -) -from fdedup_support import ( - REQUEST_LEN, - BucketsHash, - BucketsHashProcessor, - BucketsHashProcessorInvoker, - DocCollector, - DocsMinHash, - MurmurMH, - fuzzy_optimal_param, -) -from ray.actor import ActorHandle -from ray.util import ActorPool - - -short_name = "fdedup" -cli_prefix = f"{short_name}_" - - -class FdedupTransform(AbstractTableTransform): - """ - Implements fuzzy dedup data preprocessor (building tables and minhashes). - """ - - def __init__(self, config: dict): - """ - Initialize based on the dictionary of configuration information. - :param config: initialization parameters, with the following keys - doc_column - name of doc column - doc_id_int_column - name of int doc id column - word_shingle_size - word shingle size - mn_min_hash - MurmurMH class - num_bands - number of bands - length_band band length - remote_buckets - bucket actors - remote_minhashes - minhash actors - delimiter - delimiter - random_delay_limit - random delay limit - """ - super().__init__(config) - self.doc_column = config.get("doc_column", "") - self.doc_id_column = config.get("doc_id_int_column", "") - self.word_shingle_size = config.get("word_shingle_size", 1) - self.delimiter = config.get("delimiter", " ") - self.mn_min_hash = config.get("mn_min_hash", None) - self.num_bands = config.get("num_bands", 1) - self.length_band = config.get("length_band", 1) - self.buckets = config.get("remote_buckets", []) - self.minhashes = config.get("remote_minhashes", []) - self.random_delay_limit = config.get("random_delay_limit", 10) - - def _generate_minhashes(self, shingles: list[str]) -> np.array: - """ - Generate minhashes - :param shingles: - :return: generated minhashes - """ - min_hashes = self.mn_min_hash.minhash(len(shingles), shingles) - num_min_hashes = len(min_hashes) - assert self.num_bands * self.length_band <= num_min_hashes, ( - f"num_bans*band_len must be <= num min hashes, was num_bands={self.num_bands}, " - f"bands_len={self.length_band}, num_min hashes={num_min_hashes}" - ) - return min_hashes - - def _generate_buckets(self, min_hashes: np.array) -> list[int]: - """ - Generate buckets - :param min_hashes: array of minhashes - :return: - """ - return [ - mmh3.hash64(min_hashes[i * self.length_band : (i + 1) * self.length_band], seed=RANDOM_SEED, signed=False)[ - 0 - ] - for i in range(self.num_bands) - ] - - def _submit_buckets_minhashes( - self, buckets: dict[int, list[int]], minhashes: list[tuple[int, int, np.array]] - ) -> None: - """ - Submit buckets to hash - :param buckets: buckets - :param minhashes: minhashes - :return: None - """ - # bucket requests - request = [[] for _ in range(len(self.buckets))] - for key, value in buckets.items(): - request[key % len(self.buckets)].append((key, value)) - # Submit requests to appropriate bucket collectors - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.buckets[i].add_buckets.remote(req)) - i += 1 - # Minhashes - request = [[] for _ in range(len(self.minhashes))] - for minh in minhashes: - request[minh[0] % len(self.minhashes)].append(minh) - # Submit requests to appropriate minhash collectors - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.minhashes[i].add_minhashes.remote(req)) - i += 1 - # wait for completion - RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies) - - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - """ - Preprocessing table content. - :param table: table - :param file_name - name of currently processed file - :return: resulting table, statistics - """ - from compute_shingles import compute_shingles - - def flush(limit: int) -> None: - """ - flushing buckets and minhashes to dedicated actors - :param limit: number of buckets to flush - :return: None - """ - if len(buckets) >= limit: # time to submit - nonlocal num_buckets - nonlocal num_minhashes - self._submit_buckets_minhashes(buckets, minhashes) - num_buckets = num_buckets + len(buckets) - num_minhashes = num_minhashes + len(minhashes) - buckets.clear() - minhashes.clear() - - # make sure that the doc column exists - TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column]) - # Inner variables - buckets = {} - minhashes = [] - num_buckets = 0 - num_minhashes = 0 - docs = table[self.doc_column] - doc_ids = table[self.doc_id_column] - # for every document/its integer id - for n in range(table.num_rows): - doc = docs[n].as_py() - doc_id = doc_ids[n].as_py() - shingles = compute_shingles(txt=doc, word_shingle_size=self.word_shingle_size, delimiter=self.delimiter) - if len(shingles) > 0: - mh = self._generate_minhashes(shingles) - minhashes.append((doc_id, len(doc), mh)) - candidates = self._generate_buckets(mh) - - for b_hash in candidates: - bucket_array = buckets.get(b_hash) - if bucket_array is None: - buckets[b_hash] = [doc_id] - else: - bucket_array.append(doc_id) - flush(REQUEST_LEN) - flush(0) - # peg stats - stats = {"generated buckets": num_buckets, "generated minhashes": num_minhashes} - time.sleep(int(random.random() * self.random_delay_limit)) - return [], stats - - -class FdedupFilter(AbstractTableTransform): - """ - Filtering documents - """ - - def __init__(self, config: dict): - """ - Initialize based on the dictionary of configuration information. - The dictionary should contain the following: - doc_column - name of doc column - doc_id_int_column - name of int doc id column - cluster_column - name of the cluster column - remote_docs - list of remote doc collectors - random_delay_limit - random delay limit - """ - super().__init__(config) - self.doc_column = config.get("doc_column", "") - self.doc_id_column = config.get("doc_id_int_column", "") - self.cluster_column = config.get("cluster_column", "") - self.docs = config.get("remote_docs", "") - self.random_delay_limit = config.get("random_delay_limit", 10) - - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - """ - De duping (filtering) table content. - :param table: table - :param file_name: name of the currently processing file - :return: resulting table, statistics - """ - # make sure that the doc column exists - TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column]) - # inner variables - ids = table.column(self.doc_id_column) - # Submit requests to an appropriate doc collectors - request = [[] for _ in range(len(self.docs))] - for value in ids: - doc_id = value.as_py() - request[doc_id % len(self.docs)].append(doc_id) - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.docs[i].filter.remote(req)) - i += 1 - # Process replies - unique = {} - while remote_replies: - # Wait for replies - ready, not_ready = ray.wait(remote_replies) - reply = ray.get(ready)[0] - unique.update(reply) - remote_replies = not_ready - # Filter out table - mask = [] - clusters = [] - # Actual filtering - for n in range(table.num_rows): - doc_id = ids[n].as_py() - if doc_id in unique: - mask.append(True) - clusters.append(unique.pop(doc_id)) - else: - mask.append(False) - # build out table - out_table = TransformUtils.add_column(table=table.filter(mask), name=self.cluster_column, content=clusters) - # build execution statistics - stats = {"source_documents": table.num_rows, "result_documents": out_table.num_rows} - time.sleep(int(random.random() * self.random_delay_limit)) - return [out_table], stats - - -class FdedupRuntime(DefaultRayTransformRuntime): - """ - Fuzzy dedup runtime support. Here we are using set environment to implement first two steps of fuzzy dedup - processing - preprocessing and bucket hash processing - """ - - def __init__(self, params: dict[str, Any]): - """ - Create filter runtime - :param params: parameters, that should include - doc_column - name of the document column - id_column - name of the integer doc id column - cluster_column - name of the cluster column - worker_options - start options for preprocessor - from the orchestrator configuration - bucket_cpu - number of cpus for bucket actor - doc_cpu - number of cpus for doc actor - mhash_cpu - number of cpus for minhash actor - num_doc_actors - number of document actors - num_bucket_actors - number of bucket actors - num_minhash_actors - number of minhash actors - num_preprocessors - number of preprocessors - snapshot_delay - delay (sec) in sending snapshot requests to actors - use_bucket_snapshot - use bucket snapshot - use_doc_snapshot - use doc snapshot - random_delay_limit - random_delay limit - # fuzzy specific parameters - num_permutations - number of permutations - threshold - threshold - world_shingle_size - word shingles size - delimiters - delimiter - """ - from data_processing.utils import get_logger - - super().__init__(params) - self.logger = get_logger(__name__) - self.sum_buckets = 0 - self.sum_buckets_mem = 0 - self.sum_mh = 0 - self.sum_mh_mem = 0 - self.document_collectors = [] - self.snapshot_delay = self.params.get("snapshot_delay", 1) - self.random_delay_limit = self.params.get("random_delay_limit", 10) - - def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] - ) -> dict[str, Any]: - """ - Set environment for filter execution - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param files - list of files to process - :return: dictionary of filter init params - """ - if self.params.get("use_doc_snapshot", False): - self.logger.info("continuing from the document actors snapshot") - data_access = data_access_factory.create_data_access() - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}docs" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.info(f"Found the following snapshot files {files.keys()}") - self.document_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - self.document_collectors[i] = DocCollector.options( - **{"num_cpus": self.params.get("doc_cpu", 0.5)} - ).remote({"id": i, "data_access": data_access_factory, "snapshot": file}) - time.sleep(self.snapshot_delay) - self.logger.info(f"Created {len(self.document_collectors)} document collectors to continue processing") - else: - self.logger.info("starting run from the beginning") - self._create_doc_actors(data_access_factory=data_access_factory, statistics=statistics, files=files) - return { - "doc_column": self.params.get("doc_column", ""), - "doc_id_int_column": self.params.get("id_column", ""), - "cluster_column": self.params.get("cluster_column", ""), - "remote_docs": self.document_collectors, - "random_delay_limit": self.random_delay_limit, - } - - def _create_doc_actors( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] - ) -> None: - """ - Create document actors - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param files - list of files to process - :return: None - """ - mn_min_hash = MurmurMH(num_perm=self.params.get("num_permutations", 64), seed=RANDOM_SEED) - if self.params.get("use_bucket_snapshot", False): - self.logger.info("continuing from the bucket actors snapshot") - data_access = data_access_factory.create_data_access() - # recreate bucket collectors - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}buckets" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.debug(f"Found the following bucket snapshot files {files.keys()}") - bucket_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory, "snapshot": file} - ) - time.sleep(self.snapshot_delay) - self.logger.info(f"Created {len(bucket_collectors)} bucket collectors to continue processing") - # recreate minhash collectors - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}minhash" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.debug(f"Found the following minhash snapshot files {files.keys()}") - minhash_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory, "snapshot": file} - ) - time.sleep(self.snapshot_delay) - self._process_buckets( - data_access_factory=data_access_factory, - statistics=statistics, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - mn_min_hash=mn_min_hash, - ) - self.logger.info(f"Created {len(minhash_collectors)} minhash collectors to continue processing") - else: - self.logger.info("continuing from the very beginning") - self._create_doc_actors_internal( - data_access_factory=data_access_factory, statistics=statistics, mn_min_hash=mn_min_hash, files=files - ) - - def _create_doc_actors_internal( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - mn_min_hash: MurmurMH, - files: list[str], - ) -> None: - """ - Create document actors - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param mn_min_hash - MurmurMH class - :param files - list of files to process - :return: None - """ - # compute fuzzy dedup parameters - num_buckets, length_bucket = fuzzy_optimal_param( - threshold=self.params.get("threshold", 0.8), - num_perm=self.params.get("num_permutations", 64), - false_positive_weight=0.5, - false_negative_weight=0.5, - ) - self.logger.info(f"Fuzzy: num buckets {num_buckets}, bucket length {length_bucket}") - # Build bucket and minhash collectors - bucket_collectors = [None] * self.params.get("num_bucket_actors", 1) - for i in range(self.params.get("num_bucket_actors", 1)): - bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(bucket_collectors)} bucket actors") - minhash_collectors = [None] * self.params.get("num_minhash_actors", 1) - for i in range(self.params.get("num_minhash_actors", 1)): - minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(minhash_collectors)} minhash actors") - self._preprocess_tables( - data_access_factory=data_access_factory, - statistics=statistics, - files=files, - mn_min_hash=mn_min_hash, - num_buckets=num_buckets, - length_bucket=length_bucket, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - random_delay_limit=self.random_delay_limit, - ) - # At this point we can snapshot both bucket and minhash collectors for potential restart - self.logger.info("creating minhash snapshots") - minhash_replies = [None] * len(minhash_collectors) - index = 0 - for collector in minhash_collectors: - minhash_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while minhash_replies: - ready, not_ready = ray.wait(minhash_replies) - minhash_replies = not_ready - self.logger.info("minhash snapshots created") - self.logger.info("creating bucket snapshots") - bucket_replies = [None] * len(bucket_collectors) - index = 0 - for collector in bucket_collectors: - bucket_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while bucket_replies: - ready, not_ready = ray.wait(bucket_replies) - bucket_replies = not_ready - self.logger.info("bucket snapshots created") - self._process_buckets( - data_access_factory=data_access_factory, - statistics=statistics, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - mn_min_hash=mn_min_hash, - ) - - def _process_buckets( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - bucket_collectors: list[ActorHandle], - minhash_collectors: list[ActorHandle], - mn_min_hash: MurmurMH, - ) -> None: - """ - Process buckets - :param data_access_factory - data access factory - :param statistics - statistics actor - :param bucket_collectors - bucket collectors - :param minhash_collectors - minhash collectors - :param mn_min_hash - MMurmurMH class - :return: None - """ - # Create document collectors - self.document_collectors = [None] * self.params.get("num_doc_actors", 1) - for i in range(self.params.get("num_doc_actors", 1)): - self.document_collectors[i] = DocCollector.options(**{"num_cpus": self.params.get("doc_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(self.document_collectors)} document actors") - # create bucket processors - bucket_processors_list = RayUtils.create_actors( - clazz=BucketsHashProcessor, - params={ - "remote_docs": self.document_collectors, - "remote_minhashes": minhash_collectors, - "mn_min_hash": mn_min_hash, - "threshold": self.params.get("threshold", 0.8) * self.params.get("num_permutations", 64), - "statistics": statistics, - }, - actor_options=self.params.get("worker_options", None), - n_actors=self.params.get("num_preprocessors", 1), - ) - self.logger.info(f"created {len(bucket_processors_list)} bucket processor actors") - # create bucket processors invoker - bucket_processor_invoker = BucketsHashProcessorInvoker.options( - num_cpus=self.params.get("bucket_cpu", 0.5) - ).remote(bucket_processors=bucket_processors_list) - self.logger.info(f"created bucket processor invoker") - # Add invoker to the buckets - bucket_replies = [ - collector.add_processing_submitter.remote(submitter=bucket_processor_invoker) - for collector in bucket_collectors - ] - RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies) - self.logger.info(f"added invoker to bucket collectors") - # start bucket processing and wait for completion - start = time.time() - bucket_replies = [collector.process_buckets.remote() for collector in bucket_collectors] - RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies) - # Wait for pool to complete - ray.get(bucket_processor_invoker.wait_for_completion.remote()) - self.logger.info(f"Done processing buckets in {round((time.time() - start) / 60.,3)} min") - # At this point we can save doc actors, in case we would want to restart here - self.logger.info(f"creating document snapshots") - doc_replies = [None] * len(self.document_collectors) - index = 0 - for collector in self.document_collectors: - doc_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while doc_replies: - ready, not_ready = ray.wait(doc_replies) - doc_replies = not_ready - self.logger.info(f"document snapshots created") - # At this point we do not need bucket and minhash actors, remove them - # but first get usage information - # Bucket collector - replies = [collector.get_size.remote() for collector in bucket_collectors] - while replies: - ready, not_ready = ray.wait(replies) - b_amount, b_memory = ray.get(ready)[0] - self.sum_buckets += b_amount - self.sum_buckets_mem += b_memory - replies = not_ready - for collector in bucket_collectors: - ray.kill(actor=collector, no_restart=True) - # minhash collector - replies = [collector.get_size.remote() for collector in minhash_collectors] - while replies: - ready, not_ready = ray.wait(replies) - m_amount, m_memory = ray.get(ready)[0] - self.sum_mh += m_amount - self.sum_mh_mem += m_memory - replies = not_ready - for collector in minhash_collectors: - ray.kill(actor=collector, no_restart=True) - # Clean up processors - for processor in bucket_processors_list: - ray.kill(actor=processor, no_restart=True) - ray.kill(bucket_processor_invoker) - - def _preprocess_tables( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - files: list[str], - mn_min_hash: MurmurMH, - num_buckets: int, - length_bucket: int, - bucket_collectors: list[ActorHandle], - minhash_collectors: list[ActorHandle], - random_delay_limit: int, - ) -> None: - """ - Preprocess tables - build, run and cleanup - :param data_access_factory - data access factory - :param statistics - statistics actor - :param files - list of files to process - :param mn_min_hash - MurmurMH class - :param num_buckets - number of buckets - :param length_bucket - bucket length - :param bucket_collectors - bucket collector actors - :param minhash_collectors - minhash_collector actors - :param random_delay_limit - max for random dalay limit - :return: None - """ - from ray.util.metrics import Gauge - - worker_options = self.params.get("worker_options", None) - # Here we are limiting the number of readers not to overwhelm COS - n_readers = self.params.get("num_preprocessors", 1) - if n_readers > 1000: - n_readers = 1000 - self.logger.info(f"Table preprocessing uses {n_readers} readers") - # Create preprocessing actors - processor_params = { - "data_access_factory": data_access_factory, - "transform_class": FdedupTransform, - "statistics": statistics, - "transform_params": { - "doc_column": self.params.get("doc_column", ""), - "doc_id_int_column": self.params.get("id_column", ""), - "word_shingle_size": self.params.get("world_shingle_size", 1), - "mn_min_hash": mn_min_hash, - "num_bands": num_buckets, - "length_band": length_bucket, - "remote_buckets": bucket_collectors, - "remote_minhashes": minhash_collectors, - "delimiter": self.params.get("delimiter", " "), - "random_delay_limit": random_delay_limit, - }, - "base_table_stats": False, - } - processors_list = RayUtils.create_actors( - clazz=RayTransformFileProcessor, - params=processor_params, - actor_options=worker_options, - n_actors=n_readers, - ) - self.logger.info(f"created {len(processors_list)} table processor actors") - # Execute preprocessing - # create gauges - files_in_progress_gauge = Gauge( - "preprocessing_files_in_progress", "Number of files in progress, preprocessing" - ) - files_completed_gauge = Gauge( - "preprocessing_files_processed_total", "Number of files completed, preprocessing" - ) - available_cpus_gauge = Gauge("preprocessing_available_cpus", "Number of available CPUs, preprocessing") - available_gpus_gauge = Gauge("preprocessing_available_gpus", "Number of available GPUs, preprocessing") - available_memory_gauge = Gauge("preprocessing_available_memory", "Available memory, preprocessing") - available_object_memory_gauge = Gauge( - "preprocessing_available_object_store", "Available object store, preprocessing" - ) - print_interval = int(len(files) / 100) - if print_interval == 0: - print_interval = 1 - # process data - processors = ActorPool(processors_list) - failures = RayUtils.process_files( - executors=processors, - files=files, - print_interval=print_interval, - files_in_progress_gauge=files_in_progress_gauge, - files_completed_gauge=files_completed_gauge, - available_cpus_gauge=available_cpus_gauge, - available_gpus_gauge=available_gpus_gauge, - available_memory_gauge=available_memory_gauge, - object_memory_gauge=available_object_memory_gauge, - logger=self.logger, - ) - if failures > 0: - statistics.add_stats.remote({"actor failures": failures}) - # Clean up processors - for processor in processors_list: - ray.kill(actor=processor, no_restart=True) - del processors - - def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]: - """ - Compute execution statistics - :param stats: output of statistics - :return: job execution statistics - """ - # Get document collector statistics - sum_docs = 0 - sum_docs_mem = 0 - sum_removed = 0 - sum_removed_mem = 0 - replies = [collector.get_size.remote() for collector in self.document_collectors] - while replies: - ready, not_ready = ray.wait(replies) - d_amount, d_memory, r_amount, r_memory = ray.get(ready)[0] - sum_docs += d_amount - sum_docs_mem += d_memory - sum_removed += r_amount - sum_removed_mem += r_memory - replies = not_ready - overall_hash_memory = self.sum_buckets_mem + self.sum_mh_mem + sum_docs_mem + sum_docs_mem + sum_removed_mem - dedup_prst = 100 * (1.0 - stats.get("result_documents", 1) / stats.get("source_documents", 1)) - return { - "number of buckets": self.sum_buckets, - "number of docs": sum_docs, - "number of removed docs": sum_removed, - "number of min hashes": self.sum_mh, - "overall hash memory GB": overall_hash_memory, - "de duplication %": dedup_prst, - } | stats - - -class FdedupTableTransformConfiguration(TransformConfiguration): - """ - Provides support for configuring and using the associated Transform class include - configuration with CLI args and combining of metadata. - """ - - def __init__(self): - super().__init__( - name=short_name, - transform_class=FdedupFilter, - ) - from data_processing.utils import get_logger - - self.logger = get_logger(__name__) - - def add_input_params(self, parser: ArgumentParser) -> None: - """ - Add Transform-specific arguments to the given parser. - """ - parser.add_argument(f"--{cli_prefix}doc_column", type=str, default="contents", help="document column name") - parser.add_argument( - f"--{cli_prefix}id_column", type=str, default="int_document_id", help="integer document id column name" - ) - parser.add_argument(f"--{cli_prefix}cluster_column", type=str, default="cluster", help="cluster column name") - parser.add_argument( - f"--{cli_prefix}bucket_cpu", type=float, default=0.5, help="number of CPUs per bucket hash" - ) - parser.add_argument( - f"--{cli_prefix}mhash_cpu", type=float, default=0.5, help="number of CPUs per minhash hash" - ) - parser.add_argument(f"--{cli_prefix}doc_cpu", type=float, default=0.5, help="number of CPUs per doc hash") - parser.add_argument(f"--{cli_prefix}num_doc_actors", type=int, default=1, help="number of doc actors to use") - parser.add_argument( - f"--{cli_prefix}num_minhash_actors", type=int, default=1, help="number of minhash actors to use" - ) - parser.add_argument( - f"--{cli_prefix}num_bucket_actors", type=int, default=1, help="number of bucket actors to use" - ) - parser.add_argument( - f"--{cli_prefix}num_preprocessors", type=int, default=1, help="number of preprocessors to use" - ) - parser.add_argument(f"--{cli_prefix}num_permutations", type=int, default=64, help="number of permutations") - parser.add_argument(f"--{cli_prefix}threshold", type=float, default=0.8, help="threshold") - parser.add_argument(f"--{cli_prefix}shingles_size", type=int, default=5, help="number of words in shingle") - parser.add_argument( - f"--{cli_prefix}delimiters", type=str, default=" ", help="delimiter for splitting document" - ) - parser.add_argument(f"--{cli_prefix}snapshot_delay", type=int, default=1, help="snapshot delay time") - parser.add_argument( - f"--{cli_prefix}use_bucket_snapshot", - type=lambda x: bool(str2bool(x)), - default=False, - help="flag to continue with bucket snapshot", - ) - parser.add_argument( - f"--{cli_prefix}use_doc_snapshot", - type=lambda x: bool(str2bool(x)), - default=False, - help="flag to continue with doc snapshot", - ) - parser.add_argument( - f"--{cli_prefix}random_delay_limit", type=int, default=10, help="maximum delay between read" - ) - - def apply_input_params(self, args: Namespace) -> bool: - """ - Validate and apply the arguments that have been parsed - :param args: user defined arguments. - :return: True, if validate pass or False otherwise - """ - captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) - self.params = self.params | captured - self.params["worker_options"] = args.runtime_worker_options - if self.params["use_bucket_snapshot"] and self.params["use_doc_snapshot"]: - self.logger.warning("both bucket and doc snapshot are specified. Only one allowed") - return False - - self.logger.info(f"fuzzy dedup params are {self.params}") - return True - - -class FdedupRayTransformConfiguration(RayTransformRuntimeConfiguration): - def __init__(self): - super().__init__(transform_config=FdedupTableTransformConfiguration(), runtime_class=FdedupRuntime) - - -if __name__ == "__main__": - launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) - launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py new file mode 100644 index 000000000..64f492584 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py @@ -0,0 +1,54 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # execution info + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py new file mode 100644 index 000000000..bc3c0d991 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() From 0c31dc07a06942b3b6eb73cc29a62f512f4c7a00 Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 11 Oct 2024 12:25:46 -0400 Subject: [PATCH 013/105] Fixed bug in ray to distribute docs to remove file to all workers Signed-off-by: nelson --- .../python/src/data_cleaning_transform.py | 4 +-- .../ray/src/data_cleaning_transform_ray.py | 26 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index f03b6c1d0..05b18cc8b 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -110,10 +110,10 @@ class DataCleaningTransformConfiguration(TransformConfiguration): configuration with CLI args. """ - def __init__(self): + def __init__(self, transform_class: type[AbstractTableTransform] = DataCleaningTransform): super().__init__( name=short_name, - transform_class=DataCleaningTransform, + transform_class=transform_class, ) self.logger = get_logger(__name__, level="INFO") diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py index 9fdb220f7..831a6c9c2 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -16,9 +16,8 @@ from data_cleaning_transform import ( DataCleaningTransform, DataCleaningTransformConfiguration, - docs2remove_list, - docs2remove_list_key, - get_docs_to_remove, + duplicate_list_location_default, + duplicate_list_location_key, ) from data_processing.data_access import DataAccessFactoryBase from data_processing.utils import CLIArgumentProvider, get_logger @@ -45,16 +44,15 @@ def __init__(self, config: dict): by the companion runtime, LangSelectorTransformRuntime. If running inside the RayMutatingDriver, these will be provided by that class with help from the RayMutatingDriver. """ - docs2remove = config.get(docs2remove_list_key, None) - if docs2remove is not None: + docs2removedf = config.get("df", None) + if docs2removedf is not None: # This is recommended for production approach. In this case domain list is build by the # runtime once, loaded to the object store and can be accessed by actors without additional reads try: - - config[docs2remove_list_key] = ray.get(config.get(docs2remove_list_key)) + config["df"] = ray.get(config.get("df")) except Exception as e: - self.logger.warning(f"Exception loading languages list from ray object storage {e}") - raise RuntimeError(f"exception loading from object storage for key {docs2remove}") + self.logger.warning(f"Exception loading docs2remove list from ray object storage {e}") + raise RuntimeError(f"exception loading from object storage for key {docs2removedf}") super().__init__(config) @@ -90,9 +88,13 @@ def get_transform_config( :param files - list of files to remove :return: dictionary of filter init params """ - docs_to_remove = get_docs_to_remove(self.params) - docs_to_remove_list = ray.put(docs_to_remove) - return {docs2remove_list_key: docs_to_remove_list} | self.params + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + data_access = data_access_factory.create_data_access() + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + duplicate_list, retries = data_access.get_file(duplicate_list_location) + docs_to_remove_list = ray.put(duplicate_list) + return {"df": docs_to_remove_list} | self.params class DataCleaningRayTransformConfiguration(RayTransformRuntimeConfiguration): From 6ee6695c1ef5d494935c42207dce0d5e0ccd151f Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:05:39 +0100 Subject: [PATCH 014/105] added folder_transform --- .../pure_python/transform_file_processor.py | 15 ++++-- .../pure_python/transform_orchestrator.py | 42 ++++++++++------ .../runtime/transform_file_processor.py | 41 ++++++++------- .../src/data_processing/transform/__init__.py | 2 + .../transform/abstract_transform.py | 16 ++++++ .../transform/binary_transform.py | 5 +- .../transform/folder_transform.py | 50 +++++++++++++++++++ .../runtime/ray/transform_file_processor.py | 1 + .../runtime/ray/transform_orchestrator.py | 19 ++++--- .../runtime/spark/transform_file_processor.py | 5 +- .../runtime/spark/transform_orchestrator.py | 25 +++++++--- 11 files changed, 168 insertions(+), 53 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/transform/abstract_transform.py create mode 100644 data-processing-lib/python/src/data_processing/transform/folder_transform.py diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index 143835dd0..fa3e69e4a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -14,7 +14,7 @@ from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime import AbstractTransformFileProcessor -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractTransform, TransformStatistics from data_processing.utils import UnrecoverableException @@ -28,7 +28,8 @@ def __init__( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ): """ Init method @@ -36,11 +37,13 @@ def __init__( :param statistics - reference to statistics class :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag """ # invoke superclass super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) self.transform_params["statistics"] = statistics # Create local processor @@ -52,7 +55,8 @@ def __init__( # Create statistics self.stats = statistics - def _publish_stats(self, stats: dict[str, Any]) -> None: + +def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) @@ -65,17 +69,20 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool ): """ Init method :param data_access_factory - data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder tranform flag """ super().__init__( data_access_factory=data_access_factory, transform_parameters=dict(transform_params), + is_folder=is_folder, ) # Add data access and statistics to the processor parameters self.transform_params["data_access"] = self.data_access diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 8692da29e..153eaaf0a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,7 +24,7 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics +from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger @@ -48,8 +48,6 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } - - def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -74,15 +72,21 @@ def orchestrate( return 1 # create additional execution parameters runtime = runtime_config.create_transform_runtime() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - if retries > 0: - statistics.add_stats({"data access retries": retries}) - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + if retries > 0: + statistics.add_stats({"data access retries": retries}) + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -99,6 +103,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) else: # using sequential execution @@ -111,6 +116,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), transform_class=runtime_config.get_transform_class(), + is_folder=is_folder, ) status = "success" return_code = 0 @@ -157,7 +163,8 @@ def _process_transforms( data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], + is_folder: bool, ) -> None: """ Process transforms sequentially @@ -167,9 +174,8 @@ def _process_transforms( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform flag :return: metadata for the execution - - :return: None """ # create executor executor = PythonTransformFileProcessor( @@ -177,6 +183,7 @@ def _process_transforms( statistics=statistics, transform_params=transform_params, transform_class=transform_class, + is_folder=is_folder, ) # process data t_start = time.time() @@ -203,6 +210,7 @@ def _process_transforms_multiprocessor( data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], transform_class: type[AbstractBinaryTransform], + is_folder: bool ) -> TransformStatistics: """ Process transforms using multiprocessing pool @@ -212,13 +220,17 @@ def _process_transforms_multiprocessor( :param data_access_factory: data access factory :param transform_params - transform parameters :param transform_class: transform class + :param is_folder: folder transform class :return: metadata for the execution """ # result statistics statistics = TransformStatistics() # create processor processor = PythonPoolTransformFileProcessor( - data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class + data_access_factory=data_access_factory, + transform_params=transform_params, + transform_class=transform_class, + is_folder=is_folder, ) completed = 0 t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index d4ec548d8..1d268875f 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -26,11 +26,13 @@ def __init__( self, data_access_factory: DataAccessFactoryBase, transform_parameters: dict[str, Any], + is_folder: bool = False, ): """ Init method :param data_access_factory: Data Access Factory :param transform_parameters: Transform parameters + :param is_folder: folder transform flag """ self.logger = get_logger(__name__) # validate parameters @@ -46,6 +48,7 @@ def __init__( # Add data access and statistics to the processor parameters self.transform_params = transform_parameters self.transform_params["data_access"] = self.data_access + self.is_folder = is_folder def process_file(self, f_name: str) -> None: """ @@ -58,25 +61,29 @@ def process_file(self, f_name: str) -> None: self.logger.warning("No data_access found. Returning.") return t_start = time.time() - # Read source file - filedata, retries = self.data_access.get_file(path=f_name) - if retries > 0: - self._publish_stats({"data access retries": retries}) - if filedata is None: - self.logger.warning(f"File read resulted in None for {f_name}. Returning.") - self._publish_stats({"failed_reads": 1}) - return - self._publish_stats({"source_files": 1, "source_size": len(filedata)}) + if not self.is_folder: + # Read source file only if we are processing file + filedata, retries = self.data_access.get_file(path=f_name) + if retries > 0: + self._publish_stats({"data access retries": retries}) + if filedata is None: + self.logger.warning(f"File read resulted in None for {f_name}. Returning.") + self._publish_stats({"failed_reads": 1}) + return + self._publish_stats({"source_files": 1, "source_size": len(filedata)}) # Process input file try: - # execute local processing - name_extension = TransformUtils.get_file_extension(f_name) self.logger.debug(f"Begin transforming file {f_name}") - out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + if not self.is_folder: + # execute local processing + out_files, stats = self.transform.transform_binary(file_name=f_name, byte_array=filedata) + name_extension = TransformUtils.get_file_extension(f_name) + self.last_file_name = name_extension[0] + self.last_file_name_next_index = None + self.last_extension = name_extension[1] + else: + out_files, stats = self.transform.transform(folder_name=f_name) self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") - self.last_file_name = name_extension[0] - self.last_file_name_next_index = None - self.last_extension = name_extension[1] # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) # Process unrecoverable exceptions @@ -95,10 +102,10 @@ def flush(self) -> None: the hook for them to return back locally stored data and their statistics. :return: None """ - if self.last_file_name is None: + if self.last_file_name is None or self.is_folder: # for some reason a given worker never processed anything. Happens in testing # when the amount of workers is greater than the amount of files - self.logger.debug("skipping flush, no name for file is defined") + self.logger.debug("skipping flush, no name for file is defined or this is a folder transform") return try: t_start = time.time() diff --git a/data-processing-lib/python/src/data_processing/transform/__init__.py b/data-processing-lib/python/src/data_processing/transform/__init__.py index 6af43ad60..20254e47b 100644 --- a/data-processing-lib/python/src/data_processing/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/transform/__init__.py @@ -1,3 +1,5 @@ +from data_processing.transform.abstract_transform import AbstractTransform +from data_processing.transform.folder_transform import AbstractFolderTransform from data_processing.transform.binary_transform import AbstractBinaryTransform from data_processing.transform.table_transform import AbstractTableTransform from data_processing.transform.transform_statistics import TransformStatistics diff --git a/data-processing-lib/python/src/data_processing/transform/abstract_transform.py b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py new file mode 100644 index 000000000..89db70f42 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/abstract_transform.py @@ -0,0 +1,16 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +class AbstractTransform: + """ + Base class for all transform types + """ \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/binary_transform.py b/data-processing-lib/python/src/data_processing/transform/binary_transform.py index 80dff61ea..b313aff2f 100644 --- a/data-processing-lib/python/src/data_processing/transform/binary_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/binary_transform.py @@ -10,10 +10,11 @@ # limitations under the License. ################################################################################ -from typing import Any, TypeVar +from typing import Any +from data_processing.transform import AbstractTransform -class AbstractBinaryTransform: +class AbstractBinaryTransform(AbstractTransform): """ Converts input binary file to output file(s) (binary) Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py new file mode 100644 index 000000000..866e3286f --- /dev/null +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from data_processing.data_access import data_access +from data_processing.transform import AbstractTransform + + +class AbstractFolderTransform(AbstractTransform): + """ + Converts input folder to output file(s) (binary) + Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or + more new binary files and metadata. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This simply stores the given instance in this instance for later use. + """ + self.config = config + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the extension to be used when writing out the new bytes. + """ + raise NotImplemented() + + @staticmethod + def get_folders(data_access:data_access) -> list(str): + """ + Compute the list of folders to use. + :param data_access - data access class + :return: + """ + raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py index e1fabb144..cdad1309f 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_file_processor.py @@ -35,6 +35,7 @@ def __init__(self, params: dict[str, Any]): super().__init__( data_access_factory=params.get("data_access_factory", None), transform_parameters=dict(params.get("transform_params", {})), + is_folder=params.get("is_folder", False) ) # Create statistics self.stats = params.get("statistics", None) diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 42eba47a6..8276eb56c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -16,6 +16,7 @@ import ray from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import AbstractFolderTransform from data_processing_ray.runtime.ray import ( RayTransformExecutionConfiguration, RayTransformFileProcessor, @@ -56,13 +57,18 @@ def orchestrate( # create transformer runtime runtime = runtime_config.create_transform_runtime() resources = RayUtils.get_cluster_resources() + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) if print_interval == 0: @@ -84,6 +90,7 @@ def orchestrate( data_access_factory=data_access_factory, statistics=statistics, files=files ), "statistics": statistics, + "is_folder": is_folder, } logger.debug("Creating actors") processors = RayUtils.create_actors( diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py index d63664ac4..a0968ab1d 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_file_processor.py @@ -29,12 +29,15 @@ def __init__( data_access_factory: DataAccessFactoryBase, runtime_configuration: SparkTransformRuntimeConfiguration, statistics: TransformStatistics, + is_folder: bool, ): """ Init method """ super().__init__( - data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params() + data_access_factory=data_access_factory, + transform_parameters=runtime_configuration.get_transform_params(), + is_folder=is_folder, ) # Add data access ant statistics to the processor parameters self.runtime_configuration = runtime_configuration diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index c279f2b73..c534b685f 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -18,7 +18,7 @@ import yaml from data_processing.data_access import DataAccessFactoryBase -from data_processing.transform import TransformStatistics +from data_processing.transform import TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger from data_processing_spark.runtime.spark import ( SparkTransformExecutionConfiguration, @@ -117,7 +117,10 @@ def process_partition(iterator): runtime = runtime_conf.create_transform_runtime() # create file processor file_processor = SparkTransformFileProcessor( - data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics + data_access_factory=d_access_factory, + runtime_configuration=runtime_conf, + statistics=statistics, + is_folder=is_folder, ) first = True for f in iterator: @@ -144,13 +147,19 @@ def process_partition(iterator): return list(statistics.get_execution_stats().items()) num_partitions = 0 + is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform) try: - # Get files to process - files, profile, retries = data_access.get_files_to_process() - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}, source profile {profile}") + if is_folder: + # folder transform + files = AbstractFolderTransform.get_folders(data_access=data_access) + logger.info(f"Number of folders is {len(files)}") # Get files to process + else: + # Get files to process + files, profile, retries = data_access.get_files_to_process() + if len(files) == 0: + logger.error("No input files to process - exiting") + return 0 + logger.info(f"Number of files is {len(files)}, source profile {profile}") # process data logger.debug("Begin processing files") # process files split by partitions From e7260ba32d4d3dc1ab7a4e8d23fa302efdc8b18e Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 19:13:01 +0100 Subject: [PATCH 015/105] added folder_transform --- .../runtime/pure_python/transform_orchestrator.py | 2 +- .../python/src/data_processing/transform/folder_transform.py | 4 ++-- .../data_processing_ray/runtime/ray/transform_orchestrator.py | 2 +- .../runtime/spark/transform_orchestrator.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index 153eaaf0a..d51f80a8a 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 866e3286f..eca191bbb 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -41,10 +41,10 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str raise NotImplemented() @staticmethod - def get_folders(data_access:data_access) -> list(str): + def get_folders(d_access: data_access) -> list(str): """ Compute the list of folders to use. - :param data_access - data access class + :param d_access - data access class :return: """ raise NotImplemented() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index 8276eb56c..a8ff95729 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index c534b685f..4a0897952 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -151,7 +151,7 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(data_access=data_access) + files = AbstractFolderTransform.get_folders(d_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process From 5856f3f54137ae225b8cbdf07add9eaf20ed38b2 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Thu, 10 Oct 2024 21:00:43 +0100 Subject: [PATCH 016/105] added folder_transform --- .../runtime/pure_python/transform_file_processor.py | 3 +-- .../runtime/pure_python/transform_orchestrator.py | 11 ++++++----- .../runtime/pure_python/transform_runtime.py | 10 +++++++++- .../data_processing/transform/folder_transform.py | 12 +----------- .../runtime/ray/transform_orchestrator.py | 2 +- .../runtime/ray/transform_runtime.py | 10 +++++++++- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py index fa3e69e4a..44ccd0ef0 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_file_processor.py @@ -55,8 +55,7 @@ def __init__( # Create statistics self.stats = statistics - -def _publish_stats(self, stats: dict[str, Any]) -> None: + def _publish_stats(self, stats: dict[str, Any]) -> None: self.stats.add_stats(stats) diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py index d51f80a8a..812be8caf 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -24,14 +24,13 @@ PythonTransformFileProcessor, PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractBinaryTransform, TransformStatistics, AbstractFolderTransform +from data_processing.transform import AbstractTransform, TransformStatistics, AbstractFolderTransform from data_processing.utils import GB, get_logger logger = get_logger(__name__) -@staticmethod def _execution_resources() -> dict[str, Any]: """ Get Execution resource @@ -48,6 +47,7 @@ def _execution_resources() -> dict[str, Any]: "object_store": 0, } + def orchestrate( data_access_factory: DataAccessFactoryBase, runtime_config: PythonTransformRuntimeConfiguration, @@ -76,7 +76,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") else: # Get files to process @@ -145,7 +145,8 @@ def orchestrate( "job_input_params": input_params | data_access_factory.get_input_params() | execution_config.get_input_params(), - "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, + "execution_stats": _execution_resources() | + {"execution time, min": round((time.time() - start_time) / 60.0, 3)}, "job_output_stats": stats, } logger.debug(f"Saving job metadata: {metadata}.") @@ -209,7 +210,7 @@ def _process_transforms_multiprocessor( print_interval: int, data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any], - transform_class: type[AbstractBinaryTransform], + transform_class: type[AbstractTransform], is_folder: bool ) -> TransformStatistics: """ diff --git a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py index 4173154ae..478d40837 100644 --- a/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py +++ b/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] ) -> dict[str, Any]: diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index eca191bbb..9a2fb3713 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -11,7 +11,6 @@ ################################################################################ from typing import Any -from data_processing.data_access import data_access from data_processing.transform import AbstractTransform @@ -38,13 +37,4 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str to metadata. Each element of the return list, is a tuple of the transformed bytes and a string holding the extension to be used when writing out the new bytes. """ - raise NotImplemented() - - @staticmethod - def get_folders(d_access: data_access) -> list(str): - """ - Compute the list of folders to use. - :param d_access - data access class - :return: - """ - raise NotImplemented() + raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index a8ff95729..b29682997 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -61,7 +61,7 @@ def orchestrate( try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: files, profile, retries = data_access.get_files_to_process() diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py index 57f071406..64479302c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from ray.actor import ActorHandle @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] ) -> dict[str, Any]: From 6519686320fb2e76d03d9079b2b59b24be42b6cd Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 08:48:00 +0100 Subject: [PATCH 017/105] added folder_transform --- .../runtime/spark/transform_orchestrator.py | 3 ++- .../runtime/spark/transform_runtime.py | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py index 4a0897952..096fab272 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_orchestrator.py @@ -151,7 +151,8 @@ def process_partition(iterator): try: if is_folder: # folder transform - files = AbstractFolderTransform.get_folders(d_access=data_access) + runtime = runtime_config.create_transform_runtime() + files = runtime.get_folders(data_access=data_access) logger.info(f"Number of folders is {len(files)}") # Get files to process else: # Get files to process diff --git a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py index 7b968b1e9..7410d09d1 100644 --- a/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py +++ b/data-processing-lib/spark/src/data_processing_spark/runtime/spark/transform_runtime.py @@ -12,7 +12,7 @@ from typing import Any -from data_processing.data_access import DataAccessFactoryBase +from data_processing.data_access import DataAccessFactoryBase, DataAccess from data_processing.transform import TransformStatistics @@ -28,6 +28,14 @@ def __init__(self, params: dict[str, Any]): """ self.params = params + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + raise NotImplemented() + def get_transform_config( self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics ) -> dict[str, Any]: From c728224a5e3396ebe5d71dddb1b23a7a4b64ae7c Mon Sep 17 00:00:00 2001 From: blublinsky Date: Fri, 11 Oct 2024 15:35:00 +0100 Subject: [PATCH 018/105] added noop testing --- .../runtime/transform_file_processor.py | 44 +++++--- .../test_support/transform/__init__.py | 13 ++- .../transform/noop_folder_transform.py | 105 ++++++++++++++++++ .../test_support/transform/noop_transform.py | 6 +- .../transform/folder_transform.py | 2 +- .../transform/transform_configuration.py | 6 +- .../transform/test_folders_noop.py | 33 ++++++ .../launch/ray/ray_test_noop_launch.py | 6 - .../ededup/ray/src/ededup_transform_ray.py | 9 +- 9 files changed, 187 insertions(+), 37 deletions(-) create mode 100644 data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py create mode 100644 data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py diff --git a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py index 1d268875f..4075f40be 100644 --- a/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py +++ b/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py @@ -83,6 +83,7 @@ def process_file(self, f_name: str) -> None: self.last_extension = name_extension[1] else: out_files, stats = self.transform.transform(folder_name=f_name) + self.last_file_name = f_name self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files") # save results self._submit_file(t_start=t_start, out_files=out_files, stats=stats) @@ -148,15 +149,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats ) case 1: # we have exactly 1 output file - file_ext = out_files[0] - lfn = self.last_file_name - if self.last_file_name_next_index is not None: - lfn = f"{lfn}_{self.last_file_name_next_index}" - output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}") + if self.is_folder: + # its folder + output_name = out_files[0][1] + dt = out_files[0][0] + else: + file_ext = out_files[0] + lfn = self.last_file_name + if self.last_file_name_next_index is not None: + lfn = f"{lfn}_{self.last_file_name_next_index}" + output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}") + dt = file_ext[0] self.logger.debug( f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}" ) - save_res, retries = self.data_access.save_file(path=output_name, data=file_ext[0]) + save_res, retries = self.data_access.save_file(path=output_name, data=dt) if retries > 0: self._publish_stats({"data access retries": retries}) if save_res is None: @@ -166,7 +173,7 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats self._publish_stats( { "result_files": 1, - "result_size": len(file_ext[0]), + "result_size": len(dt), "processing_time": time.time() - t_start, } ) @@ -183,14 +190,21 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats start_index = 0 count = len(out_files) for index in range(count): - file_ext = out_files[index] - output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}" - file_sizes += len(file_ext[0]) - self.logger.debug( - f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} " - f"of {count} to {output_name_indexed}" - ) - save_res, retries = self.data_access.save_file(path=output_name_indexed, data=file_ext[0]) + if self.is_folder: + # its a folder + output_name_indexed = out_files[index][1] + dt = out_files[index][0] + else: + # files + file_ext = out_files[index] + output_name_indexed = f"{output_file_name}_{start_index + index}{file_ext[1]}" + self.logger.debug( + f"Writing transformed file {self.last_file_name}{self.last_extension}, {index + 1} " + f"of {count} to {output_name_indexed}" + ) + dt = file_ext[0] + file_sizes += len(dt) + save_res, retries = self.data_access.save_file(path=output_name_indexed, data=dt) if retries > 0: self._publish_stats({"data access retries": retries}) if save_res is None: diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py index 0e90f7ffd..04d6f3b0f 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/__init__.py @@ -1,6 +1,11 @@ -from .table_transform_test import AbstractTableTransformTest -from .binary_transform_test import AbstractBinaryTransformTest -from .noop_transform import ( +from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest +from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest +from data_processing.test_support.transform.noop_transform import ( NOOPTransform, - NOOPPythonTransformConfiguration, + NOOPTransformConfiguration, + NOOPPythonTransformConfiguration ) +from data_processing.test_support.transform.noop_folder_transform import ( + NOOPFolderTransform, + NOOPFolderPythonTransformConfiguration +) \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py new file mode 100644 index 000000000..5baab7858 --- /dev/null +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_folder_transform.py @@ -0,0 +1,105 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, + DefaultPythonTransformRuntime) +from data_processing.transform import AbstractFolderTransform +from data_processing.utils import get_logger +from data_processing.test_support.transform import NOOPTransformConfiguration + + +logger = get_logger(__name__) + + +class NOOPFolderTransform(AbstractFolderTransform): + """ + Implements a simple copy of a pyarrow Table. + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, NOOPTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + # Make sure that the param name corresponds to the name used in apply_input_params method + # of NOOPTransformConfiguration class + super().__init__(config) + self.sleep = config.get("sleep_sec", 1) + self.data_access = config.get("data_access") + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + """ + Converts input folder into o or more output files. + If there is an error, an exception must be raised - exit()ing is not generally allowed. + :param folder_name: the name of the folder containing arbitrary amount of files. + :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated + to metadata. Each element of the return list, is a tuple of the transformed bytes and a string + holding the file name to use. + """ + logger.debug(f"Transforming one folder {folder_name}") + metadata = {} + # get folder files + files, retries = self.data_access.get_folder_files(path=folder_name) + if retries > 0: + metadata |= {"data access retries": retries} + result = [()] * len(files) + index = 0 + for name, file in files.items(): + result[index] = (file, self.data_access.get_output_location(name)) + if self.sleep is not None: + logger.info(f"Sleep for {self.sleep} seconds") + time.sleep(self.sleep) + logger.info("Sleep completed - continue") + index += 1 + # Add some sample metadata. + metadata |= {"nfiles": len(files)} + return result, metadata + + +class NOOPFolderPythonRuntime(DefaultPythonTransformRuntime): + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + return [data_access.get_input_folder()] + + +class NOOPFolderPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderPythonRuntime) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py index 0dee013a4..2fea35506 100644 --- a/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py +++ b/data-processing-lib/python/src/data_processing/test_support/transform/noop_transform.py @@ -19,7 +19,7 @@ from data_processing.runtime.pure_python.runtime_configuration import ( PythonTransformRuntimeConfiguration, ) -from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.transform import AbstractTableTransform, TransformConfiguration, AbstractTransform from data_processing.utils import CLIArgumentProvider, get_logger @@ -75,10 +75,10 @@ class NOOPTransformConfiguration(TransformConfiguration): configuration with CLI args. """ - def __init__(self): + def __init__(self, clazz: type[AbstractTransform] = NOOPTransform): super().__init__( name=short_name, - transform_class=NOOPTransform, + transform_class=clazz, remove_from_metadata=[pwd_key], ) diff --git a/data-processing-lib/python/src/data_processing/transform/folder_transform.py b/data-processing-lib/python/src/data_processing/transform/folder_transform.py index 9a2fb3713..caa3bfa52 100644 --- a/data-processing-lib/python/src/data_processing/transform/folder_transform.py +++ b/data-processing-lib/python/src/data_processing/transform/folder_transform.py @@ -35,6 +35,6 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str :param folder_name: the name of the folder containing arbitrary amount of files. :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated to metadata. Each element of the return list, is a tuple of the transformed bytes and a string - holding the extension to be used when writing out the new bytes. + holding the file name to use. """ raise NotImplemented() \ No newline at end of file diff --git a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py index 033e92f2a..a5c9ec9ad 100644 --- a/data-processing-lib/python/src/data_processing/transform/transform_configuration.py +++ b/data-processing-lib/python/src/data_processing/transform/transform_configuration.py @@ -13,7 +13,7 @@ from argparse import ArgumentParser from typing import Any -from data_processing.transform import AbstractBinaryTransform +from data_processing.transform import AbstractTransform from data_processing.utils import CLIArgumentProvider @@ -23,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider): """ def __init__( - self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = [] + self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = [] ): """ Initialization @@ -36,7 +36,7 @@ def __init__( self.remove_from_metadata = remove_from_metadata self.params = {} - def get_transform_class(self) -> type[AbstractBinaryTransform]: + def get_transform_class(self) -> type[AbstractTransform]: """ Get the class extending AbstractBinaryTransform which implements a specific transformation. The class will generally be instantiated with a dictionary of configuration produced by diff --git a/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py new file mode 100644 index 000000000..e0fdd86c8 --- /dev/null +++ b/data-processing-lib/python/test/data_processing_tests/transform/test_folders_noop.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.transform import NOOPFolderPythonTransformConfiguration + + +class TestRayNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../../../test-data/data_processing/python/noop/" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration()) + fixtures = [(launcher, {"noop_sleep_sec": 0}, basedir + "/input", basedir + "/expected")] + return fixtures diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py index d4cc874f0..e706a4dfa 100644 --- a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_launch.py @@ -12,7 +12,6 @@ import os -import pyarrow as pa from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) @@ -20,11 +19,6 @@ from data_processing_ray.test_support.transform import NOOPRayTransformConfiguration -table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])}) -expected_table = table # We're a noop after all. -expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result - - class TestRayNOOPTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. diff --git a/transforms/universal/ededup/ray/src/ededup_transform_ray.py b/transforms/universal/ededup/ray/src/ededup_transform_ray.py index c0823a22e..d90dfa780 100644 --- a/transforms/universal/ededup/ray/src/ededup_transform_ray.py +++ b/transforms/universal/ededup/ray/src/ededup_transform_ray.py @@ -149,13 +149,12 @@ def _load_snapshots(self, data_access_factory: DataAccessFactoryBase, statistics statistics.add_stats.remote({"data access retries": retries}) self.logger.info(f"Found the following snapshot files {files.keys()}") # process snapshot files - for file in files.keys(): - # load the file + for file in files.values(): + # convert the file try: - b_hashes, _ = data_access.get_file(file) - snaps = pickle.loads(b_hashes) + snaps = pickle.loads(file) except Exception as e: - self.logger.warning(f"Failed to load hashes from file {file} with exception {e}") + self.logger.warning(f"Failed to load hashes with exception {e}") raise UnrecoverableException("failed to load hashes") request = [[] for _ in range(len(self.filters))] for h in snaps: From 6e2863a319716c513aa5f1bafa00a363089d2685 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sun, 13 Oct 2024 08:53:49 +0100 Subject: [PATCH 019/105] added noop Ray testing --- .../runtime/ray/transform_orchestrator.py | 6 +- .../test_support/transform/__init__.py | 1 + .../transform/noop_folder_transform.py | 57 +++++++++++++++++++ .../test_support/transform/noop_transform.py | 4 +- .../launch/ray/ray_test_noop_folder_launch.py | 33 +++++++++++ 5 files changed, 95 insertions(+), 6 deletions(-) create mode 100644 data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py create mode 100644 data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py diff --git a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py index b29682997..da39cbcf7 100644 --- a/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py +++ b/data-processing-lib/ray/src/data_processing_ray/runtime/ray/transform_orchestrator.py @@ -68,6 +68,9 @@ def orchestrate( if len(files) == 0: logger.error("No input files to process - exiting") return 0 + # log retries + if retries > 0: + statistics.add_stats.remote({"data access retries": retries}) logger.info(f"Number of files is {len(files)}, source profile {profile}") # Print interval print_interval = int(len(files) / 100) @@ -79,9 +82,6 @@ def orchestrate( logger.info( f"Number of workers - {preprocessing_params.n_workers} " f"with {preprocessing_params.worker_options} each" ) - # log retries - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) # create executors processor_params = { "data_access_factory": data_access_factory, diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py index a6cd700f7..dd095c961 100644 --- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/__init__.py @@ -1 +1,2 @@ from data_processing_ray.test_support.transform.noop_transform import NOOPRayTransformConfiguration +from data_processing_ray.test_support.transform.noop_folder_transform import NOOPFolderRayTransformConfiguration diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py new file mode 100644 index 000000000..9919600c4 --- /dev/null +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py @@ -0,0 +1,57 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + + +from data_processing.test_support.transform import NOOPTransformConfiguration +from data_processing.test_support.transform import NOOPFolderTransform +from data_processing.utils import get_logger +from data_processing_ray.runtime.ray import ( + RayTransformLauncher, + RayTransformRuntimeConfiguration, + DefaultRayTransformRuntime +) +from data_processing.data_access import DataAccess + + +logger = get_logger(__name__) + + +class NOOPFolderPythonRuntime(DefaultRayTransformRuntime): + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + return [data_access.get_input_folder()] + + +class NOOPFolderRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderPythonRuntime) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(NOOPFolderRayTransformConfiguration()) + logger.info("Launching noop transform") + launcher.launch() diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py index 67cf20253..a2082c48c 100644 --- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_transform.py @@ -11,9 +11,7 @@ ################################################################################ -from data_processing.test_support.transform.noop_transform import ( - NOOPTransformConfiguration, -) +from data_processing.test_support.transform import NOOPTransformConfiguration from data_processing.utils import get_logger from data_processing_ray.runtime.ray import ( RayTransformLauncher, diff --git a/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py new file mode 100644 index 000000000..cd61c6745 --- /dev/null +++ b/data-processing-lib/ray/test/data_processing_ray_tests/launch/ray/ray_test_noop_folder_launch.py @@ -0,0 +1,33 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher +from data_processing_ray.test_support.transform import NOOPFolderRayTransformConfiguration + + +class TestRayNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../../../../test-data/data_processing/ray/noop/" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + launcher = RayTransformLauncher(NOOPFolderRayTransformConfiguration()) + fixtures = [(launcher, {"noop_sleep_sec": 0, "run_locally": True}, basedir + "/input", basedir + "/expected")] + return fixtures From 3c9be57d656eee4fbda6b1d41849894249e167d8 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sun, 13 Oct 2024 09:07:48 +0100 Subject: [PATCH 020/105] added noop Spark testing --- .../transform/noop_folder_transform.py | 7 ++- .../test_support/transform/__init__.py | 1 + .../transform/noop_folder_transform.py | 53 +++++++++++++++++++ .../launch/spark/test_noop_folder_launch.py | 34 ++++++++++++ 4 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py create mode 100644 data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py diff --git a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py index 9919600c4..1d084b58a 100644 --- a/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py +++ b/data-processing-lib/ray/src/data_processing_ray/test_support/transform/noop_folder_transform.py @@ -11,8 +11,7 @@ ################################################################################ -from data_processing.test_support.transform import NOOPTransformConfiguration -from data_processing.test_support.transform import NOOPFolderTransform +from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration from data_processing.utils import get_logger from data_processing_ray.runtime.ray import ( RayTransformLauncher, @@ -25,7 +24,7 @@ logger = get_logger(__name__) -class NOOPFolderPythonRuntime(DefaultRayTransformRuntime): +class NOOPFolderRayRuntime(DefaultRayTransformRuntime): def get_folders(self, data_access: DataAccess) -> list[str]: """ Get folders to process @@ -47,7 +46,7 @@ def __init__(self): Initialization """ super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), - runtime_class=NOOPFolderPythonRuntime) + runtime_class=NOOPFolderRayRuntime) if __name__ == "__main__": diff --git a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py index 83516f9ae..041cb43d6 100644 --- a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py +++ b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/__init__.py @@ -11,3 +11,4 @@ ################################################################################ from data_processing_spark.test_support.transform.noop_transform import NOOPSparkTransformConfiguration +from data_processing_spark.test_support.transform.noop_folder_transform import NOOPFolderSparkTransformConfiguration diff --git a/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py new file mode 100644 index 000000000..9972e0f79 --- /dev/null +++ b/data-processing-lib/spark/src/data_processing_spark/test_support/transform/noop_folder_transform.py @@ -0,0 +1,53 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import SparkTransformLauncher +from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime +from data_processing.data_access import DataAccess + + +logger = get_logger(__name__) + + +class NOOPFolderSparkRuntime(DefaultSparkTransformRuntime): + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Get folders to process + :param data_access: data access + :return: list of folders to process + """ + return [data_access.get_input_folder()] + + +class NOOPFolderSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform), + runtime_class=NOOPFolderSparkRuntime) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=NOOPFolderSparkTransformConfiguration()) + logger.info("Launching noop transform") + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py b/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py new file mode 100644 index 000000000..c8e3ce40b --- /dev/null +++ b/data-processing-lib/spark/test/data_processing_spark_tests/launch/spark/test_noop_folder_launch.py @@ -0,0 +1,34 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher +from data_processing_spark.test_support.transform import NOOPFolderSparkTransformConfiguration + + +class TestSparkNOOPTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../../../../test-data" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + fixtures = [] + launcher = SparkTransformLauncher(NOOPFolderSparkTransformConfiguration()) + fixtures.append((launcher, {"noop_sleep_sec": 1}, basedir + "/input", basedir + "/expected")) + return fixtures From 371a7124c1570270fd692249dd2e601c4b3476c8 Mon Sep 17 00:00:00 2001 From: blublinsky Date: Sun, 13 Oct 2024 10:03:21 +0100 Subject: [PATCH 021/105] more data access simplifications --- .../src/data_processing/data_access/data_access.py | 5 ++++- .../data_processing/data_access/data_access_local.py | 11 ----------- .../src/data_processing/data_access/data_access_s3.py | 11 ----------- 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access.py b/data-processing-lib/python/src/data_processing/data_access/data_access.py index bba5afd2b..51d7b54b8 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access.py @@ -358,7 +358,10 @@ def get_output_location(self, path: str) -> str: :param path: input file location :return: output file location """ - raise NotImplementedError("Subclasses should implement this!") + if self.get_output_folder() is None: + self.logger.error("Get out put location. S3 configuration is not provided, returning None") + return None + return path.replace(self.get_input_folder(), self.get_output_folder()) def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]: """ diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access_local.py b/data-processing-lib/python/src/data_processing/data_access/data_access_local.py index 224e30ce8..d37e571a3 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access_local.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access_local.py @@ -130,17 +130,6 @@ def get_table(self, path: str) -> tuple[pa.table, int]: logger.error(f"Error reading table from {path}: {e}") return None, 0 - def get_output_location(self, path: str) -> str: - """ - Get output location based on input - :param path: input file location - :return: output file location - """ - if self.output_folder is None: - logger.error("Get output location. local configuration is not defined, returning None") - return None - return path.replace(self.input_folder, self.output_folder) - def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]: """ Saves a pyarrow table to a file and returns information about the operation. diff --git a/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py b/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py index 43e13bcb1..8ddc772c5 100644 --- a/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py +++ b/data-processing-lib/python/src/data_processing/data_access/data_access_s3.py @@ -126,17 +126,6 @@ def get_table(self, path: str) -> tuple[pyarrow.table, int]: self.logger.error(f"Exception reading table {path} from S3 - {e}") return None, 0 - def get_output_location(self, path: str) -> str: - """ - Get output location based on input - :param path: input file location - :return: output file location - """ - if self.output_folder is None: - self.logger.error("Get out put location. S3 configuration is not provided, returning None") - return None - return path.replace(self.input_folder, self.output_folder) - def save_table(self, path: str, table: pyarrow.Table) -> tuple[int, dict[str, Any], int]: """ Save table to a given location From 680f3138d1e183a814f6c9230ab1eee33ad759c0 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:40:59 -0400 Subject: [PATCH 022/105] Renamed/refactored fuzzy dedup python orchestrator Signed-off-by: Constantin M Adam --- .../fdedup/python/src/fuzzy_dedup_python.py | 180 ++++++++++++ .../fdedup/python/src/service_orchestrator.py | 265 ------------------ 2 files changed, 180 insertions(+), 265 deletions(-) create mode 100644 transforms/universal/fdedup/python/src/fuzzy_dedup_python.py delete mode 100644 transforms/universal/fdedup/python/src/service_orchestrator.py diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py new file mode 100644 index 000000000..ca64f336f --- /dev/null +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -0,0 +1,180 @@ +import argparse +import os +import sys + +import cluster_analysis_transform +import data_cleaning_transform +import get_duplicate_list_transform +import signature_calc_transform +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils, get_logger +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +SERVICE_DICT = { + "SignatureCalculation": "minhash", + "ClusterAnalysis": "cluster", + "GetDuplicateList": "fdlist", + "DataCleaning": "fdclean", +} + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + +ARGS_MAP = { + "minhash": [ + signature_calc_transform.contents_column_key, + signature_calc_transform.document_id_column_key, + signature_calc_transform.seed_key, + signature_calc_transform.num_permutations_key, + signature_calc_transform.num_bands_key, + signature_calc_transform.num_minhashes_per_band_key, + signature_calc_transform.jaccard_similarity_threshold_key, + signature_calc_transform.word_shingle_size_key, + signature_calc_transform.num_segments_key, + ], + "cluster": [ + cluster_analysis_transform.jaccard_similarity_threshold_key, + cluster_analysis_transform.num_bands_key, + cluster_analysis_transform.num_segments_key, + ], + "fdlist": [ + get_duplicate_list_transform.subfolder_key, + get_duplicate_list_transform.consolidated_filename_key, + ], + "fdclean": [ + data_cleaning_transform.document_id_column_key, + data_cleaning_transform.duplicate_list_location_key, + ], +} + + +class ServiceOrchestrator: + def __init__(self, global_params: argparse.Namespace = None): + self.global_params = global_params + self.logger = get_logger(__name__) + + def execute_service(self, service_logic, service_params): + # Call the generic service logic + service_logic(service_params) + + def orchestrate(self): + service_list = self.global_params.services.split(",") + for service in service_list: + self.logger.info(f"Starting {service} step") + if service not in SERVICE_DICT: + err_msg = f"Unknown service {service} specified. Must be one of {SERVICE_DICT.keys()}" + self.logger.error(err_msg) + raise ValueError(err_msg) + service_short_name = SERVICE_DICT[service] + service_params = self.get_arguments(args, service_short_name) + self.logger.info(f"Got parameters for {service}") + status = self.execute_service(service_short_name, service_params) + if status == 0: + self.logger.info(f"{service} completed successfully") + else: + self.logger.error(f"{service} failed with status {status}, aborting ...") + break + + def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: + sys_argv = ["python"] + in_args_dict = vars(in_args) + all_module_arguments = ARGS_MAP.get(service_name, []) + passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None} + for k, v in passed_args.items(): + sys_argv.append(f"--{service_name}_{k}") + sys_argv.append(str(v)) + if service_name == "minhash": + input_folder = in_args_dict["input_folder"] + output_folder = in_args_dict["output_folder"] + elif service_name == "cluster": + input_folder = os.path.join(in_args_dict["output_folder"], "bands") + output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove") + elif service_name == "fdlist": + input_folder = in_args_dict["output_folder"] + output_folder = in_args_dict["output_folder"] + elif service_name == "fdclean": + input_folder = in_args_dict["input_folder"] + output_folder = os.path.join(in_args_dict["output_folder"], "cleaned") + else: + self.logger.error(f"Unknown service name: {service_name}") + data_io = { + "input_folder": input_folder, + "output_folder": output_folder, + } + if in_args.use_s3: + sys_argv.append("--data_s3_cred") + sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) + sys_argv.append("--data_s3_config") + else: + sys_argv.append("--data_local_config") + sys_argv.append(ParamsUtils.convert_to_ast(data_io)) + return sys_argv + + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params + if service_short_name == "minhash": + launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) + elif service_short_name == "cluster": + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + status = launcher.launch() + return status + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Service Orchestrator") + + # Define command line arguments + parser.add_argument("--input_folder", type=str, required=True, help="Input folder path") + parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") + + parser.add_argument( + "--contents_column", type=str, default="text", help="Name of the column that holds document text" + ) + parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations") + parser.add_argument("--num_bands", type=int, default=14, help="Number of bands") + parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band") + parser.add_argument("--num_segments", type=int, default=2, help="Number of segments") + + # Single argument for service execution + parser.add_argument( + "--services", + type=str, + required=False, + default="SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning", + help="Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning)", + ) + + parser.add_argument( + "--use_s3", + action="store_true", + help="use s3", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = ServiceOrchestrator(global_params=args) + # Launch python fuzzy dedup execution + orchestrator.orchestrate() diff --git a/transforms/universal/fdedup/python/src/service_orchestrator.py b/transforms/universal/fdedup/python/src/service_orchestrator.py deleted file mode 100644 index 897a3210c..000000000 --- a/transforms/universal/fdedup/python/src/service_orchestrator.py +++ /dev/null @@ -1,265 +0,0 @@ -import argparse -import os -import sys - -from cluster_analysis_transform_python import ( - ClusterAnalysisPythonTransformConfiguration, -) -from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration -from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import ParamsUtils -from file_copy_util import FileCopyUtil -from signature_calc_transform_python import ( - SignatureCalculationPythonTransformConfiguration, -) - - -class ServiceOrchestrator: - def __init__(self, global_params=None): - self.global_params = global_params or {} - - def execute_service(self, service_logic, service_params): - # Call the generic service logic - service_logic(service_params) - - def orchestrate(self, service_logic): - service_list = self.global_params["services"].split(",") - - for service in service_list: - if service == "SignatureCalculation": - params = create_transform_args_payload(args, service) - params["service_type"] = "SignatureCalculation" - self.execute_service(service_logic, params) - elif service == "ClusterAnalysis": - params = create_transform_args_payload(args, service) - params["service_type"] = "ClusterAnalysis" - self.execute_service(service_logic, params) - elif service == "DataCleaning": - params = create_transform_args_payload(args, service) - params["service_type"] = "DataCleaning" - self.execute_service(service_logic, params) - elif service == "BandsFileCopy": - params = args - params["service_type"] = "BandsFileCopy" - self.execute_service(service_logic, params) - elif service == "DocsToRemoveFileCopy": - params = args - params["service_type"] = "DocsToRemoveFileCopy" - self.execute_service(service_logic, params) - else: - print(f"Warning: {service} is not a recognized service.") - - -def generic_service_logic(params): - print("Service executed with parameters:", params) - service_type = params["service_type"] - use_s3 = params["use_s3"] - # Remove the 'service_type' key - params.pop("service_type", None) # Using pop() method - - if service_type == "SignatureCalculation" or service_type == "ClusterAnalysis" or service_type == "DataCleaning": - # Set the simulated command line args - params.pop("num_permutations", None) # Using pop() method - params.pop("num_bands", None) # Using pop() method - params.pop("num_segments", None) # Using pop() method - params.pop("use_s3", None) # Using pop() method - # Set the simulated command line args - sys.argv = ParamsUtils.dict_to_req(d=params) - if use_s3: - sys.argv.append("--data_s3_cred") - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) - - if service_type == "SignatureCalculation": - runtime_config = SignatureCalculationPythonTransformConfiguration() - launch_transform_service(runtime_config) - elif service_type == "ClusterAnalysis": - runtime_config = ClusterAnalysisPythonTransformConfiguration() - launch_transform_service(runtime_config) - elif service_type == "DataCleaning": - runtime_config = DataCleaningPythonTransformConfiguration() - launch_transform_service(runtime_config) - elif service_type == "BandsFileCopy": - launch_file_copy_service(params, service_type) - elif service_type == "DocsToRemoveFileCopy": - launch_file_copy_service(params, service_type) - - -def launch_transform_service(params): - # create launcher - launcher = PythonTransformLauncher(runtime_config=params) - # Launch the ray actor(s) to process the input - launcher.launch() - - -def launch_file_copy_service(args, service_type): - root_folder = os.path.join(args["root_folder"], args["output_folder"]) - data_type = None - if service_type == "BandsFileCopy": - data_type = "bands" - # Get files to process - files = [ - f"band={band}/segment={segment}" - for band in range(args["num_bands"]) - for segment in range(args["num_segments"]) - ] - elif service_type == "DocsToRemoveFileCopy": - files = ["docs_to_remove"] - data_type = "docs_to_remove" - config = {"root_folder": root_folder} - data_access_factory: DataAccessFactoryBase = DataAccessFactory() - daf_args = [] - - if args["use_s3"]: - - s3_config = { - "input_folder": root_folder, - "output_folder": root_folder, - } - daf_args.append("--data_s3_cred") - daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) - daf_args.append("--data_s3_config") - daf_args.append(ParamsUtils.convert_to_ast(s3_config)), - else: - - # Construct folders - local_config = { - "input_folder": root_folder, - "output_folder": os.path.abspath(os.path.join(args["root_folder"], args["output_folder"])), - } - daf_args.append("--data_local_config") - daf_args.append(ParamsUtils.convert_to_ast(local_config)) - - daf_parser = argparse.ArgumentParser() - data_access_factory.add_input_params(parser=daf_parser) - data_access_factory_args = daf_parser.parse_args(args=daf_args) - data_access_factory.apply_input_params(args=data_access_factory_args) - stats = {} - fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats) - for file in files: - fcu.copy_data(file, data_type) - - -def create_transform_args_payload(args, service): - print(args) - # Construct folders - input_folder = os.path.join(args["root_folder"], args["input_folder"]) - output_folder = os.path.join(args["root_folder"], args["output_folder"]) - if service == "ClusterAnalysis": - input_folder = os.path.join(args["root_folder"], args["output_folder"], "bands_consolidated") - output_folder = os.path.join(args["root_folder"], args["output_folder"], "docs_to_remove") - elif service == "DataCleaning": - output_folder = os.path.join(args["root_folder"], args["output_folder"], "cleaned") - duplicate_location = os.path.join( - args["root_folder"], - args["output_folder"], - "docs_to_remove_consolidated", - "docs_to_remove_consolidated.parquet", - ) - - # Create a local configuration - local_conf = {"input_folder": input_folder, "output_folder": output_folder} - - # Create parameters - params = { - "num_permutations": args["num_permutations"], - "num_bands": args["num_bands"], - "num_segments": args["num_segments"], - "use_s3": args["use_s3"], - } - - if args["use_s3"]: - params["data_s3_config"] = ParamsUtils.convert_to_ast(local_conf) - else: - params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) - - # add extra - if service == "DataCleaning": - short_name = "fdclean" - cli_prefix = f"{short_name}_" - - # configuration keys - document_id_column_key = "document_id_column" - """ This key holds the name of the column storing the unique ID assigned to each document""" - duplicate_list_location_key = "duplicate_list_location" - """ This key holds the location of the list of duplicate documents marked for removal""" - - # command line arguments - document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" - """ Name of the column storing the unique ID assigned to each document""" - duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" - """ Location of the list of duplicate documents marked for removal""" - - params[document_id_column_cli_param] = "int_id_column" - params[duplicate_list_location_cli_param] = duplicate_location - - return params - - -def create_file_copy_args_payload(args): - daf_args = [] - local_config = { - "input_folder": args.root_folder, - "output_folder": args.root_folder, - } - daf_args.append("--data_local_config") - daf_args.append(ParamsUtils.convert_to_ast(local_config)) - data_access_factory: DataAccessFactoryBase = DataAccessFactory() - daf_parser = argparse.ArgumentParser() - data_access_factory.add_input_params(parser=daf_parser) - data_access_factory_args = daf_parser.parse_args(args=daf_args) - data_access_factory.apply_input_params(args=data_access_factory_args) - return data_access_factory - - -def parse_args(): - parser = argparse.ArgumentParser(description="Service Orchestrator") - - # Define command line arguments - parser.add_argument("--root_folder", type=str, required=True, help="Root folder path") - parser.add_argument("--input_folder", type=str, required=True, help="Input folder path") - parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") - - parser.add_argument( - "--contents_column", type=str, default="text", help="Name of the column that holds document text" - ) - parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations") - parser.add_argument("--num_bands", type=int, default=14, help="Number of bands") - parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band") - parser.add_argument("--num_segments", type=int, default=2, help="Number of segments") - - # Single argument for service execution - parser.add_argument( - "--services", - type=str, - required=True, - help="Comma-separated list of services to run (e.g., SignatureCalculation,BandsFileCopy,ClusterAnalysis,DocsToRemoveFileCopy,DataCleaning)", - ) - - parser.add_argument( - "--use_s3", - type=bool, - default=False, - help="use s3", - ) - - args = parser.parse_args() - return vars(args) # Convert Namespace to dictionary - - -if __name__ == "__main__": - - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - - # Parse command line arguments - args = parse_args() - - # Initialize the orchestrator - orchestrator = ServiceOrchestrator(global_params=args) - - # Example service execution (if you had defined services) - orchestrator.orchestrate(generic_service_logic) From c29d3bf78eb24045e7f6d3f110a8323432636290 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:45:50 -0400 Subject: [PATCH 023/105] Rewrote cluster_analysis_transform as a folder_transform Signed-off-by: Constantin M Adam --- .../src/cluster_analysis_local_python.py | 11 +- .../python/src/cluster_analysis_transform.py | 180 +++++++++++++----- .../src/cluster_analysis_transform_python.py | 49 ++++- 3 files changed, 183 insertions(+), 57 deletions(-) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py index dcfc9a7e4..7c162b1b1 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -21,7 +21,7 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { "input_folder": input_folder, @@ -35,12 +35,15 @@ "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.0, } if __name__ == "__main__": # Set the simulated command line args - # sys.argv = ParamsUtils.dict_to_req(d=params) - # print(sys.argv) + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) # create launcher launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) - # Launch the ray actor(s) to process the input + # Launch python to process the input launcher.launch() diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 5ad18362a..221b50512 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -9,15 +9,17 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ +import io import os +import re from argparse import ArgumentParser, Namespace from typing import Any, List, Tuple import numpy as np import polars as pl import pyarrow as pa -from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing.transform import AbstractFolderTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger from Murmur_MH import Murmur_MH @@ -25,23 +27,37 @@ cli_prefix = f"{short_name}_" # configuration keys +num_bands_key = "num_bands" +""" This key holds the number of bands used in the banding technique""" +num_segments_key = "num_segments" +""" This key holds the number of segments dividing the hashing space for each band""" jaccard_similarity_threshold_key = "jaccard_similarity_threshold" """ This key holds the Jaccard similarity threshold above which two documents are duplicates""" # command line arguments +num_bands_cli_param = f"{cli_prefix}{num_bands_key}" +""" The number of bands used in the banding technique""" jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" """ Jaccard similarity threshold above which two documents are duplicates""" +num_segments_cli_param = f"{cli_prefix}{num_segments_key}" +""" The number of segments dividing the hashing space for each band""" captured_arg_keys = [ + num_bands_key, + num_segments_key, jaccard_similarity_threshold_key, ] # defaults -jaccard_similarity_threshold_default = 0.8 -""" Default Jaccard similarity threshold above which two documents are duplicates""" +num_bands_default = 14 +""" Default number of bands used in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)""" +jaccard_similarity_threshold_default = 0.75 +""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_segments_default = 1 +""" Default number of segments dividing the hashing space for each band""" -class ClusterAnalysisTransform(AbstractTableTransform): +class ClusterAnalysisTransform(AbstractFolderTransform): """ This is the second transform of the fuzzy dedup pipeline. It runs in parallel: for each band, the hashing interval is divided into segments. A cluster analysis @@ -65,7 +81,9 @@ class ClusterAnalysisTransform(AbstractTableTransform): duplicates. The resulting clusters are saved in a file for further analysis. Args: + num_bands: number of bands used in the banding technique jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + num_segments: the number of segments dividing the hashing space for each band """ def __init__(self, config: dict[str, Any]): @@ -75,58 +93,102 @@ def __init__(self, config: dict[str, Any]): defined by the companion runtime, ClusterAnalysisTransformRuntime. """ super().__init__(config) + self.num_bands = config.get(num_bands_key, num_bands_default) + self.num_segments = config.get(num_segments_key, num_segments_default) self.jaccard_similarity_threshold = config.get( jaccard_similarity_threshold_key, jaccard_similarity_threshold_default ) + self.data_access = config.get("data_access") self.logger = get_logger(__name__) - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - bands_dataframe = pl.from_arrow(table) - docs2remove_list = [] - # clustering - bands_dataframe_groups = bands_dataframe.group_by("band_hash").agg("document_data") - bands_dataframe_cluster = bands_dataframe_groups.with_columns( - cluster_length=pl.col("document_data").list.len() - ).filter(pl.col("cluster_length") > 1) - self.logger.info(f"file_name = {file_name}") - num_clusters = len(bands_dataframe_cluster) - if num_clusters > 0: - sum_cdocs = bands_dataframe_cluster.select(pl.sum("cluster_length")).item() - max_cdocs = bands_dataframe_cluster.select(pl.max("cluster_length")).item() - min_cdocs = bands_dataframe_cluster.select(pl.min("cluster_length")).item() - avg_cdocs = bands_dataframe_cluster.select(pl.mean("cluster_length")).item() + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + self.logger.info(f"Cluster analysis for folder {folder_name}") + metadata = {} + input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + files, retries = self.data_access.get_folder_files( + path=input_folder, + extensions=[".parquet"], + return_data=True, + ) + if retries > 0: + metadata |= {"data_access_retries": retries} + match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name) + if match: + band = int(match.group(1)) + segment = int(match.group(2)) else: - sum_cdocs = 0 - max_cdocs = 0 - min_cdocs = 0 - avg_cdocs = 0 - self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") - self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") - bands_dataframe_response = self.process_bands(bands_dataframe_cluster) + raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s") + output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet") + + # consolidate into a single data frame band hashes computed by workers + band_segment_dataframe, consolidation_stats = self.consolidate_band_segment_files(files) + metadata |= consolidation_stats + # cluster grouping by band hashes + cluster_dataframe, cluster_stats = self.get_clusters(band_segment_dataframe) + metadata |= cluster_stats + # cluster analysis using jaccard similarity + jaccard_cluster_dataframe, jaccard_stats = self.analyze_clusters(cluster_dataframe) + metadata |= jaccard_stats + # Generate the docs_to_remove dataframe + docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove") + output_data = TransformUtils.convert_arrow_to_binary(docs_to_remove_dataframe.to_arrow()) + self.logger.info(f"{len(docs_to_remove_dataframe)} documents marked to remove") + metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)} + return [(output_data, output_path)], metadata + + def sanitize_folder_name(self, folder_name: str) -> str: + if "://" in folder_name: + _, folder_name = folder_name.split("://") + if folder_name[-1] != "/": + folder_name = f"{folder_name}/" + return folder_name + + def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + band_segment_dataframe = pl.DataFrame() + total_input_rows = 0 + for fname, contents in files.items(): + df = pl.read_parquet(io.BytesIO(contents)) + total_input_rows += len(df) + self.logger.debug(f"{fname} has {len(df)} rows") + band_segment_dataframe = band_segment_dataframe.vstack(df) - filtered_doc2remove_dataframe = bands_dataframe_response.filter(pl.col("docs_to_remove_length") > 0) - num_clusters = len(filtered_doc2remove_dataframe) + consolidation_stats = { + "input_files": len(files), + "input_bytes": sum(len(v) for v in files.values()), + "input_rows": total_input_rows, + "consolidated_files": 1, + "consolidated_bytes": band_segment_dataframe.to_arrow().nbytes, + "consolidated_rows": len(band_segment_dataframe), + } + return band_segment_dataframe, consolidation_stats + + def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + groupby_dataframe = band_segment_dataframe.group_by("band_hash").agg("document_data") + cluster_dataframe = groupby_dataframe.with_columns(cluster_length=pl.col("document_data").list.len()).filter( + pl.col("cluster_length") > 1 + ) + # self.logger.info(f"file_name = {file_name}") + num_clusters = len(cluster_dataframe) if num_clusters > 0: - sum_cdocs = filtered_doc2remove_dataframe.select(pl.sum("docs_to_remove_length")).item() - max_cdocs = filtered_doc2remove_dataframe.select(pl.max("docs_to_remove_length")).item() - min_cdocs = filtered_doc2remove_dataframe.select(pl.min("docs_to_remove_length")).item() - avg_cdocs = filtered_doc2remove_dataframe.select(pl.mean("docs_to_remove_length")).item() + sum_cdocs = cluster_dataframe.select(pl.sum("cluster_length")).item() + max_cdocs = cluster_dataframe.select(pl.max("cluster_length")).item() + min_cdocs = cluster_dataframe.select(pl.min("cluster_length")).item() + avg_cdocs = cluster_dataframe.select(pl.mean("cluster_length")).item() else: sum_cdocs = 0 max_cdocs = 0 min_cdocs = 0 avg_cdocs = 0 - self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + cluster_stats = { + "groupby_clusters": num_clusters, + "cluster_duplicate_docs": sum_cdocs, + } + return cluster_dataframe, cluster_stats - # Explode the 'docs_to_remove' column - doc2remove_exploded_dataframe = filtered_doc2remove_dataframe.explode("docs_to_remove") - table = doc2remove_exploded_dataframe.to_arrow() - self.logger.info(f"{len(doc2remove_exploded_dataframe)} documents marked to remove") - metadata = {"nrows": len(table)} - return [table], metadata - - def process_bands(self, df: pl.DataFrame) -> pl.DataFrame: + def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: # Define the schema with specific data types schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64} doc_ids_lists = [] @@ -137,7 +199,7 @@ def process_bands(self, df: pl.DataFrame) -> pl.DataFrame: doc_ids_lists += doc_ids_list docs_to_remove_lists += docs_to_remove_list len_of_docs2remove_lists += len_of_docs2remove_list - processed_rows = pl.DataFrame( + jaccard_cluster_dataframe = pl.DataFrame( { "first_doc": doc_ids_lists, "docs_to_remove": docs_to_remove_lists, @@ -145,7 +207,25 @@ def process_bands(self, df: pl.DataFrame) -> pl.DataFrame: }, schema=schema, ) - return processed_rows + filtered_jaccard_dataframe = jaccard_cluster_dataframe.filter(pl.col("docs_to_remove_length") > 0) + num_clusters = len(filtered_jaccard_dataframe) + if num_clusters > 0: + sum_cdocs = filtered_jaccard_dataframe.select(pl.sum("docs_to_remove_length")).item() + max_cdocs = filtered_jaccard_dataframe.select(pl.max("docs_to_remove_length")).item() + min_cdocs = filtered_jaccard_dataframe.select(pl.min("docs_to_remove_length")).item() + avg_cdocs = filtered_jaccard_dataframe.select(pl.mean("docs_to_remove_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + jaccard_stats = { + "jaccard_clusters": num_clusters, + "jaccard_duplicate_docs": sum_cdocs, + } + return filtered_jaccard_dataframe, jaccard_stats def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: # Process row and return a new list of Series or a new row @@ -216,6 +296,18 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=jaccard_similarity_threshold_default, help="Jaccard similarity threshold above which two documents are duplicates", ) + parser.add_argument( + f"--{num_bands_cli_param}", + type=int, + default=num_bands_default, + help="The number of bands used in the banding technique", + ) + parser.add_argument( + f"--{num_segments_cli_param}", + type=int, + default=num_segments_default, + help="The number of segments dividing the hashing space for each band", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py index 28d96f428..8ff6dbf2b 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py @@ -10,11 +10,19 @@ # limitations under the License. ################################################################################ +import os import time +from typing import Any -from cluster_analysis_transform import ClusterAnalysisTransformConfiguration -from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.runtime.pure_python.runtime_configuration import ( +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + DefaultPythonTransformRuntime, + PythonTransformLauncher, PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger @@ -23,11 +31,31 @@ logger = get_logger(__name__) +class ClusterAnalysisPythonRuntime(DefaultPythonTransformRuntime): + """ + Cluster analysis runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + class ClusterAnalysisPythonTransformConfiguration(PythonTransformRuntimeConfiguration): """ - Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. - NOOP does not use a RayRuntime class so the superclass only needs the base - python-only configuration. + Implements the PythonTransformConfiguration for Fuzzy Dedup ClusterAnalysis + as required by the PythonTransformLauncher. """ def __init__(self): @@ -35,10 +63,13 @@ def __init__(self): Initialization :param base_configuration - base configuration class """ - super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisPythonRuntime, + ) if __name__ == "__main__": - launcher = PythonTransformLauncher(ClusterAnalysisTransformConfiguration()) - logger.info("Launching noop transform") + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + logger.info("Launching fuzzy dedup cluster analysis python transform") launcher.launch() From aada59eccbf6b8df6e1c5b332fa19a21a99b125c Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:48:21 -0400 Subject: [PATCH 024/105] Wrote get_duplicate_list_transform as a folder_transform Signed-off-by: Constantin M Adam --- .../src/get_duplicate_list_transform.py | 168 ++++++++++++++++++ .../get_duplicate_list_transform_python.py | 71 ++++++++ 2 files changed, 239 insertions(+) create mode 100644 transforms/universal/fdedup/python/src/get_duplicate_list_transform.py create mode 100644 transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py new file mode 100644 index 000000000..c7b4cbddf --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py @@ -0,0 +1,168 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +import re +from argparse import ArgumentParser, Namespace +from typing import Any, List, Tuple + +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.transform import AbstractFolderTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger +from Murmur_MH import Murmur_MH + + +short_name = "fdlist" +cli_prefix = f"{short_name}_" + +# configuration keys +subfolder_key = "docs_to_remove" +""" This key holds the name of the subfolder with the duplicate records""" +consolidated_filename_key = "consolidated_filename" +""" This key holds the name of the file with the consolidated list of duplicates""" + +# command line arguments +subfolder_cli_param = f"{cli_prefix}{subfolder_key}" +""" The name of the subfolder with the duplicate records""" +consolidated_filename_cli_param = f"{cli_prefix}{consolidated_filename_key}" +""" The name of the file with the consolidated list of duplicates""" + +captured_arg_keys = [ + subfolder_key, + consolidated_filename_key, +] + +# defaults +subfolder_default = "docs_to_remove" +""" Default name of the subfolder with the duplicate records""" +consolidated_filename_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") +""" Default name of the file with the consolidated list of duplicates""" + + +class GetDuplicateListTransform(AbstractFolderTransform): + """ + This is an intermediate step of the fuzzy dedup pipeline. It runs in a single + location and consolidates in a single file all the duplicates found for each + band segment. + Args: + subfolder: name of the subfolder with the duplicate records + consolidated_filename: name of the file with the consolidated list of duplicates + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.subfolder = config.get(subfolder_key, subfolder_default) + self.consolidated_filename = config.get(consolidated_filename_key, consolidated_filename_default) + self.data_access = config.get("data_access") + self.logger = get_logger(__name__) + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + self.logger.info(f"Get Duplicate List for folder {folder_name}") + metadata = {} + input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + files, retries = self.data_access.get_folder_files( + path=input_folder, + extensions=[".parquet"], + return_data=True, + ) + if retries > 0: + metadata |= {"data_access_retries": retries} + output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_path = os.path.join(output_folder, self.consolidated_filename) + + # consolidate into a single data frame band hashes computed by workers + consolidated_dataframe, consolidation_stats = self.consolidate_docs_to_remove_files(files) + self.logger.info(f"{len(consolidated_dataframe)} documents marked as duplicates") + metadata |= consolidation_stats + output_data = TransformUtils.convert_arrow_to_binary(consolidated_dataframe.to_arrow()) + return [(output_data, output_path)], metadata + + def sanitize_folder_name(self, folder_name: str) -> str: + if "://" in folder_name: + _, folder_name = folder_name.split("://") + if folder_name[-1] != "/": + folder_name = f"{folder_name}/" + return folder_name + + def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + consolidated_dataframe = pl.DataFrame() + total_input_rows = 0 + for fname, contents in files.items(): + df = pl.read_parquet(io.BytesIO(contents)) + total_input_rows += len(df) + self.logger.debug(f"{fname} has {len(df)} rows") + consolidated_dataframe = consolidated_dataframe.vstack(df) + consolidated_dataframe = consolidated_dataframe.select("docs_to_remove").unique() + + consolidation_stats = { + "input_files": len(files), + "input_bytes": sum(len(v) for v in files.values()), + "input_rows": total_input_rows, + "consolidated_files": 1, + "consolidated_bytes": consolidated_dataframe.to_arrow().nbytes, + "consolidated_rows": len(consolidated_dataframe), + } + return consolidated_dataframe, consolidation_stats + + +class GetDuplicateListTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=GetDuplicateListTransform, + remove_from_metadata=[], + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the GetDuplicateListTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{subfolder_cli_param}", + type=str, + default=subfolder_default, + help="The name of the subfolder with the duplicate records", + ) + parser.add_argument( + f"--{consolidated_filename_cli_param}", + type=str, + default=consolidated_filename_default, + help="The name of the file with the consolidated list of duplicates", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py new file mode 100644 index 000000000..703ef630e --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py @@ -0,0 +1,71 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import time +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + DefaultPythonTransformRuntime, + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from get_duplicate_list_transform import ( + GetDuplicateListTransformConfiguration, + subfolder_key, +) + + +logger = get_logger(__name__) + + +class GetDuplicateListPythonRuntime(DefaultPythonTransformRuntime): + """ + Get duplicate list runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + return [self.params[subfolder_key]] + + +class GetDuplicateListPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for Fuzzy Dedup GetDuplicateList + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=GetDuplicateListTransformConfiguration(), + runtime_class=GetDuplicateListPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + logger.info("Launching fuzzy dedup get duplicate list python transform") + launcher.launch() From 2019d56565ea52c5474632a822e67ac7e66fdac8 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:50:13 -0400 Subject: [PATCH 025/105] Added text preprocessing Signed-off-by: Constantin M Adam --- .../python/src/signature_calc_local_python.py | 39 +++++---- .../python/src/signature_calc_transform.py | 81 +++++++------------ 2 files changed, 48 insertions(+), 72 deletions(-) diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py index eb958ee3d..062580f22 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -20,31 +20,28 @@ ) -# # create parameters -# input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) -# output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output_second")) -# local_conf = { -# "input_folder": input_folder, -# "output_folder": output_folder -# } -# code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -# params = { -# # Data access. Only required parameters are specified -# "data_local_config": ParamsUtils.convert_to_ast(local_conf), -# # execution info -# "runtime_pipeline_id": "pipeline_id", -# "runtime_job_id": "job_id", -# "runtime_code_location": ParamsUtils.convert_to_ast(code_location), -# "minhash_num_permutations":112, -# "minhash_num_bands":14, -# "minhash_num_segments":2 -# } +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = {"input_folder": input_folder, "output_folder": output_folder} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} if __name__ == "__main__": # Set the simulated command line args - # sys.argv = ParamsUtils.dict_to_req(d=params) - # print(sys.argv) + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) sys.argv.append("--data_s3_cred") s3_creds = { diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 7ac8eb057..7c4dd391c 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -10,6 +10,8 @@ # limitations under the License. ################################################################################ import os +import re +import unicodedata from argparse import ArgumentParser, Namespace from pathlib import Path from typing import Any, List @@ -100,44 +102,16 @@ """ Default number of segments across which we divide the hashing space for each band""" -def _optimal_minhashlsh_param( - threshold: float = jaccard_similarity_threshold_default, - num_perm: int = num_permutations_default, - false_positive_weight: float = 0.5, - false_negative_weight: float = 0.5, -): - """ - Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum - of probabilities of false positive and false negative. - :param threshold: desired similarity threshold - :param num_perm: number of permutations - :param false_positive_weight: importance of avoiding false positive results - :param false_negative_weight: importance of avoiding false negative results - :return: a tuple (optimal number of bands, optimal number of rows) - """ - - def _false_positive_probability(threshold, b, r): - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, threshold) - return a - - def _false_negative_probability(threshold, b, r): - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, threshold, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for b in range(1, num_perm + 1): - max_r = int(num_perm / b) - for r in range(1, max_r + 1): - fp = _false_positive_probability(threshold, b, r) - fn = _false_negative_probability(threshold, b, r) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (b, r) - return opt +NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?") +WHITESPACE_PATTERN = re.compile(r"\s+") +PUNCTUATION = "!/—”:%1〈&(、━\\【#%「」,】;+^]~“《„';’{|∶´[=-`*.(–?!:$~«〉,><》)?)。…@_.\"}►»" + "".join( + map( + chr, + (x for a, b in ((0, 9), (11, 13), (13, 32), (127, 160)) for x in range(a, b)), + ) +) +PUNCTUATION_SET = set(PUNCTUATION) +PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) class SignatureCalculationTransform(AbstractTableTransform): @@ -184,13 +158,6 @@ def __init__(self, config: dict[str, Any]): self.num_segments = config.get(num_segments_key, num_segments_default) self.num_bands = config.get(num_bands_key, num_bands_default) self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) - # Calculate optimal parameters for bands calculation - # self.num_bands, self.num_rows = _optimal_minhashlsh_param( - # threshold=self.jaccard_similarity_threshold, - # num_perm=self.num_permutations, - # false_positive_weight=0.5, - # false_negative_weight=0.5, - # ) # use this dataframe to store the minhashes and size for each document self.all_minhashes: pl.DataFrame = None # use this dataframe to store the band hashes for each document @@ -224,8 +191,8 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab # generate minhash values minhashes = df.map_rows( - lambda text: mm_min_hash.minhash2_nosalt( - *self._generate_word_shingles(text, window_size=self.word_shingle_size) + lambda row: mm_min_hash.minhash2_nosalt( + *self._generate_word_shingles(row, window_size=self.word_shingle_size) ) ) # rename columns, cast minhashes to list(uint32) @@ -374,10 +341,22 @@ def write_band_signatures(self): return [], metadata # define shingles generation function - def _generate_word_shingles(self, text: str, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]: - words = text[0].split() - document_id = text[1] - doc_len = len(text[0]) + def _generate_word_shingles(self, row: tuple, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]: + text = row[0] + # lower case + text = text.lower() + # replace numbers with '0' + text = NUMBERS_PATTERN.sub("0", text) + # convert punctuation to spaces + text = text.translate(PUNCTUATION_TRANS) + # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end + text = WHITESPACE_PATTERN.sub(" ", text.strip()) + # diacritics/unicode normalization + text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn") + text = text.strip() + words = text.split() + document_id = row[1] + doc_len = len(row[0]) word_count = len(words) k_shingles = [] for i in range(0, max(1, word_count - window_size + 1)): From 9362803f99fa422437031263474e97365d61d9f3 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 00:51:22 -0400 Subject: [PATCH 026/105] Added python test data Signed-off-by: Constantin M Adam --- .../python/test-data/input/data_1/df1.parquet | Bin 0 -> 3093 bytes .../python/test-data/input/data_2/df2.parquet | Bin 0 -> 1397 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c9220bf39c8dd2127707be44ba210363df6aa1a3 GIT binary patch literal 3093 zcmZ`+2{csw`@b_;CTom)Lt|$wk+qV%-ma!GQW$#@8C$YuMnpxHk{H<5Kl{6hl*K_LhY?SX9Y7y!wX zqz`sV|1Q^a13;RF!EUz>NE6Y~w@A$P@=28JxvaxV$RH5*U-e2eP~2H6d^C%(D-cMcCRY7e=$)E(wYu}4&pB|!0${2ryymMznu=7KKYyAPeJ%s(6 zjYf$9gPJ44HcP>o%aY7TbEU0rFDl{7XQsW{aZeq6A3qCOAZ8zac1yXeW`e#rvh;jH z-Pbaja}xB9GW2QzkE;uzc=)!~DctzR>oiy8=F!LjpKZ?`D;g8s}-C zynvH}r;MZT>YH1acMpeCmB;+DnfAjrWs<$+BI!bp>iSlUZeJn32u$20c@za7C7+o8 zt7SYA8{qx6POX{-zJ7u-FDhs)U}CQ&1|-?kJJXDNPxUc_YPFg^<-5k-ezH?9`IajH z$nQx|8xu&@uJ$j!rA%}Sko3QA=@&Q|zr8==&L{Ebn?u9ZsZ&`ZVSb}U6~j<>>Zr}O6<5XEPNPzFj4I^Y9U^v;mi2V&B_aX`>TW6DS3e_b z@7JAjcS7ee2hTbknYUDPO!uN2{Yj^6)W`mbgk_CcSghREFN4!wV|ktSmzPh*Tt0h6 zeP)-+=W6X&olfqKNz(2sU_W(>K)&Evxt3auIoCQT~6$dmT7e%EPq zenO!yhTMx43Rx^3dsnU9YJa5XPDq#f28EWZQ9AYMv%$u?TKfxMITt?=`;kjO;5T$q zmXBbn_gS`BFfVPXVn?Jk59<+Gd~5x*1?5xenyg7(pYHvFPdsuvN`lp+oV#;AH>}}A zFIpHYdeVZM`hz@=p6(}jl}rBl(1Z+XUyt$xs8C3&&2^~mb+gU$ipuQ#5R+(!PKZfQ=77%m^N zFQleBR(*C~2o{|VcC4$2$?0bYW@|TB>@0VRy;Yo-R;KEzPCOdj_PjKyW>*f4WM74lqf9=$*}%X1uEW4L{C@`O8=s(~8%xHrU;!x)Aw_*9 z1C~KDO2^a`L-`OWCQlN{Ojj^Z*Ypc_oWSb#)^M6PjHBi^nGvqnnRM7=FCTV2Xmi{i z8+mq7N{(2HmM9ZDj(;a}Y}VXBAg8#m<6P3fv#1`6FxyH#C18l9|4!2XA%&9~O>K0D zH;f9TK2h4vHnl9&oOtLRFVr^L)b{5^x%wz8ycZ*nwW8ICjdtkX6vs;0bBMj*a_W-V zmQx=I-{rz~3Htam_{aLfJBCx}j2LzpKQ{JNw6dcI|HFDK8qDxu5t1`pqc z`^k#xUvw23GEaOl?@<`qbz zt^-?>KC~i@&qZ;ujR5X(0Wkkl>iXD0pJ060lr^gxxo&NVCZil69WZ9 z{`dhLH}ChY;4uI1GJ~SP3k5gr;GGBgzx_Z+?gG#8lU;>Z+g%sD4hwF7u_rmsYbxc? z(Wl+ve*tyE!D?%VA6KKufE8d1d^MwLdG-YFYE@IX5H4q;(3(^Neb=AT zr$2o6&QP3d9O`&w08bhnVXD1Cg~U6C3ceg9&9J9IUgDH&?{~mne`M%wte@7~hy!{= zbZ%Rm&tS;jGqLYI_4W9LN4Gt`cbXOM;{GA9pJFhFR~oF3@skxbtUD9_v@ZA$=5qdv zf~xqG`HO{8aZXqA0ZPYX{e4(#+__MWMORM>gXZ#8^lb}9R~Rn*T1;6u(c|%A{=6;R zD{%vD_Bt?pE3rq_{q4x?i6a-3&BZsy@?HwD)r53-3rp2(2dPrkJ{z^UyobzFnKPZ|84d#&}~g95rdDD1*C8g2DE{Y z$wH)D6u2V&OCG?1{=dXHSuhd9m64G%Ch>#>1<^nrL{UNi&wC93g^;sO}sPiw-ft+k36Uw4@v;|D>&T0b&ZDpxiQciJSYi3-}E6g9s_l6kwDtl#5c?Q zzQjM50$Rd@5P+xS|Fy>d`!Wz+m0KfU3&J3m3HpMOKqKEsQV6!Ov2rw`dK~i#aF@(L zYjer)5RA}1?q+xr(Zk8V^z|bP7^cE|VHn|Lc&jyH4-4bk&0*N*mmea+rrZrS2ZqQb{%KO?K&Ro}hD_jBtb+>(3hj^uaJ^Y?So lvph*5InXFR@E?FApLLMGdk9rZ6Bz*R3>cvSz#91>_%FJi=e_^{ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..23fac4c726f017c26b7fe64153fe5e0b459d60b1 GIT binary patch literal 1397 zcmWG=3^EjD6D<+-i4o-!WdLIqQ3gQ<2IhK=t@?i%BJMIUEZ}C~Vc=!pW8h~H07-!X zTL=>aGovVrq$&fOqy%qfUP*jrN_=vDPHAqQD32JQ*aAke3N;P}py~V6MbB~LGXJR< z!?(>nMg^IZ_)Ya<;Z#*wq zV)nzM(mQUw+G*c04GwO?b)C zb6Q{Yp4w_=$aYip$lHU`{jqzUo;;g7>&eqwGB@8A{5;GN8GGc#2iJ2KPVQ^$XlD{k zugvRvUHCfKf9}NZZx_ZrZPq<Xv3;kv_fPPC&yY29``!Wvw-csk z4{FP_GFx3b80p>0!?3_{TXnkgIe zvEzXj-}qJFm@7;4$`e zp(5e^1yhf6oy*mK_rY|BRp)0@UZ!T<8lm(riIYB@FtcD{VwREm;bH3(bamn9bKjSq z*!{NX`L^;ml8Z0xG;?3BHuDs_wSs|DQcs`<|NoO0b}{@<^0{lp{jB`W%W{t_!xLMJ zw!X=EFzNm55{m?bk0no+tWJn#cAjNb`qbgYIY*9f(|Jm!MMs&s;<$ zy#06^$Nz0RA{Osmcd*vE{W|GcAY42cTGmwl+Z$kG&2+Jq$gnuW@>RN z1TwtvW8`?|vS(6o%y%AHHnpgxog6X3LX%WQ*bLYMyN{VQcuJoW<$+`nj^zBjlGMDC zVsPGgr!Mx7TkQxVgQScYhiHWuFh>DdM;Jx_af8^vTxH|IQk0)xBFX~fs4}7DF9uN_ zpcvdpRR$j!pe`{!Nf}8UwFX8R!nsncflpL~LG2K)3;_p z0&=MYk7H1ff4Eg~az<)yqQn#?eGssSbOZud5MfaUe&f47Sdt7$olxQVMho5P%)+7znfhq$wS!4@3t-taJe@ zadfl Date: Mon, 14 Oct 2024 00:52:07 -0400 Subject: [PATCH 027/105] Added project admin tools Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/.dockerignore | 1 + transforms/universal/fdedup/python/Makefile | 64 +++++++++++++++++++ transforms/universal/fdedup/transform.config | 5 +- 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 transforms/universal/fdedup/python/.dockerignore create mode 100644 transforms/universal/fdedup/python/Makefile diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore new file mode 100644 index 000000000..f7275bbbd --- /dev/null +++ b/transforms/universal/fdedup/python/.dockerignore @@ -0,0 +1 @@ +venv/ diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile new file mode 100644 index 000000000..05f6bf5ca --- /dev/null +++ b/transforms/universal/fdedup/python/Makefile @@ -0,0 +1,64 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +# Include the common configuration for this transform +include ../transform.config + +venv:: .transforms.python-venv + +test:: .transforms.python-test + +clean:: .transforms.clean + +image:: .transforms.python-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-python + +setup:: .transforms.setup + +# distribution versions is the same as image version. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.python-test-image + +run-cli-sample: .transforms.run-cli-python-sample + +run-local-sample: .transforms.run-local-sample + +run-local-python-sample: .transforms.run-local-python-sample + +#run-s3-ray-sample: .transforms.run-s3-ray-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/fdedup/transform.config b/transforms/universal/fdedup/transform.config index 774716e15..ffaeb9f45 100644 --- a/transforms/universal/fdedup/transform.config +++ b/transforms/universal/fdedup/transform.config @@ -14,5 +14,6 @@ TRANSFORM_NAME=fdedup # # If you change the versions numbers, be sure to run "make set-versions" to # update version numbers across the transform (e.g., pyproject.toml). -FDEDUP_RAY_VERSION=$(DPK_VERSION) - +FDEDUP_PYTHON_VERSION=$(DPK_VERSION) +FDEDUP_RAY_VERSION=$(FDEDUP_PYTHON_VERSION) +FDEDUP_SPARK_VERSION=$(FDEDUP_PYTHON_VERSION) From 4dac838b2d941117f40bce371574aec268d09206 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 02:40:11 -0400 Subject: [PATCH 028/105] Bug fix Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/cluster_analysis_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 221b50512..2a5ec3e6b 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -240,7 +240,7 @@ def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: sorted_document_data = sorted(document_data, key=lambda x: (-x["document_length"], x["int_id_column"])) # Extracting int_id_column values into a list - doc_list = list(set([item["int_id_column"] for item in sorted_document_data])) + doc_list = [item["int_id_column"] for item in sorted_document_data] # Creating a dictionary with int_id_column as key and minhashes as value doc_minhashes = {item["int_id_column"]: item["minhashes"] for item in sorted_document_data} From fbc2b58e255edc758a9d4016d49dd57715c3db93 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 02:41:49 -0400 Subject: [PATCH 029/105] Add op modes for data cleaning: filter (non)dupl and annotate Signed-off-by: Constantin M Adam --- .../python/src/data_cleaning_transform.py | 38 +++++++++--- .../src/data_cleaning_transform_python.py | 5 +- .../fdedup/python/src/fuzzy_dedup_python.py | 60 +++++++++++++++---- 3 files changed, 83 insertions(+), 20 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 05b18cc8b..8e17b757f 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -29,12 +29,16 @@ """ This key holds the name of the column storing the unique ID assigned to each document""" duplicate_list_location_key = "duplicate_list_location" """ This key holds the location of the list of duplicate documents marked for removal""" +operation_mode_key = "operation_mode" +""" This key holds the operation mode: 'filter_duplicates', 'filter_non_duplicates', or 'annotate'""" # command line arguments document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" """ Name of the column storing the unique ID assigned to each document""" duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" """ Location of the list of duplicate documents marked for removal""" +operation_mode_cli_param = f"{cli_prefix}{operation_mode_key}" +""" Operation mode, can be one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate'""" captured_arg_keys = [ document_id_column_key, @@ -44,8 +48,10 @@ # defaults document_id_column_default = "int_id_column" """ Default name of the column storing the unique ID assigned to each document""" -duplicate_list_location_default = None +duplicate_list_location_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") """ Default location of the list of duplicate documents marked for removal""" +operation_mode_default = "filter_duplicates" +""" Default value for operation mode, will filter out all the duplicate documents""" class DataCleaningTransform(AbstractTableTransform): @@ -72,6 +78,7 @@ def __init__(self, config: dict[str, Any]): self.logger = get_logger(__name__) self.document_id_column = config.get(document_id_column_key, document_id_column_default) self.duplicate_list_location = config.get(duplicate_list_location_key, duplicate_list_location_default) + self.operation_mode = config.get(operation_mode_key, operation_mode_default) contents = config.get("df") self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents)) self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") @@ -88,19 +95,27 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab self.docs_to_remove_df = self.docs_to_remove_df.select( pl.col(self.document_id_column).cast(input_doc_id_type) ) - filtered_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti") - filtered_table = filtered_df.to_arrow() + if self.operation_mode == "filter_duplicates": + result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti") + elif self.operation_mode == "filter_non_duplicates": + result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="inner") + else: # self.operation_mode == "annotation" + duplicates_df = self.docs_to_remove_df.with_columns(pl.lit("d").alias("duplicate")) + result_df = input_df.join(duplicates_df, on=self.document_id_column, how="left").with_columns( + pl.col("duplicate").fill_null("") + ) + result_table = result_df.to_arrow() metadata = { "input_files": 1, "input_docs": table.num_rows, "input_bytes": table.nbytes, "output_files": 1, - "output_docs": filtered_table.num_rows, - "output_bytes": filtered_table.nbytes, - "filtered_docs": (table.num_rows - filtered_table.num_rows), - "filtered_bytes": (table.nbytes - filtered_table.nbytes), + "output_docs": result_table.num_rows, + "output_bytes": result_table.nbytes, + "filtered_docs": (table.num_rows - result_table.num_rows), + "filtered_bytes": (table.nbytes - result_table.nbytes), } - return [filtered_table], metadata + return [result_table], metadata class DataCleaningTransformConfiguration(TransformConfiguration): @@ -133,10 +148,15 @@ def add_input_params(self, parser: ArgumentParser) -> None: parser.add_argument( f"--{duplicate_list_location_cli_param}", type=str, - required=True, default=duplicate_list_location_default, help="location of duplicate document list that are marked for removal", ) + parser.add_argument( + f"--{operation_mode_cli_param}", + choices=["filter_duplicates", "filter_non_duplicates", "annotate"], + default=operation_mode_default, + help="operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py index c0b5fefd6..e5c1e5025 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -10,6 +10,7 @@ # limitations under the License. ################################################################################ +import os from typing import Any from data_cleaning_transform import DataCleaningTransformConfiguration @@ -51,8 +52,10 @@ def get_transform_config( :param files - list of files to process :return: dictionary of transform init params """ - duplicate_list_location = self.params["duplicate_list_location"] data_access = data_access_factory.create_data_access() + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"]) + ) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index ca64f336f..c05fe326e 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -57,6 +57,7 @@ "fdclean": [ data_cleaning_transform.document_id_column_key, data_cleaning_transform.duplicate_list_location_key, + data_cleaning_transform.operation_mode_key, ], } @@ -66,10 +67,6 @@ def __init__(self, global_params: argparse.Namespace = None): self.global_params = global_params self.logger = get_logger(__name__) - def execute_service(self, service_logic, service_params): - # Call the generic service logic - service_logic(service_params) - def orchestrate(self): service_list = self.global_params.services.split(",") for service in service_list: @@ -107,7 +104,14 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: output_folder = in_args_dict["output_folder"] elif service_name == "fdclean": input_folder = in_args_dict["input_folder"] - output_folder = os.path.join(in_args_dict["output_folder"], "cleaned") + operation_mode = in_args_dict.get("operation_mode", "filter_duplicates") + if operation_mode == "filter_duplicates": + output_subfolder = "cleaned" + elif operation_mode == "filter_non_duplicates": + output_subfolder = "duplicates" + else: # operation_mode == "annotate" + output_subfolder = "annotated" + output_folder = os.path.join(in_args_dict["output_folder"], output_subfolder) else: self.logger.error(f"Unknown service name: {service_name}") data_io = { @@ -145,12 +149,48 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") parser.add_argument( - "--contents_column", type=str, default="text", help="Name of the column that holds document text" + "--operation_mode", + choices=["filter_duplicates", "filter_non_duplicates", "annotate"], + required=False, + help="operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents", + ) + parser.add_argument( + "--contents_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument( + "--document_id_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text") + parser.add_argument( + "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation" + ) + parser.add_argument( + "--num_bands", type=int, required=False, help="number of bands to use for band hash calculation" + ) + parser.add_argument( + "--num_minhashes_per_band", type=int, required=False, help="number of minhashes to use in each band" + ) + parser.add_argument( + "--word_shingle_size", type=int, required=False, help="number of words included in one shingle" + ) + parser.add_argument( + "--jaccard_similarity_threshold", + type=float, + required=False, + help="jaccard similarity threshold above which two documents are similar", + ) + parser.add_argument( + "--num_segments", + type=int, + required=False, + help="the number of segments dividing the hashing space for each band (for scalability)", + ) + parser.add_argument( + "--duplicate_list_location", + type=str, + required=False, + help="path to the file with all the duplicate document ids", ) - parser.add_argument("--num_permutations", type=int, default=112, help="Number of permutations") - parser.add_argument("--num_bands", type=int, default=14, help="Number of bands") - parser.add_argument("--num_minhashes_per_band", type=int, default=8, help="Number of minhashes per band") - parser.add_argument("--num_segments", type=int, default=2, help="Number of segments") # Single argument for service execution parser.add_argument( From 828ec41b4a0727f008566a3ebf7a0c400ee5c5ac Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 08:07:06 -0400 Subject: [PATCH 030/105] Python and spark transforms for cluster analysis Signed-off-by: Constantin M Adam --- .../src/cluster_analysis_transform_python.py | 1 + .../src/cluster_analysis_transform_spark.py | 38 +++++++++++++++++-- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py index 8ff6dbf2b..c35c5a711 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py @@ -72,4 +72,5 @@ def __init__(self): if __name__ == "__main__": launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) logger.info("Launching fuzzy dedup cluster analysis python transform") + # Launch python to process the input launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py index afb8c51b7..30f9dd317 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -10,9 +10,17 @@ # limitations under the License. ################################################################################ -from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +import os + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess from data_processing.utils import get_logger from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, SparkTransformLauncher, SparkTransformRuntimeConfiguration, ) @@ -21,6 +29,27 @@ logger = get_logger(__name__) +class ClusterAnalysisSparkRuntime(DefaultSparkTransformRuntime): + """ + Cluster analysis runtime support for Spark + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params["num_bands"] + segments = self.params["num_segments"] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + class ClusterAnalysisSparkTransformConfiguration(SparkTransformRuntimeConfiguration): """ Implements the SparkTransformConfiguration for Fuzzy Dedup Cluster Analysis @@ -31,12 +60,15 @@ def __init__(self): """ Initialization """ - super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisSparkRuntime, + ) if __name__ == "__main__": # create launcher launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) - logger.info("Launching fuzzy dedup signature calculation transform") + logger.info("Launching fuzzy dedup cluster analysis spark transform") # Launch the spark worker(s) to process the input launcher.launch() From bc6b81cd231a328f3fe32bfe26b0d40529d2ee57 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 11:00:28 -0400 Subject: [PATCH 031/105] Sync spark Makefile with dpk Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/spark/Makefile | 84 ++++++++++++---------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile index d30013da8..7eb132fbd 100644 --- a/transforms/universal/fdedup/spark/Makefile +++ b/transforms/universal/fdedup/spark/Makefile @@ -1,45 +1,57 @@ -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free -# to override/redefine the rules below. +# to override/redefine the rules below. include $(REPOROOT)/transforms/.make.transforms -# This is included in the image name, if defined -TRANSFORM_NAME=fd-sig-calc - -DOCKER_IMAGE_NAME=pyspark-base -DOCKER_IMAGE_VERSION=latest -DOCKER_FILE=Dockerfile -REGISTRY_HOST=docker.io -REGISTRY_PATH= -DOCKER=docker -PYTHON=python - -venv: requirements.txt - @# Help: Create the virtual environment using requirements.txt - $(PYTHON) -m venv venv - @source venv/bin/activate; \ - pip install --upgrade pip; \ - pip install wheel; \ - pip install -r requirements.txt; +# Include the common configuration for this transform +include ../transform.config + +venv:: .transforms.spark-venv + +test:: .transforms.spark-test + +clean:: .transforms.clean image:: .transforms.spark-image -image-direct: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings. - @# Help: Create the docker image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) - $(DOCKER) build -t $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -f $(DOCKER_FILE) . - -publish-docker: # Must be called with DOCKER_IMAGE_NAME=, DOCKER_IMAGE_VERSION= settings. - @# Help: Publish image $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) to $(REGISTRY_HOST) container registry - $(DOCKER) logout $(REGISTRY_HOST) - $(DOCKER) login $(REGISTRY_HOST) -u '$(DOCKER_REGISTRY_USER)' -p '$(DOCKER_REGISTRY_KEY)' - $(DOCKER) push $(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) - -publish-ibm: - ibmcloud login -q -u "$(IBM_CLOUD_USER)" -apikey "$(IBM_CLOUD_API_KEY)" - ibmcloud cr login --client docker - $(DOCKER) tag $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) - $(DOCKER) push $(REGISTRY_HOST)/$(REGISTRY_PATH)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) - # ibmcloud cr image-list | grep $(DOCKER_IMAGE_NAME) +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-spark + +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.spark-test-image + +run-cli-sample: .transforms.run-cli-spark-sample + +run-local-sample: .transforms.run-local-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image From 4d486d35a36039783df84ce666ab03cd21c0cf59 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 11:01:59 -0400 Subject: [PATCH 032/105] Spark orchestration for fuzzy dedup Signed-off-by: Constantin M Adam --- .../src/cluster_analysis_transform_spark.py | 1 + .../src/data_cleaning_transform_spark.py | 9 +- .../fdedup/spark/src/fuzzy_dedup_spark.py | 207 +++--------------- 3 files changed, 34 insertions(+), 183 deletions(-) diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py index 30f9dd317..5522d67de 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -11,6 +11,7 @@ ################################################################################ import os +from typing import Any from cluster_analysis_transform import ( ClusterAnalysisTransformConfiguration, diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py index 03976bac8..29890d05f 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -10,6 +10,7 @@ # limitations under the License. ################################################################################ +import os from typing import Any from data_cleaning_transform import DataCleaningTransformConfiguration @@ -51,8 +52,10 @@ def get_transform_config( :param files - list of files to process :return: dictionary of transform init params """ - duplicate_list_location = self.params["duplicate_list_location"] data_access = data_access_factory.create_data_access() + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"]) + ) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) @@ -86,8 +89,10 @@ def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[s :param data_access_factory - data access factory class being used by the RayOrchestrator. :return: dictionary of parameters to be broadcast """ - duplicate_list_location = self.transform_config.params["duplicate_list_location"] data_access = data_access_factory.create_data_access() + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", self.transform_config.params["duplicate_list_location"]) + ) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py index 6d0e090e4..5217f2f7b 100644 --- a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py +++ b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py @@ -1,28 +1,15 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - import argparse -import logging import os import sys -from typing import Union -import polars as pl from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration -from data_processing.utils import ParamsUtils +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing_spark.runtime.spark import SparkTransformLauncher -from file_copy_util import FileCopyUtil -from file_copy_util_spark import FileCopySpark +from fuzzy_dedup_python import ServiceOrchestrator, parse_args +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) from signature_calc_transform_spark import ( SignatureCalculationSparkTransformConfiguration, ) @@ -34,172 +21,30 @@ "url": os.getenv("AWS_ENDPOINT_URL"), } -args_map = { - "minhash": [ - "document_id_column", - "contents_column", - "seed", - "num_permutations", - "num_bands", - "num_minhashes_per_band", - "jaccard_similarity_threshold", - "word_shingle_size", - "num_segments", - ], - "copyutil": [ - "subfolder_name", - "data_type", - "num_bands", - "num_segments", - "parallelization", - "use_s3", - ], - "cluster": [ - "jaccard_similarity_threshold", - ], - "fdclean": [ - "document_id_column", - "duplicate_list_location", - ], -} - - -def get_arguments(in_args: argparse.Namespace, module_name: str) -> Union[list, dict]: - sys_argv = ["python"] - in_args_dict = vars(in_args) - if in_args.use_s3: - sys_argv.append("--data_s3_cred") - sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) - all_module_arguments = args_map.get(module_name, []) - passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None} - if module_name == "copyutil": - copy_util_config = {k: v for k, v in passed_args.items()} - copy_util_config["root_folder"] = in_args_dict["output_folder"] - return copy_util_config - else: - for k, v in passed_args.items(): - sys_argv.append(f"--{module_name}_{k}") - sys_argv.append(str(v)) - if module_name == "minhash": - input_folder = in_args_dict["input_folder"] - output_folder = os.path.join(in_args_dict["output_folder"]) - elif module_name == "cluster": - input_folder = os.path.join(in_args_dict["output_folder"], "bands_consolidated") - output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove") - elif module_name == "fdclean": - if f"--{module_name}_duplicate_list_location" not in sys_argv: - sys_argv.append(f"--{module_name}_duplicate_list_location") - sys_argv.append( - os.path.join( - in_args_dict["output_folder"], - "docs_to_remove_consolidated", - "docs_to_remove_consolidated.parquet", - ) - ) - input_folder = in_args_dict["input_folder"] - output_folder = os.path.join(in_args_dict["output_folder"], "cleaned") - else: - logging.error(f"Unknown module name: {module_name}") - data_io = { - "input_folder": input_folder, - "output_folder": output_folder, - } - if in_args.use_s3: - sys_argv.append("--data_s3_config") - else: - sys_argv.append("--data_local_config") - sys_argv.append(ParamsUtils.convert_to_ast(data_io)) - return sys_argv +class SparkServiceOrchestrator(ServiceOrchestrator): + def __init__(self, global_params: argparse.Namespace = None): + super().__init__(global_params=global_params) -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("--input_folder", type=str, required=True, help="path to read the input files") - parser.add_argument("--output_folder", type=str, required=True, help="path to write the output files") - parser.add_argument( - "--use_s3", type=bool, required=False, default=False, help="if true, use S3, if false use local FS" - ) - parser.add_argument( - "--contents_column", type=str, required=False, help="name of the column that stores document text" - ) - parser.add_argument( - "--document_id_column", type=str, required=False, help="name of the column that stores document text" - ) - parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text") - parser.add_argument( - "--num_permutations", type=int, required=True, help="number of permutations to use for minhash calculation" - ) - parser.add_argument( - "--num_bands", type=int, required=True, help="number of bands to use for band hash calculation" - ) - parser.add_argument( - "--num_minhashes_per_band", type=int, required=True, help="number of minhashes to use in each band" - ) - parser.add_argument( - "--word_shingle_size", type=int, required=False, help="number of words included in one shingle" - ) - parser.add_argument( - "--jaccard_similarity_threshold", - type=float, - required=False, - help="jaccard similarity threshold above which two documents are similar", - ) - parser.add_argument( - "--num_segments", - type=int, - required=True, - help="number of segments to divide each band hash interval (to improve scalability)", - ) - parser.add_argument("--parallelization", type=int, required=False, default=-1, help="spark parallelization") - parser.add_argument( - "--duplicate_list_location", - type=str, - required=False, - help="path to the file with all the duplicate document ids", - ) - return parser.parse_args() + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params + if service_short_name == "minhash": + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + elif service_short_name == "cluster": + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + status = launcher.launch() + return status if __name__ == "__main__": - # configure logging - logging.basicConfig( - format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] - %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - level=logging.INFO, - ) - args = parse_arguments() - sys.argv = get_arguments(args, "minhash") - # create launcher - launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) - # Launch the spark worker(s) to process the input - status = launcher.launch() - logging.info(f"Signature calculation concluded with status {status}") - - fcs_config = get_arguments(args, "copyutil") - - root_folder = fcs_config["root_folder"] - parallelization = fcs_config["parallelization"] - fcs = FileCopySpark(root_folder, fcs_config["num_bands"], fcs_config["num_segments"], args.use_s3) - data_access_factory = fcs.create_data_access_factory(root_folder, args.use_s3) - app_config = {"root_folder": root_folder} - execution_config = {"parallelization": parallelization} if parallelization > 0 else {} - status = fcs.orchestrate(app_config, execution_config, data_access_factory, data_type="bands") - logging.info(f"Consolidate bands concluded with status {status}") - - sys.argv = get_arguments(args, "cluster") - launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) - # Launch the spark worker(s) to process the input - status = launcher.launch() - logging.info(f"Cluster analysis concluded with status {status}") - - stats = {} - fcu_config = get_arguments(args, "copyutil") - fcu = FileCopyUtil(data_access_factory=data_access_factory, config=fcu_config, stats=stats) - fcu.copy_data(subfolder_name="docs_to_remove", data_type="docs_to_remove") - sys.argv = get_arguments(args, "fdclean") - # create launcher - launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) - # Launch the spark worker(s) to process the input - status = launcher.launch() - logging.info(f"Data cleanup concluded with status {status}") + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = SparkServiceOrchestrator(global_params=args) + # Launch spark fuzzy dedup execution + orchestrator.orchestrate() From 19e0844bd93f52b9e02277a70065221d981bf477 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 11:03:02 -0400 Subject: [PATCH 033/105] Bug fix Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/src/fuzzy_dedup_python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index c05fe326e..acb1be3bb 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -76,7 +76,7 @@ def orchestrate(self): self.logger.error(err_msg) raise ValueError(err_msg) service_short_name = SERVICE_DICT[service] - service_params = self.get_arguments(args, service_short_name) + service_params = self.get_arguments(self.global_params, service_short_name) self.logger.info(f"Got parameters for {service}") status = self.execute_service(service_short_name, service_params) if status == 0: From 2ce3d8c440351723373edefdbcaf20c8d3730647 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 11:03:36 -0400 Subject: [PATCH 034/105] Added spark test data Signed-off-by: Constantin M Adam --- .../fdedup/spark/test-data/input/df1.parquet | Bin 0 -> 4111 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 transforms/universal/fdedup/spark/test-data/input/df1.parquet diff --git a/transforms/universal/fdedup/spark/test-data/input/df1.parquet b/transforms/universal/fdedup/spark/test-data/input/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2584725bbf8e96cb852fc2702f8014fe689c7116 GIT binary patch literal 4111 zcmaJ_2{@Ep`+sI^6N4Btp|LY4vJGjHjK&x#jD5{cb}=F3A`$5n2di6a=70 zidTVm3H~Y85E>99L13yzJ}F#^D*ADmuI_RCy|)!dA_WNf9rXh@nJqoNeXlTBn09pulc~MaCszeqR%*qCM}ZB{?MuDm;H z?K@$HPYB7Ej()n*E}5|0UgR8Aq)lOAtw1(@HZ_Wy`ucqdkIGDfuIoC!1)ZWV6V4D74nOQ2^`Yu zRk;JPSg?2%`5}4JdNzWXd42tu54OA7NR~DI@1n%YB+f8zr&}%QNkh=|N7~Xy2FrQ_ z|Jss_5iA`zOC6@nHmyX|thyFb%nk)!R-q@J+))aZxPDYYZH{vDb9J$PXZgpXic)<& ziGi1|vhX{&+>QA=>^c@4MV&R7CcM%plLb(+BBnyL!#wmTVM|$w%oplASXUxhD|vk6cr~TE``v1b(LGk~(q@`(kSJMxkLt{+wo|D{;z?ed^*@l@ zH}y5dT}b{-{eCVEm-}(9<@_g`F@#|=gDcjw`W^TXzw zSZipt`{sGp+UI9M!&Uwx#IJV@^hlwj<>O|B zfp;t(e|22-=U(u)sH=#~?jP{YRA?&eD!?@u>^NtV-h`2a2)4BOI8d@i7eVcbbj@n5ioLk*9s8-%KjN}tw?>haERm-h8gV?Gn+kHi!GiZq zjMJ}(Ijpp)fIxHE=5veqo-lY!lm9mj!_MvBZUfcdrWI{z> zWmSDs=W)Nvm?S6KH_g|19cCn_x#kD;Y{#FN4}M4eYmnT~0*+{SIGKTE7jTA(t4N?} z=me0Wwl>7+lP`}bfh+BfnBE;ZuMmq_m=d*m>;=;lXl1`HuAok{(+qao=G<9hPcvB9 z)inVTd?}c(jK>o-BXoXA56zxk+}C+6e(+UzuYRyeC94E5LQ|RH_ilC?OO6O^Fpp6W z_YG{3P#w@VD3qIRc8h_wzinzic~gWOZisSC&!uh4H^3syd-g}ulC-P_R&6ic(mi)>`c^eUZ?n`G`)#CoQN-rZgY!0y89tn(=Y@LKQtlqKe~M?5N{giBF}Z%NU$=Zx5xR)!*+07M7n^)4?he61>nlz&|mBZarW~ix%!d< zcZsb@1+guN$^kLB4F}hlFLoDZb(~vuwc${Gt(6e%I-g|a z(AE?39;oB=m-%7yyb44F3;`2hLpM;~rPu#htBlx<5D`tW)-<3$*-C2!a#W_?6>UE;?FkV$0SNCOPj`p_XHTryel#c7c; zrh6ABAs|4KBRW7xqJB*=bne^Wns4K{QkQ03^HcW(x3!@hb3^6cqhs?Ax5wQ33fdAE zKK*n{6<=-`>6}1g=P-5@nF&xpj71Q~-@}B(fdx1VeqNaR6EIU5ADNEwlAjJeql^dV zv`4!S2b{VR^}$6&nT_-9k>^h@(?V<=Kl%1Mp_fq-!{kUWVQ%%hD#2Y&W>4UqX~Pv4tDOPOBGsonNn9H^EZ* z?t^vT_=X(D_ewiXk1t)YyCJE^yDy66R!~oa+uwDjb89FuQPD^jzBJ)6dFgmP9hM(z z`zTswEJwn@lLvX8qG$)F>C2f53fx&c6524wZN1tSuvXIbO}${? zDi8^Dm;UQrfbq`x64ckZoCOh%l2bkTqnn+?9$6FLvWof5a>CZ%Oww(yLZfdLRXEN66=Um z+hWVXB)ewqnc3W|nfIf}kwyQlR*02d+x)T)wXgHpwb&bA9`{SWi61X~G%?Ig-dHR+ z_dZI#txXu4=3>-{lGJNm#iyV}z4@#yf8-9Y;9ZN%gO8nvDi|0XtnPh1q8Wxfa`BA& z#?k4r2#f68@v|p#Lp=qiLz!=>gqR#PqaVzw&8Uqjo(lCMs|8tK{mEE9l#~15i5lSt zy5>O#nyz^!Q41TV>=L`{l5XpN20SzCn*Fu#9jd)8hmkQ%7+Q;!ehd`I*UqQ&Ei^%Q z!i?LFm0LR9TrQuZZgF+!+mp*r7BsCNFjJf$ml@lB#f0@^P%`s)U zLronFW99v4;6f&Zrtx^Tnw124gMrhM5|ZREkR{=u^ksx)OLy|61aCd`x<~9 zr!x~6j@jT(AA$*Lt%EAP_*P5qdA?xW-(p?*C zwR^y`yDg~2u1C0u{;}y=$WATUvZ%RWbd@vSO>%fcj;2OKG{ba~_fJ%uznE%v0dP}8 zA9k8C&(A!xPm~DE8-O6QxTX{n+YP7?E6O^u9%2jQO_AbbRb(UHqNqja9i;y5@NprQ zq!B+kP;oMV01^R#8zbmPimv5=;M;_=m+1qi8rkfcUP00HYv?o`Zb!r;(Z&Ci^wIFJDQ4(blT=Y8(u@2>9roSM5 zDAw56&;k?aeBRZ^kv|ozzz}sSEM0UMo%%)kJcPJ6-(AwjVgs?Kuvq#?#2z-H?*|7n z_Vut>_dPqhiq&RxSZzjU7|IA5$WVVR?%HYN>2rES6Gj~RXuRitmSKm*;?(I*3Uq(^ z7Q=V^0A5tVVv`jsbxJa`ZA|R9Ze50F3B= G1^gQ$Q;6pP literal 0 HcmV?d00001 From 5e4022cd8289baa46ac09036f91387f67d01f16a Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 12:46:47 -0400 Subject: [PATCH 035/105] Setting input test data for ray Signed-off-by: Constantin M Adam --- .../fdedup/ray/test-data/input/df1.parquet | Bin 0 -> 4111 bytes .../fdedup/ray/test-data/input/sample1.parquet | Bin 36563 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 transforms/universal/fdedup/ray/test-data/input/df1.parquet delete mode 100644 transforms/universal/fdedup/ray/test-data/input/sample1.parquet diff --git a/transforms/universal/fdedup/ray/test-data/input/df1.parquet b/transforms/universal/fdedup/ray/test-data/input/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2584725bbf8e96cb852fc2702f8014fe689c7116 GIT binary patch literal 4111 zcmaJ_2{@Ep`+sI^6N4Btp|LY4vJGjHjK&x#jD5{cb}=F3A`$5n2di6a=70 zidTVm3H~Y85E>99L13yzJ}F#^D*ADmuI_RCy|)!dA_WNf9rXh@nJqoNeXlTBn09pulc~MaCszeqR%*qCM}ZB{?MuDm;H z?K@$HPYB7Ej()n*E}5|0UgR8Aq)lOAtw1(@HZ_Wy`ucqdkIGDfuIoC!1)ZWV6V4D74nOQ2^`Yu zRk;JPSg?2%`5}4JdNzWXd42tu54OA7NR~DI@1n%YB+f8zr&}%QNkh=|N7~Xy2FrQ_ z|Jss_5iA`zOC6@nHmyX|thyFb%nk)!R-q@J+))aZxPDYYZH{vDb9J$PXZgpXic)<& ziGi1|vhX{&+>QA=>^c@4MV&R7CcM%plLb(+BBnyL!#wmTVM|$w%oplASXUxhD|vk6cr~TE``v1b(LGk~(q@`(kSJMxkLt{+wo|D{;z?ed^*@l@ zH}y5dT}b{-{eCVEm-}(9<@_g`F@#|=gDcjw`W^TXzw zSZipt`{sGp+UI9M!&Uwx#IJV@^hlwj<>O|B zfp;t(e|22-=U(u)sH=#~?jP{YRA?&eD!?@u>^NtV-h`2a2)4BOI8d@i7eVcbbj@n5ioLk*9s8-%KjN}tw?>haERm-h8gV?Gn+kHi!GiZq zjMJ}(Ijpp)fIxHE=5veqo-lY!lm9mj!_MvBZUfcdrWI{z> zWmSDs=W)Nvm?S6KH_g|19cCn_x#kD;Y{#FN4}M4eYmnT~0*+{SIGKTE7jTA(t4N?} z=me0Wwl>7+lP`}bfh+BfnBE;ZuMmq_m=d*m>;=;lXl1`HuAok{(+qao=G<9hPcvB9 z)inVTd?}c(jK>o-BXoXA56zxk+}C+6e(+UzuYRyeC94E5LQ|RH_ilC?OO6O^Fpp6W z_YG{3P#w@VD3qIRc8h_wzinzic~gWOZisSC&!uh4H^3syd-g}ulC-P_R&6ic(mi)>`c^eUZ?n`G`)#CoQN-rZgY!0y89tn(=Y@LKQtlqKe~M?5N{giBF}Z%NU$=Zx5xR)!*+07M7n^)4?he61>nlz&|mBZarW~ix%!d< zcZsb@1+guN$^kLB4F}hlFLoDZb(~vuwc${Gt(6e%I-g|a z(AE?39;oB=m-%7yyb44F3;`2hLpM;~rPu#htBlx<5D`tW)-<3$*-C2!a#W_?6>UE;?FkV$0SNCOPj`p_XHTryel#c7c; zrh6ABAs|4KBRW7xqJB*=bne^Wns4K{QkQ03^HcW(x3!@hb3^6cqhs?Ax5wQ33fdAE zKK*n{6<=-`>6}1g=P-5@nF&xpj71Q~-@}B(fdx1VeqNaR6EIU5ADNEwlAjJeql^dV zv`4!S2b{VR^}$6&nT_-9k>^h@(?V<=Kl%1Mp_fq-!{kUWVQ%%hD#2Y&W>4UqX~Pv4tDOPOBGsonNn9H^EZ* z?t^vT_=X(D_ewiXk1t)YyCJE^yDy66R!~oa+uwDjb89FuQPD^jzBJ)6dFgmP9hM(z z`zTswEJwn@lLvX8qG$)F>C2f53fx&c6524wZN1tSuvXIbO}${? zDi8^Dm;UQrfbq`x64ckZoCOh%l2bkTqnn+?9$6FLvWof5a>CZ%Oww(yLZfdLRXEN66=Um z+hWVXB)ewqnc3W|nfIf}kwyQlR*02d+x)T)wXgHpwb&bA9`{SWi61X~G%?Ig-dHR+ z_dZI#txXu4=3>-{lGJNm#iyV}z4@#yf8-9Y;9ZN%gO8nvDi|0XtnPh1q8Wxfa`BA& z#?k4r2#f68@v|p#Lp=qiLz!=>gqR#PqaVzw&8Uqjo(lCMs|8tK{mEE9l#~15i5lSt zy5>O#nyz^!Q41TV>=L`{l5XpN20SzCn*Fu#9jd)8hmkQ%7+Q;!ehd`I*UqQ&Ei^%Q z!i?LFm0LR9TrQuZZgF+!+mp*r7BsCNFjJf$ml@lB#f0@^P%`s)U zLronFW99v4;6f&Zrtx^Tnw124gMrhM5|ZREkR{=u^ksx)OLy|61aCd`x<~9 zr!x~6j@jT(AA$*Lt%EAP_*P5qdA?xW-(p?*C zwR^y`yDg~2u1C0u{;}y=$WATUvZ%RWbd@vSO>%fcj;2OKG{ba~_fJ%uznE%v0dP}8 zA9k8C&(A!xPm~DE8-O6QxTX{n+YP7?E6O^u9%2jQO_AbbRb(UHqNqja9i;y5@NprQ zq!B+kP;oMV01^R#8zbmPimv5=;M;_=m+1qi8rkfcUP00HYv?o`Zb!r;(Z&Ci^wIFJDQ4(blT=Y8(u@2>9roSM5 zDAw56&;k?aeBRZ^kv|ozzz}sSEM0UMo%%)kJcPJ6-(AwjVgs?Kuvq#?#2z-H?*|7n z_Vut>_dPqhiq&RxSZzjU7|IA5$WVVR?%HYN>2rES6Gj~RXuRitmSKm*;?(I*3Uq(^ z7Q=V^0A5tVVv`jsbxJa`ZA|R9Ze50F3B= G1^gQ$Q;6pP literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/input/sample1.parquet b/transforms/universal/fdedup/ray/test-data/input/sample1.parquet deleted file mode 100644 index 58387d07daf4381a020444fc5dee676b1360ebb2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36563 zcmeHw3v?sZm0*=y?v~tkWA(IjWa1{}-tMBiY1^vO-%CiBq_QoyEUEReYz@#Ul}fg3 zNfwsmUm!~-Oag>W20{qQFf42c8!{QrOnwLEFca7r2q7V1b~0JQvCx8~h)Iq>UFZ>~h zH;J~-cGhX@s?(;T>`op1=&mi(cIPhBJ-dP+2_(6%lP`bmB+#6(y4;LQ%2*v)m&d`l zT7O+fbJZ`7m>2!J=P9~didql>`x;dH(#4VH3wsjn+CxFoC$JI>_Ng)_Ng zMOrEf`C)74MJL6@!tgRPWF3}@!x^C>43me)q3n!R{_!~ML-XauB1m(;$=o2#R3nT! zG)+(#atknojs`T#c^n&EGCCepN?*%ShkYi==-ZQ97^W zYHG|qyG&o`P%O_g)PAJYwR}jQ$efQN~zPUW8P`-+KZK?(qOS%60BCnE@{}1;C5nZ8QFBS8um<$>PFm} zEy*31)D@|-lB?t;tVOr&Hh-p*e)MhhOBSR>ZyoeC_2SV}rdy6Fdhz`uPyQIFoWv5Q z1S+P;nDSv6Q?Y8Pv{;s?wZ$s6A`~lBWswpMZ+rO<7mE_LvbdB%1wwJLGAAuji^cpJ zRbDJe)MA!`5~`9bNJG>N^c967wIr1!p(16Vk+dp_)e1C#M!sO!$HjT7u5o0XD3dJd zKbi%@c7g(PEiD=D@AZvyp)q!t=v?hynUjj_1B4z_(;yIH=jToi-PlVIp;P<}HAsa* zz8FPzuY!HEC#hH<MYE=XDqMETPRK4!X{ zu=i2D&?&)>dI_C_@45JJC>*Dff%teJL>jwcu;>xW@XuE=PM%7HIB$@r;$bSv$D;v$ zilOVcvfiWus-T?do*Rv}N_rD|0 ze?iCljWL2X#;l+k(3*+_6ETW4>>)4L1HTLp9N?#YJnttDbdUSuCwlY;r+X6-KNs(t z7U%Z4C3wdpX~!o7e*kHR+{B&l3N&LNEh=bKRvk zh0e};UO5>GPND^y2~fdF$`_uT9Qc!7&gbJJN8WC@gs`*5U8b?iy*x;d@Jj4Bf2TsA+7us(^Sr6^ z?z6u=_{hOUqhmS{iX`GxXfhlE1lR!O<;Mac&{bnU#`fWVrPeg)&oRf?n`K#U-#ZL#$UTL5S$K7Gl zFsyS12F^S`!npK-(3s)lh5^D&MtWo6MAT(?2Bt==neI1r&VI!NS>BpJq%cf+r7k$6$gz_QT#NAC8U* zADmn;-1SG_*GG8YS05pw#;-sANg__ZR~Jkej}x}g6U69k1mQQ`rnB?!xsA~K!Xa|O z2}RE1`}MjQ)%T~pV3j}yU;6WBiTK%n(Ze((|A4SQ?z-lt-V@Z=XpEd0xY>CXp>y-k zU!<4E8SsCLVQ@3I8}38~4FrzG&F^ zF~VU8B(jFUtrOiRC~xBN;{(?M%VK=I^Q8mPaFFr_2Cfy^|G1vunHO~~=KU;Tyv1aiK5#h&f+5ZzmcYG$oo320h1-;9b z{K+%NzTG`Tc_YW~?XrL^=Dgvk`FIx*I(E1ltagk~^ly%_55&XQ-pz~4NQCG~E5z#AjU<$?7?K>7oMI`%Qb z?U*NHrvG#x1`COA+5gO14gPh{&l1m(JmB}~BkvNevUdlIfWgs$u`GB6WS&!YS3xv$2i}u}T z;Qsf4qEyLchRlccV2on>PVS`>UHks+qC<%;Frm!jpXKfPKzv5{QwykkG8{c+d~o2& z?l`Oq`>BftytDKey^#EVZF_ZyA6T+#61ca%;NLC{;G_x?I^gOX!Cve%}Me&STDx5Ql|XAy*utqEdd}uKgA&SIjOh6%60d z$0Vs|sykegmO2j{E>loS6&9By_BhcC=u|7CR}sX}NP#?xh(o?sJQ~sP;eXg66VKQi z91-kXAty{;r_jV-j-XK)7t#PW43S%m)3YD zvpiP@#;Cvm1CcMPfl{5F$iuC%Qx-OGl#)k9T*YN|zBaVP3~n8Kh{BqNd~d92D$ZMe zOrE3fF1z*T(|bodf->^=k#n!|;6XQGhk>1VwX-iv6FKw^n+)*m?broPgo%AY6Sq+lHMiB>rd}mHxBCE!JxN*t zwGOFhfnTUr2dJJnqtazqBxb7+LBYCQF<(pn@@o32tC1!voi*VvnUY(S5Ap#od2VaW{}m6pD?wg$#%h`IhWU8l$Z0j4wcXvv)&=6gEm?+Wx&??Obq zzTq`Vrc8$wmiJ!X4JfE1pBmB3Zgst-;R!mc1@fX)M}Kd&n$MrfEEZtB#4`B{()WXLT<3I zn33}3K^!ASAA^E;O*L1}p_OzP(GRi0m12HT$bc;$t_60LzrZ57Zm6^d(X<7r_zsTq z&hTt7;~~tRk8w49u0YiAiPz8>7P zgJtvTCZ7H1x;oFga>b0adS=7$-qB&cyMzABwe*wkM(o;r+}8rD$%+xv6$&d3me5KL z0a)9h)ji0%DXU||YK4E7;ZF_q>TBEB(!Q#R{1?tb;$O*_*62r(e5@|NApn5@%d zh1|F3vS+iR&26{ZSkd7Y8Obg(Znu*`c(|;RB-m{(A(Isxo^(dCvI5{?7qTv!;I^?2 zd)jJu3K^#dEZO`4L5`9NQj{J{5 zK+?~=k3=#*xmBTT1LB$*bRp-7xA)tb^Zd=NDvj7xUCLuCAg4S({b7>+)-6~i-r`Bm zbMU#p|4|@Vak}?F$f+b0*D%q@=f3ZwB>jG9(a!5W2SA_uml#Z!$NIVjeqpMsw>S@eTqRc^vkIwinM}0j{N9};{kc1_WR>ok6YkTb z=~`vm8TVTN2ADl&(8><7t~jhwox6I6AZ3*5>9@cV#tbs}q88zSLpHclP2q3;D}~?>v^h_O+tm zr?2J@@oX(WFc~BkxsRg!z&p=Fen1a_ZdOhZa9_mvfrw^6d(->?N>@}NNIfS(*3nW6 zS_^-6d2`HKW2Rq-(O!trp4V)_&0kcl1A8+PY{GLT6xPCP?Q?zR2hH?-e@4>Zy&EqE z7xeLhK5nl*YPt#!lBV}4o>0{dHASv1OEIQ3THhNo0S?t?fd zq*~G8SM(R#&u@|D?~?Q@Pm=VfpTvY}jHwAnleBZ#LOt|#z)abz>ETExI*}@8fzf8| z)vrO6>T7^en-s}8K)U4{DoA?@b7gT+wNZQKY0&?t+XHG7(sRKsiL$yz?Cuep{k1}R z8jY0*n?_ZW2ygy-l?ZzvqAG%-NSUUn-uJ#q(l>l_LsenNrZMtnKsDugvFQ>;ReP)` z+gOE&Y8KAQWHdC?uu*BLe{CY>zHh0B*}WvqD(m>&r1^f5e&`=a`ZxazQ&As6J(?z# zaA``xa_`YS01@;Yos8GOaZjzvqI_w>?}cxx@awE=%59|i6C{28w@Lb`f5hl1{p$m1 zqIM2y`rNZBHPuyw+4^L-`&obu)!Q%`?te}NWj8*RQit=??~wG@ztbL$W|Y@$XEH7) zBifx}MzVXDw8v>@A#y4TtV4wRO?HUOvKfK3!;L4qB)P11!R~N-SclW;_IT1x6y=rD z*|dvggtQ&Pyl_Fu20>p>+RZxM8IchLcP8Uxt?1@bT4EiN!($hroy{f)lFbHpuv{lW zj4qGpv8U5^M>;Fn8~}!e&@&@AoPy+XWStJXBWr~-wWc}bplK$+Am;tRrX)?iL>R05_eFEJ(RW77yV5e8=EF0gN@O%GDD*Se5BY1lWg+h*wDCfacrn+qUph(sc7m(KI0h?PG#g_ zRYbRIuPm#JGzp{nsR^@si+(5&31?g5pD8>nz#BVR4ZF$lO${W3}a`WK+izaZh-aqEz6 z0BOeke>`sy=I_ly?DI8)u(nY_w0jC>; zuC@*-4P?&YV|ePK2GgmoXtpa#9hwS_^N%oh_4W6?dg;!*@POWtR}cIi+*@1rR+8WJ7V_Ejo0d$bw?FesZ)>uaexvzU`i-Yl>EDxV9=(le zIQore*63|chSA%dyrSQ%%n|)2rfujQOrMy)^%c_GMeEJa+(RB9VAE#V4Oc)iLfP`o zBkw*WUk8*EH|0Yk)E~b45WF2R1K$oUp|>I)Izb&mmj$7q2p7lU30_dLRDnk;{=Qg( zuMYfGLSLEdu7U@7Ja}y<+%=SH!#0kR%}MzTzJ^#ISw`y&x-$ZWhpNRKx&e$as1{0V z8x@P`c{sL*$_r9OKyh_w{_aCk5t_qIZqzpsA02cbx=w{mBNm71@5miu&`1sVwA40X z2I(%Ry%n#GVij75Y1f?q>}uqzhTaS;s@q1O%3DGqN!JRssj>;|xk;ggTxIa~U^PhH zB20XmuNB}%@%B=vq=Z@ls%y3vrjmkeg{iLDUYHbKQf zfj8Ej54b3K-VUx(x6N=h?6YjMw-*ki{aXpE6>cx2+KtmzXsU`Ph^n~)uiH~C{Hlu0 zCQ;4qVn3o~X>Y3BcF_3ZsTQ>x6-^+wI!Ey(&{lx*Y(Zlid6p`nBOt-zGhCRkh59r^se1~~M#NsPue z16166eBitlZcB9&_$_Kq?R^R&jW15)0Z^-PvEk&@ft9>!PYUrQKBXI{}UFMaPTIdOD_pa zYbbV8S`(I*7FQ4-6di_A2HkkD9)ehk!h2OEJH!mx4qZoBx;c22d2&QPDQtOkY#9Bx zX9bSqPQx$yImK~y&O6JEa@;KEo5f{bZ$teGDuj3QEA^-2@-uwmQE+{L<5u7u-5mVI zgTPcTXu0e5T^ za35WtwnZJ{7{ZCmWs;5(jK7eI1rTh&ky(+ZJXUGii^mNVJUO8-HJ|a}`bug#ItOD| zXJUZwcsZ9!I_A@Cw3IH0c)Y2p=vXj5o69!gF0MiQ(F*WS!HdgHB^?<5lpkpI*;i$L zHRIo=UC|3X20k)sT=Uh8f5uwlJCmOFI@Gj+e5|4Q(>cV`baKi%lZ?(~#`q1{WMeC{ zAjb`!2J%zYbYZlTN}dMUaJXNpkk6;bqFG#yc;K6uN@JR)7LYtK?47`^5M7?JO#yBW zE9k{SJKm+nJo6yae7YFT3Q3uEal7?k4Bl=ww(1G`(EO;&w+>r|tHHa4G#g?-7S)=5 z)#h6$n;FlqfKDR*GU-Ad>6kK~TI8B$C$32)Lp8ZG`OMfT%-QJN#=ID}mXWTiWM&nn zqve{;7lhDDRcp+L`mRZovzQrZdZ8p}kKpuv3l<>sd=};?t*)A2EHJh5w_h%^tnU z|9U-p+XAyFLJR}N0otsm6=6IypSD39U}x>#idgW>!#rJ2C8PDVT*Ll};}gp>?9{r* zPA!0pD?2<-)>6sLX11oGey8ljlJFGHv&?uvafaY^WVRP5TCj zGc?6?n&w7he5y@cA~WV$f&Kd?IH~Hje5LcX-q(B*h-s}0WBgh>Kdv4J2xUcfZf0Dc!S2;*-*bzwgUS_5L*X1X37wEZ6{03c3LvsQhd9O{lGri z6vTCu_{7*$+x=hLw&zREQxNO)O@NO(1$yUMLorZ@X_tlM1n?C05;#BMb-?$vv`xhP zRn1q3w+2=xWA>HcJeNc9a8O^MNX~TMrbi+yqLcjGi8c&q*(QXHo9t)a)4K zOWcs7f&T>+p1mvZ>@RyE_p;0-AmefxayQ-pdLBj3;nR?-@mV1k!_A`v%@}&-@H0-> zgiiaRei70&oWwyshDXoP5uWWReRFEXhv1e_zOUmG#shn8}Tdn`nCCk^$Ghv7?Z$~Tjpng8;>l@`ccyd z%oMdeP}iF8jrwL$du!aaxjb;bQt>%qKA`rVa*e;<&G^&eMEDJwXwGx# zff-}1+*uTe5X)<)V$q%Mf@3?Dx^?ebS%%pJ_W@? zpwQ<={pXtcVqBFYMf2dRuL;v$*yDJrTo~bd*-A~n6nP<`w$7ii5MKYF>NMgP z+XSygpD?`8GOUGGvADp6BQ~JV@2Tli8~KzGjfYy6e1Bw}jq2jM$DwU*|5c0}LoYqdw9$MOqcPejzd)YQI(|8jdd zk1xlRVSst0>X@JKpzt3VIW_;`+yTd-JU`elIR^}Aha#Zr!|Mt^+vqEz-COq=a8APk zewWClQ2({{8h>i~Dj#a7&lfM!ujc;IyecOG^*#~F2akvKcM>zTdA-3?X{?^tfsUXu zv*$ZUXTGuYYak57v*y+|{h#jo_~$dN6!>y=&n=oW0ZiZ2p+U7uxp+EUQBD{Y zT!Y%%{P3|Ee=xjyejAVEPe7;PViUX;`5;_iJyz%U(RvN*8rFyPjs5|QWD9>0=+TC= z3mg|j{6+Y3m>e`1%sT)*1FjNssPNzl(Ozy2m2k*Ns`VeYYd*Ju+93S_hYZFO!8j;C v^2{bIR7qI6rYcni#ZqYy9}J>u2Y)E}ND#c~4BqRv^Z$rDBLorQqRjsXsl2cq From c14bdaa471f2338bbf88390f6c0d94176ea792b8 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 12:48:43 -0400 Subject: [PATCH 036/105] Bug fix Signed-off-by: Constantin M Adam --- .../fdedup/spark/src/cluster_analysis_transform_spark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py index 5522d67de..feeb3241e 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -45,8 +45,8 @@ def get_folders(self, data_access: DataAccess) -> list[str]: :param data_access - data access object :return: list of folder paths """ - bands = self.params["num_bands"] - segments = self.params["num_segments"] + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] return folders From 1215ac5ab9f1c8c04e55252bc25aee305707d620 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 14 Oct 2024 12:49:15 -0400 Subject: [PATCH 037/105] Ray orchestration for fuzzy dedup Signed-off-by: Constantin M Adam --- .../ray/src/cluster_analysis_transform_ray.py | 48 ++++++++++++--- .../ray/src/data_cleaning_transform_ray.py | 10 +++- .../fdedup/ray/src/fuzzy_dedup_ray.py | 60 +++++++++++++++++++ 3 files changed, 107 insertions(+), 11 deletions(-) create mode 100644 transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py index 970686e13..a0e8e7de2 100644 --- a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py @@ -10,9 +10,19 @@ # limitations under the License. ################################################################################ -from cluster_analysis_transform import ClusterAnalysisTransformConfiguration +import os +from typing import Any + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess from data_processing.utils import CLIArgumentProvider, get_logger -from data_processing_ray.runtime.ray.runtime_configuration import ( +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, RayTransformRuntimeConfiguration, ) @@ -20,11 +30,31 @@ logger = get_logger(__name__) +class ClusterAnalysisRayRuntime(DefaultRayTransformRuntime): + """ + Cluster analysis runtime support for Ray + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + class ClusterAnalysisRayTransformConfiguration(RayTransformRuntimeConfiguration): """ - Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. - NOOP does not use a RayRuntime class so the superclass only needs the base - python-only configuration. + Implements the RayTransformConfiguration for Fuzzy Dedup Cluster Analysis + as required by the RayTransformLauncher. """ def __init__(self): @@ -32,11 +62,13 @@ def __init__(self): Initialization :param base_configuration - base configuration class """ - super().__init__(transform_config=ClusterAnalysisTransformConfiguration()) + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisRayRuntime, + ) if __name__ == "__main__": - # launcher = NOOPRayLauncher() launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) - logger.info("Launching transform") + logger.info("Launching fuzzy dedup cluster analysis ray transform") launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py index 831a6c9c2..e83960c24 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -10,6 +10,7 @@ # limitations under the License. ################################################################################ +import os from typing import Any import ray @@ -88,8 +89,11 @@ def get_transform_config( :param files - list of files to remove :return: dictionary of filter init params """ - duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) data_access = data_access_factory.create_data_access() + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", duplicate_list_location) + ) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") duplicate_list, retries = data_access.get_file(duplicate_list_location) @@ -117,6 +121,6 @@ def __init__(self): if __name__ == "__main__": # launcher = NOOPRayLauncher() - launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) - logger.info("Launching transform") + launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) + logger.info("Launching transform") launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py new file mode 100644 index 000000000..0b9be33ca --- /dev/null +++ b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py @@ -0,0 +1,60 @@ +import argparse +import os +import sys + +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from fuzzy_dedup_python import ServiceOrchestrator, parse_args +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + + +ray_worker_options = {"num_cpus": 0.8} +ray_params = { + # where to run + "run_locally": True, + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(ray_worker_options), + "runtime_num_workers": 3, +} + +ray_params_argv = ParamsUtils.dict_to_req(ray_params) + + +class RayServiceOrchestrator(ServiceOrchestrator): + def __init__(self, global_params: argparse.Namespace = None): + super().__init__(global_params=global_params) + + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params if service_short_name == "fdlist" else ray_params_argv + params[1:] + if service_short_name == "minhash": + launcher = RayTransformLauncher(runtime_config=SignatureCalculationRayTransformConfiguration()) + elif service_short_name == "cluster": + launcher = RayTransformLauncher(runtime_config=ClusterAnalysisRayTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) + status = launcher.launch() + return status + + +if __name__ == "__main__": + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = RayServiceOrchestrator(global_params=args) + # Launch ray fuzzy dedup execution + orchestrator.orchestrate() From caf79a30b1c24892e1262009d57b29a271993c73 Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 18 Oct 2024 09:41:01 -0400 Subject: [PATCH 038/105] Added python test with expected data files Signed-off-by: nelson --- .../docs_to_remove/band_0_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_0_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_10_segment_0.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_10_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_11_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_11_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_1.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_13_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_13_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_1_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_1_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_2_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_3_segment_0.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_3_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_5_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_5_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_7_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_7_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_8_segment_0.parquet | Bin 0 -> 1510 bytes .../docs_to_remove/band_8_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_9_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_9_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/metadata.json | 58 ++++++++++++ .../data_cleaning/cleaned/df1.parquet | Bin 0 -> 14986 bytes .../data_cleaning/cleaned/metadata.json | 59 +++++++++++++ .../bands/band=0/segment=0/df1.parquet | Bin 0 -> 2753 bytes .../bands/band=0/segment=1/df1.parquet | Bin 0 -> 3122 bytes .../bands/band=1/segment=0/df1.parquet | Bin 0 -> 2862 bytes .../bands/band=1/segment=1/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=10/segment=0/df1.parquet | Bin 0 -> 3305 bytes .../bands/band=10/segment=1/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=11/segment=0/df1.parquet | Bin 0 -> 3450 bytes .../bands/band=11/segment=1/df1.parquet | Bin 0 -> 1354 bytes .../bands/band=12/segment=0/df1.parquet | Bin 0 -> 1354 bytes .../bands/band=12/segment=1/df1.parquet | Bin 0 -> 3442 bytes .../bands/band=13/segment=0/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=13/segment=1/df1.parquet | Bin 0 -> 3413 bytes .../bands/band=2/segment=0/df1.parquet | Bin 0 -> 3177 bytes .../bands/band=2/segment=1/df1.parquet | Bin 0 -> 2758 bytes .../bands/band=3/segment=0/df1.parquet | Bin 0 -> 2745 bytes .../bands/band=3/segment=1/df1.parquet | Bin 0 -> 3122 bytes .../bands/band=4/segment=0/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=4/segment=1/df1.parquet | Bin 0 -> 3413 bytes .../bands/band=5/segment=0/df1.parquet | Bin 0 -> 2753 bytes .../bands/band=5/segment=1/df1.parquet | Bin 0 -> 3122 bytes .../bands/band=6/segment=0/df1.parquet | Bin 0 -> 1354 bytes .../bands/band=6/segment=1/df1.parquet | Bin 0 -> 3450 bytes .../bands/band=7/segment=0/df1.parquet | Bin 0 -> 2667 bytes .../bands/band=7/segment=1/df1.parquet | Bin 0 -> 3289 bytes .../bands/band=8/segment=0/df1.parquet | Bin 0 -> 2845 bytes .../bands/band=8/segment=1/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=9/segment=0/df1.parquet | Bin 0 -> 2537 bytes .../bands/band=9/segment=1/df1.parquet | Bin 0 -> 3314 bytes .../expected/signature_calc/metadata.json | 62 +++++++++++++ .../test_cluster_analysis_transform_python.py | 46 ++++++++++ .../test_data_cleaning_transform_python.py | 49 +++++++++++ .../test_signature_calc_transform_python.py | 83 ++++++++++++++++++ 63 files changed, 357 insertions(+) create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json create mode 100644 transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py create mode 100644 transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py create mode 100644 transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f5da05a106242414df178b29c0ac05f21de73c73 GIT binary patch literal 1505 zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&yT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f5da05a106242414df178b29c0ac05f21de73c73 GIT binary patch literal 1505 zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&yT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a811ad8780065241630c479f45bc6aac12b04e09 GIT binary patch literal 1497 zcmcgsUvJV-6hAGjcFAJM;%(Z52V%6@!~W2bnPK*FU|WOE2^g93LZB4bGAI@X*%$B= z_~1t{zH9s}#s?pK@JpC@PD=$hNc4r1_WZr~oZqkarbR4UA}$i*{)vzUtOIO(+WD1I z5kP@Z1c-Ir%5jXZgNf`QS>)am< zYfaA)YnrTmk+g4;d2FixGsM^=0w81{l^_GKfKV*t!|QGkiYmGqc}ytsr`h?0-mXju z-6bl6+R&@{Zr2;RGpv5b%#ShRb-U|=S^vySTF5J^=xXHgtjM2c=kxlkPI-yHdDSl6 zUVGS?q4)ld`IYa+?dA(c&ZB3X?18QfoQ~VA3x&?{p*(X%!uZA)4wLiYfhHSC<6sFl z&Rp6ZFqeHQ+K0F%o2vG1P5-x?yFnt7q1z z*{!r1cIB#?DF^F2#Q-0zBse73QCunncmoiBpd%hKUb{ScojGka?QF~GR)&pC^^)-I zPw*fHNNH29#C&W-2M@V9fjeT$GU@*$3^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e303e5ea14abd679e479c2fe54ba994402d60350 GIT binary patch literal 1505 zcmcgsL2nXK5T0EYx=k8G8sBC&;XsUTdax~9QX{692W>YfR=`MgF9enaY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%^XWq=4H}icvFI(nSizzJ0?v$9!U=v{L z)9&xI$^jGzg#p`Sckhd=1WTlJHQJf2G}u}a#q*-tdJ#ZIN0B52%zS%=xVg0UeS6M#l!QwaA2R+887xPull*y?~J5a zK>UTy`61!8tK(PMlXlC_wcTEA)XX~Pi0|PP59b0Y9?DmUkIlKu52-kUJ0eRmt^YW5 zgeFy>uC_C9`zM)~d)Q|}(GHzQC@$5TN{AQq^q>*Ex$s8Y?qIOp2m*hS9(%#?%T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a053b47e2e4eeecbc7b1d7675075940736665f6 GIT binary patch literal 1497 zcmcgsUvJV-6hAFcyJRtB@iuM312Nj{VSnhzjF`O~*w&zP0!F615GVy^2F1c4`vQIf zAN&Z$ca5LL_~3&NehCxLX{q1_iN0{sp1=2=^ZWJQw1gE)#6?ovJrS~i4S>y0yT8&Z z0wf?50b)bk+oEz(C_+i;YE+qd8f-4e(*B^^S`k2|K$avK%pX!p#EpGWl7_z4KKF;C zddqXfx+ZI1B<-7I9-Hd_3^BHd00$LeRy{1_u%H@hyG_0PPdrM#kwu0|fuiu_r2F|W_+jFAJm!Q0N@*%X3#GjIVv+Fu52WXtI$q4pwmE z%$3~%bJeG!eTZwascP>McL0o(zK)g3v$IFxA?e@cU6u*v5wT1!%M(OObnqClgN06z z%hQ6V>=h7;Y$=W`MO-Sp;QVt|L90TMTT_t7Zjkg?^Ps6z+1BjbYsboEnpWcNfL7-U`c&c3+IQ`Si%RQPi;cSQZ6BLL0O(nz& udV0|C-(0$*ZD%mpZuq`8NsnEBc<%Mn)`J~uCu1C>0Kx~3VgM!jss9FCMDwr! literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..43cda6a0d1f01c2bf370612d58cded5f1e901344 GIT binary patch literal 1497 zcmcgsUvJV-6hAGjcFAJM;%(Z52V%6@!~W2bnPK*FU|WOE2^g93LZB4bGAI@X*%$B= z_~1t{zH9s}#s?pK@JpC@PD=$hNc4r1_WZr~oZqkarbR4UA}$i*{)vzUtOIO(+WD1I z5g-ns2oUSy!6uaxLJ>+*SEI_*(_mvxmi7nTRto?!1+pZ`VE&MjB5v$~k}&ku*110z z)|#Fp)-+lBB5B_w^Vn4XXNa*$1VG3@DnSNd0U;&i!|QGkiYmGqc}ytsr`h?0-mXju z-6bl6+R&@{Zr2;RGpv5b%#ShRb-U|=S^vySTF5J^=xXHgtjM2c=kxlkPI-yHdDSl6 zUVGS?q4)ld`IYa+?dA(c&ZB3X?18QfoQ~VA3x&?{p*(X%!uZA)4wLiYfhHSC<6sFl z&Rp6ZFqeHQ+K0F%o2vG1P5-x?yFnt7q1z z*{!r1cIB#?DF^F2#Q-0zBse73QCunncmoiBpd%hKUb{ScojGka?QF~GR)&pC^^)-I zPw*fHNNH29#C&W-2M@V9fjeT$GU@*$3T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a4ad5fbf82bf959f34c9a25974d34cb91aeff037 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{AUQX@?-18p~Gt$>j#7k(@Y)D#vM3hB*1 z#fujY9!yO98$5Y79(wRkFy2gjGs_lm@!*9QcyHe~Z{B?0zJUTOnKVk{^zI3jDXal( ze%k$=R2V=Me4{|u=*AW$SVbsFXiBg$eNtg_0gC%Yx0O7Aj0uq>0>%$1L8JN}khrd` zw9dW3u-bHOx~dZOv!s5PjAKLjpF{L56aZ8Pba4y-n5XFe39`Krq`d>EMIZ*m7lnLk z*HTTwJ?qq%6&Dm}N@P;X#f4E^6yk0d195U>1B|z^sk%F;Yfv4!RnO_VBS%R5CaJ$9 z<4x3fA3>4fw*NS6p~yv33rL5j7Ocz*3!{Os`Kf(Xrn>07uIh!;YY#g!6a09~_{lFy zxH*>>Mq!aW{bUa`GO#;Nw@wKT^`1O)U;O;9H|&Fp>48f0gnqDu>$ki+U@Uto>X)cW z3`PAATL;h++A1rRo}TUdL(;D0U2bERWsDW^%k42^2(;NTV{Mje3oMHTPxw=gu{@8M z2N5&AlzYzcXWWTaxjb);NtSg4=yJ~wnpl~~ij{d~o0(L@%v@E@%u%yjZZ)j(WhGq_ z^__y?hm}YN?JCHNdBInZxS`J2A?ItAMlaK+t)`V}+1>K6k*-`IzlRe(j0vW=DO==v zEXHhh$eSa%BObX;`akj$;knE4Q>{wh?wzJy+{bgq9BuRcNX6#;Cg$@6E!nSouP>e9 mw%zY<*FDc2Cr6GqICpzV^TCd}lhO|o0R9J#@TXS5fA~LmVfT;# literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A@pGgY7zRwa_Si{MXNcoT`>dcsJ_ z2Go3=qxSoo<3(RgTkanlKYcEL@Vm{*$G#~a|3J=&M&(;&-QKB~dwlc&?~28qnj1&z z->%C$@4m}thZni8ZTR$W-|aeUn!;BS?>5{m-EFAb?0(OyR-Q=xqt|c!rk`CEdA+*b z=c(QW`9B4=uPq$<_Nr)Oc5v*O_0y(YTD`|>j%M~-Q+|<6?y}^TEg^&5$A8un;P z>S$*EA!Dp9*&e-p^W5LKy#?m&=<*xkV|N@KJ58JXbZcFg+ZU>iytOSn=eN2wTbJo> zJ^l3P=T{@9_FTVnK~<-HyW%>Z@CulEo9jRSao>UW3!lwczpJ;!oLJM!$H%9?_Q1wX zB@?<1Ui+|PiS|;L)Qe+pK6uQR&aV2!^p~n->o!$eH`kS`cGa|--nvs?(~gD}6Iyt! z>Df9nF=LE+_>S;ad2@DFo7>zNHg)Mhx20Z_-Ig?LYMn_ft_kb>*H- znWI{g*Qwh(V>4GgvXx|C9I|_DXH}Wc;g5WLypvJd*i$Rk(@$k;-=&l7S87spUk&=I zero2SsKkgkS(N{cJ;sB~wtjmaE*UZAN6sxTXJdVLS>_1a-afWx>{6 zmp4@F=O|Z29<4flZzk@VtNh(6!3}_ZLs*7meNZu0d9y zzF6KNrds;G(N?uGhpzm~vLNZS`4bnk$~63G=d{bUz17C-!NVSQ zON+ZN?R?;mkn^RxQ*WK`evk04np3R)q^g8Wn;>eRxq!XJM%|n-;`Z8_6tA*5$0i<{ z)IHUJsw@3J4`hqqhn0a?(j6tsl36meMf?K>LtTR z9SiF)Jb6mrS>+w(nYLH+_PdcGtLWI-6Y9{^_wRi_?qt`Pb;oMYmTYCx>OJ}d4~UBz zdGX!~?$0HD`=`7Snsapa{09b`W#8cg``b@%A?L3&B<`zoKh!Qgt$N^n&COeH4SurY z_Y~CvLo%Is+7v+VElD346rC)wo$fiQlW)>QFVUzuJv8$pN3JmjpNX4WQ#)y}^x5Ia z^>x#qFVqa|S+)8~m8NdWhl5t_mM?4l=AC}4DlVKWioF`>-)q(#TUhelqLKqs@^^S< zZcV(h``m}Ih2prG^@YB%7kbRvTHsMXr`zWjrQi3CBHdi zOlQ3}y`TR$sgy^5UdUQ~z;y?27^J~{Jqsyih9e+OU)hAoG zAbRGxBOdQoYqqzVEPuQ27hQK|x~oo4Y}ujex5?@owvdVIrnG)f5j5DZ_SlFq@Auo@ zvd4_o-`{Vzx!K;cMVF$S0mX~vJuA7iD6C?^)dg#_H(uDhFs!U$?fr%7f$g+&DgrA4 z2b|#*YIQkryu`zAl(l}zeD9tK5h2-G+0WFnEG`NMRNHGJr!=r{3jkJO8XiZ!E_4*F^3Uixe*uIVRhBX=TY-glDF4qoVwUsKjEJV1)*1mFx>d5S?O`6i1L6FGb(=A zJpV8otIo?>uF#bHFn&bcgTC>t=XgYiSVHBUvPbWOYdKaU9CZ5!1iKyNl;koVn?UU6RKP=ChdBQp|e}sHldvmf$-o+UgQg`f0fA>W6-fhLDAs=~&C${{hLX#>w z@IWTR+QQz(58`Zs1pFciY(>80|>D2BmrYHJu z7&z*~vamrZy2U@)5SiRD-k^aPmN2+x)U$-ISXDDnB-&rJj znAPId@+L~uqb4wyv7mIq#-jeDm0)=@Y2lHEXIK(tT1g9sa!eGfM`?_X;3?=zL2pWD zqKt$^oRy6-NIPezaAzIQXh{|+!=)%(p^O$Qdq>JjNrIIzv>s)XJVDWLTV>Ii7?wkM z45OzBiyo$?X*AHr@hH>AbCjM$$t-TqCajd6ixtBJFD6f5EIehgksMga5f@^6f&;JMzEyJz%aaX7>YwY z+)rT4V0&qIe1*MDzsB2gajJ!}TabZa5ygX1;4U6KMOlo9BWY46goiO8t>8ZkVI~oF zBxfac6hZS~trY@H(o)2k7?P7BLZ>4+j$tYAC+=ya3_LVrgB*)AkOXgI;jWcs^fnz# zYG4^NqDexSRhWoFoK0s!1cwqKz@!BwQJjfnr6`xOkfj7H27iEIlUR~OStMhnNtha@ z14rpu0%>#%&qIrufrXgC3y1*=SWf;TBH$&0<``rLc+jOXmNde+5RgV!a5gK+Qs5UI z;D|8;%s7%QrgSh7Wzo?#!CC@imnUI4yv%@Y(U}O=2yZZ!;0wlVhM)qzjEE&+-4>Xrh!HKvBUVzZdnJ7n#b4pGw&cWD#SYp`VzPcvV@(j}2DBy$53NXcv zHiNkq-eyJ+IDBm&%oHpU0uAeH1`BK|4^5Uvv>RE*&YK`m0uM@?dCQ4bj1dFH0NJEp(V_OU_@piO_&%nAxA2`p2AQDT>#f(pE%th zOm4;0M&L)hN?|^y@vmQGIv|yEs8$jdns7z9QZ5e3qri38Sd!p?7Xl9`Ek#qn5*Qm# zN@I_+FNqNX_BgC1OLU~wVI)MCZbm+!mH-DS0?;)A1tLtcAb=J@Tp8h;z=)SM5Q7}T zZ{Z1!2U%+h6$ZIYSaKtdgb3iV8BlEp+ncGfGz4)3rAGrPGb)CqV{MlLgGnr|pzQ#( zMyZpdkR(j#7eK_)U18@Ry&pqhyhu`gFZB_ zk}()ySc?%z^^(8=*MY{+k8_G0Wdw$RX8jK-XpC#~?BWP0cCes3I9oX#Z6@U*023G# zh5|285M#kCJ4x$+vP~|OqQWLs1~*bHg4Ae|fzS(Jdey$>z-iJb;1rGkA*9AeApmmG zrAQCHq5ujHGiSTmRm5E0~FP5f))YtarsdU8V*=y z@j@*Q-E5>DWMN2iS_%UFQf4ySI3T>R9Km{AU_bzjLLRq5cLs6_nDkPFoHpA5nV3+J zL$Ie1X|~c#DdZQ7cV(mk0?09t|3Rc<9XuySL&gW4b0i!y01HOYg4H&fmhluI|Ak)! z+JP=>&Aw zv=mb;Bgmi}rGR`1e8vHH2n1$ZIwTS(t-yQ@W-@%1A|tip2s~06J$PG6CH&bzorZUZ}Hhy#{eYjD?1x3Tg>R>d=IB0i@mxN-qa>l8Y0- zwLFwxNMnXM#X9F@JWE<_U=lxwb)yfr2)%x<^K ztt@aysPl3EFnj@37}S9r$%VnpfDIcf(>HA(WJ3KY!@S^RLP_hOFib3?pto}aA&XEb zq9gh!`$ox4yqOjcwdo2;7IqPk+AJuIg4_lUV5~x|8-uzJ;DHtdVkxe$6r*c26usR8 zerzGIBN>V<$dhT5ffT0`B`D2igy^xPIG?dWL@nKM^#be!U;)~-4AdtCa(+o@gO!mm zFW^53QaWQtCKAeKh!~)Z(>Ze5Xz(QXh~fYd8O}lmVGheKdMr)@m!a-=9H z7Xh}R0C8jkJd(x61O31ej#SA*etuyqmRqb}R+1l^$y-Z z`p1OD`~;BOF0r_XTt-1T4(%C@E^-)#eefDz%U*Ui{-VR&bOJymED4eWm@l+S z5u|C5MJVuvWD0S`&e+Reh{wObQ~r;TE&f+ODtx62lK47* zRPe@CA<&f*eov^Xa`V4DGW64~q31k;T-$qxX1TWa0;pZvd)M~fwY`T_?+||1_8uJJ z+TOdi_fQ78w)gmvk!ySJ+TJ%k_<;wAuI;^Rdk+t-T-*DmJ-=&v|Khi3c(Ty+L->EV zy?@<7zQDLhQGgexGm&OO;t41Fy1Bn{a8M-ig|?=Be>YKD5IiXG7Y7CfO8=kE8-}`` zK)4PVxDFV+{4wrPC<7=)12Cf4J!r97K&eP#1kO@s^{#*wPTn7wX z2Mk;X4Dg{j*8zj3hYhX+2KaQ9=6`n-|Nra>g#RbY-TcL&-u@Sf4kA%tP=K#0J3BKk z#_>cVpv*lCzxo(e@EU~wz$;ayva8faRgy|&R3!+vRBE+Km5y&Fs8j>+J6t!u!xjFh zesP`A9|r9UZHa>u;V=FYyo8U!Cm0v{kJg4qDu(7oDs*nw2GM&{bBjBi^9l=2(;XeQ1x(U`!70?k? z0WAOsj>Tl;5n_158%QUx+BD!Jv|@0b&1J7`w0Ro^tl>2sNV5|e@(ikndD14(Tm%ec zh#(UH;f+F)g@mUmiYx=c$ixx>CC)!lZM7iY%~MnbB%+GBt)n5i-@i+QA#qCX7&CET z=w$CCA0K9S*S9n~T)tYn&1Va1B8w(_ZN1ys((F>o`5b93Aj3w+a0cuCMaTyhxl<+V zQ(1&(uu|uG_F5FqSyKd_J=5_1#=Y#SU4O!Fd&Z)-6F2r#4J(8lLwB&VqPId^jh-#u zb`5pqpN8`pYuV1)v;9UTpJATO9|w%1A^oy9Aw2sQd|kT-7?Y843zI2=*Tj9EY(#6S(;;|6 z6mao1ctqwtuKsv6SI{HEx_uRBT>=LqKZ)jco%{>EHt-~q@Y-|Cg7dj>&{K}P3v&^< z>E}72i%AeVxVw*1Id+ky2Sn%+jtwUh^XG>M4d#AnC+iN5Rh0w649oFBs^E}*p<(^Q zBL+l9sRhkIQ5!u-H#lZU?9gGu^aV6mm6e3R?Q9d za5C%lWy+v#nMZuL3UWkS`}}bW%rd}}oD5G1a>7F5U=eY!xPFNvOopYKiAl+=-Hi0EDtH<}CwO(i09W{vn zE~H_xZr)d zDS1TT{=eMVM|P7myQ4I`L8a*&pa#(D#89cUbZX@U|HTqJ*knp+l39|Z67r=~C`ql7 zxk1X2qy{OgLD@4Cg(-BEB}w^oCVXHfoU-&%57dz~b^hq&o{$e!6}t4woStBTy!RZ8h3UWvcH zf@yv8ADjX4#%w;iCd6-$L~jrPe_Fl6z&#?P)x zZ6iGyOO;5ITWSAEivB|(duGLwWKOCha;s!+^`DcHEVUv!lF~`Pky508Un$+%oYO+$ zD0NbM;-8q(qw#Bk7%SU6b6Xs~#Clmyw0S)4*4TLtkGIL~icLvRN=Y(mEjoZZALt?f K{))+;*S`VMIFrHv literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dd4f930793d5e38efb9536223e4aa1ec1aefe431 GIT binary patch literal 3122 zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkY$7_Rq6xT5fQ5hu3-^Dk{}6!wHB9BQK?p{ z6|JJyRt1Gx#EuKp759Cm&dhOY$HC>)>e18oXb0!sWMZZ2A9K!}>35R5efxKBe&0#5 zloBKgqKD|6NGCW0PD5t3$9EcZvDe29#~XK4EIfHdgy<|P{V3l_l#6mrD0~$*o=Rbp z24LsG(NQ3T=d?SBMikT`kUObWb{RTD*5qkA(=?G&3d#*cxs_;RwTjnSQZru=Incq0 zPQdR&B=n4t+&VGA4k4-(HjYYRlU35NUp}P<{u4IG!mbvchwbZ-Wy~znXB*6uvvg*i zLoQw9W=+mC<`?M=mPC#d<$0oA%4^SZn*YM%E7SYCgLu~=QQXa2F5!Xvv5}8nI0Jbh z3sBc8p|yD`{5JN-_gC!kc&QKWwDd)0KUcDOixEpExx!~v3yk0S00Ay|*p)-Y54lL+ z`7c~%w8jr(n5PKPaTGXI|6ZhZ45m1{&+aDotfJ%*u^lkEn za4!G~V*-ZF6>zcM8+Ls>;Nm(A7xJ92EIky{7e``5YaS+)^n%?=7hFrK!=|pq=o!#J z&eUm$S92{~7stY@`3wqn4TRyo5=~tvV8v7omi+oPqN=hnWxFTNPLyMP>sRDvXM)Dl zU2wgd3l1IZ4Po7I1U5cGS)3P=>Ni4N8-xvx?zXl4Sks%D*a?=*|`&bbE1*_M2=k*eW9(8V|$ejCvG_6*d-Y*RR@yz znvL)_ox;=E(eSz*fXf+`xY2Gvi8&QjWt;G9=25JQ*^2e?;kdbmBTq})(7(I~JnMq- z;Ng61zSalc250hk!%xIL>lQKl5)>G`5WOr3qkmmQJ~5T^6*cYnxqcw|#=okI!ZQJz zdv6w|xUYxX#BcfJO^-?Ipyfha!eLV4c!PWx*B3!oZ$U4d;+j|PCT3?lqR;&sS90Mj zTIT(goZNVi1iKY*)y6E$xc>oX=Nm4oU3~%d2c8g5BywR#0!dAr0~vLw3C-FlQNiCmxX23h8@blKWNKVsL2WTI#r!r2ex_#$y1V!VFAiQs>9zJ2EnnJXMH=-vJJ zr8W)@X;H}U>CgQ*X)hL?+|OSb9ggyYQ@Ka`WaQF_W2kHxOLp~ogaXGsq^@i@dJa!O z)43kJ&w`zBjCzXiCf9=>6383HVNev63wx%lkUv>&!arN)!%c7GKHW5)t8NS;`}*wV zYfd~P6Hd4B;rn}c@+zCjZF4)pzc(>d=C|!5K0h#xKMoVZR%fF0POQv%?kxE2ZjyJq zM#!U|ctH0B0^b_SyU(&`Aab7_b9QkRm2N7vdv{Hbp1nMJdwO~MX#4o~?bqMWe?UNB zQ1G7yhJ?N`XmHq@LxzUGHB5?#93B-N6B`$wkT@b~Wb&w#(Ql6#n>sEnef)%plO|8m zWn^Y$PtBQ@n>RgQZ!i|jC^VUiW)_#sdZ)|mcjvrUI(Odu1@D)Au&{j5;w2RyE?xG~ z^2(1_Xjgu+s$bRWHJ`3sSG|72#?Ll={>A1kTWe~!ZLh2Ua>rK{pa0#_U>y+ zZvJNffrEz*A89#y?D&b3r%syr^&Wh);9f8qKpKD;) zo^2BHb4}(>+4DMFH@QS{3wL6VB4h#VdElG)Nh)82w3ttLk`{^0MNkJK@V~et)2TxBM zmokm<^~p58S$TA@*o(9np+Qix+OsSfuY(tGUP$aZf$MtY3-Sw&DyQqy)Q;(>K{92 zXZ;@d-uu1ZANTwH-upf`PN_aeUiVD$yWn+TGor zwk~(LOTcc16lwNN+W>&S$Qm;mX(HeBl%9fwprKW@Vpc9<7`SIe+*V z0_^Mf_^ubx*E8P0xxtI-zPHZ8$xVMoul@XUl$$K$JwXqAJ-m<6WE67;o_Y%ozH|*u z0|zeptrhLBj7N1RM&P(5gkBjqhc^u~u)S{?KG#qNUld%%;`0^E-j}sdaHI(h?K=q{ z4DN#OwuP9q?2Xt~IG6M7n4-FP>M+Xjjk0^Ts8kIb7Q{U%Pl@xVzR&GGaS3!;h}(Ou z3TyK)(<2;%is@J3OzuZ5>ablbl{=7)|zLfJR)VcS7qY*r>H zlmOo|$R1s>Q+4f$Fnn@yGdz9j5ckt(mf<}^Iq2AoVdlPH{)<`u#s#K$G)+By%UXOW z`(5>h)&lM&^_QIA>S9hq7>b_ifQ_Fm;*tjXA@ju%)ueY3y8L%?*WAAV>w@`AO1~Bw z{<#&Ie{vEoRKJ2ENgCuCwQ+ySwxY*c3P7`KGH@BKct-JIuH)4fu4+Vt9fupyj!X__ zzMaDDKl2@WF8K_T`P}2o1LrH?XlN-uR$YO993EtnTmQ)PC47JazW4B&FCW35uiVJK zP~$}W1sj$Vj-up1klZ5Sgj)C&n<)&aZ+^nc^$=Nlkc2*DOgb_!-@b`ZF$Z7u6JbEj z@@s&6%`B%<^P0E`+W3hHiIb8h>jZtWXh@l2oSJ%%DQ(*HbhBkf#>`pwW@gRKw&vvC zmzQ5Kr*Ll3Jlp)@1tpTbbYWR}#iGipYDZ1&VrN}_!;+756MLR<0uaD&Lk_7!t@D=%(M&x6* z6wYp6B|$VM8imJ*nP~Y-^4-uZ;Rd$(z1^LWC`O+~`Ztl~oZhlSPw?#a=y%!kP6Zma z<%6gScXW9JZh<9GZpOE~2xg<>`N<`ucF4!#_$~{osPTBXrFJ*_FJM9Cuu*STRL+9F z8kp=5I}>vXiEpCum5u+iSkUtZqrve%ey+Z!x8whEW52hXq#xAlc?++9crt-1gHhB< z_4SLJilP@w{27}~DPOWnl2k|jDOF2SuVn9($|b2&D(zI(?8Kmk&PpYzl8!`pM;d_JUEgl@@)w$d(pDjFkHAyqo4y<*fG z+rO?-={IRMp;+rBX(63f_-PO1mh!o)+H?CzPx7b{X>u>^KS?oqC}hp9c#_OXbwuxy z?7h)*Qj(=!#70s&8Ff;MjIJxCy8`7sWFMtY>QCYmS9&y2OAt-6JG7?98_DXBL_!=OaxVWfOw=gx%$J9->e`FXlozv)aOQsv{oO@Ru3jZzh1kU}=_xyd| zIp3ES3PtI>jF04FO+3dV8VJ?=zUAHf@xQIy%!7~Sl_``T$;)}UW-^jP12R<~1wawU z(VZiNgSMGyc$qo_@<_ExQEqlx7F3x%RXnHUl{ks1I&(xW!db=de1x3oK@@IlP(@)s_VnE_*yk+ z>==)0--Y49+9cfU9E*x^v1He77uL*=Mbf5DxPD3kbTPOW!J+APHOdbBjmy1zA+OBD z?h-XVyc>ySn+39>HjJ~WGjXCl4X3sm5$~9Qdj-AZd|N6uXjEkEkwnOD=@9CgiL3@0 zF195?F(wW%v03=m7KwFblToue8?(A>m{@2OX(Vs&4kUWc;L$Pz z5^m^lxqJhz_Bv7TEr!2wCmt{P8k>xJusv@YdNy<9(Tblje$^<%w@$>}dn>W)$`~X% zqsfEjKgdYSb>dAXa61#&qr5@*J0ykOYG2F97;Ke{B}|w1t|yp&f_g6y5=4n^A`2PS z(6BHoSrxJB?eY^5@ej^J zTb(Cw{rTqcW_M1?$aSYasy%Jb+>u#?%rEWQYqhsk==i#f` zb0ai7C$(kH;CHTYR6ALVT0Js*wZ|J`wOd&a&zFPD@DPZOrsUw1lS<)&Ad zwM-@60h@pvjS#qtq>DbB;^nxJFrvGfs6a=53uM`Y|P$^_riab44n8m1ULC9ra zdJcT)NhCeY19K!4EFG`YO`*h${2X@9_OX2i+o)?nFtJTv%8sI3NMq-k@`?HB<;8RB zEQYByr6omGOztS8_};M5P4t&7I-wxwlKhDzlMS->loe+|fV;tW@5NxrMBL`=>=grBaL=e19zEWs;zutJ*~+krbLVEWA&#HrAwhay=Y01 z)9k3cRM7jRdlJDiE2<-im6VLcBqhF>da1)uN8^ZjGJEQunzE~rG@*)fn`;->S-le~ n-0lhH+FF+{Zi%(lQ|)rbrB6;xPfb&sG(g`Uj>v#Q`Xlcjk_>gx literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..aca2026d8ffa41af6bda14a140d8fbca62eee401 GIT binary patch literal 3305 zcmc&%c~nzZ8vpKlFCi$KkQWk+iYyTnOkji}BISV)mWYT|7DWvNStN@j2x=|3(4tbU zRO?a&6^o)kUC>b$sao9kHMMp`-1XS%(bMT@N9VqmqEhvbIcLuFJIP(Y<@bH}p8HD@ zm_$FCqFreZ74ED}h@9LZqE+8+iZ^?B za0w6uc@t!f)T<1N`NJGY-T800@S+y@S;4R=Sppj#Il!#d3@XmIA#x?b zuz#BZj;v9Fo7NW|k9vVlHw=afsTg^;cmVZ44g!4~gq2WmuE7JW2e^WhFbvM-I>3_D zK$yBP0!Es1VM0+quwL#2m*eZ9+O81#%6FiX^-|>Cx&?#_qrtuPBp7xL0o~6cXtA3B z%ce+S(eJwM!ZSYXD)$CiIUnj)s^RE0dpLZa2j?plD7JPzco>hv^I1{gepe0`(ktNF3mp`h zl3-tTgc>v zz@T%7s3q|*=Jy4tF?%~zUi$+6*)Rm{^Ih3f;1&lp{cEVnE>+->=1C<~KSk!CrPQ6c zgJ_`q6#03KH~3w;1zPGj(YkyOGC5i!OO{QHM?Aitb8sN=Xs?O144K3G7hKm)*Sc+Vq3&e z^(>aK8A6Z+bTcU3e}kj`H+$T@_W@4`wh;7j8~nB<25eIzA+N75@yo;}Sa7V3yf`Kl z%Jxqoo;35&`H_dAV#heNtKSo76iz4VOGiN85pmFRx)13&ZztGCK8Nd*8h{K@ka{`{ z1O;W(-pR}O&z2hDcKclLzTm?9x_UfO+31Iw2Q-njM_-}|C+?7;ZT+p>OQ#cCosW{A zqy_SPwthk8DN@KIV1(dR8BlyLn&((D1AO+h@OxcG=*h1n%rYxM@!@dNWrhtbE*ekl z%^yn$eG#uYPKmxUmXXeBKafTC*N`^nCt}*qpArwI6jQrKCLqC`5W--;7ddP71gO7Z zU!Vh4^ckKHU1Ih6#iZEd7E|oy#&Vr>72=M3aR#AK9z1%8CnZ*tU>YETqfT+YcQ+a`f2o z6DLodK6Ccm`3n~>T{i!4j2_v#SdA7%S`OKHMFL($zI zacgakj-{kbBbAq9G+AZlWwH-Ee$7j)MDv|RYiL;wE!)7+-J6H^&L_JYe>U5&M+{(Q z$pj0&5Z(AwEa=h@3%YbzN_u-08~?$n1k0)BzVBls`&aywm#ND#S$Osc5}iS^BYk9L zeK04z?fpG5-<6=GJOIReV@7tSR!viwjy;JtSlv^qs&fj%#0&w1xOrWFjUlqbyQ)2EcSmq~igswC3biw{zS^r$%KPL+!fkY}%{f~cg{6+l#f4Q-L*-a8% z>Mat;1fpjW{QOCzVjCtgaa2;Uy>9l#)T98NMw!K^ zS$i%#P6ta$*F16TSau%EHkQ;m=oUOv@mjb^ofM!?Nsia2CUdFdc%v=(#eUQs4=xs# zH}J5f3^!<$<4o)Z6Qg4FIiHbByCPP>y zT;xKw{urX=Q1DtfXNi&H?C^dH6JBUJCk`xYg>@LNk`^N_Ny|EydWtrx0LS6-WbLtk zY|0*uMH9rX26g_lf=rWdhQZ*g&d=8ux=zo`H|FSdu6}`o{RS%}Dk;FL542DK3HVRW Fe*i)ig~9*; literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a9169d9f5dab747b86df507d1750efb57844174 GIT binary patch literal 2537 zcmc&0YfzL`^xXS>`#{-5;@fs{CDd5ScqfDmq2KacToLdAizzPb%7V-8whut3QL>z< zA;m^#)^vo@Oif9XW)M(N!x!d*rnH$Rsp;V(wHcgJoUD7lZF#8tYw`@-@7(h~=bm$B zbNT#Oj^zTlARUJ}2n3L-dfsXX{m?ak4F_BxCyyZX08Y$_RS6I-_3;#g7y$T%Vsc`H zFsPf5ier`WAP!I}{7MW~W3k!bG;>&vCccsbdDmEHi23a{l%8kXPHjBH`DkRW~ z{^+TiUooy^zsQlReWGTh%TLSM}} z3CYP}?B~z+LrYK<-nTXlLegeK$N7m&(7O95?2UskF8x0IP}~ematveRW`WdQ#~xa` zQ9Q803BPo&f%i`w5N+AM2ybhNMeS4Dm={0(i+SbjuS`nkB*~PuRro;kCCRGA38F5^ zEs-Tg&zu4$ByBE*)z`8_fwe0jqP0Uj{t`m{e~|Vvd;~EwUuJ?=_|vAminv=Sg1+kD z$OuoQU}-T4xPEtv$cK4w0U=80L#$+CZSB%o;fkl~-Yhw4P7GnH!wPwkPZe-qK69iGc$yL!YPG0_Po@aVAmI0~r`qod=)hp0q(_^@6M9Bo*~ z4UVqGWG!=f_lO;bcLMl`5s~ zKVH30Xf^&XovMCPoh0v-$+220ACLtBC{+r7K0kkML6Wzz6isY2g?L`W^L!rpr!br6 zt9eZspTYBGd}^6M(-46<6ielKE&YZBH^A%5=47owvR36h?ZMenKA^h~D* zG0pQjdT8|YD)9LjdS6}=mlIo3u+VMHn6X@6m~W>1%3KoPiVAWF_EH5SBnSdtdjd*% zBlMj`1@Ts6W*Kjw`ht6s4gtOIp1)uLy=T+Io4OFXH=gB$rg0ex;%)lE99vPLkUH^~ z=glvB{>XSpu?P%{hn7-fH)bw$Q3kKoQF%eA+UGw{E7Y45n}DzMJfB9rX=TJ_?~}1Wgh-$&T>#yr$awPC~F$i^%XoCA~^Ql3u!yy1p{Q zP5cOXQhj2dsM4qLN`fNPZg9NnHn}28?e<86!(poqtui^B#vP)O@F1xr0QdUrqg$PlC$fZgV5etHqCPkLL!Xg-r*ho-`MxI7g z6p2O!1q+xcMMQ(WL~O*DC~Ayd@!>PJ=$m_2EQ$W{zVCf6^TEuVnbUq}hWi^L*(f)L zVtko^a7N5PR{)zCBi5U?)?Sz=jdsx4FTG>xu(iGU)|nf+8D-zE-+PVRrRojD!=upG(whT|OgoE<3y5(K2gfC9=y>`h}0zKQe$Zzf|b z-()Nockal==#`EPqxkD8-ZXL)P!J?p5Cp0=>NN~UlSFMowk9b8mN$d;vIBr?H+W!5K;hj~h^_tu&b%lG zQL-0o9Ib$F+Rb75N*3j18WU*>FKF7PhC?O6V5@O~2SZ<=lU048NGV57_4YvBb_KCE z418x%&{Aa&l3q5@P3jA$(#&CTtS3xf;15Gu(_n0_HAt3qgYyxUu+B6GdbsXD$19b{ zuAu^?3kHK-!*Sqt_JfRjGN?Bl3ri;{Vd3-dAh0+ICT_8X6XQfs+PWQGF+@;v%m^-Y z=?43ndIGgN0Ms=dkRM_P5mjZNsBnX|gd?o(UIU-Ktb%JW5Y7cN#FcrUz@nj2P;3o` z{99L1T6P!MM%jZVUjjwWG&EONgY#!W5cyODI}4paRVacj#c^=>k{KL4O~dImY7|;t z2KHG;;o0;cu)E<3XX1XU|Z=9 z?GNU_hV#9^KBFsoxb_jUOuUA4jtIC6I|y1F0hv`Jh?=@ew+gpx$YsV4LJ&}{g+S`6IzUVyPhDw3~>UtbJsva9VHr;?Ltp_9V6Ve zw+wPmoq*=qKcgdMw~@OAN378%!jszdq{PvOu2^vjs`frbwh&Hu{h&tWAtq?pzItd- zWupDFT8P=TBrM7uL$nMJgu9*{gnUadS=wX_p4HK)W_>eUDp&?D3p2@{u6cxa{{eJr zQFqa;13QpgY9qBKUnCknbC6*%*WEC^$3?ni-+hp}AiDHvC}G@>AhVe|2)26+huwcQ zym9kmS_&r6e?}ubt_T5>m_SJH;X>RWUkme(G?HhB`#?d{B%-5^MyH1y1aEH#YNvGv z)JUfimH7eCBOnawPg;@EMbD7K+-fijdYGiQAL%sGUP1k@&hd!I|ww78)&qKVel8W|zpBX%oNs?h4mIPVs*n zxSgkL6nG$C{D@E#4efWa&?ctDd4NB`6fH6kOAL*SO-#+qyL9a)Wn>m|g=Kf8RS#>M zp0;-O4ys;`PQCj$yYzKcySabV&%^WM{sX)|@gC?i$d~o=4+snj9vl)H7Ct0mXymY{ z;Uh+liXI&kJ7(;-@e?M-#U~^tO-fElO`Dvq$zFuCuV&$rm)oV)EmVL8s{kI!7ZYr$dGX zs%v(B_x-NjwRQCkdm8sP?c4uD^MQkh4j(ys?D&b3Kb~qiedg@B^Q}K!_~*~RT)f1! z@$qQR*l5P(nTV5KxylFY%YpJ&6%NvSRf!2_e(ksG8*lh=05@rlifrn_sc&Itx3RN3 zr|+IUC*k1UO1!%ao56jWgH5#@7`NjH5>EL*59eQ~4LayHcu2R||NAu(E6(^4MnH2) zv)>;J00s^-zghRBVUN{icp7vW#5hG&iB-8;a}Z< zAn5yWDhfS7F3L(sNz%kI6n+jk5O2LI9F*Z(y7}S~fCnLex7dtGulz1s^LlTNfgu=C zxHoT2$%P}~UmgUG(0Mctme{`y?_UV~XRshMR45hU|Klgm-}LnOzg*nE?Iy_ztYwBO zv8+TX1yCsE#%xs7&}c9H#geb3T>0j~`mroK4FB>ijAe6JzhpLuWs_O|WMRz@D-7Xh z{wy29k67*;u_R2hcn`cIcIc;lTwVL)NnpfaUiQ!7`#`?Ku30vm@BCtTQP^kIygWJH zJ;E(MdQ^5|(16J?qoY#zeN80JZ(17e1b_a56PyHYdVj){-+RYCD>m9ABQZFcjpOZw z@E8tmF`aVM$dSAp%6EF`LhAZ_CgU|fU0k$>HfD5$Hg>cC9mgA{#}_){-F$GdaC<8s zzLb7kV(=&(zrlut^ZJ5PPG|gKAws=zv2p7xJ;Q z4qtzqqW)0unx9~aGso!gehllEqdzAhS>6ikutFvEMgmCvx&S>!6O@hP2sn9r>>r!* zN2AvSxeXVWIVC$u=aRs2E^(Qe+8mpyNts!x+6)^v&%SPb)rxQ>!0R8VrvM`He_j6p D2t(7I literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4748c07ab9f5c77e8fc8c141ac748fc7bb24c158 GIT binary patch literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4748c07ab9f5c77e8fc8c141ac748fc7bb24c158 GIT binary patch literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3c53b83a00add2b8dc0dbafc83e88f7ca6fca968 GIT binary patch literal 3442 zcmc&%cUV+c7C-luVTK~8^M)B55h)`oI0FHq29XECND&bWMNw%Y$jm$$5R3&I5tV3U zHDcFjEGR?;vq}-gVDA;(L{VdGQE_pNExPx;5tUW{*zenKb3VBDo_p%=+{-(cNH)rY zp%@M070!qlXa^u4v8$k7ufKG<(8hoEp4Z!DEA3An_aE}Kshj~u%E;XqbH)biij+R! zDsGX<#Vu9IF=z~=CU0~R*6(GD)FaW zDco>wzz~238$1DEK9cbd$2su1x>5)v2uO(;1(cQ8m&O`AiA)o3CS&a0Wh@uBNaSMl zMn{HGe7K8+7Onycf+P!qL3N6H3&Yha*^rd2OVMYIN>0d1uu8*=wB%7qhKy{TUI6wI z$(oR=np8Cs%KswfN8vVDGuN%dnJ1)x_;<6%_sHPc<00L9CX_a?&`>i5+}3u78|N+I z@q8D!Q{NSmx=GQ7jRu%EQVK3h>cMc$1wh>%934Sa=8mi2%lr5tswb_VL68pMV$@SjG(nKEaP zba8<8Qhzv|ZUYPAd|=$%Kp4=F4kL0qfn;%exEN6i>)Paiy?O^aUaCY+H6%4Ajbg zps8qv{17LIC|e7P5)W8IxWdW~6)@#>8Qg?Ea6!uu*JphJ^9M>n@q-rf@7_S^*==DP z)`c-1F%H9308;t!S!VXdXaYvx)*c=$5KyddNczzT+(&t7y|Q3ui813QpM zS}nCIUnCklwXbq-|A<;vIE=cW}AuvMkrAd%?r$60p z+C*^MRV`|J5uxTMBtXLVpy}I(w3}!NgI~@;x5D;VZ^sTXgw2i++=2~Axy0l-2 z@ejTt?v2T#b`FR{;#I?+_gq2B{T_QK8QT*<^D_jRT#s6_&x=1-F@I<~k zg-{d?P4_X=D5k`DfIq<$Eiw~J%q=Xf+F0APZP#AP$ZX{byAH~Z_MIF$J32YLsJggz z?bhARy@y)k;rW@Dx6kK2d-eXpw~t?6e>NbnUr?~Ne@JLp_<)Fkk%OWJ4;eZvIwm%5 z_=xzCqedqrCMBngNgbP(J}yJ2H*n)Kjak_fa&jkrIcf5gsd>|;&zLzY|Et*rbLP$~ z{CfU^Zx$B)ZPDUymlQ8uwtU6PRjb#m{chd*?>B7Rw7F!<*3z;cwr$^0Ua|AXzwg>z zSyf%Lr*`kYy8S=ZA2@jE@R6g(j-NPr>hzhj=gwca*l_9cKYsq@%2lqBk4JNs7L%_{ zMV$2d4L(>84wS#DaFA}xS6Fd2H-Ede@pb?QaEIoo$bD@&&0Xy59(MNo+56`&NH}<~ z3`-YdGk8dIu&I&*%O)H_!YLn_;QSLcLNnb6kLgC|f4xQ0k+Xb)5zw5{`j4jqfIfq4 z?o>Ui+0*eFJP*DGVw|F?uA`D6O#QEa>&79`rRn)zm|~SU@tLZf~Ez(3k1G>qnL2Nzda0*!oA`2K~d;mwS%Qmg(g=k)}Vi(7R0ZusFY zR(jLZ38AOob@)^>w0_$fGg9=aS*AXJ8Og37RYNCLVJF-UKXm_zpdZ7jDD(!o$e1)X zMVG)(_&MN0y!WbbQHF1A?~fk_;7!O|7n>28lv{UeUhmy8Fa#qC_vNiAxo{->%Y(oX zT8_rW4*PfD{R@Hr6D-Kg6-q_;|MN&yr~xg{GF zH88rj>0-&(P->pM*#MSh2jO3y!dNzk4M=5!SvHjoOcnM5u)+X-709w7{EYdQ8FRwY z$a~-&u>-&Cu2%QNi=c@9JRg|Dk3sx^U9)UBKLo_`tRQDKJU=ecGr}V=dRTUHaIbN( zF;QcAxh@jtH!&R#g1>Eqv6_p*$bT z4<_hB>ZW|g;k|&YglKO=Y)ph9E=GWk;|((53mwurA6zU<@8!dnGJs3g4$I;NY)Ck- zFDT`-#2*wQ)EgHYQ%mVtwjaOLX7DxO;(~jnCwk`Mnt1U_xX3wt{c(z>L&19if+fxz zqr>u8HXz4zPC~N071m*eN}7xWkfwbBdaN!u8^;lF^7hz2Hsz1TqzQ5dE+KP#c1o6e p634kGWM&$2944e>8q*AV2M?bf9z8UQa3#RoAE>4PBJqD){|0`F&W->8 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dde573d07cc0c2130eb43113befd5c673b2f0ef0 GIT binary patch literal 2537 zcmc&0YfO_@^xXS>E$v1pB42G;b@-s716qhMB#il$H&g_iLfI4wt+ZI$t`9)BD0`UT z#sV76FdAXe81XgdSb~6pI;QC6YcIAekuBTQIdutUQ?}^t{krl{`M1oI_TF>P`<#2v z$(P3$#BeMZ#D(ZN%t0`KMD90sxkkqzL%&{#sF z@L*^PiJvoy$w{~mQ;G7w;bI!@J!T}_2${)I?yWN0JtZci$0)NB#A+%jci6mDb|HZd zB=1G)n^=7et495dViS0LWLk9r+}i!zjku!|qrw7KkjnC{`ADvYtM;7Kn4k~^}wCISs*i{fo*yezO|wgHk|2UzFLwD^~YD@p)L{nV$mts zba)y1_?aPS52?ln)~CYs)EwwOKaJTGDT44fKY~eVf58tW2Z2e_G7fGbNWAszkzT+s>Epj`kSTHK&tl>%Wg){_F~q+%qF!*7|C^Z)!iYc6q!ga$gf; z(;Ar5;D*Gl<*@ErrYN{>HAHrFizk1B(9j$}PhG#ewzt)p85!DmdY7x;me_J(k0<*;@dpF@ ztIyt8(=&LeRv*3X{MGg~pWC}GtIthRaoE?End9HNw90CyEoyeNHmlncV78fQ50D2x zh~$@~@*kz@W|{go5pCMBxjgb!kDDU{w1Wve+^l|%LgL40>-h0eDp4NWtOLOlHS462 z*0q`K6&~Nb@wMye$?E3Gq+K7jK7r3iC8lhM1BKXKZZTIGIhKs2a6D$Zg{$=Yrz|A1 z6XLL9e04doPk8(^p>m^jCj(ZjPomnaLbwAu0WtC5zl-otVn2-97aTvBEXc7^rPTk& zqxUh5#{cD|svp-&l3$X^v05S@mW2Q)Rf<5qpkQ%fqVHiTT3HQ+I9|u|d_MV8$l>`K zURS}V^Lz!LQX$ZEL|_rcQg}X#?&R#>$w@%n)B-icFJAMkMw390wA@U3PN|`58eNEK zp4ZbwXP`%c&ui&9#?DCLdQ zcNZ7N*-aT0ypie)?nycX4E}S$(xvpAO&4G4Lg>DDRuP)cV=Rnw7>aTo#YIBu#9zKI zzw8&s$3u!mU`#x;lscy=W0{9C_$)n@7ldm3{`0ehdXr)k@Ry$FQz@EZqcw0Ar!S~1 zi>)O!nNNjCk!xuENr=8nAvB$!Ng^lN5x#-f)%fm72$pIQ8D6NQPf1A9M;B5zRHb`~ zA0bbwPwW#_dNn>tP=q;+u2;NfPgJ?n8D(_295rFpW|!OQu!qION5@2Km3kGx$Pc>7 KCr}>w$omJ>jBkek literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cd2748f7d0eb609ee249493c7051b096c5ef985f GIT binary patch literal 3413 zcmc&%cUV-%7C&>#vLFb^-gOsOMaqH-vJfGPh+Gg>U`512QN)!+1lb}Bf+d0)5>%p* zrx6=yFm{Oo<|#!5gS|v-#FyABK6{DLL^JoUs66$L_kHhsIUmf-nK|Wm&Rp*A62(S) zF%+Xw!!k6~@3}b)XQKBOuMq0)fV-oCr)4=h6nPMA&*F5u(>NVqC%d zyZG>R1q4A}1^GVeO$BaN8huihHd&XMs8MICty1wKRg;*cPtVfo_?lXf_&SnTk@Aa# z?7v9)!AcKUGfmbi<{`-!d)4gm6*Bb8Ns#6<8@4pF&|Ehj+*kL6E2k~t@q8D!)z}@9 zdWg{4b$XaLMg%TP8bN=_1;En|9<(Q*@J=ekSN;j7UTy$EiXW_tmBP38ZD86m7UgDG z5@}LDsNbr911pr^sP%vcLtdie<-MRtCPCe6oq@XT2||4Y1m;oDRPGGst`1-)3WO7B zHn1Sx7beaPhQZBgFgm9zm@l@2vyoe%tbI0g@vJ~cx5$uF-9`}29RyBwN5QbYH|Xw) zp|<^KSTtS+^Zxh&LW+}N>}E$emLP!A=56S*1%j$0?ckiP9W>N;18U_!P*gpF{4gho zEME=Mjb5;daD$bds^GI%iWt%pW2G>Gw*=Z@GffvTR{1G2sOJLMUf3Pvg#y@ItcHUZtYQC28cvodQ22(`;A}h$FQ$cp(@jq}HLeIQ zywpKXW(*YPm%*#a`(TN3J(PwAz~yBGdXakrdKPpB$1UD)|G_L+d)5`4bsf;-RZo$< z<{HX$LtxN3LFj@=$S5C9R881M6>fM5zm@k!yF7~92^}L~ZMU`5So>0NPH>^3${wTY zu*1;YXCbvQv8i1T&tv3QVco&&%r($bhl#qyJJ9d0M+k3yi&@TzW6(JB7j$U#ZRFk2 zK$Pe;@O$+d(%db8-njGxl<#?t96?3+{ir}2!mQBHhFYkTXP~`#O~lM<66WWOBASMU zz#ZR5gk-alEUmW$-^v(NwWbj+6fA~Ug&AZw&s@U4PhUE&Uj_0?-A$F`3k1Vv z46rCRbh1eAa-Lq%a1TTth%S8|PFVIP$SkG?lupm!p!Zdao3}otMPLPertgMl8^gdV zE(FrMco6r-RKuJ@yUEkT0-&INJn^W8MkfdF2Y-JTYJ1m5P$im7Y{?%8T?R%#?eWf} zX#NZ2GP@G2Lten8#Bw106{MaC1YuSIwR7wu!SjVixZXGmmhIa{FE1NKlvH`48rN#_ zlXZWRqmSGm19o>abIPAgZ1X&8HY358-eY!`PFKW{2fzq^OOqh?)*!k=-V|`(Q7f=L zi_oK|B!K!mPz>l#+E1~BVSmg)HzM|;%ns&Av)iBEnK6Q(-4Y1bfC@Tke>`1OF%oq@ zWlwK7_zG?6c#w48^fg%%p+wV~mGF(Rfb5v?Gnr$30clfzB_`edg19|Cm)br!3JGuc z69(&@sH0X-;5S!;8B;89XY$P6zH0f$KtgeU6j&~i4kA2}FMmQPfQI{bu##s+2`2%5 z>r%A9OlWS=&eE#AwT*2DI}s!9D3RKCl6CIV)uEfCle3H5)vbGvp6(vKJQZHvpY-kX$XhcKyb>^+5)Jo3w$7s<$<$ZsBMxINI%#|HjeoaMA7}gY2FO{C#AA2ebhm z;=qqEZrh6>w>4Nk#%OW8Cr^2-oiEzltof~OS7!q}!wF@rkmLoOWej1WxIt|R!d#oG zdCC zVrhOozfXGDju(lr)s`c@_jj?;Thdzt@~sXZBR%Uk6quf@OUX1*`8gy$kK`p?A{k$qC7eAVS`wf9g2TBiD21mHYkM+W!V%qIECK} z!Ulu6RWQqjaWmFiXRPr{BNu@S#15I-)6=sLUW7yr;`HEbZVcfD9GYcS+z=GUsr)&s z;Pi>(yd%BF#f;3-g!Y{n7aKi+JJ&{Gd{ff!z{d;bgJ2L?O!4?7ch<(=7$4)K(i1&gr)iFN$xY$U2d@Nr&&NtLlU-BBXme$s@>_Bd*Oy_!Fh!5?XHqJW-_oO#x zgqxhr^&dktT?*a{;yp2PTpd1-V}r6y_r!zcys!<+chcm<7irq(OOMlrX5l=1ot!_8 zk6pQ|F0k%vy)U9N>K}8?oasHu+wOkvJMWjo z@^QW#!?|$oT0DUxfJ2bBbIn&zYd3^~Ef8W^Z>Pc8a3i zL0QAR4BpdDZ%oO}PBmC&>U9>KT?Src=x3%FvvRWyLOQpQ^fpqK(aH;y;=geD)+7t< zxfS+W?l3LnS=j4I0UiG7T*&la3bpM#w71L#@2$h(`|GywWR(XzX!C-UVG>le!w4&9 zNWf!58yLUy08lx>V|xlp9%ew|zJI~Bu4)ja1;UQ0a`^I*11#Lcqk%7-3TaE$dFgFJ1{?~Kx~YLv4sp=sdERLp)TMg84H&)9bk200L)nt z3gg=|VS4^xuvzZ}H)CpHn|&S(Qq`mLwF>0cQUj6|5#ZKx9!&d3g5mE{XttjY>t-up z<*(mBSXnB}+U*J#l0;C^z8Br?jiBLNAGp=e3635a0?d|ipl;}ZqDVK0soM(j8egcS zJYma#23YjG4(`DixS`>wyUX5(RTCs2|5^h@KYougbNj&_#vQVYY@pPeg|>bBz>3SKL#mir&K2OE-H5xD&a8mI^$sM%9saXyVakXi?^%V})0!#f>!FQmN6yfDTHwTSHeIv4w{1ZE&Y} zJv=YTp@*mnsGw1!*^JTwq90GxBj1cxro2cbVm*_np~3ZR%E?4_^U{3cu7uf?YxwWDWA6exA_?%g?mZ*Cz!-@sZh7 zM-z*#jz0;d^;6LP!5z>bnMc(Yje|ksqM`ZXK-y#JKIj|v48EIL2Xv5{HgaP@oLkHs zoV8BWxz-GSZCe6f*PPiewoRkT8+=jI&_=rY^m8=*+H%a(3uO$su$cZvx%9;o-E|3}f$XD)cwNad;>mz0nqft~ z#5TSA*xK3mb?E2lB;lm}Wpd{Mih+X$y9{x4bN5gV_4FDx+}me_O6}|SyOI6@?~NKA z_2x zE8Ac+&CM}ea_8mc&tK4I;YW);E+|~QWa+Y^Pd+VPzG7v`XRB7PSzG$~I_3J`Zx~j# zanl!@x0F{@ZvArG_OGgT?5wWYwY#?N>pgqx_ciSQ=Gy}Y8=IPATMo4zK63Qf@wO8u zPn|w<_T2dk7cX7Da`oEv8#mkkaO;m&e+x_?zO7s@H^HA+6a2Z+1d2N^1eAy53XZbE zKt#`%)FwN_Kp-L2%w%PlExl5+QpqcepVUfLdQ>8FNCB|SA;$1xZVJ*d}E)kC4(?KBnxBluIYo>k^VR^gtw zMdV)9pQAXDHi&35GNBg)c`59nyC)vb*uD#~FF5|sWI-yDE9Bb$G35V7ApC#1vA@|( zk}i`=MM|->v%dsDu8`UCad8vk1FaWJR>`Oc`ST$>&rif(LeV^*$A_fx;XI$lho%W< zAy{BMS%va^B$@HPd&WCqX(kqkA%4Pw;VRWAyao91~f8A1}sR^an$ zvY(Ue7vq~8KP6WmK6*~V)VOrQ&yL0M&CkSxU@ufKf`h0nCe-p5UzO!iS^u%<4AZjEOSJ`1tv;{A;YQ)7&YQ-##AzlqlTqCV^y4=xs_ z*Ww|i3^D07Q!IqRM{0?@Ae7hbe`2IiZ(M9l-KFRGab&5Ay}e?Wq6^IRwW@x>$#A6LUwpA_9Ns;^s#-c zN?K!;1euFTmoqmv)#8(4GWqCoa*TN{^HOun8AgMPZ@>uO5o)RmOqDU#XCl8*Ik1Duc6%$8uJ_3CTY}>P)rtWk^cu3wR8q z@h2U?^C2>1jf%wGQNwN_Dx@BbLh7-VldyZP5`ZURV;Xid`!wuuU!%RE#%i*;%8Uk= z!PiU%W@A}}-BM$^lt?KiyYE}Vm7OTI&=KfH-zgB-ZoZiM@7TNqh*GPixh z1=!W`01X5kPW&PoZOigS`A6E}pt>3zU3?KQX=7kz-Bf(BFb#gwbYjKM3})kDAJFV6 zMy*?p!Ij2!@VKd(35{BWlj0LN^V)ur8^>EvjQJ4T+#rz@E*$5*-qFw768r=A!I9e# z7>T%z4<=xrQJAUW4?xDCdvGfDC+}<5M{|MT3-JpM!M*KS;HysqOK>ES zU6>4whiBr>104D)=L94rg|T}FbV6(3T>NQ6Dg>uyL;IKgnZT6~QP|Qw;FtCgek|Js zOj0ys=f{Jzrjgx0<88@3H3l6R(#DMZ_-|&~*{e+Qp-`_u z4Ri7Cs7qc8OEp}F*DcNxt!GYx6B0L6z@qD!T)^Ua5V5aa()$uZoqv+6X7~uA$BkwB z&hvr7zgD1koQ+IS>9}iTuK*7?(B61gnc$1EKbRHoY_xmz3?qFDH0wBz=oWx5i^X}#2`$5n$I?A%(; z0d@Ty{_}{F`N$FdI`y&OjfJBZElwPxae!h;c0EB}$~%hQB$y2Zb6LgmE|}_j0V*E5 zIVQ9FW0hIWHp(|9Cu=b~U0x=Oi9TfVYC9PDdSCgkKI(g~shdPD>sqiEKFx0AXrgpP zyq zXF+Kl6Azv<3F}MnLnyxJ`1xc(&MH+(-GBUh@``x=f4Q;hS9O!*Cwt3TwN&2PI}kvr zQuqi31rv)C-4{!-iH)KXCujvh$R}Sa*@93jXe)(uL8uf`D#bM|G035#6hX+MJ(+uY zG82bRia>#ciSvg?Ma7drT5cwtr_|CmjWz^Y5OlQB>giO}3(<5wyF4~Gro4D^jWIo8 zwth-M71dkwNPP2Vkw(Nz5kW{0MBI4dkm|baca{{#*^C*Lf`RTA-IH{P=soj-Nt5V& z0&U!>i=n&YnN8NTE<>3)Jww$h_S}v!$Xrvk(`>he#ArsujEGk1Q~*yu(9Qznk$4+?@H^wwLfib@MQ9t8!_RrHpZKt&!R7G+py1+=u+wuq?nITu{q zreNYjTxGdSGbI(Mx`Pg*l+**{We=J$HFYR|8-*>*} zIls18DoqhsAwp1F1S~)#fJ&Eks8k>Hd|9>tLZqNc5O^Zy1zwj9@v4AK?41EXa117# zM~DYQJJJcPHVya)tr%QmbJ?rwZQgnTYk18%q}hZF83xspENK%+C?H@UEP@OGWNj3Z zEW|wxO=KAeMkbaBC~^LQYO4kDL7bv0AQ4r}eH{&n{n1?_42e}@$C!x)LmS&0e0-SQ zUE9>?aQUk3HlHo5flM0g)wOPCQ=>~s=OWTvLWWkx@E+Fv1DB6Ya;Hkzud)a)_*?J_ zdm{?xzFY*Jj%oN{{XTZ>uDkGi#~Acp;--G8;f1hc*ba7f^j3(i(zC_eZlKQmPvJtw zdUkj9*?yyvPcbj%kA#$S-$En%DSmh3A#^M5Bu;f-3tD*Q6!fgPgg$xW2b4Nc##`KW zxYfCx369U?cE5H8I^MsHhJyu<|GjrJd0g+K^K{HU) zMi0^r9x^m$*zggtdP7|N$Wc!vBqk*rQ&LBdNlPD_F>d_RrU{u7CrRe4?3~=Z$@v9^ zmMKM3i%Uw&rj<{xm|?A)IjicK>e;rMTDxOTU46scdCo?c8)QLN%@6f(GVAqa%Ajsq zKx{V)a>T-xMPnD6Wq>7|49^O3!ct;j88NWDc7-EMhLvqZrcWYl6;a6OltK3#HM-K} zHe8O>3D^%Wll#8V;Kl|Qg&K#KbvAf?K@O*bzFzWF+d0j({+e^4hD-eotqP1iT(QT# z#~2~jLa~T|PK3!p5rWW#`y=T>A6yhw0g0#z=u*6Sbe9;pkChlPyy2$sB)AFh1s|uw zRp;~L=f_BM8X2yt4Lj8Ya2~b(1fP#fP`fq_MBZCl?`X6MEP+`)eh97gI?LXuNhEhj z!(!hm3%p;sFCJ382lXdEB)r8$PqU)Z3ySENcnI`FA5HAXQ2UDGzb6YCUaQku{^g&N z{|VgxlN#vBY*ZnNpf$mL#c!d?^)5QnO^P zlX4}gPRgoN_RK_K3Y}$1Qa&BYIxv!zGWAjm)Q~iF(NoFE8Dx@EG?}imnkmnrjF^@r z3uR_2T`F?PNZ0dfQj1b*DyBEtbH~lI&M2*?^2TBkU&~y=6nj~U5fTI;zddD2WqrKJQc99hYtaGpexQr| K`zt1YTK@tgIEX(0 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..64499dc53d0c919b6e990bada1910bc4d40ce3cb GIT binary patch literal 3122 zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkfnl3mGVG#M8qnBqJ~8TNrEH@)>>Rji%PXJ zTG1+MZBcP|YXh-MXq_I-X@yFY6y^KBT??a_d$%#}Muh|Z$YkMf;FxhU6!!dGGAsT4M8 z0CpZ69R)&oPP>(8L_r+_xszIDm!&i0Oqi@QO%^$&pj<&k^lHUVDbq{1+ZynchDf#Jdhj;x68D2@m9tjC{<8Q&A8y z7xk?YTAL@qZ*5ut z+zWuhn23m30xmRo!>*SHTwEh?zQ75KGD0zBeiVka7GPXi57;er!PVq?tm|Bg?g87# z>3R+EYTg9b`El@SK8>Os{b9JTL{sN+STae21;6e@bWJWMZt=vKOgUD!enoC{BxpR< z3D>%~;NXFt5LOLEU}GC9;=PdEuomh~L0H52VpZ2h%zV~>n;3#Cu_AZl{Wr02xGU6O z#-ifx_oSet3$_a0&{x=@+E0d-?c3ouGX^P-<=9cx8`>&4w$$iw{JJBKUXrV|RGpC`ZVjZ4MJA!4g8?ibe95+^QI3#@{JaI%E0^L~RWaW)pq#rmsa(jf?kc}~WE%;}-zU^o z$mKHMOs-e>Hd*%34B5(qKVrwQ6r#8_n6vNC@g?G3#CrXLB%(nJXMH@ZEj* zWm7yH(xXw>-Jkn${2t6Zv5&tzG8~l$CUI?hW#rD%d=5_Cr;)(YdZXPHOaeN zC8X^q9?*S(z_*6*?$hlVh}`GJ2fUxSpZ^`|1a@UAfjv-4KNF9RbmE_$Bg9Vj;1*r) z=LN+e5RMmQ@(v2Sj-BisIy*XbadvSPm2N7vdsj`j?mawudU|>LXnXng?$g)Lzh6LL zQ1Bc5LqZ1(92EBE;345}MM#lRL!)D2Y0N+NGZ?>r=CQ#iuJ*)vjK%_Oo@Lf3beU#=1?Lx70U$x%I1U+Z%W6{Cd~!J$su{ zn!nk%|G>dRhg*&uJ$C%W$y2A#oIQ8`!o^FMuUu{Y_S#>y=TsVzv*LMNdtkKF=NlNd z=bD7Ve3Q9DZecD1O^T5FdF3u=cNn6HGvc8zGSJu5lT%31)Wmr>p( zp;Fjn7|OqVN|j$k`MX_Thw_fkF~2a^kY~0wEGy1mbJwnLr#Szz_1E}(Rf6iO5UAv) z?8&)$ohZ=0=EJ?v9(^=PTU;V&6GjN9YF}khZWV6dhM3&%^%pr&o-~+g3o6T9uolWu z54Jn;ai{h@n0<@mKPC&Mpw_69{>Q!hPwJ`vmlv!3vtE+&wo)l*70N0%S3s>%*-NRZ z!_&g74@Zs5nJ2|aQl1o*XE}?c0>juUN|NH)O!u~#?kr0a zv%m~V!{6x}5HOG~qLbs;KB|;%=}=0y?NdjMV*3O( zSW~xzZjEOOJ&QE!(n5^sW0H*-V=SptelTizbbn zW=HwyQe>(1o-DyKEh-~fDrr@+Bxyajq@J#iDWQHWc`|)!pQ^H}u}VVaQKT!LT9Rw_ p&n_zR*A*8VOFgFL7Mt>o29Kc7enI^L)kzxQ`3Fq`kV5}0{0G#3JWBuo literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d0c68b087a629133e6561c3090078ae686a5e13e GIT binary patch literal 2537 zcmc&0YfO_@^xXS>E$v1pB42G;b*Q7FE6@^QNEq`gZ>R`3g|aCYT4}MgtPeo9DQ+{t zg+(-)VKl;`G2&~^u>=7v>X@RNuf5o^M7C^G=hP*bP1&Nm_v^|-<=-+-+I!DE?{n@w zCtp5asN+~JgbOopn1fINiOSfGOTsm~-{61?<>VSd58=d|Sd{=#k^oOJi~)c&0+SUZ zghAbkR2-{}2XTl}AuTgG%%xV7+sa`%mNy}JGg2q2CF5CAYDg%+VIWHg*#I64jU_Y+ z4~C|Y1UMs@oP_%@l_>ukE~eq$V}`?xl9`?5-YSd3Q))JOOfnlmZ06E(r`=oS5EAG> z@*bqViPgujYQ)b7Hjc+frd1!ry%S{M29Qwn9_F?WWo*cUg@3O{?bZ+3#Bscx@*G-^FUthLQxhkuZoA`!kuAG&-FT+LKcD{^vwChmU%uc4Q?>>6vvnx!BZ(7idbv1b3(2!3i!U(YD}(8yyo$J+NJDRRxMf&+ zg$kAylYr-Uw}@=ay$c9Y!T@4}rqtIjn-`^cs^RUjQY*p ztZB2i=Iz;B*;{#J>7izO#=dVI9orqJuHRkV(^{RS3Eyyfm#fd7)N)~uC+9%P2mSkN z&fZw%8#q*Fh}m}jYWu3s9bK2z=O(B)>~G7gvF}`3WpmILwYXWk&Fu-Y*e$dN$b%n1 z@=H?rk5YBBO#PdPHtpzK9{H-r%#nWD!2}*|)&NH#31GB!0{94(D35N|{?PH7b;5A# z+AWR>kAL3y+x7HBb@N2ht`A!u$LFIGQ#QtfLhLTLTB=MOOU6MPxX!A3^O4jvq`G|F9ITY%GO%UeEJ<0r^wN<@s7( zU%_Yad~i#^ORNU5ULIMFUS_^O^Qt*Pq3H!p5;@6^@Qu8_)_+ezuvCl4@Iod1NO7+s83X;1+S|Dc#?xG+ddzW2Y5Ge~N$U=lz5V=&56%h-?N|Pe&u5@DoH6*A6BhP{j zG}u5PikK)x1cSXqY{Zw?D?VG~8Ka42?p;xN>L2g>-uH4on3*$k%I}=H+}|aH4Rv8C zM#&6NF#-m90!Swo#%8 zr>#`fX$1h?YzRDx5J7GPMKh8!l*|nTMlz{hbd)-FNNWMT&5R=wqQ| zju;y-1mM94PXJi=Vg{-(1`ewwg+LDhsVDRT8lQ3^&`q367qn7Q=ZRE=UfW1;1@G_T z!`BrM1bG7#2dOs|*c-%ZVlqxPTW$guf-9k{M;2H(SECb^a%A050pdl&!MfoDXm<|)^#cji z_m~LFrpaOPAK!sraU4wDZUrYN381uXC%V=RLCx{*aM9QVnw$CnwayomHBTTf&>DiP zHh`?c1=bVxu&!4P%z0G>H(@B84`7IEpAUh;QDTsN8vuFj*HKc2G3=mhASF)^iX3TZ z*|iHC=lDa&a{=rwZ~#St0JaxL!I8^`aOeyTXG)Z4MEM4=NjnBFX8D8l9cMTjT?Ch3 zsv$c)42tv0;8o^9SQ)StN=JCYwbcZAk#igR<(q?5r7JvoJRde+umc-)Pt>vgDKd+_ ziPG&6Xw}x>za$t^tHu&F2|KBR@|W;i)c~~5skpnyDhM|B*-T9}D+QZLwp2)22dWJ` z25keEQWaBLyZ3cINq!aR050cmLJD<^Xjr}n{cd-haMiR6vrnIdmU%y;qZ{raS5qxf zqKSpyYd4a5_MUXbs?$)l|2eV(6*2e+B`OazK%<-Mp+S*~4&=5H^J+;b%$`8Bj`4&0 z?oSBm_5iZ9Ngv#Ig`t{_EpR!1IlL-JCHpw%5FP^u(TPR91nmc_kxODDRgxzVjF~&E zTd}rRw`7Y;^xEc!Aa+7@>GKhU{s4l^VCo>i`WYN?z0vK?-A`#T7{I{Ujqt1@5DX&x zAlbr+csRKh79MRR&yDef{HAHdlR6rm8F>giJZ!1my`MmhIFqQ%^92jvAgDiOPKpa( zAln7Iz|ijnT%A${q=%B!FkT?a$fx#BT_$+GG!1UG%!kznchYOhCJ-exE~w6~mi%PP zpX9{jw@J^&K0@ofOk$_=1>xLD?(}})13Fn5K^}%Q7`!S5a_$bNd*;pr$367|;|mBq zc}fCAZ35-6p`_VNeHin{LUcRm07~zvhhiH&=)I}q2-k*XhqGpM z`H@#>o9PkKaog8qT~Gj;)fNCN((*~uNk5Um@h0=xy1@%G$n8);+&>*S zXz&n^p`OFMSZ^O+KmUN?fg^%cBZEhUj1C<$cHH=|@QBC>6DLicGBqkXCN^$bd_rQ< z^yCz^Mmr-lEj=SMD|_Z=vu4kko0B_l{({f*zW6eK;iAO_UllI-dTG%=mMvegvUt_% zHEY+El&;_KP1(jxo40H&uh_P|vg+F%JF9oq?EdciJ$q~G>Kpbo?r&;7@I%YNLx+zX zJ$C%W$x}a`Zas7M-1!S_KVAIi&%a!{ti8hJqqX|oXJ4I*wBl>mxn!MoAfj-%Z>^Di9jJ{Ro)(#jv|z&}D-cuZ^IR~+~W z#%+8B6vkTp4vZGZ`}b2GtN9h9J9WP`>@(NGGn`P~2}zM}E@ucG#SQOD5PB(zYL4#W z(x~Lbv~*!yavVnAM&FK{`+}$~Pn}s?!G=dBky?h^6iI z{66VnJ6@!sPFs%j-rvPWZ%Gf+_72OBk)Gij3QUeu$EWM4{34QELW+{!idG9ud+&$; zNYW40DJyUTsUR&TAuc6~p)ljy67PIC*veJgO}sGogBv03+H6LkGw#aDoZZ`NUcKC|M;usZ~DaezdX2q+mj?|mrFzn zk)*X3epAZiQhhcwbX3@2-Ge1vPdRhq#(J|XI~xCT62!7utam)?&$98XPdvZpjSWU} zD<75(py6EM0jWdcb*c0@y$%a10T2Wz&B0BZZapO3B1UKkP=b`KHOviiP=}}>BnuzdVO=LJV}Jc9Mk(%lmemh-|kEZ<3;6JMlmpD#Tk#Xke*;p^o5aeVB` zU5(BXq?X#K)EOCZ=}s|Pty5HLswT@aGcGkPQKPnWaqsWaUnx_`0p9#TJp~Yg|DpN| DjS{VmT~_Iw+{XIF61pDuM{kz$k~;?h64b@P{*we3R_%x8MEk z?k_2pN=*VQ#0qg%GC>qTq}gBo>h$ktooYUGY0Ds!{ak+mgeXCiAn-yY`SQASh*t%5 zL{(4=077Fi8F_>l-taEc39L2^_(-i7T4Q%Rs_X5(dI4*AO&ij@g$#KH)#E&A6KF01 z1~NpF34riMA<0U@(-cLPfly>-iGUIpn5ec|5bx(Hs)7cC3lRO zI54!c_mYo~aCmB)8l7%`wZrbWM>LQ{gQL3E<7#SjE9HENG+!aZCdP0Y>;6T^M;7^( zO4y^a3eRDs&JFC1Xq>aQ2)w%{<9+qJ+0{G#gx_|hqW2Rw^->Kfgf|Ai!Oo1?260t- zws`vu)RF%YT*z3*wpX9+H6r;m^L+krFrE7z8rhHVp^f{|tzjSI6wl?5`Dac;_p-0h zhp+sEQu@hwtH%MiI$mQ!<1@MT&1YcO;oE2kSn-$-jA%!G1S&b&35N_`bZqHaysU$P zHS?$8vt`-vbNY2G?#yMj9SMi@y%lKRYu)fo>qhu%wU-&3yabywMsp48`}3EN??I-9 zE_Ulmo-bQGE^L$BKg>1oYwqQv*Fc?!xNWx!aCjIhWDKoCF_?*<1@tF6D7i zOIsjee<$DfJVMuhC3nsKIvB@|Wnx;wq3rinNdICtTr50>{86FE(PieoOg5smmFW<= zF&en|Dm*N64_AAlk}K#GVf~&dXng_)!#|JVcAWYPz1{B=lkoNn%)$$~aKKxR4;JPk za?>wxLI;x|v~l+zrE=^dOAm<96&%}#Ow6C3A~b;erG>29I962-2s13lhp0lsdWA>y z?i1NJDq1aQ`ia_@{<;AJ2gME^GBi$a7#2T##K?riq-3Kh<*C%P^idh3$2@Hwn>lX0 zWXYP4os*lFUr=bBSTw1)q_k{u`IL&Ow#sSKt7cTswAa)+oU`ic8)naOHM%_@3$ki% zxR;YzpFdLub<;fJyIGJU=eH~vwa_91EaGH%MvxO069-F(gQd00oDniC-$G>iBr>cZ z3K<u@Y0PY}GstZu;B< zAD7cz=MUiLNu>D{8E&c#`_%++9`^p2n2#z@yE+X--d9`iY_tn3fmuC%0Il^p>(1!$ z1aKh@i#@w6@B!tXcu4i`_g}yQZ#7eER#eV{B044!f@h+SA^u~jeug(ysRI>}D$mGGn-Lbf1ZT3$~10y+drS5gn;Gl%S1{Pv18k8Ns^r*f)NI*C^j zsIOph&wNO;2tAMwZKXwaFn&qC(r?miLc!Kc(geE9b+|0Ko0p^!a`Vo5S5)e*T(vNQ+INlBJk5gkeCWWY!%GO(|dZfneG zB5{;DsXg&eOzF`CG(n7&?cTXfPJd#pEGOE%UQcuEJg3*!;Bm*A(vwU{My*u`aPI@1 LdYLcb{w};tD~p(Xb0!sq_I-!@+{FVLTDDLDfmvCSH$jHZhG!^-w zb5Y+ap|yDu{MYos_m}MPXrT{oxAa0*Z&$K@gAohHyTWH#3yk0S0D&%e(3wN!4|&Mg z{%>4ttAji@92-Wf@%eox%vd3jvLbsfUmcG9TZ3?PWh^}P0eCR1jht!dizmv9@ywx(99} zr|LDtt9cV#=f}aT`4kFw^n>A^5>1`QV#y>87X0!xqN{T-af>HTkCUUe^(%6{BSGWI zPPp2|1qb%`gs^G|f*K#9Jl+e*4Qrs@6pYoJA69j3#LQ<6xPd{q94m6yKX?lZhq*%i zWh~0?d{6R=yI`x}4Sl&Cs{CbW*}fhAGh>kQM2;Pmy`Zg>V@tIT$F4cz$ORcL)C7^C zb!*^lI)SG%V&HWv5EnD6aIMXNQga%r%h%%Bw8K~yyAicRBXE5MN1m45M4yUo@T?EP z{ReZf{)#WW4bJ4z>Ys>v_6=h8BPcX@A!boBM*K35d}^xTE9=_uxS=2UCZM{L!ZQKu zd#)EIy4S*O+_!wn+DD|d|6<{0!a-8%c#V7<-wVN)Z$K}c;F_22BxYwjqR;yqS9A`$|AmJU>SFJQkjrf)m47y@HP^hw^yhs zm&;{-<2c`lZL+K*8M2iJe#DNX6r#8}khAZ{@x|gE#CrXVVq(`HmdjR+2_}!Ryd@p}#L=sqL? zO=r6CK6AIjG5RUKo6rD$U=VK<2SZU@A?%vCME+#43IA-F12?^q`)utPuBI`V?D5^r z*ByIC#-6;%N9^s{!K-{4x7qC&|Ngizng8Zb_=2Ex{wPccU!H}s+i^1IS<~Uavq|3N z3Ly`F;sM<{1id|ocb{(0K;#}T-skfOgbpl@JM zaL60|Lc`wdKOp?AfrBF69xO#h4T+A4jf)?ekeHM_EM<7=h<8SgN*kS?F=p(z@e?NM zGPAOCCgo1f%b!x9Hy8`27MaY&(@ILGzuRfXdo$lJn>BmR+z-k>{HS8y`~{UCFI@D= z;;KI_(JuXTS?}uQD?VGfs-|}Jn$OpMv2Oi_jdhzgZ>ewia_d*ywm0th`p-Le?cURr z()`Wdeftj_Jk)ad$kAiRPn{-cyR@MS2y^QiU z36;Vo!%+U^Q>y$T%HQewI+S;Oj`;;ShFr6?VOeqhs=IcDJH`2zt-r?Ss}fXKhC(Gb zWlhe}>qLR}H6QMU_UNNY+~P8rHerNvs`gbDR`mj{1g+NB3QlunF!|9h%f+Uqlk-1WgB;`s`xt6m?Dv-ohQIZtTX1ce{bZ1$b zm<47?8uo6Vz`*`=5uF^z_E9BljAjEhElG)Nh)ic&3ttLi`ze_r$-$Xvql&X*224pG zojRHE^(i#I>G^c9*o(3lp+Qix+OsSfubsXrBQ4aB9h)oZn7+k5O@~stZJ#=FB-;;V zgEe(a=+=0q(6dOhE-loUK04W$G1`(k^*7v_-_Up3$3u%n>4kV$DI*KBV@H`8LyAvi z@)n^IoB!eQmU`1-Q?iv_l7_HlYyqo5VMfe={LGM2T9bZEh!(ko)t`oFy%c&DY0;#S z)9ffeU5YHR-jgL*rbT5WOC_yJmL#p`mekYrF~!u6B~PYL?Ne2DHC9QeJPLJ1Q;T!V q0a=BG0lK0hV~NMKoFY@6(clpr);G9skUCKVJpZ6c08;3`h5rB{LO@#p literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4748c07ab9f5c77e8fc8c141ac748fc7bb24c158 GIT binary patch literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9d247f8cae7a335a4ecb364c654ae9a66b3a8c00 GIT binary patch literal 3450 zcmc&%cUV-%7C&>#vP%)vz3VQnf&vRD$bv%DpmIS#UxQHe&M zMpOh_L@Y!FeM+$gD{907@g<5awy5~<8B3y>dsi%p{_(!=eJ}ID%$%9ierJaJ8^YNL zcZOnon1LEb%s@K;+iAzA?3oqz= z##q10ST1hek&DqA9T`UP*Hye}<|LpXNHQ+~R87=d7)}=P`nXJ8LRv;_yf#B?k&GwF z@v(9G)J$EP0PHs;YeuR%Qq@c-|BIL(h1+P!T({IPPe}psFD6g!lL7N4LW<`c*xtxO zW6gMQS=SwITr`I#iyYu?tuw@RlcG%}dRRC{3Jxo3L4VBwz^y$zZbLx+gJg&<{~a#8 zDg{wuUnm)=fc4E*Fm)A+a?;I-6h&V+v_lQY3j?96&J`Yqy+UWpdcabp9649p19jgG z#QG5MnMpx?nLS7xZK1u?2hOEf!QyByn6SVP1~;a_=xiI1EN>4Dq1$0Yn=I(;whNu! zu0(bNcZc*?1)^{A~~T7bL*gZC&9^j0lPwccSa22rBB# z;BwpcaQILcpw_L|+fu$}q)Rvcn%k%&Ue<6b1`Ocuq7s0jyEu6S&3CGUUaK1>5 zf=kzdz2OwRoEiXjciiB@xTSFQRT^YxL_$IC26#Q`D69zF48_6y;rc29z0A1{-Sav@ z*Xu8$V`3hZl_z_555ncQum@w}}keN&s1lm1= z6CO8B@7(>2mVyQJo^}A9Z3zO4D1S)p>`FWuQwj4=9w09c?GJf}#uF`7G&(={7a(3l>7tj&VNN+%`oDy0v1LGdt4TfM1LDj)YTwvpq8CdH z@N?~KSaozKy?VnaqNu_hRXJ9YJxhKkN7vmZ`yc3HVwXFK*y+|_G9$)|K4|igPE|*d z$H4%7SH?lk-9dD_nUld~Z?&jx141oNNdWCPpdQeVw4Q7ZLw}o(ZigH}8SNw}{(v`q z*-edTrx?Pq|1LW2STr46Er+GMMj+=4)^zEK*Jx{p6Qs-5ugIy5fv_)qI8hZ6h`u)D zksV@wAhRv6B3<%N#KebR5ckLDP`d|*Bk^r-g0tL*I_UHS{)ClDn7twwrb+znyDMA= zImQ2R;MPFdD)2zQ`W~SO8k!$qp-D`Ma{zyWDOzMAmYABETePvXYTK^8l#z9iE37*z zJ9W0P?b6lG-a+N)hi)#-zwYx{pUY=f`_3qR6bMJor2l%kQegpjj0tW>JhiC?e zhJ_D_7&>hDh{%yq(W6GkjENnq9Tyj$Fg`IUIb}ksE=|u(OgCg?PRh!jJZ0*%=`(U> z&YC@EZtfTJ^5!pCnE&OX#a}I1`j2JHzg|(Wa@Fd>HATg1*R9{M@taL0n@hKB-L}2# z+Z{W1l~?TE^UuBeDyyn%_8&NS= zx^k6k;^Wbrx!JU9GY}`eeuEFzg9GJF6%NvEb)f}kb?fKbC3k!|fV(tDg&%6msqbND z_p!5I&Of+#Ny5RyRd{zfHiJhr2b(K7FmJ{YB%JcG5zfC+6SUAx@Pux%|Ib?_ojCKS z7y->GEq{F`0O&Ww>TcEZn*E)w!Ha-vAjT=G>N_bJ!pMJvJ|qr_E;)_o!UO}Anrz50 zNk~oLNd+@VlUUZQkj?F&s+XzG2t@n-fq#@H)Qnn#S(#@=xwy(4Xyl#1^A~Q7Z$^BT zKHzUbPtcHyTXlIh#MXaW#o`-ICkS#Qp?YEaDKfNtOO2@sX^9y|p1+J_SCHzCjjGTF z)8Sv;ejRcU!bpd*mE+7-f5peSM z*grPqkH)A8a$8QDJ~1;P!!?fMT(#-x`YhW?3F(GpeVVPiR}c3dYK2A#@b(9)DS&YN GU)LYZD$;EL literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..78e853246b125d8723fcba4db0657d7509e2c93c GIT binary patch literal 2667 zcmc&$dr(wW82`?_cb8oSLEtV6ZHf<8G~Oc!3=wyE!-{|@EQGkrt~}hA>>_A}SlQqV z`5G-49H7x05mYj45KvG0D+o%dp<;bS0j^6~HBGL94j_aOJu=f|lF?R*;*>0_)03{IJ}G6|zEE82F0pGO5U} z4FF_YM+x#(G*JV`8IDZqmZ zo&X+Ck->X}gtmz&j!97}vP+~QyRHPM{p2oo@K4$lNxOj=P21N=Ybq!+=#18Ut=g(~ zD#lE)How56FEbeVLI#lJERwIM<)hy82Ei>9W+&M7;Y#a@jtJ)nL-X-Oe4f*f(0RyY3w9 z?Y)Plf|82-Gzc{(JE4rD-Ec@=ijGyCr`B}R@J(P4+K>|lH^N3JX=@_gc*GIJTiu|2 zM;{DVZ-oanrSz1*)l^h?grIoy3&Ly1_oLwA9;RWfP$=-tqh>_!7Zmj82(}%)2`SxD zXuOv~InJi&GPVm6r`>_UkRQigxjIMS4o(ohst4}uP6j7cJm@F+Q#TiN!nzYZ^q2Fa zq58-YYNShmKA+nUoA)n72PTg|yZdr#&)PVc6qgF!XI`W?`w773t&hPu{!jQJe=pEc zL9~gD0Z~~sb8zt%;k`{3__^;*nE8d5pniKU)zBV{x@L6J&4UlnYo{*L(LGZfrmbC0 zwRjBDm3d(Tzm|=(K1f9mfCVD!3SjlsB!O$y3h?Xb7Eb&cp^@Kdczq}lf?j!<_FCZx z?^&v8kGu zIL4lW~BMeK>&Q8xu96 z-Suf=4f9KzSmRjUZR`)xfP0;ZICIi4l_$eAAo{**Sblz@jHPUVkTmvXi4Da@0uDM0 zqc65t9dvpfd3E@S_0p0mN6GQ=^4baV8XkBa?Z#8zkqhgtWGVg;iDW|rJ1{*74|a)E zWakJx*!hRM*mx8k7?59de-=DAkAj6>XDqbZkgfwsdXfB;n|zxa28buUKO^ST3iGT9 z1*y-JeB`3=szbg{$FnDbGu2BCKm}K7fJ3ry90R2q>fx> z=De&(+r^SLF@c0aIR(dY8Td~qmE+1eMIo2KafMuLA-|`<3Udh;%W=tM#%ueGS3GMW z9*84u-pZE(1H&;BpO!?{vE^ipCj)lPaY`~MRAkA^xgfG$stHL8)?_U#(ELU-g za@#ra$r3B9!||Q88SzEh_W9CPhJ-R4hp&^^WB=Hc9F0vAq~2zA>9R7N)xW@O_E(pd jn##SG>q;%fCZl(7SU_+b<3x-7&A&Ddi)?Lfgg`!fe zxPrx^pimX;C`%C)_Z2tlI67*rTiuSHj@4G@zL%m>^^ZAc&h$IU-M;<%?zz7tmW}ga zD8`9#RpSZl0BnQ$Z7B=TMm;?6LyYz5Ne_3mGr-s}VlPI(*y681<_{jiPM$>ADGdPT zP6QqW2$2H$b|hmcsUHYzr4sXGjW#tYLu1Hb2(dt1j>KD#JVGw~kR_=X7YjoG9&GUh zfcHwKzZ$#eLX=Pl%ux`D86453iIWyFk~eWmgq<9T5d9$|#<9P9ikTL!eCz~4x`Mo# zdL6unRjMu}FDFfFOiI-lHCCB;k(rv5qRY<9(emkBLgLFvUP{VO60-lo<$IHSX3fmE zQ8QOaKF=nzr;TLz=TjleZvoVFu+Y&w8N4?3gB#~9;c2lO+-dWKl)fTVxmgEGCW^pq zRU7E8xdABb;juLVMfWoyvF>*`_p%BE>4C6$tQ5X}WD7IauxL)MC6Of!gx2j!IJ`~; z&N<%jc=$_nsnLt$7) z7K|_G4dyHD;bKe;R9NSOqhcpIQ6obx&D9`U904xPCqTbz0B9eGp~-qYte7l=CBJX`3^goFIU*jveT_1%mqH-QZGpd)U|72dMQ!L0SI<3L{-0rgkGpt9_uH@PPF_ z>tWWbTDS>=;DU-Fu7CCk6b~1H^jj4a-o1gc^18!z$`x`7&7s7LhPJvo@R}74vCjpt ztH=}NMFQAXs(~X{ZQ#&Z8qRJ|qNu8k;A%JqFJ^>;%WVamOD=({FSSr$jEB;~3V1c` zAgoetfwHJzxW0x!FXr5WehYhnbB!-NdORO0FS>)P)($-_{{=au-b6+Z1bVFtgfESO z+}cq@eZ~%|sOlyByLJHj-n+D$&^a0^`&3es9Lm6Tf*TcE@f0;g9)pg65t}badlAbB|!U zdi5Eo-TNFlgPI6DphQ)XR%pb&CTN!DqWyE*iFpkq6c>ym+DC@Leg7wfWSfdCYqbRb zx_DH-sSU0!TnVp=a>+i5IYdyvKsvLer{M0voyaG%h1yUk5R9BX*rHV5(<0mP3cYUM zLlAi*y6kxrVL5;x^O#0Zxjcg-zCT*rzVk6H0xJlZ*#gh1Bf%;m46+@)iH8##VA0VQ z^8CnPSlBw5c+yCtv%?NSP>>t7tM?PA7fmB-3WtK@&}e8n)r%Arzd&va>cA%K1zbz2 z1u{rU>X;!Q%v(t9nY2Rie7OOBYMT#h4(_1WR*WMy)cc@D_XhH#&A*f5kKZDLTl$!} z6iy>{C@z}Kp5RaKHG4p3D-+1WV1U5YDKO_w1Z_8WI(Y4F5_G?a(34;AyQ299l!FJ6 z4%01R(a>dH8R+X+^ zvv%G34Q1sUzpmKyP37h-Rn=R!)zp5weaFta`d#1sefOS*#-`@)TlTi@+kc?#;Gx4u zjvhOH;^e8*XWGx6JAdI~#}AkOarvKDuIm5A`J?rg-DX~!jr5}HH#ld$dXUJmkG^Zy z(R#^`6*qsnwRua39^f{ur(#>X>qp(e*6w0!_s;%|t=;FWJ-BjJzO1K=AxwA^(e=qT z&&kws*pp_UvNH`vv$X6q?m5SAcR4L?V8qWv@+L-J%H!JGt$FVQtz?w1QApTm2{Z~y z1aO*fiw6$8-~B(Llxza+yz z^sf02IDKE9(jq^Q2n;D1X*n8(!tm=xyoKU!GW9n5AsC3kkC1dNHX|?zcg0Cg?oAsQ zf)S{LIBiP8w}eA_0(U`*?c&6a95ThpEPnR$@Mwd92PaXRkVahLRRM&WLu`s|o%5+Dy8KYX5?KDLil zxz?B@LE@y>D?TwT7?w6t79rcUeRzf!7}0lfY|69o{9|J3{k D8Fh$h literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0d95d679984cbe1a72e4410ce8b15f3ba58a4945 GIT binary patch literal 2845 zcmc&$e{@sT9sl0@-b-FeN=wO0lM*lm5{kC;hqROuN?%G_+NLFKfs$e^X%pH&(maxs zw#uO3a77WdM>cdj#T^+N=r{v$M@qGbLq$DK*$>#xHgU(n#546AJR9Ps``#Av-a_{}#?~nWaar3!x%Z>RGFQrNbmqaB<0}w6qa)wu*X_0#+kkTY$wj`ohBWWx} zkR>K$rfVwz&{-*tA|WE!pC%TGHy45?)ojwWd%JyYUEXMyM2({HF=E_J?8SERcAm`H z7z#)fa7e`o0QoYh*o7gm(5yTMIs&{-TLuD38EN7ZGcib}>%p4Dgv2CrS9Oe-z|FfT zOq!`CKq>bj*!S|+GRR8yg*$pe{_a?t&l~e52XGPawRMDpJ)v$jr;EgRp4f*t`^(hw zUmSVMOfQSlR-H>4j`v}d|9k<#&c0RjbW;WVt>_XpJy*r;c|{LJ&o`4Jdr!jqgS+7C9T6@wcQbVq&ldti zQ#2P}J52HeWBl+ojizbSqNLr*l%(LaKM6lL@hKQ`h_L5M4b{)3T#s}Ns%Bh`Hf&r4Dc;gscK2QzGo(c#~%b}m%I|AF^9OHh|R0@Nytf7~V3FKFGuY<#3<6pSr z5*#sXpg-NV7^W?*h0$M5)k~>Bnm}^Dor7iFm<5m6W3-Jru@U zB%RPoKjE{b0qyl)uzEdAJZmJ<2b|S_6Zh4-gp>;KSwAuZT3%cagmS!~(TcjH3Hs!T zDXEjvCL1JUy2+d|#WHo;9oEe0GiKWCv$AI2c~^GMoZP(pg1hGy7R@W3U$Vfluyj$G z>?~hgQCYR5x~A4uSHINV(AczW`HJS1o|aYjwBFmc+S}gY^RMab3anih40VS=krZ*g zJ|ZZ5G*+sBp=Sg7?v<3(zW)2>-S1QYHVO)SOH#5op@YrnV4!1*KUsmTPoc~<2Zskx zp?E|A{kPGnPT{w4Ha9#-&cSyw-F$RVI0xGox+O*b5C$tK`iFOvEGTl5)Qz=|)YVTt z2Rkdzfrz2l_Zch_jYHUy@c^I;1-coW`J;R=5RGa5K|ccrM1ldWaqlGK6BF$}Pq7cF zP~LXUT$n$+FTO9Sf;1>Tt!(6ubP*x>XqR+YSRgl#{ zX{nsQw!NSaYciK9VUc@T{V~LNQ*h6zT4Lmw9m+kjvp3!+HCSeaIF7vcAV3%jr;*9B(8N?zL|4N1}mnw>7_LPX3%cv&#Z- N?E|AcfE)i9{~HhQtA+po literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..81083b2ee90f8208676313b318811207bed04c6d GIT binary patch literal 2537 zcmc&$drVVT82_D1DeVTxqqk{Ybxfe5A_XT#*=D#!TfhoPraa=*LMu?{ZKcIBjgxF6 zb4t`07dxjD=i+OCY~tdgPThi!I3H6t+5VAX%ydqpnOQd7aOd2vJQV(0<_Vnpo$vYk zzH`1WEf5ORco`qXt1LXnBL-0D-W9nQ@kizf9()Y1PNw`QUe3#PQ;?(x$~1ix0A(CU zca9Ja`evfzW!iMeqqG`jnN_qcth9P6c}~r#*Aum$=%?xx!+FvsQz+m$&@GC7fXAT` z%46$sXc|RO(od-=+$Xuz_+PjjgWDU@2RA9g<|?mq+C}d|o7HQLsG>xbZDF~qrp_r! z2^=KqV?=+4(|^h7`s4Id!$cf1t%fjub(n?kCQ{Llh1@?%Oz$j#BmFJ3b_wV@vKYzR zC*az5;kdsx5jQ)>p?tiG?A+zTnguE(Zt8^Vr$oRIi+hnAnr>I2bpPMD+`9+ziVW;3 z*5aePQCPlNAS-LbIfpg_$J`zn7eEOTh-6hKxI$0NE`A6s}pwY>?q% zTLP40;}EOL#J7$ptSg;@>eX48-Q~di`sbm1FBVtwTCpS2htY=5$k|pMi9fOzs@2(u zKXMlC1Ct@%QR8srd~95-!-v=RJVVg&&9nG1Iu>6Ye*vc)_uIKHSur^->?LIRaf4N2+b- zCBWIJ#D?Thbnf4eUmxHguD0N#Lg>Yks!v9 z`^~?Tk+$o^n?&Fisg#DiPJ zlehkS<5;shJ9*@~Qyz zbUYVm%k07LT61f4>Em3D_iOr<56X~30aCLckKzZ;qVS4>_rvp}Qd6`+u zG}0Zg3CPh1!MjKtN&Uw$|5D(`lZBeo>a>>sc=SG{(fGf-RQ=O>N$QIcYEG|EKZsBP zTAe0bC@jn`$_P9xO|#6vR=Qvm1Yr*SXKR)q_yl8xU=oB1A*({-8L7f-Mr8>?4*Sw` z@Jmk;>0ut2BO!m;1cPBJC1&PjvvZb@?K9a%T?>MRZN?IIl;lDxJ6D%Y%1bLNnpbBt zO{*>`F05p7X930cx`S?#zbwfK1wmK9pF}d*AbU?~QMza|R|r;SFNLS+&{YyV7tWo_ z&be$0q%J`Z;HjoOqt{xL?kXwHbCniLsnd9K0{P{>Iv5Wv7F`4Iuu>Y`HuF3$69_pL zrY|Y^g7N3%NcE=0rYl%_L72fva}BG3yVNwzQ8uZb)?_kMqDA(x`csI3OQAfYWJ!_J z?5Mm%F!}=bB!OjCR7a3188DKP4DhAYOPr=U8b`{L*;D`2lwD0g6Ph@;wRTCJ-8-?| n?Vf0@t#$e0mfCAQRW30uZAwa7N~+eP1N#1OSOyf(A9?=(tu<#2 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..28e2ab00c7a1b435bea1614c8f8f03dd7c991c90 GIT binary patch literal 2537 zcmc&0Yfw~W^nBmF`#{-5;%>XRB5JH;yq1KJA@nY9ToLdAizzPb%7V-8whusS6f0+_ zAs>y-U^+shnVOO&%^;wlhVOjPls3~OH9dT!HiJ`&ll6V~wmek+HF*ZUd%p8N=R4=j z=JNSEj^zTlAOnXv2n3L8mU_IeZw#3*g99#*Q$`Sa04L?7ns^A8`*^AW3;_H>F*z|p z7_?1D!?Ef(kOrt#ekCTmx!7uQTRE)6%JoRufV2r(`AC-35hN7gFpwjFd;lH|jUhA& zkA|j_`#6J`l7#y>l_>ukE~eo=WCp?wSC}28-b#zzQ*1VQOp0=Xl$(o79X4;JT}Yr6 zDZ7yNHr5`&nn6E<*a#k*icuz}rE+%tM=^m*pJtQSD{&CgKDp2deZ zwW9k|&f!?c4Oz{l^U%HeJ9Os5n<(}Gbei4Iq zXDLwLu?{$_b)n<6J$Q8+18Zv*;-0x_a4WtKtM+CvjYs_<{$K%W+0hMG>o&pP^)6;& z^eUW`FkMpq{y6E46Q3bnc_+JVjZ`{!<&04c&T*q`A>T{39=iiUkx0_mpN;*eVaCg~ zL&oHLa60y*EB*iceZYIdRb?l)f z8>Rgl-0(~HoAAMj1ClM<7vgO#I@CU;jd|(gznE9g{mLYFPLxevQ-u#iUzV+06ffzL z-Imy5jLa!;L*nLASam&95?H$&B3e77V=p7r_XlY&(?<|9;}s@&xj${nYlypzBIv6Q zjEwL^DwYI{kA4W&VhYwPT^3Y+u5;)SZ zju{wTo5fz{5%-NauBXOo8^)4xecbv8KA)7Bx;_q6Qg^A&8 zgP7)d13mOcdKLJ5481QaiOtcK6fE$X)2A;p7Uo+izcQD^x4eQJg1r>M2nm9KXiq>X zZ|3SipRS9G|(bi;!O)^d#uZs z1%aoaTP6{7YXd+pcLI+BM3Bot-i~B6B@G0=y;LGhSL-yBbJV6Bnh*=b^+?==nA>UeQr!9TeRz;sDxngUq9&aO~7fge^zJqCuI0igR; z3@x@3VR@De7C-wAqE=A#F9eN8d&9*(j?lh$AW-W@fTHmcl*YJ0 zeBDNnZuNr=geR=)+X%B?)WHoH4(F9LadrL&uw=9dq~9u`^zJp3SI`HxQ|_QG6+(qK z4>}qezxHu1b5RBcseT@+;02A+4Ksy z{9Fe`<|J5ES_Lnr?}wGjZ=fxYqi^gNJir^Myg+u5&<-H~fyA zH8+si6M;eJ2GL97VeGSos4-^;RbKrZ{!tfz_V}#oEpUy4%>y@6Q=BWoJ-Eyff*$Q}3ZX^f#789Yt!+5zBeff9y*CW5&Hfnt-pU?A5BL;=l^D+*k^47Nh z2C*#?sC*JjSO*Yf0o@Erw?E*J|MgzC@4U|wfei$I+y;MajRBjKD9G>cL;NM2~ATnA*R zg4EODASfuKc28N(f3nO3H#_En*I8%YmsJyp^^Jb0c~BEsedq<6c=Q$-);7?}t#mrE z&FK*NNm>xkd)wz^z9NM@2qp+wodLynM)Dj=W`Osu7Ji=#2tE3pgjt@i zONu5CyN%-rkq_cE$0^YlrZUng?MJf6?lRKm{!C2!^;6NA3iwIBT^OFdL@!-LI%#>JBf@y$XPm0I4 z5(sx4v@2#;>Y2eZ6_hH`QCW)zsE~yM0G}L*vfx{=RE>Q*%o~>z=mn_qOl*p=1An zgNF_uIeP5)iIb;JpE-N({DsaRFaBhSH`4%uOY#*`1N_W0z%NY(5MAaJkU`dE$>$2{ zf~&kP_iJ1-*Ksm7U2usuSl_@Y@C=HZw_Z!Yrb~323*M(obcc)U-vxrZ7k`qU>?@-Q z3k;3y`G^a(xjGh-GEG!|uE}hbnV-o%@c1<^u@cXB60fD@yJ&eMhj*_Z-dmq+7usFL zVipD``_N`<_J{#GS>(cmFN6aveg&WsLAQoP(5=Hl(wn>3Sm?rmDuGB_`Q85SLL|GF z_>`Zi%QjnZ_8=0UMDmOM;X)Ktj^9%QWAj+!Hccy;uFG37W48WvwX*cNQ%26F5TmnRqy3|1PY5F7SUA3u1v( zCRP29e{%dq`2T;ov47c365sAC7RUwSa%cSfNo5jiCNXhzQi$bZNj6ZvECn(V48x4U zUzXw+rjUuqW}+D;n~BWk_9C#tD0UUeFfr_m`R*BW+|tB)U>z}|XAbf84aSS8_>n9h zS;&r2?0{V}jEWs1QdpLgGYXcUn(iO(m!34fKodP|YD#iq4lCCt;QVIf;eqoP$vMGE zV6yn*maMFYy(u**P^VF5GiuhJ3y;gel+w*7jvL4FvFuQj>A z^{L5R={VjPOMS5)^~?u13)5@)u&s list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.0, + } + launcher = PythonTransformLauncher(ClusterAnalysisPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + basedir + "/expected/signature_calc/bands", + basedir + "/expected/cluster_analysis/docs_to_remove", + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py new file mode 100644 index 000000000..fca5485b4 --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py @@ -0,0 +1,49 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestPythonDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "output", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + } + launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration()) + fixtures = [(launcher, config, basedir + "/input/data_1", basedir + "/expected/data_cleaning/cleaned")] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py new file mode 100644 index 000000000..07710b74d --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py @@ -0,0 +1,83 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +class TestPythonSignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + # # create parameters + # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) + # output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) + # local_conf = {"input_folder": input_folder, "output_folder": output_folder} + # code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + # params = { + # # Data access. Only required parameters are specified + # "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # # execution info + # "runtime_pipeline_id": "pipeline_id", + # "runtime_job_id": "job_id", + # "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # "minhash_num_permutations": 112, + # "minhash_num_bands": 14, + # "minhash_num_segments": 2, + # } + print("====") + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, + # # When running in ray, our Runtime's get_transform_config() method will load the domains using + # # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration. + # # columns used + # "fdedup_doc_column": "contents", + # "fdedup_id_column": "int_id_column", + # "fdedup_cluster_column": "cluster", + # # infrastructure + # "fdedup_bucket_cpu": 0.5, + # "fdedup_doc_cpu": 0.5, + # "fdedup_mhash_cpu": 0.5, + # "fdedup_num_doc_actors": 1, + # "fdedup_num_bucket_actors": 1, + # "fdedup_num_minhash_actors": 1, + # "fdedup_num_preprocessors": 1, + # # fuzzy parameters + # "fdedup_num_permutations": 64, + # "fdedup_threshold": 0.8, + # "fdedup_shingles_size": 5, + # "fdedup_delimiters": " ", + # # Random delay between reads + # "fdedup_random_delay_limit": 5, + # # snapshotting + # "fdedup_snapshot_delay": 1, + # "fdedup_use_doc_snapshot": False, + # "fdedup_use_bucket_snapshot": False, + } + launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration()) + fixtures = [(launcher, config, basedir + "/input/data_1/", basedir + "/expected/signature_calc/")] + return fixtures From 8fd9676f36d33e9c304309c956468a207a0eff52 Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 18 Oct 2024 11:24:20 -0400 Subject: [PATCH 039/105] Added python tests and expected outputs for the tests Signed-off-by: nelson --- .../src/cluster_analysis_local_python.py | 2 +- .../python/src/cluster_analysis_transform.py | 15 +++++ .../src/get_duplicate_list_transform.py | 16 +++++ ...t_duplicate_list_transform_local_python.py | 44 +++++++++++++ .../cleaned => cleaned/data_1}/df1.parquet | Bin 14986 -> 14933 bytes .../expected/cleaned/data_2/df2.parquet | Bin 0 -> 3068 bytes .../test-data/expected/cleaned/metadata.json | 59 ++++++++++++++++++ .../docs_to_remove/band_0_segment_0.parquet | Bin 1497 -> 1513 bytes .../docs_to_remove/band_0_segment_1.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_10_segment_1.parquet | Bin 905 -> 1523 bytes .../docs_to_remove/band_11_segment_0.parquet | Bin 1497 -> 1523 bytes .../docs_to_remove/band_12_segment_1.parquet | Bin 1505 -> 1532 bytes .../docs_to_remove/band_13_segment_1.parquet | Bin 1497 -> 1526 bytes .../docs_to_remove/band_1_segment_0.parquet | Bin 1497 -> 1523 bytes .../docs_to_remove/band_1_segment_1.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_2_segment_1.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_3_segment_0.parquet | Bin 1505 -> 1510 bytes .../docs_to_remove/band_4_segment_1.parquet | Bin 1497 -> 1513 bytes .../docs_to_remove/band_5_segment_0.parquet | Bin 1497 -> 1513 bytes .../docs_to_remove/band_6_segment_1.parquet | Bin 1497 -> 1513 bytes .../docs_to_remove/band_7_segment_0.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_7_segment_1.parquet | Bin 1497 -> 1505 bytes .../docs_to_remove/band_8_segment_0.parquet | Bin 1510 -> 1530 bytes .../docs_to_remove/band_8_segment_1.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/band_9_segment_0.parquet | Bin 905 -> 1497 bytes .../docs_to_remove/metadata.json | 36 +++++------ .../data_cleaning/cleaned/data_1/df1.parquet | Bin 0 -> 14933 bytes .../data_cleaning/cleaned/data_2/df2.parquet | Bin 0 -> 3068 bytes .../data_cleaning/cleaned/metadata.json | 46 +++++++------- .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../expected/get_list_transform/metadata.json | 48 ++++++++++++++ .../python/test-data/expected/metadata.json | 49 +++++++++++++++ .../bands/band=0/segment=0/data_2/df2.parquet | Bin 0 -> 3984 bytes .../bands/band=0/segment=0/df1.parquet | Bin 2753 -> 0 bytes .../bands/band=0/segment=1/data_2/df2.parquet | Bin 0 -> 4763 bytes .../bands/band=0/segment=1/df1.parquet | Bin 3122 -> 0 bytes .../bands/band=1/segment=0/data_2/df2.parquet | Bin 0 -> 3695 bytes .../bands/band=1/segment=0/df1.parquet | Bin 2862 -> 0 bytes .../bands/band=1/segment=1/data_2/df2.parquet | Bin 0 -> 3684 bytes .../bands/band=1/segment=1/df1.parquet | Bin 2537 -> 0 bytes .../{df1.parquet => data_2/df2.parquet} | Bin .../band=10/segment=1/data_2/df2.parquet | Bin 0 -> 4466 bytes .../bands/band=10/segment=1/df1.parquet | Bin 2537 -> 0 bytes .../band=11/segment=0/data_2/df2.parquet | Bin 0 -> 4906 bytes .../bands/band=11/segment=0/df1.parquet | Bin 3450 -> 0 bytes .../band=11/segment=1/data_2/df2.parquet | Bin 0 -> 3317 bytes .../bands/band=11/segment=1/df1.parquet | Bin 1354 -> 0 bytes .../band=12/segment=0/data_2/df2.parquet | Bin 0 -> 3138 bytes .../bands/band=12/segment=0/df1.parquet | Bin 1354 -> 0 bytes .../band=12/segment=1/data_2/df2.parquet | Bin 0 -> 5020 bytes .../bands/band=12/segment=1/df1.parquet | Bin 3442 -> 0 bytes .../band=13/segment=0/data_2/df2.parquet | Bin 0 -> 3138 bytes .../bands/band=13/segment=0/df1.parquet | Bin 2537 -> 0 bytes .../band=13/segment=1/data_2/df2.parquet | Bin 0 -> 5244 bytes .../bands/band=13/segment=1/df1.parquet | Bin 3413 -> 0 bytes .../bands/band=2/segment=0/data_2/df2.parquet | Bin 0 -> 4782 bytes .../bands/band=2/segment=0/df1.parquet | Bin 3177 -> 0 bytes .../bands/band=2/segment=1/data_2/df2.parquet | Bin 0 -> 3988 bytes .../bands/band=2/segment=1/df1.parquet | Bin 2758 -> 0 bytes .../bands/band=3/segment=0/data_2/df2.parquet | Bin 0 -> 4323 bytes .../bands/band=3/segment=0/df1.parquet | Bin 2745 -> 0 bytes .../bands/band=3/segment=1/data_2/df2.parquet | Bin 0 -> 4341 bytes .../bands/band=3/segment=1/df1.parquet | Bin 3122 -> 0 bytes .../bands/band=4/segment=0/data_2/df2.parquet | Bin 0 -> 4035 bytes .../bands/band=4/segment=0/df1.parquet | Bin 2537 -> 0 bytes .../bands/band=4/segment=1/data_2/df2.parquet | Bin 0 -> 4860 bytes .../bands/band=4/segment=1/df1.parquet | Bin 3413 -> 0 bytes .../bands/band=5/segment=0/data_2/df2.parquet | Bin 0 -> 3554 bytes .../bands/band=5/segment=0/df1.parquet | Bin 2753 -> 0 bytes .../bands/band=5/segment=1/data_2/df2.parquet | Bin 0 -> 4872 bytes .../bands/band=5/segment=1/df1.parquet | Bin 3122 -> 0 bytes .../bands/band=6/segment=0/data_2/df2.parquet | Bin 0 -> 3553 bytes .../bands/band=6/segment=0/df1.parquet | Bin 1354 -> 0 bytes .../bands/band=6/segment=1/data_2/df2.parquet | Bin 0 -> 4311 bytes .../bands/band=6/segment=1/df1.parquet | Bin 3450 -> 0 bytes .../bands/band=7/segment=0/data_2/df2.parquet | Bin 0 -> 3765 bytes .../bands/band=7/segment=0/df1.parquet | Bin 2667 -> 0 bytes .../bands/band=7/segment=1/data_2/df2.parquet | Bin 0 -> 4158 bytes .../bands/band=7/segment=1/df1.parquet | Bin 3289 -> 0 bytes .../bands/band=8/segment=0/data_2/df2.parquet | Bin 0 -> 3781 bytes .../bands/band=8/segment=0/df1.parquet | Bin 2845 -> 0 bytes .../bands/band=8/segment=1/data_2/df2.parquet | Bin 0 -> 3997 bytes .../bands/band=8/segment=1/df1.parquet | Bin 2537 -> 0 bytes .../bands/band=9/segment=0/data_2/df2.parquet | Bin 0 -> 4018 bytes .../bands/band=9/segment=0/df1.parquet | Bin 2537 -> 0 bytes .../bands/band=9/segment=1/data_2/df2.parquet | Bin 0 -> 4326 bytes .../bands/band=9/segment=1/df1.parquet | Bin 3314 -> 0 bytes .../expected/signature_calc/metadata.json | 54 ++++++---------- .../test_cluster_analysis_transform_python.py | 4 +- .../test_data_cleaning_transform_python.py | 6 +- ...est_get_duplicate_list_transform_python.py | 45 +++++++++++++ .../test_signature_calc_transform_python.py | 45 +------------ 93 files changed, 345 insertions(+), 124 deletions(-) create mode 100644 transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py rename transforms/universal/fdedup/python/test-data/expected/{data_cleaning/cleaned => cleaned/data_1}/df1.parquet (79%) create mode 100644 transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/metadata.json create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet rename transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/{df1.parquet => data_2/df2.parquet} (100%) create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet delete mode 100644 transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py index 7c162b1b1..915cdcd1e 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -37,7 +37,7 @@ "runtime_code_location": ParamsUtils.convert_to_ast(code_location), "cluster_num_bands": 14, "cluster_num_segments": 2, - "cluster_jaccard_similarity_threshold": 0.0, + "cluster_jaccard_similarity_threshold": 0.7, } if __name__ == "__main__": # Set the simulated command line args diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 2a5ec3e6b..412fc1fa8 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -33,6 +33,8 @@ """ This key holds the number of segments dividing the hashing space for each band""" jaccard_similarity_threshold_key = "jaccard_similarity_threshold" """ This key holds the Jaccard similarity threshold above which two documents are duplicates""" +sort_output_key = "sort_output" +""" This key is used to sort""" # command line arguments num_bands_cli_param = f"{cli_prefix}{num_bands_key}" @@ -41,11 +43,14 @@ """ Jaccard similarity threshold above which two documents are duplicates""" num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments dividing the hashing space for each band""" +sort_output_cli_param = f"{cli_prefix}{sort_output_key}" +""" Sort the output""" captured_arg_keys = [ num_bands_key, num_segments_key, jaccard_similarity_threshold_key, + sort_output_key, ] # defaults @@ -55,6 +60,7 @@ """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" num_segments_default = 1 """ Default number of segments dividing the hashing space for each band""" +sort_output_default = False class ClusterAnalysisTransform(AbstractFolderTransform): @@ -98,6 +104,7 @@ def __init__(self, config: dict[str, Any]): self.jaccard_similarity_threshold = config.get( jaccard_similarity_threshold_key, jaccard_similarity_threshold_default ) + self.sort_output = config.get(sort_output_key, sort_output_default) self.data_access = config.get("data_access") self.logger = get_logger(__name__) @@ -225,6 +232,8 @@ def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, An "jaccard_clusters": num_clusters, "jaccard_duplicate_docs": sum_cdocs, } + if self.sort_output: + filtered_jaccard_dataframe = filtered_jaccard_dataframe.sort(by="first_doc") return filtered_jaccard_dataframe, jaccard_stats def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: @@ -308,6 +317,12 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=num_segments_default, help="The number of segments dividing the hashing space for each band", ) + parser.add_argument( + f"--{sort_output_cli_param}", + type=bool, + default=sort_output_default, + help="Sort", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py index c7b4cbddf..c49124cf1 100644 --- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py @@ -31,16 +31,21 @@ """ This key holds the name of the subfolder with the duplicate records""" consolidated_filename_key = "consolidated_filename" """ This key holds the name of the file with the consolidated list of duplicates""" +sort_output_key = "sort_output" +""" This key is used to sort""" # command line arguments subfolder_cli_param = f"{cli_prefix}{subfolder_key}" """ The name of the subfolder with the duplicate records""" consolidated_filename_cli_param = f"{cli_prefix}{consolidated_filename_key}" """ The name of the file with the consolidated list of duplicates""" +sort_output_cli_param = f"{cli_prefix}{sort_output_key}" +""" Sort the output""" captured_arg_keys = [ subfolder_key, consolidated_filename_key, + sort_output_key, ] # defaults @@ -48,6 +53,7 @@ """ Default name of the subfolder with the duplicate records""" consolidated_filename_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") """ Default name of the file with the consolidated list of duplicates""" +sort_output_default = False class GetDuplicateListTransform(AbstractFolderTransform): @@ -69,6 +75,7 @@ def __init__(self, config: dict[str, Any]): super().__init__(config) self.subfolder = config.get(subfolder_key, subfolder_default) self.consolidated_filename = config.get(consolidated_filename_key, consolidated_filename_default) + self.sort_output = config.get(sort_output_key, sort_output_default) self.data_access = config.get("data_access") self.logger = get_logger(__name__) @@ -118,6 +125,9 @@ def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl. "consolidated_bytes": consolidated_dataframe.to_arrow().nbytes, "consolidated_rows": len(consolidated_dataframe), } + if self.sort_output: + consolidated_dataframe = consolidated_dataframe.sort(by="docs_to_remove") + return consolidated_dataframe, consolidation_stats @@ -155,6 +165,12 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=consolidated_filename_default, help="The name of the file with the consolidated list of duplicates", ) + parser.add_argument( + f"--{sort_output_cli_param}", + type=bool, + default=sort_output_default, + help="Sort", + ) def apply_input_params(self, args: Namespace) -> bool: """ diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py new file mode 100644 index 000000000..be90b3073 --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected/cluster_analysis")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} + +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) + # create launcher + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet similarity index 79% rename from transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/df1.parquet rename to transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet index 11964c2e20b7cef92e09929a3a25e0cee1f17d64..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd 100644 GIT binary patch delta 2417 zcmb8xdpr{g8wc=>(OgEGHe}{D8zpxgQ?UuzScK{@lUz=O8Riz5T+)SDZYx^a+)F|v z$-$6YE|Yta%Z?N>q|Qs{^1ko+eBR4n@B94o`{(z0p6Ac+jmS}kr8B|B2o8d)!;KyP z3;+xO(5W~0uDmi1h>x1hFW}7E(k2KY%*3(1U2U)j&sz;}Elane`25nM0 zd6lM?vuUz;jPc#?6Wgq9x0Lhccv%XN45R>+CKo>ArkZoA=V$87sUE&GRaR5VP;Tx$ zQCl4O6VG(Xky+{M)O327KAkg;5C1sKUw=`xo-*RWxURxkF-wPYqNNWGY}>iVH@HSK z@*@r&T((@A9*uHnf2qz-d)Qy5vz_9klE9^{#;%+?cwC4|AnoAtvJ5HLVBInCjuL)2mbQ8$;YfdY^s;UA{K%r0;)5hkTnGzFCgnw+7ra};rg-EvHUq=h3~579uN?{ zABtvbVmOZW@$%d%&s~JZ)wOmL?|@>X37r5@N10Cn7r{ZOq({s|h$tbD&qvF~IlOi1 zPog&v$bOJFp^^OiSOOBYUp_NpOy7e_J4-R{W()TF51z+_g}QIO*lOZacLE_ERL+CS z1mtf+Tge0H!>X&cYq~W!dk{rcTmCDDDHn|~x>pOPJuZ89z$xamD_`$j>HPs=Q)6@CQ!==xaPy?iz&fKSS`Dq}QKc2ISt zPu0v0qU;pR<>krvDHqawwyRi%PoL9d1hRzgwyOYp_CvB?7_jQ=3jYiZAB`@NMDyd4iCyd*58r*M`SPg;uH=Mv{HE!Sk zVn4d@R;wg9(rfYYKI@+)d7>fHc(zGLRYOv9p=pYFls~(aQ(ftiz={j+IndaecR{By zYmF`o)x`|cLYCL2BA7O$$taZV!W{I=3~#59$T*Y#@|7rHmfZoSUAi3;-12nx_|i5` zB1vA+u{=jjSNQtl+XP4#UrkBI&98OcmU)UGvQfyca@pN?sp-V1t^P5Wp^`}j3eNm3sJ8^L(8#9{_ zC8sk5UL;lK4@a(vij{MP`lQuyaJjGLB{D=-JZ!l=P_}Jkg=&$ z1dtL#*<8C8^PY~pE+-5~vzTBZU-X9++acF$?lGGp4H?U=r-ZDXqw3)B_xSdaclIBe zPx{fn49#7=%Ii)3T07418e9dEc!3PlqfIMR(r}D<<j;n zZ`87@4<#;^&#_%dP@gQ8GE12(NCz5PTVFdb8w&F-+9=Ecj}AHBw)X!Ww{iGTRP0m| zo4IS{Y-(ce&j-0?G9KL~?yG&1(_*5&bBKOxJEN1Ols!R6OuX|z?%g+Ek(n?miq0oq z{AF?)ksn)nRunGsJy)15E(H~ypCH(L_~}CjLdy&ttybNSXIEZBqCWS=)7$nMN;m=) zPIUO)?<18)@o3K5qyVQUgQQnFKzf@B^MP!Gc5~@zO+0bf8Y|^jZ1Fu)WlQl*H^8Qq zMu2>lJYKD|f^sZ9>!uBxg~DcgQg1vM4>$udzoCb=Ik#}h~hv*&`yNM=-Ci6t{TT+(WAVbuWv;EDm!A%#}o6mIEB zZy>a~>H*{_&)}P(tS_@%%drk$X9EVkIdzDR!J1I)BB-Nb-Df;=8}hVWklFqR~%ILxD?B99j zIRc2~V*h20x+}U|dE8$%`A=)~`yk=6e^~?U8%ksN(-;x|V8+B{#Eo;9b}dYMU7~Ki ye0H7A5o3@tQd#f+_eu6)U2XLLc&R~Ns)&gIVZexO?>aor8VG(31pxl;tUmxhZ76gA delta 2494 zcmb7`X*k=77RLV(B2l%3q=Iy;v6M(`6`>_FM8(#`QfnF&wJjFzZpg*lkH?l#Uq3fP6!J- zgct&V1Au0$-d82fhX9ZW0>FSUAOb*ue~qxS5C8@t00a`Dsq=?h3IRbt6y>1*(dD3^ ze>;(Y=Kr|l-{DNeuPsndiDgAb5)qOI<)l7 zqT7s6C#(Y!*dwK%_46^c+)SJ}!UE(4vIlv9^339^X_Gf~jmYajHHuZNBCKB%hXdIPHaQs%rV{4a%3OGYx(}2_Dc`i(!#P1HnWl zIhN4gRPZ*Y+dI5nY2ko=e8+Hnra$>Z%cAPWEO+Q|tG@Hw#oCt2fb|bIhHot!WN6jj zFX1ZmbebuTLF6(vV!kWdHL=|a{ghqbX&4c1#XAUt!M;zg)OK8k_svpG)g#zT;=I0@68Eb4e*MaGHVTsoh{4V6x5HzJ6?a!WKri%b=_39 zOoR67?ih%5Yo4u%{h*MNsz2n`|xp24bOg_HaXqek?x5RH+i(O}?ckNEg zw{(6XxqP)Sd;BhWT+JB$#{8{TBVh>aiB^}~Cj%XmNhM?5FoLR=q+0XjwULeM+W&&r}bOew%7KGe{;oJNVA$hv^VGU6=|E{$qKYX3ZwY6TViCa zD;sUXkT;F4boJ)Ojftw?vB?>a?l!h5A$aZ#3f(EzlR3hr7B4G+WZ+V8f{UN<)~kV) z1-Y%Osf@Dt&SOEk-lVy7S<^P13&BiomBU6eR~9?eT5OOZ%~5alRm(nDyV{t-KcG)m z5h*@4;hhkNY1F;7O!uH&9QCOB5>aB!-g(1Zn*MI6jL}tfqT?P_|p( z8-sx zV*W_-K#GRLc6^^!sZz3rgMUe>3EX{d=2b8C>_BAr=40>MJQ?r!U%&bz{JX8#dxUe6 zVP_H|az<7N{O@fm;7hpzJmle~|5(Vvx9IbH&mUs$DcIJptVZ1!bb*fI6B`a90^2Ay zr5(GK?`>w57gVdmnEd6tvZS8DBo1Bv*eGGi)zi|)LEcocOW%$*;pU>-Glaf>2I1v9 zq9^M`PBY0(mo22Xw=rVh7^$$U!-#oFi%J<6HXkR9tC`k~@TXZVk#SGGBYFn=dS$Xiu#A@itDZ%n0cm@?Hrp-Q+WKLay^eP_mL);t zFsn(rTv#{D{2j9^Y`r6`D7Cqn?=d`()B(dpV?^psfvkDubA!~QgARKJ+DUrMvp z;?}(6(iYP$oviM{R!SUtZC=fpeG*MvMj^FxUo-W{Z=zX!Y3?1ujxAO%yPjMkhKiZx zEQN{_XEk$MyunNP+EHR~6flV-G;{^VV? z1TJFJZ**87<=WdJVo-kQmAW8QN8S*4sp3I^GAhZ3vg3&4cS6#&g0-4?Q(Ql7;x%Cx zTl60ImhfB+EQ_Lm(pZk3X3rTz0cC=Tkx6az;hrTdMtz;NHFrSc-E*0%;KR43|wq4N}I z{-9_8m3=?32me~`=JeDG=>`8zReqITRmE-mL}%zu+s9>GlU)68AL*Otf-b>u=E;Ma zl(`^It#8H2XH}_%PLYPWo%$!Vp+Y+74c@95Mz)4KzHeM;LvCMkn3yv>lk{7XH)i=% zWX#13CwnuxLAu;nHs@(m(cn2E$<65+j>dX=k?pg%ZDJvj4<@KYV6Y0*v-e|1+D>e{ zg-`8F1*hOX^*)QqnoJ5uot8OyqwYuaq;xkbgtTv<-zk2UWufcq>DB+LH%gZFOY+q8 zgex?r*uPu#j=IJzXU(9oW|c{&_zqcOgNzR5VS_`$C-3vMcZL1%aqwBSQABfJa`uC8 zb++_5e)KcI2OQtP&7Zmq7E2a= z!J$!rAvU-E;`Ea%1%;!$vuSwiuj6A;e(ao=eVIPP;_c}TByGz6?l4cvDPfYLA(CZ< zyld-wDN*mFO~6%{TFpSMJByuYjYZ|P0rO>KY83|#)yug*b~5UdY#w0Wj}xT-a`D7WC68SirG>)3R@c@8#F?WuX0Xt42|D~4Z z4QPX;@m~<+PXf6YpokDh837O}lpHi5GJ+8p!H7YKBgK$hE(p1B8goMp^90;1fZmWS y+%$m~CTbF)VkkwF&fjAfH4V()`y?ZFVOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd zr)%p2Ubx#!+mwyv0QF_?+Op+i{+J=%~>8 zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N! zT+gohy}f4!viskNbvSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk zldhaoEl!8**-E20aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1 zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R$Zwx35Deu1M^|QO;^<5i^#8S> z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@CN06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1 zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^( zZ9ZkWQS1(*+75QL rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK{~Syj delta 453 zcmaFKeUqCdz%j^h@?J)9YZe9w5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;&cK`MuG#z0y%+ovM!?{)Z(_u zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQn;ei=7gI(tj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet index 57642d19925240e1a86b323222b420617a75e7d4..0e089dee3ccc91c9decf65449e3ae930f1d1717b 100644 GIT binary patch literal 1523 zcmcgs&2G~`5FR_J>!K9Ws%zO&4iPF=5B-UWib&PVCM{MVlq86>j`Kpc8ToVkIRUE3s(@ z;&_BI1rU7Dco^$sb8>9Kjw%N$)+<<#qXDs%Y%n%K0{xpj=0*PIRk^UctwEd9doL(o z1^x4~{(2edO&6cjauz);w-2h+x7v26MkE~S9dY8mc>X^p>;-4T14Yte+TjAOpT4j= zu<)rU-@}Te%gU#v4FD~sMrg74E>di-#A~ tc(3NZy0iy7RVG$-2x^A-H@Z$iT|L4Rsz)p9n$~ zNN})1Ss delta 476 zcmey&eUrOBz%j^BltolPRLMt_Nt8i|fk9)d{$B+SM+OEK1_%&k5ET+t(E&;eh_Z>Y z$w+X(WDqPd1`RBtK>c9TI6WB{n1CE6Mn)zEwI_@+qAZf83~U0D5)*rP^_j$WFpAw_ zRC80~Aja&;8=0jhA7>PoX2t0~*2xo@MKxja5*#oVcK5VRj%Sn=gt_7cv)U?VpgX{3 zFtScw&a48oO`VZx@_R-{c8CwQO+L%4!VA+egI(F*81uP7k($-R&V5Oyn zk02JJU?n2hh(#1P@kE3KyiBu~`RCtnX76&{+}_6e<`R3n$SVyF7zzNj)8wtHj{s5$ z9*o0h2uN=+MFuXinP=j>lLr+|1A!))Nfp#a+`44nd8`Q_+<{QYHVE@>pM*|GU88zI zvt#Hto;L4IuR82Ct~VUZ7k2d+U^fCE0+`5XvMro#T@@EF^XT>03pcF-$KR(l$Y?_QjEra#oC*n12BvGz zWLze7iRux}O=u26_r!38>)p1+R!F5vss8`@i6484#UGf;wC^i2?jt+P-U)sHPxe-< literal 1505 zcmcgsL2nXK5T0F@?KWu)X?&aAgaa|U>A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&y$z%j^BltolQ)W`=&F$gg*Xl&L0t02(Gz`(`;0X$GzltENTR7FRWO;iAg z*<>U*U@{UMFtuU~8XPAUf#(~G2jp~60qE-5D;z-vG7|(CAru2PpBk&pvbIbN}dv8i#0o!hLY z$w+X(WDqPd1`Q6ji6M&YOpJ_73~Em%)<~!_iS1w%yThpFrpAF=&twN?6%QN^V8-bH zn2ZDmj0JK4^JGIt$%(lfP)D>)j$)Kmhq>Yfv)U?VpgUk15G;t+d5r4%*g}C3rz03A z=QD|N!{jA67$?SwL#^I6`2w?wAk69+>}t0_!3NgA$T(S=MP>3zreJm^Nf}8S(!xrW dkyKx)FsTu8jVi|q7BMC@4zVelJy_N;0ss{%Nvr?> diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet index 06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9..bf131f43cbf10180944b4906799b7d6288c54724 100644 GIT binary patch delta 585 zcmcb~{h7Nyz%j^Bltt7))XGPcO_V{1fk9)d{$B-&r3?%S%nU3H{0tDl#E|fSF@aH( zK~zLkO$Vq{0EpRSBsgFSBsiF%O2rs7u!sUB!6tJBGB7X!Ic$tf%qk3?YX2BzL|G(F z8Q26QB{VG$-2x^A-H@Z$iT`V0CgTtp9n$~ zNN})1SsY z$w+X(WDqPd1`RBtK>c9TI6WB{n1CE6Mn)zEwI_@+qAZf83~U0D5)*rP^_j$WFpAw_ zRC80~Aja&;8=0jhA7>PoX2t0~*2xo@MKxja5*#oVcK5VRj%Sn=gt_7cv)U?VpgX{3 zFtScw&a48oO`VZx@_R-{c8CwQO+L%4!VA+egI(m>l00$Gw|Fn&mB5!d%2k!`pB#MZqFOLbF6;GjE^zmb+hY|S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDSo6 zerMF3qxb%n@s;n!&E`u+E~2NO?1PpVI$gKd5D7ZRd-B{B3FB*DI7}{v2db>6^@A1M zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0mccu3lJd5>j+1w^dS%kl(~5*<87>|n7I zqt%AK}bR=(}@YNKY(z94)L zr+5$pq_n9}VLle3gNNLlz#Xw=nf8Ai27zW(6)1_&QGiUE}Ar~Vt0tMqdK delta 386 zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R c0|$>-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet index 57642d19925240e1a86b323222b420617a75e7d4..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 100644 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| delta 386 zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R c0|$>-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet index e303e5ea14abd679e479c2fe54ba994402d60350..2838dd9727770220dd6b3f3ae9f0d4bdaefd8ec2 100644 GIT binary patch delta 341 zcmaFJ{fv8pHY3|a9Y+>6Mj%}%4J3NKtqnx2d_>tq8H5-ZG`8yhRghT9z>pxwz|6qP z00B%42@e<(7)2RGMMTwffJy~`m`z54LvS)bqo|f3RG9<^Gn56Sz$S48GB7X!Ic$tf z%qk3O3nm9L%L=lI-C`8`#;Df7qPB_|W(cF;x{3wu4dZ4x`#8cC{}cC&AQBWK^3xjVX|kZSo5yIgmd+m8?&rq1OPv?J)Hmm delta 484 zcmaFH{g8WtHY3YK9Y+=xMj%}%4J3NK%@stAd_-A98H5-ZG`8yhRS;-oU|?o|09Gh1 z${;Evs-go_CLqct$|fVh!8*BtQ8XMTFTnxR0n`9Ch0~LPfeFX~TE)zyHjPzAltt2% zflWYCf-faMxj4QgKfWk6H@_@Zlt+w3>;$9O8%DJs%xbHcVP-S3PUdD&nY@cpJQ8FM z2w=JbCL_TCV~H_nz}W~Mzs~}YOg>(gPDY2p_w^( t0i(?1*-U|qER)|d$$kWQS1(*+75QL rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK2hvO; delta 453 zcmaFKeUqCdz%j^h@?J)9YgPsb5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;dBB_MuG#z0y%+gvM!?{)Z(_u zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQn;ei=7gI(kWQS1(*+75QL rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK{~Syj delta 453 zcmaFKeUqCdz%j^h@?J)9YZe9w5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;&cK`MuG#z0y%+ovM!?{)Z(_u zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQn;ei=7gI(kWQS1(*+75QL rUqA(uELbFEByA}0zBD7LZd7H&>J)oUu@fv}ENUELTQ+O6tYZWK0*6c< delta 453 zcmaFKeUqCdz%j^h@?J)9YZe9w5M>Y*5>?R=WfBz-WfNtSk>G&IAXs7y8pxuP`I*Go znHU+F7}TCjtdY=X65GKjc85{TO^t&n?Y7KPASW2I;dBB_MuG#z0y%+gvM!?{)Z(_u zk<7B{Fq>a6tF2-N+6_~UU_tcFWmW;&sLlrQn;ei=7gI(tj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet index a4ad5fbf82bf959f34c9a25974d34cb91aeff037..8e1fe121e25cb51ec8c26b1aea7ee00463f9400b 100644 GIT binary patch delta 441 zcmcb~{g8WtHY3YK9Y+=xMj%}%&B!wOKa)p*D2u3qsF4qlVi00r(AcW~S3#hWfkBV~ z0+^w+D1)ewsEQ6ynSdypD4UD~2TX+o2TY9^g9eA-WLZW}Fw5J2qLCNyU&w zYzL#*9Y(bk>}sz-ZbniIwFTV-U1q7tJxsyCzMj%}%4J3NKC+}bsk!EIq08s`}AyE|_Q6^CVQ8rOF83_*N z$^49>jLef&85P+<+8ET@CI>Rh3Nnf9U=+K%K)i& z>%Wrm*8pMgB<5QJWSQJrAyE=15|I+B9IU8vkgGsyMidM1s*44H5IRvo5aarHAwk00 zCP;BjU2Gq^!%@BESR|%Il~025P0*{l{69^!6~=&s0970f0Qx0zzley>h^0OPyjlVg zA|SzapIP{62t)z5DI|4~-;%$s%A)_p_)SM~k@LHD5bzVlH@l#}kDN2Sf!v1mkyCf= zo-?*N&7XquThQNz^$#KvovwRDx0yu$H9^5gKRv-Ry=Umj(TF0clPbzoO+Vnp$xaRB32+?OuaOIN;l%3unjk|N7Hja5g+pqFO@R zn!{0?-&xS-Jr(6NqC|CB`5d_kpe588tyCWG-}8o~ehsZNna~`iw2Uv4hm>NVLw70d z(0qr-a+vUledQ@FFpGX*(PK;bXDt4NDPAgHV5QN?(Jl`50~csYnO=#*|Vyk=#5klAau%xv50RY%QC z?FjMR_3=>3k>aLYnej0xwdgi0j^K8fW#ZR=;5l3~m1kYe+Q91XrJrwLp9zb0*nWg! zv3kCEctK4L8t%(ud$ei|2CEI%btcKN?G6u}e$u$JW~`;PtptGgzz*JJCA{_j0OaiY Ae*gdg literal 1510 zcmcgs&uUQX@?-18p~Gt$>j#7k(@Y)D#vM3hB*1 z#fujY9!yO98$5Y79(wRkFy2gjGs_lm@!*9QcyHe~Z{B?0zJUTOnKVk{^zI3jDXal( ze%k$=R2V=Me4{|u=*AW$SVbsFXiBg$eNtg_0gC%Yx0O7Aj0uq>0>%$1L8JN}khrd` zw9dW3u-bHOx~dZOv!s5PjAKLjpF{L56aZ8Pba4y-n5XFe39`Krq`d>EMIZ*m7lnLk z*HTTwJ?qq%6&Dm}N@P;X#f4E^6yk0d195U>1B|z^sk%F;Yfv4!RnO_VBS%R5CaJ$9 z<4x3fA3>4fw*NS6p~yv33rL5j7Ocz*3!{Os`Kf(Xrn>07uIh!;YY#g!6a09~_{lFy zxH*>>Mq!aW{bUa`GO#;Nw@wKT^`1O)U;O;9H|&Fp>48f0gnqDu>$ki+U@Uto>X)cW z3`PAATL;h++A1rRo}TUdL(;D0U2bERWsDW^%k42^2(;NTV{Mje3oMHTPxw=gu{@8M z2N5&AlzYzcXWWTaxjb);NtSg4=yJ~wnpl~~ij{d~o0(L@%v@E@%u%yjZZ)j(WhGq_ z^__y?hm}YN?JCHNdBInZxS`J2A?ItAMlaK+t)`V}+1>K6k*-`IzlRe(j0vW=DO==v zEXHhh$eSa%BObX;`akj$;knE4Q>{wh?wzJy+{bgq9BuRcNX6#;Cg$@6E!nSouP>e9 mw%zY<*FDc2Cr6GqICpzV^TCd}lhO|o0R9J#@TXS5fA~LmVfT;# diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet index 57642d19925240e1a86b323222b420617a75e7d4..3d1f158e9e79bac193f88f94d2b548b79827778b 100644 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJS(ac$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1h@mC2oUSy{w9@^A|VnfO^GT~Plb&+S=t|TTP*^}6v&b!gYiR3iMYNCiKMQrw$A*) zu-5b(v8KxE7fJml8ApclKST6QA^<`LQb{rZ3kW4cKD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t uf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEVGN)|KlR_NHuSUr delta 386 zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R c0|$>-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet index 57642d19925240e1a86b323222b420617a75e7d4..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 100644 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| delta 386 zcmcb~-N{}b;22~m$|A}jD&zws7=#!YG`8yhRbXIbU{EuY5oM87Wnhz(;7rRbDlUmn z$xjyL5n~XO5z|v+P~*@9$^$hqF)%RTGGY%S+zdXT?&A28{P?2O-2Adsh(X3+llmBm zFzNym)TqgOn56|^j#$J5(K@-EDWaZ1Qby8-6lbduav!Gi)Zp%rln_Pr(&V$uTyp3R c0|$>-h!`-u#JU(a>#!_ki_@% diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json index 26d0c0905..c08326355 100644 --- a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json +++ b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json @@ -5,8 +5,8 @@ "job name": "cluster", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 08:17:19", - "end_time": "2024-10-18 08:17:19", + "start_time": "2024-10-18 10:32:15", + "end_time": "2024-10-18 10:32:15", "status": "success" }, "code": { @@ -15,7 +15,7 @@ "path": "path" }, "job_input_params": { - "jaccard_similarity_threshold": 0.0, + "jaccard_similarity_threshold": 0.7, "num_bands": 14, "num_segments": 2, "checkpointing": false, @@ -25,34 +25,34 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 71.6, + "cpus": 91.7, "gpus": 0, - "memory": 24.71, + "memory": 24.01, "object_store": 0, "execution time, min": 0.001 }, "job_output_stats": { "result_files": 28, - "result_size": 33665, - "processing_time": 0.052, + "result_size": 38040, + "processing_time": 0.061, "input_files": 28, - "input_bytes": 78286, - "input_rows": 70, + "input_bytes": 115324, + "input_rows": 168, "consolidated_files": 28, - "consolidated_bytes": 33600, - "consolidated_rows": 70, - "groupby_clusters": 14, - "cluster_duplicate_docs": 33, - "jaccard_clusters": 14, - "jaccard_duplicate_docs": 19, - "num_duplicate_documents": 19 + "consolidated_bytes": 80640, + "consolidated_rows": 168, + "groupby_clusters": 35, + "cluster_duplicate_docs": 79, + "jaccard_clusters": 35, + "jaccard_duplicate_docs": 44, + "num_duplicate_documents": 44 }, "source": { - "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/bands", + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands", "type": "path" }, "target": { - "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_sec2/docs_to_remove", + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove", "type": "path" } } diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd GIT binary patch literal 14933 zcmeHOYgiLkw+@Ji5meA1DB=bYY@39OqT-cs6OfyLPz9`$$&idpX5!3*K(&@CUfOCE zue2&v@Jef~TCG}ZRn&T`7p#g}QB*#w^@dv89<|@M_9P&BJkN8!?>W!;mHZ%?Jv)1^ zz1I7_YwgJ z@p5UtC-t8B=<&s=EL1E3;f91H(Rh zUiYNzNoFarKGHSEA=x3v!Rx~H`w2#fw)o?VFtZTUGy? zN!4WPt}&-gmlmCm7mi68c;xD!4{QDKq4b>d0f*eTmHEaFn(t+#Fs zzj^6w;i&!JqncS?9ADS>Noi`2sg>;A;ya_?if;6sn)ozg$^78z<%5SWEU*1+@~xds zUmWXF%dBildvE6KTHldxOiliL^O$ZwZrtcn=kD%a78CPlxYzpu35wghoh#~|99}S> z*RI{e^6IN@4)L+fcbGnX#JHVB+WG1SPQ7c|&g`J*5twE$eYXEW=8bRGMIT(8w`1Dz zZN`E@R}$}J%70b&cfDFVk19&MA*VevP#bi&mMDxdZ8)D z=yreTy7LuP5AyPxes*)~?leKOBl(}_ruz7H?=(+1ryiHbX6J+*F0buQog@M$abq8U z_jp%L-c#CbTwcYh>Zv||pL#s~M8JTacVq9pu{k2%DW_+jPJdK%v;>B(*y6%|weIqp zsYPQlYLu7Pe!j|e@coEg3g5rq_IP;rQ2^i0yC~}Vh=A`NCtdrm+m#cxnlJK0g1-Fl z+~gHY%PR~uiHk?96L;i1_1(Jmn_l5>mQFZOnB05TtV7XSf9t0mQy))IvgiU{~EcM)QT65yO^Q|y_LH*k$+a6Mt z2Q#YqDZL~8wf@kkvSU1Zu5Z?3 z=8NR%YcfB+a zYt#?URZXY*Pbu$rVB_{_yO;Mb9aKn_udUcrJ#K2*Wd6vTJN7Qo_uVo7p2^kyjlk2a z@z%YILQLX?Lf_cy4er0!S3O;xAZ9K7{>LW6mGT2F?7It!m_6TIiN5(HqV2R!-ICTX z>h(tBPq%-W>UOB6msf{O-QEYW?~m3r#dKM?G5uNBcXy3E8>=3cernByE;;Ulx;rlm z9)HF=^~C7do?>F#xwFDz$38N@d(Q8tp!RpD@`II@+%w(w{5mgU&7y0x{9Z(tsS2+# zyMB9k%e1dP!-r1x`?lX6&Aw|##{a?p>^n1Pj@OJ)4`Onh_KfPeW9!yy0KPj%0{xx2~e_ISK}^XJo_=e=>7UcXJOK|y=Q_PgWzcewuP z@u+unCCsPDYt z)~KF_U&bFOry6=5ubk9j+@Rp&qgHC#$U;gCjp)>SzEsG)yRN4@MCR_VTE=$JbVwbx zwcUkdY3JIwHHweCR?6Bt`o_$jU3Ast^I^wE!>qw|Wj@~>pR_v8=fUO`yUZctt-Yfg z%bpHJF8tNV{bw%6|GGOxXZrnA!_1jgN2mX><$T5XlXo2wt7x(R&|P=(5>FQmsvcVS z>B8>Cmp%vm)5`t(ib_D}g}OZg^M$&K|5<>`s;+WBSPW!}lPa~wjVqGl7WoX3Qf z50@`;IeRK;K@|1Rh=&7Ig~b=Al|LJi*1z8%s>Uf}p7`a0gsxkDUbLqlda5#g@?`G8 zGOvYonv#-FS9QGnC)pIbxP#Y0yV~mqy(TpZ0up)!AFgKTNhvT!M~n{+38Sv}ZxM?}QuM{6otg&HMP|mXPhw zG`0IGKl9yuefZHG`&Qp?)4AKHe(QfatGF^acv`wg&Y8PApLvYvVeD`(FOIL7@ZO^n z3nN17+mf==*GIcJ8tNARnB99vRI%UIDE$pTw?L-u57f=PpocA@ys0G@^1SNPINf_(N{;m%ssGX|F@$Y?~FSdlKxi-f3vM;kkTU{ zzWr6du3y#pxE;|qg?2dG$*ZixmD6kX2J7#R@|0CNIs_P(kNC68CrbH4&)$c&$&xST zG9Pngzh9{we(nI77A%@}&p2fY4CE#T3D-I7WxkDUqNVxUFGH1IG&}o8xp0 z!RlaYhCzvDK}4x$QJ{4c8p-4KbizdIgduX6;Kk(894peSnG(Q4o)9TLO`;UZE`mX^ zMJ9q3u#IUjvLKJqWW}bWrKV>pQ37Wnl-NXdi+w0KExp^5b(|@aG(WNr1d5?ZDLh<0(n|hg2_uEDBLx#h z(gY)dwI&EK#VCS7Z6if}%funRh zfig%=6rsh)!9tAS1;l{`yr6y&5%3bh2pqBiJeXn?Pw8P?2uQOl1ha|aY48gPIO2=| zGlAj@X%Z%)S&}hJ))E-IA_dD4Rr%N!(m?Qfc!RMdUvNev1Qqb5M?3}VW|4_wXtG!d ztA_5_JIKO=6LAl`0I69p(AE~)l!98GiLn8(#IV7A$rjX#9MYO;;Dgx&FvX5Gg1M|{ zHX;ZdzRo9%G%OJU4eM(K3v8+gO>8sT^*m=04G<`a2gR+tWkV~*NIuO2v9f6aOb`H@ zt#Jh;9Vjh>0B^y4pd!seq&Sgywcj9Xz-SQ}AkKJZNe8TK02&HNOBLf0@SMrC&H_HB zz<(I%CLX-rIu4l7V>!M+i8KtMC5(h*L}~#;7&s%LMjD-t#!v=b0M}!m*xVpZZoW3`qlA}N#ANQf@eihMvV2@XmGplb#SM3~}109i>~Iq93gh?g}GgB-%oii9A7 zthIy+gWN1Fxfw@N1n}4#sJ4~utyEd*j|76&p+wq<3SsG3+m*mz3X3afI{>X&>eR?T z4iowX5V16vfJ_3i5KIBw;5oD2AWfv>K)02M1RqEo0X~^oo-QO9fMxTt5mkh#NUZ;)_A(UIj0(uzhHUjT*E(fK$7 zyo5$jFJKx3(PkUK$4CPmKo}v9ON@OP)Hg`nYpWgah-wd!Uw z^&ksFn$yw{=$A5+(JTPrrR7N0;{pQ$pqKKv3A%HTQ^2H`B4o4K0?5RKf*gW9g(#zm z;ff)@V7!Nhhd=-Y4)Q;U^bjl0iHVT$LFcRq#|XfJ5wu{nnPF5S4ak4t7m0SDOIogw zQW*)pfPxfA=y5v7NCr}XplG2u%EzI_Igx;*DS&<#!U`dj`A}d;T8=Cr`G{f*X`Yj0 zP>te2z9c^5fLjFuvn?4C36xe~egjHp7{lcwb5%4IX_yRi~3}tX}W5HR6fC^Nq29dQIl#(QhRl%rO zEN}%t+7gig=#Hra17r6>orUW)h!f&i28t@EB_OFo6V?TgdLt;k8q`TGj|SI@P=28d zBg`o$ZI_WeWio?F5D{GJv$)EF{_-puax$Po4-96cas}2k*dU6g!NI{6i$!hXfjd&2 zkNXGX3#h`N4iqRM7-j}+n0b}1Wdos->PHpkg`kp3S}TQNVigU&Z5s$xzmSlKegi`X z4Nx0IBO@PYCJQJYb`g-;Sd>IVZUYB!CaKmPjDiwGpap?=S|}*S=$Z&cZ;)4ATZtVb zq1b{vnMCJP@?@e2C7JaQJ)V+}<;)OKHV9WQz)k=bpk2#BeL^7Hm!vjW83pqK{^KB} za~5QvplpVS0m=l@n$sqNC&5Rw0EkGj6*4FjNP|ltL4-OS0s}g27Gam8M44F#unh%> zH5=fOShEQ914md>r3m@?g{@d_v3{8-qCm<-SPvx0ra@Zt1`&cU2eH&yc?0Pm6BhFm zKyJZ!ff~(7*-9h^ixEQ?KnL4G)$)GfgMoM!lnsl6HAEA<7UNSQ5>(aDz@m79H$sU9 zqet;jw4f=N2T)O@Q9NTbi=ct9w~3Lzt`R4F0-Os;T!pg?PA#xz%FJ^Nr3BY8dvxYkaMG+1dDuR&$dH0FksLNDg4W)T%^~ra=~= zz?YIK#1)UrV19}(f`Szq2dUX$5*lo=SHBRCe}AX^A0b=*`ritUYkd^1w%-a|a7_qg z6{MdD>fZDG=$*j*y#x2R@v-mit%7CW+e?VH@9ph-d;8uV^1M~`?R$H0gne&s-`hhC zWZ&E4=SB9ty?t-r^3(^O9@_Wz_Psqkv9j;&TQ>amz5R%5gxM!WG(Dw*7r%Nj~tDz|+aw$6NXTbc!(0{s6*$xWIn6 z;H95%?1u~NhYRe73+#srS{+@pA1<&TE|AVuzH*ihKY)yGIqzpbTwp(3U_V@7KU{zh z%-IhYv^;CDA1=Trt1|v~$MFBp9zb}meBaY4(8cq~gsw7~w~v>rCOthhd$9F+gjb1U zFn$fxYv45+|AE&ijmDyh(re;08oef3x}}MV(rA+L&1j7#0l&j_^E+JOk5MnKQ(|Dy zLC_XECKmqUFTqRtD1Cx)q5ni}Sa`^|?C=oMBDKQ((Rf^GaF`s%8>@-3Sm%{)O5LD~ zLHfYix>{Q`niTwjfbrfAV+!65ODWXGXNg+Le!V6}8jrM@f_uUY@RE5UI5#!POo=Me nWKt1u%Fu$~jFI$D2cZMhq3RHCB|MM8k7Rywm&wxLA4>lPWZ^x$ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d GIT binary patch literal 3068 zcmeHJU2GIp6u!GGYZrVOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd zr)%p2Ubx#!+mwyv0QF_?+Op+i{+J=%~>8 zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N! zT+gohy}f4!viskNbvSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk zldhaoEl!8**-E20aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1 zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R$Zwx35Deu1M^|QO;^<5i^#8S> z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@CN06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1 zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^( zZ9Z>@5MKEvI6chB8(N77(UfyU^qKic&R`FrGL;QYNqyHfWy(|`{Yw^Up+x);K*WFul63d>VLgZj zjp-GTiLND2{NC_*RJdyJ(H%xGx%jPhl4_IbCsd($pC z<92E5qT5<29^po)hq^d#66uBfP$}vJ%^Y-st7~_Z?F|OmP7wGLbLoEawWj QCEYOa<2(ug4g4j)0QCQOxBvhE literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 0000000000000000000000000000000000000000..34b15a76c75091485eac702e1c9f8c6b28c3b30c GIT binary patch literal 663 zcmcgqJ4?e*6h296478T3ctZjivKTt_VPAqy-js@us%=nOT}oR+6s@UgZNbUW!O_*f zATI7s{u}>*Ab4(K7jbd%8_svn^E-#6&YV1r(OZGmYLrk|07$*$KF#%afM*G2K!!NX zK@teeLxkA_5hQ^ERcR`XJ%q|sHZ*0L*A(fyQiaqnQA~ys@go2c9U@55-P?!tAR08L zmq8}FmOSx$!}iE;2X5aVyELH^^+{CkMJuqBzi8=cybh=cXb2JjYNX4c$-~~M+wbs9 zY)Skn2rAJH-T1Ga_-({e@V-R=-Ov)uarT>Q(TduukmD$M4AI~QvBTJe*&f?q%wv@> z#cZ3gU7oBkw#6C!DWg~Ap+j!)h3xVEO+E+F;KdQDm9R1z8a|_V3CTY17WQshMQ7Y9 zZeMhptA%6S2=!1G=S?ELus&3ZIzclBo#5))9c6lhL8cP~{=^)+!SKw#H1jLje0Etk N4E#8c0ze&q$uINqc(?!n literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/python/test-data/expected/metadata.json new file mode 100644 index 000000000..bf26b5228 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 11:20:38", + "end_time": "2024-10-18 11:20:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 136.2, + "gpus": 0, + "memory": 23.89, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.021, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "type": "path" + }, + "target": { + "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c7d3d807242ca605dd7ae80e4807895ad4d48c50 GIT binary patch literal 3984 zcmc&%XH-r&>{ns3qCWSl>YEfiY;u0EYF)g*Cg|rUM zp*e<5V8I)dNqK|X03aBJnN^4|Q9#Jsks(dW9D$=Flk%gZl4It?N2SNpSR#}(B1tQf z$IE$tK9>Ya!~>Ct=d8nMv7|e&v8EDQ z|F^8Ayg`YSNBpWIVJi9KEhE&j&{Tp6BJV7g_mJ;Ou@J?i%*#xQP0pAT6O|DqN?;BN zF>~gnBxWWhYl_=|B`=Zu8X-gszsT=vGQob%EB8+bGY?0g!fVI+c`f1aNk1{`(j&s<%y90=+$>yWbCau% zbfW6H-%*LS;lz1Jhe;*#;N!>M)X?RN!RoV<9Nn7;J^z^*wy6DJ`@s}Kf3XmP`_>|} zq851M*Mu^L@=?qgXX>_%E!rI61pGohpe!P>xl1)Q?_31sdr}Jd)gdU~ih|MC^r?dv z-lD_9E)Z6Sw-8HidBNvtGjN@s7Yf%a5%kelsDB>!r=qG7bt(=Y75gm=L!V@IVq^IT z?i|@{LEjY_p>SAA9!iPC*)P1%)z{s+73SW=g~{GDzqmFkdcsP2{phL~vrNvJhJDO9YeH4G$ZHD(qT&Yy8Px02~W}VP2 zN-9xW0cKWhOYy@6lb~@w`!8k5Ih8N^q&#i5s$Ig-(*zXGG0| zG3e##BvST8Cj?H(h3Surh>Ks>qPD0P5El=q-cp1b+QabO^>c@~4c&yxAJ}8TPnRL~ z)fM#e-a-=ZUn1*vxlC>e97$;v84}TbF&M25;>Gn?!jnf00_#o;X99O|^P-}SD<_+S z!~Jh5J%_ERoNJE_3^o(o)lb2D*K)ATJ$B+qPVXy73qAsjY2<<>^Kp9S3)w{j}ReYljUdow!7~`kIkPz95O$ z6Di$}@+thasjE?xlFPT8`W;&KZ=ZPw3_1Ba-LRA5LvK z*^By$Q^_PfVQL1-r*CGM0g?}+O-d&WqKP3 z*JB6^zL8S~=d1D5w!>lwZ4x;CDCW$oR-wAl{b+sTTHd=k>UxF9w2>rB@R z=~X-{x?b?x-$;^ey_94JiJ>x%sWGz=Z~leQddj#kvhX^wc!f$&7}rQ(t4C?fM<-_ zGRCJ!ZTph(cX1En5A^jab=5kaTNqarjLs^1HITv`lscdfRPFEqkIdkA{z^QuF3;;J1@hu?=_2{{i@x{Lf`9_AxCGO%Ax&kSVf#j>}o9Z zyE%6Dsipl`qPk@p-Lk>RZM&+&e*WcMQ&vj+Z;xAW{YYffcK1ryPddEfG-`7t_1Mj-#P$FaZ;=r$zt3^gGtEAKJ zxv!(GXEzs5``UAp(G)c)3%c)}Z{}a>UJ~@ccl#LY$kVH2!4FkC9Fw}6OM)K-lulXp zhSZd)w%erU$g*)2O}JPA1#c}h(ggnScL4R5mX57}7$ zvzZ}{2N03>`!Ir^l#tA#qu6vZF(EyJ8=Dx*LJS5^)C(l-BPC@f@c!%wHu^1=>w-|gBjo2{+=o#2peDofwlk#q91{FZc8`NR((I0Oa2frdctU-DQ zM*kH&(tZb)5@VC&G6v9JFOob&@&XHay*UGr{^t7T2KZ^y4P-ZAo{f~k0-vmbgFT*SUyCzApZnQ)w=Sj|w5(CH#r9wqe(A3aL1A`?k zBW+l6R45e+#Wd#2lAl76tx(1(JQa#Kg?G)5#Y`8ZrSYW!7hwzg+j$H%5av|$Q8CMzcAW9z%Dv8Bs0cy;==H-pm&qC%-nGqH0m=YcqkP;E5DV>QoZJ<8i>F?*mG>f6%<-@j8nHu92lEDfTJ}OpU zqm(@uf0~b`-%PU^8f?8n;m#ht64@T4MtDwK5N*GR>B%@&iD_~++kYm-fk$EPlp0GW zbEY~*9;9gUDc#>x(6Gu}gYtQ&+OxdRy&;+SzYE;^X unXwtx^HNi-qten+vQ4vM)6x@Cl1=TL#@dawm8lE?-hJRC2@uHqFaB>f=8;VR literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet deleted file mode 100644 index a9ea0990f6152fddc195c2471bf0d3bfdbfc49ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2753 zcmc&$d2mxj82@(P(X_O*w0YsBgi@dh1$v}~mV&YHQhJBdgB~fAk~Xm|q{$;mX$#6J zGvFWy!U*U<9T1@+BZxE577Hr3gNllb!+6YiB7!q8%Hg&9LVybV;fy2SB)j|VcYnM4 zONyn^6oD0Dg*XeDAQB+LbYqLiKmS!w^|_$Tj;vGb;sp>Q1x(U`!70?k? z0WAOsj>Tl;5n_158%QUx+BD!Jv|@0b&1J7`w0Ro^tl>2sNV5|e@(ikndD14(Tm%ec zh#(UH;f+F)g@mUmiYx=c$ixx>CC)!lZM7iY%~MnbB%+GBt)n5i-@i+QA#qCX7&CET z=w$CCA0K9S*S9n~T)tYn&1Va1B8w(_ZN1ys((F>o`5b93Aj3w+a0cuCMaTyhxl<+V zQ(1&(uu|uG_F5FqSyKd_J=5_1#=Y#SU4O!Fd&Z)-6F2r#4J(8lLwB&VqPId^jh-#u zb`5pqpN8`pYuV1)v;9UTpJATO9|w%1A^oy9Aw2sQd|kT-7?Y843zI2=*Tj9EY(#6S(;;|6 z6mao1ctqwtuKsv6SI{HEx_uRBT>=LqKZ)jco%{>EHt-~q@Y-|Cg7dj>&{K}P3v&^< z>E}72i%AeVxVw*1Id+ky2Sn%+jtwUh^XG>M4d#AnC+iN5Rh0w649oFBs^E}*p<(^Q zBL+l9sRhkIQ5!u-H#lZU?9gGu^aV6mm6e3R?Q9d za5C%lWy+v#nMZuL3UWkS`}}bW%rd}}oD5G1a>7F5U=eY!xPFNvOopYKiAl+=-Hi0EDtH<}CwO(i09W{vn zE~H_xZr)d zDS1TT{=eMVM|P7myQ4I`L8a*&pa#(D#89cUbZX@U|HTqJ*knp+l39|Z67r=~C`ql7 zxk1X2qy{OgLD@4Cg(-BEB}w^oCVXHfoU-&%57dz~b^hq&o{$e!6}t4woStBTy!RZ8h3UWvcH zf@yv8ADjX4#%w;iCd6-$L~jrPe_Fl6z&#?P)x zZ6iGyOO;5ITWSAEivB|(duGLwWKOCha;s!+^`DcHEVUv!lF~`Pky508Un$+%oYO+$ zD0NbM;-8q(qw#Bk7%SU6b6Xs~#Clmyw0S)4*4TLtkGIL~icLvRN=Y(mEjoZZALt?f K{))+;*S`VMIFrHv diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c355b299a2b6bafc20dea943a5a9833b966e68b4 GIT binary patch literal 4763 zcmc&&d036<`@f&(ec!XEsFQP&igZN#YK!TV_K<89(P>Z1NsDX^MRiKH2q6?j$Rssi zDpYnuri3Cf_GQL0Wl-VwzNcx1S^oH4*YEm$pX;3GeU|&aKlkT;-gEVQ@{)R62uY!r zaJr|ECxih2hC3F2a%Ycj(%o2op`t81$mz2car&$PfNzNDUVsQfVT~mZk+zT&+W<>VEaC>rVuI&| z%i_a@SW}?cj5OPjWRZmPZ?VM2bh04CKzC~N1Av|f2wgqtB$(vQNd#O3EH0Y_z6u8f zDjd6hv=!4-dmoJJS80nleLEr!`rSs8rtTGn5J?LOS;HEs2HjQ%Yzc5Lt+x< z1j`Vtc21QDYTpj1>*uEcyL_+oofx?+1U{gG*SZdig&A<<9$>$Mgr>v8fID0d)HUp(alRU?^RtD>EJv8sG9TtF(gkj=I^3LG z3VZq`fsR=@x>Bl*^iLcBjVxEtKXC}olJ?(+~-T;{{A{_0R)Hw4K1iC|vQ0cmdfF!^u+h!0r6ZfpqogDN2T^I>=dg39a>AjxLIp{+8w zcvlt9U!&mKPIKg5QUJr^e}L|#&Y=I)44MOT;O?gwSd=gewx;cY&kN7N7MK07%Y6de z--c25@+V-V7z{&8E#dW>R4BeV9EQaVK%KjPLt4R)P=XiHuY~^!?dO-1zVlqH$7Yy`oCA{`@qL$Gc$R{4B(IVBmXpP$lu(aQ^fUY6F^ZHiaV+)bZydk$SI$RDs?*` zR`ob4O`8Zh6Fs2j>R`fPV}`@9R|YAoRACcfs?2pj|XgGb!~`;$7fSvV6+@> z-!mKUtgt}U!>fpri=WY)%TI_2r-rcg(-z``11}Qa`rA^*2iFr(=Dx%Qh=;LxL9qOp zD>Y!nVlX~d!|Hz%p^o1O09g^3e>0xYS}cHq|Adk~!UpO;M<-FTBT;IF2TL_PB7M-l zg>6_zGL*eD!VvW|hzY6HGV*}jA1A$YLD#z5`@c#(&rNR(;C_dGQQ0);EZhBCRl&B; z_fUVsJY=D0y!}8X9a)ST6xJ$gPewx07;n~s z`_9N`XD*tzYCEiL)rI1<#-y&-5RTb~bv*v*YCPZX4Z7z(iZkra6BN!f=3OqgWzSrn zqQHjk0&(b8-_=J5mMJHTmEs)gX0MZXqss zWQloiuRzF$2K4@ZB#DcbirXh#A=i89Q~WL3L|{)aM(N(1u%{#8#gjVT=>g~Al-CZn zLKdjAf821ediV>aZnXgwur2YRLF)*1`YUkVu>wY>9b|bX%;Cg#e9QHEK8Am0=<U-s*sUjswc;JpG3i#i>r4-k~35BO0`%=n&5PWlv44ya;x4RgkifzuSCR0;L zPR9*`)2hW~-#<#+EN#G=N93Hr`-?#PR~7Ipyi8nbHDkpSmnnO91M<*Sk}&Vsq}5t5 zo|_q)j?NNQ?2%JmqU0C1Ab*oHF{(#SYmwr(7%+N`Ex*H}sQpJ57T zo0716>O3^bEs-a=eFq-uEF2y?tpX=xl(IuN>rzgkE?BluMjl=z6kOFG&z>0KMqcL@ zv1329q8rCoVC^ln8n(@AU_n!vp1Xk+66KvEPEIc(e6yaS>4Qnmgpa<|`kLpcXKO4O zt*!&%j{8vLlp{pg86dViFoi2i^0ALai{Sf#ahz}bw2*HU7n)Q{P_lY9VSgB5-p>+h z(1miGEPggXVe**USRR6d$~{r_h>vJ>eI_Ss`hL9dbR_F8rG*1|`oxvA1h|oZ05wJg zg3VP=^48;#_^$04bWNO!FQtn4Sz2}!Z?}T2m*d~B{$VKMYgD0Bl?b?2aG%)B(p5>* z&_cNh8;NCm@1RTZTUbVw-e4c_fK?!<;O~8u2gW6%__cQ0V8xq>p1t`MM|bfdX@ZL8 zvQ0wN`tAk(JU5*?sa-(4IUkD#-`kFKr#awOe+junV-jnRh7$_&{+6Pz7PSK5dV#U9 zmiOnSh-6AHBAGv0blc6+)1e>4rymgtrJ!>*bBn3vk=*5gIU@VrD^du~07IYJ-6<>C zYC~PO+83mz@kN&5Ewu$JGgS2)6e4Nis!Vl5@BKb?g=wn?ng?FkD&3R5X3*%kr!94R zGS=zZr*gy&dleh>U2_b5>-T1E9PU-E#=qFCuaBHrI_E^H(Bp=&UuD#t`dp8jrgOcPeCVQ# zx0cDyWQoV68|RgWT;4WjDa&MAMMU!f-|BRe?cYbYRW0sjneI5Y;NhjbaT%uhRf#{} zI=+-`wzFn&=kwO;46|J)lY6>8bhFKO*QStsEjxvIL4BHF(AZ@hi^4OR8b*FK3X47G z)@YAg(!;Ubdtt*ck8O6DmPMCxjb~Ra<60G8&a;lZRg-D8uOZ)VX;%-|djF3F6IW}E z&$2FQD*6lh;A=FxO>!T4d=`zqPl>K}fJWcrKE4lq2k&e2k|jMpHhWzU&N}iH`avan zb5{I8CHh0&U(lCy@c!h^pr5o&X4~$pDGPkj+KV34{h`}MbAD?|=ojdAdm>fSIwP7( zW_%ZV=hoZEmLnloi~Vo^7TtC{@zvuyx8KR{)upKppWF6+!NZHWW?}7ZAL5@j7EReV z_fA*h&#jg6C+&B-lU_?YEwqKWR|mL$eR<|aN5n9iAS9lQiil5OheU-i7i;$JXI==FMnMBcvTJ%PX0q3aH<7BA+iNiY!1i}mjgkxK$!RYt^XDC^Q7C~ z1p@7+Y$8sd4I>c#zDgT?Ef8vK{!JjL{vj8lLSn)adi4)25k4Wwerw4|D_SD_J^sI- z{u6I;whf3_@j>As(J~=P%OnH*h1xRE_B^C+PYWin!J@B=FJ$!^f308`yFZUbh=nXq zJI0$7DUXOzWSBpO89a<{&WI6|5W~$tm znltmrK+DM%fwN{N20M?5^!4!$XZEAL==>JXr#odlM`aK?3A**hQx2KEul(cvX4%99 zyM#$)jK30|rb9R1zPa~|8O+?B>3XRvQ}4|ul3sI2kj=7@`}$0l`}ruT(|A*R`MFR3 zIv=`Nbo(M7rj!n`!7eism<_3$Cu6TPO6tR(>ZYtWU2MAbm0l{H$P8Vgm>R_TIggni zXt{{4i8W(H7deTkKb>Olq0nm%N>4g-njO9GD|JZfJtt+dj2CSqRaUasNlCJIT}jG-rOb2Pn{AkvGK#Vtq-NiYeDjWv-RKi7BbssI20 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet deleted file mode 100644 index dd4f930793d5e38efb9536223e4aa1ec1aefe431..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3122 zcmc&%c~nzZ8vot*USd!X1YSrmDzZdWkY$7_Rq6xT5fQ5hu3-^Dk{}6!wHB9BQK?p{ z6|JJyRt1Gx#EuKp759Cm&dhOY$HC>)>e18oXb0!sWMZZ2A9K!}>35R5efxKBe&0#5 zloBKgqKD|6NGCW0PD5t3$9EcZvDe29#~XK4EIfHdgy<|P{V3l_l#6mrD0~$*o=Rbp z24LsG(NQ3T=d?SBMikT`kUObWb{RTD*5qkA(=?G&3d#*cxs_;RwTjnSQZru=Incq0 zPQdR&B=n4t+&VGA4k4-(HjYYRlU35NUp}P<{u4IG!mbvchwbZ-Wy~znXB*6uvvg*i zLoQw9W=+mC<`?M=mPC#d<$0oA%4^SZn*YM%E7SYCgLu~=QQXa2F5!Xvv5}8nI0Jbh z3sBc8p|yD`{5JN-_gC!kc&QKWwDd)0KUcDOixEpExx!~v3yk0S00Ay|*p)-Y54lL+ z`7c~%w8jr(n5PKPaTGXI|6ZhZ45m1{&+aDotfJ%*u^lkEn za4!G~V*-ZF6>zcM8+Ls>;Nm(A7xJ92EIky{7e``5YaS+)^n%?=7hFrK!=|pq=o!#J z&eUm$S92{~7stY@`3wqn4TRyo5=~tvV8v7omi+oPqN=hnWxFTNPLyMP>sRDvXM)Dl zU2wgd3l1IZ4Po7I1U5cGS)3P=>Ni4N8-xvx?zXl4Sks%D*a?=*|`&bbE1*_M2=k*eW9(8V|$ejCvG_6*d-Y*RR@yz znvL)_ox;=E(eSz*fXf+`xY2Gvi8&QjWt;G9=25JQ*^2e?;kdbmBTq})(7(I~JnMq- z;Ng61zSalc250hk!%xIL>lQKl5)>G`5WOr3qkmmQJ~5T^6*cYnxqcw|#=okI!ZQJz zdv6w|xUYxX#BcfJO^-?Ipyfha!eLV4c!PWx*B3!oZ$U4d;+j|PCT3?lqR;&sS90Mj zTIT(goZNVi1iKY*)y6E$xc>oX=Nm4oU3~%d2c8g5BywR#0!dAr0~vLw3C-FlQNiCmxX23h8@blKWNKVsL2WTI#r!r2ex_#$y1V!VFAiQs>9zJ2EnnJXMH=-vJJ zr8W)@X;H}U>CgQ*X)hL?+|OSb9ggyYQ@Ka`WaQF_W2kHxOLp~ogaXGsq^@i@dJa!O z)43kJ&w`zBjCzXiCf9=>6383HVNev63wx%lkUv>&!arN)!%c7GKHW5)t8NS;`}*wV zYfd~P6Hd4B;rn}c@+zCjZF4)pzc(>d=C|!5K0h#xKMoVZR%fF0POQv%?kxE2ZjyJq zM#!U|ctH0B0^b_SyU(&`Aab7_b9QkRm2N7vdv{Hbp1nMJdwO~MX#4o~?bqMWe?UNB zQ1G7yhJ?N`XmHq@LxzUGHB5?#93B-N6B`$wkT@b~Wb&w#(Ql6#n>sEnef)%plO|8m zWn^Y$PtBQ@n>RgQZ!i|jC^VUiW)_#sdZ)|mcjvrUI(Odu1@D)Au&{j5;w2RyE?xG~ z^2(1_Xjgu+s$bRWHJ`3sSG|72#?Ll={>A1kTWe~!ZLh2Ua>rK{pa0#_U>y+ zZvJNffrEz*A89#y?D&b3r%syr^&Wh);9f8qKpKD;) zo^2BHb4}(>+4DMFH@QS{3wL6VB4h#VdElG)Nh)82w3ttLk`{^0MNkJK@V~et)2TxBM zmokm<^~p58S$TA@*o(9np+Qix+OsSfuY(tGgxYM-s5zPU-U^|k%uec$_D&qrqF%$)6aW|DIV zmWQ~qH0#4oSF!>Y%mHkcR#63GKPaA$mVNuf$Mstt<&7R5b!y9$`%7=R4ZGP}^kW{IcLt z6zyL*8#0^1a7)rbx_r-bc-%A*ed1JOC>S*pc8%OcFRwJo;2pj{h=Wjq7eG-4X{v-6W{pZ*%<1T;6nN!fS{4(mTyNBF{Xz;d-c=)-k zo)S8EGkZ6kfu_zUXcQ>%lw&SvPoNQ++jRs!mSv*uf(G?KFcL%}|5X z;a8X~UH3ukgqZD5g0N^JrgGSJ2(bMLK6ATnaQn_AMhr&axwHd*+8YQ)DnCda?u75Z z(FP^G9n@E$-cWpWA%57-ptG}1fR~pY-8$kSw1^ku_ll;&@aZ$*$mwB}c=c0cx8egZ z@p}s2#54irEwUMq`Ii6PazAYY>I>YJ2M#bg15l&qa!@C3kW^@7kO;z z25^}+m9lzE1fc^Z$Zlf~+?v^qvdx7kzQc>T$qm|R6^$zS6| z->t;=7Ut8fvx1S}mKWBT97aRZGH~xjXI>WdIpY&#NAEvPQ!WqJSzWA~DlE~gL_Jg+ zf9#y^(6W1%VCy=6YS@V9z)Qc+Uq0$MyqA)R4^+N}+BTRPd7Wux%;t@Up1xe1F*gRy z3d}K({q1YGK76sgCbR`-uX>NaaKi}3Z(#sdFILk{@35lNwp01j69Vb4gbjSn&ll18 z!v)xSW2e~TTm>xZJ75!JHvvgE9j87CYoJu6w^7(ITHyUm#Z(=+iw0^mbehF*Nb=o{ zQsx|_5|073@!Rq6Mc!5%;o2`cG$d0nEz%09QiafGvIi}*D5rdy5H|RSj2ZfAGhSwS z$Gq6-rtnO20*-4|qV_S*P+3=rpfs!zFYZp^U1h9rtbr}{MNu}K-?|r_Nsa~g(@Of% z&9V5Z$1!x)vJm$cS{jyGc`*jH#eAFUXydNy3z4C?4HX(C!`ZsOQyX|AjEcloXnpot zYH9t~s5fgPZ(M5#_$>IAS0`#QtiQ1d9QRBz?DVpR2?q1go$r6ZX-^Cx*V{<4bRCN> zKE8)vys=U^>xzi_{)7e%yS5pP2vy*V(K0$uJd3we?1vIVw9M1VjG0=&7cMw> zMttSP37+uT1!3xS3BHD93fd-4l1=WY z8(T6_&%Z<{kAVkegsBzL!hFC2sZVXea!O$C5Ybt?d8P|wWU{YQ7MsvO}{D6mx27q79MQSG4W zsxMhNf)~s$~HPL=qk4K-BrG6LQ-F&s(V+(=1Fjk^Dfkiio1PQ z3$?WGZhtLES8pYrE~{9r^>X{lZA0JAysgjzJRoArh)pA{%TL7K$?e1*SXnfhX-)mB zh^ZY!<{dXJ(4u!4?d$sdRg@{Or)KYnoI?1z;x z)ABj@k19j#3f%I6W~}#)`8PeAJ&D=U$73VUG1})p;(z^!Wi2DES*$PkGhVHb!nEXcuJ|Wp z(W%K<+5Cjm1g?6LQdujK+_sUNw~}>9Wmj~a#cyBQAJ#)3*7IfSB1pi%Am}Hh|33FH z3EkFD+&g}yX^{&0xpGcODj1aEO8@V7iO4UF48~NLvF!TjKdbyEFZ+Phg!II0eRFdc zNp2!ppRMfqC{p`>Z~jl5{t;Hoa(9sOvf`2w($p+XdLTRen^wxsTDjlChx7#Cj-{_A zoaO0-Uu{h|xnHk>#Vk+h#c9)0-L-J-7A77BugA`c*tg;A>m2_lSdbW7T3agr$9>*k z)Mx!)9`9fFk|YOY5@VS_veHHjU}-HC$wNZsgip~wENLz6%#piXA(zYNk`G5Sv) zB2n%ymnX`76LotEA~1_v`O4*i+)VhvnQ*#g7H5GoB%hNv&e_?MEc|BA;CSC$ZuH{@ zVp=X&a)Uy}vAT1)3&*F#y3KZt4WFMA?>{+36%mreou>tp_}+Sx3_5$hIwK?qLVA0; zC3p6UepY0-dwP68qFl}C>)aDKgj9ok$h>(RAH)rM=(^DL@uZMFMYcNJJwp{SJ0mhe z2c7tvtH&1<_G&yNS%iKQ50^@XCO%+(Hg_NoRC4k4@tlJWjlZl)VNpdcie-fhpQplb{r%56w=*YQBuE^EjlP*|Ji^#}znba%k zKdefHT${xSnOVsh={Bw& W<6XzQSSqanUjD%m8X%bbtNCwBcqY97 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet deleted file mode 100644 index 26c4d1bf6cf98bc32a1045d139db4a38b76b8904..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2862 zcmc&$e^gV~9sl0@-b-FE2x4A%!AOIWh!!MJK*Wf=ARrJhfC51&1`;8fkY^GEtW~tN z!_C^(qg&f~bUSt3w4LXu^qh(~T05<_J+5`_>UP$aZf$MtY3-Sw&DyQqy)Q;(>K{92 zXZ;@d-uu1ZANTwH-upf`PN_aeUiVD$yWn+TGor zwk~(LOTcc16lwNN+W>&S$Qm;mX(HeBl%9fwprKW@Vpc9<7`SIe+*V z0_^Mf_^ubx*E8P0xxtI-zPHZ8$xVMoul@XUl$$K$JwXqAJ-m<6WE67;o_Y%ozH|*u z0|zeptrhLBj7N1RM&P(5gkBjqhc^u~u)S{?KG#qNUld%%;`0^E-j}sdaHI(h?K=q{ z4DN#OwuP9q?2Xt~IG6M7n4-FP>M+Xjjk0^Ts8kIb7Q{U%Pl@xVzR&GGaS3!;h}(Ou z3TyK)(<2;%is@J3OzuZ5>ablbl{=7)|zLfJR)VcS7qY*r>H zlmOo|$R1s>Q+4f$Fnn@yGdz9j5ckt(mf<}^Iq2AoVdlPH{)<`u#s#K$G)+By%UXOW z`(5>h)&lM&^_QIA>S9hq7>b_ifQ_Fm;*tjXA@ju%)ueY3y8L%?*WAAV>w@`AO1~Bw z{<#&Ie{vEoRKJ2ENgCuCwQ+ySwxY*c3P7`KGH@BKct-JIuH)4fu4+Vt9fupyj!X__ zzMaDDKl2@WF8K_T`P}2o1LrH?XlN-uR$YO993EtnTmQ)PC47JazW4B&FCW35uiVJK zP~$}W1sj$Vj-up1klZ5Sgj)C&n<)&aZ+^nc^$=Nlkc2*DOgb_!-@b`ZF$Z7u6JbEj z@@s&6%`B%<^P0E`+W3hHiIb8h>jZtWXh@l2oSJ%%DQ(*HbhBkf#>`pwW@gRKw&vvC zmzQ5Kr*Ll3Jlp)@1tpTbbYWR}#iGipYDZ1&VrN}_!;+756MLR<0uaD&Lk_7!t@D=%(M&x6* z6wYp6B|$VM8imJ*nP~Y-^4-uZ;Rd$(z1^LWC`O+~`Ztl~oZhlSPw?#a=y%!kP6Zma z<%6gScXW9JZh<9GZpOE~2xg<>`N<`ucF4!#_$~{osPTBXrFJ*_FJM9Cuu*STRL+9F z8kp=5I}>vXiEpCum5u+iSkUtZqrve%ey+Z!x8whEW52hXq#xAlc?++9crt-1gHhB< z_4SLJilP@w{27}~DPOWnl2k|jDOF2SuVn9($|b2&D(zI(?8Kmk&PpYzl8!`pM;d_JUEgl@@)w$d(pDjFkHAyqo4y<*fG z+rO?-={IRMp;+rBX(63f_-PO1mh!o)+H?CzPx7b{X>u>^KS?oqC}hp9c#_OXbwuxy z?7h)*Qj(=!#70s&8Ff;MjIJxCy8`7sWFMtY>QCYmS9&y2OAt-6JG7?98_DXBd4Z)2? zsen}$#SN@Jwbr;C)A^E+HjNDgrF3er64Q#h!5Muw)7vq=YTnZw}Tgh%L;ox@>`QjP{6 z_5kvZH&Dj05Hz2JOFKP4G{^-U9E0FuniN*Yj)e5(!7#2f4W`Z+2%_~4@cH;A*kV%v zFMIDr=bMzs{YWD?F4u$mk@JwfZvH z#SFQzgm^YxOl0d)C zy8%OsUIMo!Ke&5uA#D465O`$SqrU2YBd5gcC{K+bJIftv_Rl2EbO(Qb^Ekd?PlmL1 zdz0hX^k`H!`%B_wm7DN`R5dbpD0qzjdt{?=8d30*4qg8Bn%$2JPm7jZoFUqUevqtl zJ}DeyvbC;!+>Pwi70B1IR3MJJLuFp0MDJZWL^ZzUiVmOkA*O7y66FjZENHQ9BEGoh zNxUE8XqRoZo7gb@78O;Z;b+Ke!P9Gwa5jH7?0LL_m@>CjxWtw(XnF86S+M6F)KN4- zBE2yL3Pwc=X7y-M^atxv+~SQ;e03mfEA}D>Mz~VmrK`nO#}5%{vG-8-m{%x|uWq0e zv6uMV-jTw|B@3jYLvIl+*N38qM>9#qCtpIu>xB^c(Wm^g-yTF44L?Ej(RI9ILnNs6 zQZ#YNHo?JGR7=!c_anp)&O_4A9q3_CI!SDutGM>Yd9ozJoo`j9aGPhtUt|q7Jd)SG{fh8G6!2$gM#J|lM&ZSh&*0E2?ZlDNi@e=$xY&BG zDkJ?ScJXQl>3JRI1Jcp^ilv)nLHy&|M1gZa5xNrkHs82Wub6bJjF+4OsC&f>n_&B5 zf>@hL4EvG*ZJypb&|fF|?qIDr#-TzSrd|xqis^De+cyrx*&-49828K;Dgn!2mGSTDZHZaSmDPD>cRTWQ)FVu0OFcXjan+T zw!;%3cY?)}G_x*Wy7j-9dTL-P9lNa*_>0 zPO?8ZOjUdr1)g}1E+7=ihdvv2x0@nG^MIh_LDlmY@F;tA^s(v%i-b})eR)vL!a^&# zUrOh(nng=(T{J~X^0tP*5WF584% ze_Zi}oymH%u5sOP6TEhzy>k7|4a4&9bXvSgOjD~kPr0YRui`u{(mnAcGQmANGuhiK zGD+X*!{8()^0O((tejQyJ%?aHD*6-zon}Scy|OFs=3g)Z&!)yeB>Z8s|ok8$`K{X*0KDv3Nz`a$6?!Y$+=yP z=8>+O{C)$E#g>#*`4;kjo0L)0oNT=%EM5OJA?IuMFIAPeDrv$}-u2JdMNwvI7F+6* za>1ly=QDf-kQYv3c&fcUuV zC*ZOl|D=}3J(*uD`&fe`In^)2mi-rZvC#8nzr*9N%D(N>CLkjzD>={7(cr@0hg1jM zRb6hl@V{vPFTDO4WW~w=kPC7XQj#(aG>ID@PvV*W$x|7=!yyPa1Q0;TpU*ojun0fj zm#}hw+z3t3g7DW^ZBowN4%=f9*hBv_s^K)mbt72)(|t{-J#yzui{1McsJtjIbg+0%V!12A4h6m1GFlPh@fyrWzTe4@*>F35q z1!N`ak{JW5&$-8NFvaxqk#E1v@?+S+0-X!p5>Gnb)8rYV0*o=yES#0+gbMooPYmPojgyV3KlKa~!Y*|gY!0$xwWHGF z{pR4Dj9`Uuk_*`U;}9*Eg7-9>CXO7V!{;%KroeJfT(GPbmSMO|T9i1DmVFL-Or|y; z`{8i1`q(~JWv|8}338WgL(Z)Hq`YAX+1bMkIXT7xm)S`L_!=OaxVWfOw=gx%$J9->e`FXlozv)aOQsv{oO@Ru3jZzh1kU}=_xyd| zIp3ES3PtI>jF04FO+3dV8VJ?=zUAHf@xQIy%!7~Sl_``T$;)}UW-^jP12R<~1wawU z(VZiNgSMGyc$qo_@<_ExQEqlx7F3x%RXnHUl{ks1I&(xW!db=de1x3oK@@IlP(@)s_VnE_*yk+ z>==)0--Y49+9cfU9E*x^v1He77uL*=Mbf5DxPD3kbTPOW!J+APHOdbBjmy1zA+OBD z?h-XVyc>ySn+39>HjJ~WGjXCl4X3sm5$~9Qdj-AZd|N6uXjEkEkwnOD=@9CgiL3@0 zF195?F(wW%v03=m7KwFblToue8?(A>m{@2OX(Vs&4kUWc;L$Pz z5^m^lxqJhz_Bv7TEr!2wCmt{P8k>xJusv@YdNy<9(Tblje$^<%w@$>}dn>W)$`~X% zqsfEjKgdYSb>dAXa61#&qr5@*J0ykOYG2F97;Ke{B}|w1t|yp&f_g6y5=4n^A`2PS z(6BHoSrxJB?eY^5@ej^J zTb(Cw{rTqcW_M1?$aSYasy%Jb+>u#?%rEWQYqhsk==i#f` zb0ai7C$(kH;CHTYR6ALVT0Js*wZ|J`wOd&a&zFPD@DPZOrsUw1lS<)&Ad zwM-@60h@pvjS#qtq>DbB;^nxJFrvGfs6a=53uM`Y|P$^_riab44n8m1ULC9ra zdJcT)NhCeY19K!4EFG`YO`*h${2X@9_OX2i+o)?nFtJTv%8sI3NMq-k@`?HB<;8RB zEQYByr6omGOztS8_};M5P4t&7I-wxwlKhDzlMS->loe+|fV;tW@5NxrMBL`=>=grBaL=e19zEWs;zutJ*~+krbLVEWA&#HrAwhay=Y01 z)9k3cRM7jRdlJDiE2<-im6VLcBqhF>da1)uN8^ZjGJEQunzE~rG@*)fn`;->S-le~ n-0lhH+FF+{Zi%(lQ|)rbrB6;xPfb&sG(g`Uj>v#Q`Xlcjk_>gx diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet rename to transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a46cb40fef1eae7a5d0207565c796ee63a4a4fe GIT binary patch literal 4466 zcmc&&c~lffx38-1o_%0IW-C<+Lu zfXF5&TO?qV9RW8mF1Vm^2b1^|OMU3!0g&15RLbs@Auckw^Pp}^Ozy}+TM$xJ?S>=&VL5Y zKa-(DF(Ke>!%>WBFrO&P?gf=i@8A^RhUT`J!cu20T-OL;X!PK-K;|_|d--VAiHs4BrMY;#2YO0y1dt(&gZBdonoKQAOkw`k_O$X0UN; z17=Y?iY+_;1ela4XQG^PFF0qBAV#G5K*Q+*f}5vbF+iN#z7}SJ#if{sy|Ir zyAI*-ujFM5ECQwrzQ8osaX|j575E7MB5|r_ArV*=#VaVU&~T0PhQ(`cqedhe zpl`I6zJ^!>sy-Y*e*bEwZ&4*^wXfge303Rh#nhQx-V;NRFvo+o`o077KC~bDZ_WX` z`*pyv-A1^M`(%bm`W_a$wGplKeF5*e%wl|g=LrmF8L`e+ThW)MCGnVzo~ZWGbok~> zG%mVy8@PXw0zAI`f#`YA0DA*}1KwvcuqHz;Z0_?$eX79|RG9@R|B*Rjy}kfK-(G}o z?nmON=wG6N`4{jscRhlgsf`7Vgdn)ZgAw-B5IlR*%xYEd1ZUlI>8XK1lS<}I1r`r~ zCMH;yDH$2nA<5}1-euS54zWT0cIEn!56R(NX zE8$qGn=idPsRVEqbl@R&pP>Oq16X^|LDhJ163!UDhB5jzne_WL*!7BwNKh5Sm{?K- zv|n<8Z^3!2yWfN+$IcTrF8cWKOE_jaoT1sD{{?fKYzu718t8_Let{dF-2{0V4%kGU z-vMpab9&O04p0#pi%#yD2OAFXxYoUOgxXg#Kzm;TidpOr7dpqY#JBE%hm+P!m3h{p zxUCiR6$f+(`xTBTa7`dyxtY(or1u5gF4P&n$}FPG-uA<5-z6gL%q9)1A9jJ&eJ6EY z^evz;y8}D3qzLob{uC~mh%@Go`VeWShu}z-43C~L3544g!^lNdSXdjtG9SzU7rx0u z-exyA-+mU$Sm3J(eWIA4Pqh?on6MMGsf392lbFytSB*A^2GvuIA2WNaLs4+G8*H35 z3U{<@V{Bhig4VP~((V$PD2SzpT}X)o*Ye6>Z$uEVyyS-8d~AsBTD8F|qGZ&aEMjlh zv?f?bQt7&Tm#efqTmjh{4KSG-0j}iV#}3eRxG5T%aDUuBZ2i$Yuv?x=Q?LC?`v`yly$RsJC1Lc(t-DUun>aM%_6jd>)f={DWGZM+w7idJ7op z90ENpRk+Hb7u3(@c^>~4wh=#3<=?;jICZo_oVv)Q9AoGjkRA5>We9f@;FSs_wL}%1 zxe*|S(B?>D62?$B@NPPiw297};*@2RpPa%Lnup(L%HOnARacrSloV{#cFba~;bvW+;I5<*tMKD-RgQKf5s& zjjLFer;H@;{3%ZvX_Mn0k(dhbLTvTq+q-ec5kv2II zPT-ChAU)@UGt;!2U5Osi2yX5~rX8#-&d?FK5ucoEJ_BJxXucQn4r?&F+TymYN3fgxFEmqz>S5?@#N7r_{wsZB-u6+icWy`v%i+VCmgS&H` zYmQyWnJpi>-BnY3>9EZv)Z4I&?ZlI}jIzKEne z9qayv?vgu|B4mu)P;~dNv!v0-bXQY6Wcp8ZSM^Ww8x^`LEN+a}=OV#oB)(=MemsLD zynpro4^jUPx@e~*5Yptq;i1uie4J$e`skyV2z_n0;}dMi*8pINgrC-*Pa8A-l#f$( z|2`Q$;?vx$sol6xc}moa1|lPjpO(HR>0g)fuMGS@!2$tKq%Cs$KYqjc54~Lczue{j z*i8~-m0o@z+f5{Vnt zNPQ?(xh^rK^vEFdFU^9ymd1xT%!%~z_6Voequt5;)~zHPr9WGx6EX=>#{4OV)Y>Qe z<-T5)t3n*ZB!Sd^Wq1+}DL&)6$5&rbx(n5eL06_emQN%(CyfjAvW)TZ{xZhbTM3G5@$EcUTKst9)Gd3vfgB|Nf|G_L}Et`9iylk z$b22%?e=l}o! literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet deleted file mode 100644 index 1a9169d9f5dab747b86df507d1750efb57844174..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2537 zcmc&0YfzL`^xXS>`#{-5;@fs{CDd5ScqfDmq2KacToLdAizzPb%7V-8whut3QL>z< zA;m^#)^vo@Oif9XW)M(N!x!d*rnH$Rsp;V(wHcgJoUD7lZF#8tYw`@-@7(h~=bm$B zbNT#Oj^zTlARUJ}2n3L-dfsXX{m?ak4F_BxCyyZX08Y$_RS6I-_3;#g7y$T%Vsc`H zFsPf5ier`WAP!I}{7MW~W3k!bG;>&vCccsbdDmEHi23a{l%8kXPHjBH`DkRW~ z{^+TiUooy^zsQlReWGTh%TLSM}} z3CYP}?B~z+LrYK<-nTXlLegeK$N7m&(7O95?2UskF8x0IP}~ematveRW`WdQ#~xa` zQ9Q803BPo&f%i`w5N+AM2ybhNMeS4Dm={0(i+SbjuS`nkB*~PuRro;kCCRGA38F5^ zEs-Tg&zu4$ByBE*)z`8_fwe0jqP0Uj{t`m{e~|Vvd;~EwUuJ?=_|vAminv=Sg1+kD z$OuoQU}-T4xPEtv$cK4w0U=80L#$+CZSB%o;fkl~-Yhw4P7GnH!wPwkPZe-qK69iGc$yL!YPG0_Po@aVAmI0~r`qod=)hp0q(_^@6M9Bo*~ z4UVqGWG!=f_lO;bcLMl`5s~ zKVH30Xf^&XovMCPoh0v-$+220ACLtBC{+r7K0kkML6Wzz6isY2g?L`W^L!rpr!br6 zt9eZspTYBGd}^6M(-46<6ielKE&YZBH^A%5=47owvR36h?ZMenKA^h~D* zG0pQjdT8|YD)9LjdS6}=mlIo3u+VMHn6X@6m~W>1%3KoPiVAWF_EH5SBnSdtdjd*% zBlMj`1@Ts6W*Kjw`ht6s4gtOIp1)uLy=T+Io4OFXH=gB$rg0ex;%)lE99vPLkUH^~ z=glvB{>XSpu?P%{hn7-fH)bw$Q3kKoQF%eA+UGw{E7Y45n}DzMJfB9rX=TJ_?~}1Wgh-$&T>#yr$awPC~F$i^%XoCA~^Ql3u!yy1p{Q zP5cOXQhj2dsM4qLN`fNPZg9NnHn}28?e<86!(poqtui^BVtrr(`!gCWF>2H()_3plOyZa1k8iEF-rI|N>fZae_dfToIX^tbUgi`*1yP|M zlmZ2+0A_7I6-xar*H`oX^po_q{-mV8?|Jv*%MX*5mL00-cz_?ZFQv}=`%F3SbZFunPu6C01VzCO^Og%0gERR za)TwYq4Ogoi4hbgP!&8zf?tq97Bb-coj;KY9XL=J&<&670OS>HVIEkn?3n$f2W^iGa@S&#Uwn&qkIT zMvy!UpA-(gVsMcBEjuH{(72G~n6TKS`Js{|iDD!@hzy+{5*M8u6D!O2G$ts)3x_&YAw=V#bvN7<(6$N;;Ngp0{ z8$-xwKH5_m2kXD%gMNND#Qjqrz*G%hC}FVSX(R+5_z0IjRRb%0GF1ADp!_)xmK2H6 z@&pBJp=dI6?l*(eB`%;7GZtRV{DdynkAqxIAu?{$1LBD(aN?$c-3kKw>h*wYs10g- zJLp}=gUx}~5Vg($ruQ#|xoKL!-KGZDX4JthrBoPc+KA5AX(HYB8sM*U1>N@Z5Z`PG zu|Er-O=&J{U7!i;KO6+7f-nf$s{M5{kCLcumTp;uD4`^Ys3hXEJASROw zxh5oZA2CC;Gx&=f8Q z=fa&&v5=PJ2L+kC;B(4J$akrN9qzVpuLwi$mk+|2tZzW4&H|pl$bdc93_&ke6}{T| zD^d@AfRc<5#K-D_^X3_lQ162^MKll_sz1S-dP{V4Y=JySXBzAowTB2&-vN5_^a;;h zuTYEIS?HgbL)6UglOJt*0pI9m4CYrKKn!sfYu|Pl4H=%pEaD!s(|RvJ_sW~-OxY7; zF(Mu-jSGdLmI|C}WJ}f*_CkHfJEQ|1*yIytsM<{t&FX4{c7p_Td_^C&vIU1tX>+hX zZzp(a{Sp)Ib-{OZDuDF?Kh#vw4R^A(!RHML_$brmn9amVWMr-e>+#7(WFC2pD9vQC zyrtjD6~t@EMUT8qmUKM>{#Zorc;}8OSYmiG)e0`UZ=uKHzTCq{6G%QN!o;P=;BAc? zC$ENPpDz7XIFc|MYn|qT@)EP~5%UJ{H03)eCh``x=;t-qlLgC( z=INe@GiZawD<454V&br}Lkoxx)CKa;f$5~AA(~fSTgRUk<&P>;Zek-1bl9blM(D!@ z5iTCTl&FcDho!!8L6_d&R(Y0jio2#ah`SFxRotr4&32b6D-?aci&TsXkvTtyCFl2q z6LXuxU3#^ZQ{z1fwVgG?=9I{D6UG^{nw0CX8@KhbWsZE6c=`QU-gl2Uep~FxXkj_% zo20Q*$P{S!oQKU>(#c+F7e1r6O-|G^9@QVlBGojQmTtq4}$eVbg#X z?Ac^OXnBs}nC5JjlRw^yEew2t?z&Il=-nDb5ppJS=NhfqzFSuExUJq;(}OYS?a>%Q zbnzy5PR#_bgO|y(KeV7;$s6!Lx)tvjt%y4N{INde5H)3PIac<-0+V}v9>Rt$pttv; z2yFKf(QVuF#1>CoQa)c35B?B}p*3Ed@Q0(}$zZ44an)0B%(H}@B?%r`HN_Cd-+xM~ zjn6@4Yzs_7V>8ZP^9)=|R>0`YT9!xBTu%H;DcAGyB>59M%azDg3ZR4&cu(zB_G>>N zgX|~6Pfc;`-Yr+4bwUr;p3{r(x7AkG-~lOmU@1$GKV$ zI(g&{fgPE1;sj5_Dhnkh3glws592MR4nMkbN~F=z>a% z3zQFzNZ`~q&%*;kIPpRwv0wm9GK^cmXaH;Gbp2yNs%;c-1ZAstbODk@nGl}`EG0k33Me(?0 z3Qh7A4+?>^_p?f~R~2WxoG*Ix;up+gOc$iuDhifvrO?2eC)ndtk74?41^jGR1EEy< z0<^sCv4MF8#A5z*)-Jvi3ipzd^u44{m3pUOLfuvV{fb2}r56&+j~y-5TjvdiFz>?^WzbX{|e0=g=)zc=(O zcP+Hvy>8nW|Hg=cuHEbNO@q3!L=Jm46poMRs|q;2XJhe%_0^_PXR&6PB$Wk>9JW-Xi%Vmp>TajaR?(S&?7LnIaRDWMNbEkc_x>L>8YVRtq z>cH-rZMA;;gL{gcYV+#m98P$2y}LGl-~5gYu7>m8f`*XOxyEx&?kzkJ{*Q9EV&}S| z=BWP0$Qvi?iVwxbcl7**9?)S}B12c5Q2!_N?2>F6y}F;0>dn4z^N9CD26wU)%y!?9 zfqBZnNNGeh4K>h0Lp2m%t^WdbemetIBn8dt++jb8QledhltNXX6yUi?s?c5RP&sOG z#QjHf7&qIPBB;9TlZA0*&TtWo7gJK9;SwhG+2{jj`J3l6J_50ApY=z8NA&Rd8YZLn zZ(qNBqa0+n66Xpm(|8j5OnH%Y(pz4zW44;HZ&R8|i1S8`Nhz=1s)V}c=(^=urmKdz zZyOVEFlk72fk)x^$g551YT+}sPf7pv)sR|*XPHB`z-lo+(rfp$LhIyr{DnT%-qpU% zi>WBzx;aNuUcaNF{To7h@~oB!Vgj3^hG=vog=nsnGv`p8NUGP>OhcYLtnna4Q3`{U zl+%|yq#~UdFoMB&`s=Io5fPqk=@wGp_{8}z-L3Z&l0IKEM0-5AZ+`P~#1_~T#7mB^ zRWbB!D_9hAYV#g=5U&+XElP^KR5`2Cx4kGi>e@cZsl2Jh zDKR$=EgHPqUYr_t=h#a22-}jhML(R%G4wlHlAbttvCJjkcKhPwC)XOnuN~dKB=y;y z&b1@HEnS-aa2nIS6R?xA^6e6;Ix531)Cu$fBYi<@?IW9&G?OFaat9CL_`=y((I&PkwZuj zmuD0FZ=Ugx`KD>1KmD($m5?!$iIDR}hoK~YUZs`(O}8+bZdEkUjt%-J{2d`t{*9hQ zhsA~`4O6gOOt2a=IBIQh*@~tle}?!Q;J->;w8095ti+IruowwN&_qoi`vc+A*Ywz{ zW=AtNu)>62*PmhyD}T+H7`@+5hQcV8hYe#*2xX_la6*hm_;OnM>a>4t#=i{s|0W9p zRgtF1<3G(t|401qzw$Ew&t4_L+lc~I1CHRW1s_19DO3=9dCl~jJp5(}cM_(ISc&b$ zV(~2cVPu+EoGP{t7dwl^;bMny*_u7AFr67Wh{bM9PtU*f^h`EPWIQmA#4{Id!4@;jgZ+GyL!Bo@1^9bKF#9o{bbRRx z=|<+yLFR-Gf|g-_vLUngm3?BMpH*zAOSo9V*vsJQbZ80qGWVK2o0+>a&2Z|n(1+uR zqSx$`Bz{(L0sb@M0{vyF(|EIn^K z5Zk8?pOY+D#){Sv%PKi+BujF5U6y)4jB_%LBg>Pqr~T8W%+U;Mf>1kNlCUT_ENN^= peEe8RLPA`sc1l=6Vq{#bwz>5<^KoV(4^4pI{-B)z@TC7s_AlGe7#vP)O@F1xr0QdUrqg$PlC$fZgV5etHqCPkLL!Xg-r*ho-`MxI7g z6p2O!1q+xcMMQ(WL~O*DC~Ayd@!>PJ=$m_2EQ$W{zVCf6^TEuVnbUq}hWi^L*(f)L zVtko^a7N5PR{)zCBi5U?)?Sz=jdsx4FTG>xu(iGU)|nf+8D-zE-+PVRrRojD!=upG(whT|OgoE<3y5(K2gfC9=y>`h}0zKQe$Zzf|b z-()Nockal==#`EPqxkD8-ZXL)P!J?p5Cp0=>NN~UlSFMowk9b8mN$d;vIBr?H+W!5K;hj~h^_tu&b%lG zQL-0o9Ib$F+Rb75N*3j18WU*>FKF7PhC?O6V5@O~2SZ<=lU048NGV57_4YvBb_KCE z418x%&{Aa&l3q5@P3jA$(#&CTtS3xf;15Gu(_n0_HAt3qgYyxUu+B6GdbsXD$19b{ zuAu^?3kHK-!*Sqt_JfRjGN?Bl3ri;{Vd3-dAh0+ICT_8X6XQfs+PWQGF+@;v%m^-Y z=?43ndIGgN0Ms=dkRM_P5mjZNsBnX|gd?o(UIU-Ktb%JW5Y7cN#FcrUz@nj2P;3o` z{99L1T6P!MM%jZVUjjwWG&EONgY#!W5cyODI}4paRVacj#c^=>k{KL4O~dImY7|;t z2KHG;;o0;cu)E<3XX1XU|Z=9 z?GNU_hV#9^KBFsoxb_jUOuUA4jtIC6I|y1F0hv`Jh?=@ew+gpx$YsV4LJ&}{g+S`6IzUVyPhDw3~>UtbJsva9VHr;?Ltp_9V6Ve zw+wPmoq*=qKcgdMw~@OAN378%!jszdq{PvOu2^vjs`frbwh&Hu{h&tWAtq?pzItd- zWupDFT8P=TBrM7uL$nMJgu9*{gnUadS=wX_p4HK)W_>eUDp&?D3p2@{u6cxa{{eJr zQFqa;13QpgY9qBKUnCknbC6*%*WEC^$3?ni-+hp}AiDHvC}G@>AhVe|2)26+huwcQ zym9kmS_&r6e?}ubt_T5>m_SJH;X>RWUkme(G?HhB`#?d{B%-5^MyH1y1aEH#YNvGv z)JUfimH7eCBOnawPg;@EMbD7K+-fijdYGiQAL%sGUP1k@&hd!I|ww78)&qKVel8W|zpBX%oNs?h4mIPVs*n zxSgkL6nG$C{D@E#4efWa&?ctDd4NB`6fH6kOAL*SO-#+qyL9a)Wn>m|g=Kf8RS#>M zp0;-O4ys;`PQCj$yYzKcySabV&%^WM{sX)|@gC?i$d~o=4+snj9vl)H7Ct0mXymY{ z;Uh+liXI&kJ7(;-@e?M-#U~^tO-fElO`Dvq$zFuCuV&$rm)oV)EmVL8s{kI!7ZYr$dGX zs%v(B_x-NjwRQCkdm8sP?c4uD^MQkh4j(ys?D&b3Kb~qiedg@B^Q}K!_~*~RT)f1! z@$qQR*l5P(nTV5KxylFY%YpJ&6%NvSRf!2_e(ksG8*lh=05@rlifrn_sc&Itx3RN3 zr|+IUC*k1UO1!%ao56jWgH5#@7`NjH5>EL*59eQ~4LayHcu2R||NAu(E6(^4MnH2) zv)>;J00s^-zghRBVUN{icp7vW#5hG&iB-8;a}Z< zAn5yWDhfS7F3L(sNz%kI6n+jk5O2LI9F*Z(y7}S~fCnLex7dtGulz1s^LlTNfgu=C zxHoT2$%P}~UmgUG(0Mctme{`y?_UV~XRshMR45hU|Klgm-}LnOzg*nE?Iy_ztYwBO zv8+TX1yCsE#%xs7&}c9H#geb3T>0j~`mroK4FB>ijAe6JzhpLuWs_O|WMRz@D-7Xh z{wy29k67*;u_R2hcn`cIcIc;lTwVL)NnpfaUiQ!7`#`?Ku30vm@BCtTQP^kIygWJH zJ;E(MdQ^5|(16J?qoY#zeN80JZ(17e1b_a56PyHYdVj){-+RYCD>m9ABQZFcjpOZw z@E8tmF`aVM$dSAp%6EF`LhAZ_CgU|fU0k$>HfD5$Hg>cC9mgA{#}_){-F$GdaC<8s zzLb7kV(=&(zrlut^ZJ5PPG|gKAws=zv2p7xJ;Q z4qtzqqW)0unx9~aGso!gehllEqdzAhS>6ikutFvEMgmCvx&S>!6O@hP2sn9r>>r!* zN2AvSxeXVWIVC$u=aRs2E^(Qe+8mpyNts!x+6)^v&%SPb)rxQ>!0R8VrvM`He_j6p D2t(7I diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f82d9dacab54b6721a1079f37ecf1cbdf3e4d8dc GIT binary patch literal 3317 zcmc&%c~}%z5`XXYF$Xh(LXR^r2vHdjIYsb7q(M#x1q3;w;s7Hc*9Ze+yjG18yzzo~ z22rD;cp%0EjPbq&!AL|A37&YwYhB~6iTk=I7+Lj?{l5J+^-aI4>Q()!s=un9a5mDN zAsIKOznY;L&llyRS>Ml zbh12q95TENf|vf)(zVP!m?)^9AleNdNmp8(5Y)c@)@;q2R@l{HHFURFMvG$}2s2xR zqUnt_!t&9dz=P-9(BS<`%yL}%&{d`t=vs>%`by?Y=}h*Z;_>amm?A$aLtX~#oTdnl z1t!yb-xQ-yryUU#m{PQrXCZ71h^A@_`%BI4Is(I>NP2QxAdKCx3MR~53zjt5fUlQ0 zk=bj56|T#cicOByph@vhVQc6BVejjAVXD|ke7@34Fs5j>nW$zos=D15{(2~rRMa(t z@S*cSRC@_=?q)Tt*SrKNsQ?~!l)@tov8Z+7gf9NwmLRku6Nz730NPiz@Yl9<5^b5L zXz{r~7KQhsOiGkQ;_GAt3nGOn_Z-24yGO*%%?SJuzD`i6N$jz0h&}MQ^@uX}SOx?2 z!4fO0rNj`AAW`tTS-^4L4qB}rE7U!oDGI+o$mCXanV9-q0=f_+ab`!O;6)6eCio2o z538~T^+lIK&44rL`(^dS9v@p%yTv7Bzlf{E&+sB0Z#M!x&!B+G?iW17x;>*qx zXPaE7MM76+i)Gu(xgR5Zu%0R-}=6676Jr>OF zc@k8l=b$}HhrsG$GpTp|eyVG<3pmkWK-m!qFf1fbth#m`-0G2UuNz&3^j}m6CKlUJ z0TY9eCSODDn#)M)dJPo}*M^XnMOy^AS52_-yIDwCa@fM_$HicB!yemEI}a#deUdm7 zwS|aVa1TaVlR}^0;;5p7_u=a=b!4V_50L7=6{e5cOQakJM9EDTaAE3t6zhIP@=f;~ z;jnm17?&Xe4W`@S4D&^V?=FbM4^@=a=}I(1(cZ1l`HrZ*Qj3x*)v%_|Z*bw!`N9QJ z+fe?obovHmi4w)Vhzs-dpmBXUtWQe>o^@*S>K#XP!|OP_q?m)w&QX{wu=J+HrG*09 z<>R^>y)_Yo+CcRNoWwQD9 zYT&eefXQKRCGZfBf$dM9q0E1p0E3TIHhm=nn_fOZZ^jph!dfK6(^EQV-MSXqjP^rK z<5c8Si!geVMF32RoJrv-b%kOUP)5 z9{oMN1`HfDc!>8kBNzmi~n@&xbX=SG>J*c z+KDNtX_L}3GPAOQ4oxO0Yn?Kun|UrH)8|Qbw)xnG!ETEEX9}HUDpoGiDW`For~40A z=*Su2BHh55yuQDUPne}ioSm$d>Yn7x$@+Zmyn-(Z=Py{eXz`MzMa!12Sh=dW;j7Y(Wt%o{`FiWN?d3amR_xlnXK&@cs{P*_`1ZT%nuFgT`r+`AqsNY)IC<*y znX~84U#P9SSl@8z$IFdBU1_>{t@(P(jn<#rZr-|m=kC4s`wt#I`sMMHr_Y|NPFpD% zWNbmfovXMgGcAj2N7`I6BP~}i&}L}43XjWqDJ5%TWE&XOKNwXx-|ODZ%m-_>8{_u2 zF$t3(prCgG$BcLUGMCf9DPl)W?d$3~5HA8o26KCV12+b616Iwl zYHsk0sHwiTy^pC&rA)OAZ{A9v2&#%JDP9F}}|x;eprZ&nv+oFd6mnOODs+ zer|k>XI64h3ajDn^X9QTnBqG2kC7N&RbaG~_mB?pbsa||{_&B^mZh7S7?A$+`XurYOno@Iw~%b*M{2DYbBxI%iEzPPV}|Uz?MgmYrqm?&ad{;-*k5 Q0eJfZ2T1_J@gJUl1J8q)CjbBd literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet deleted file mode 100644 index 4748c07ab9f5c77e8fc8c141ac748fc7bb24c158..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..842ce2caacab4c58384e71071de8b40ef03e2e4b GIT binary patch literal 3138 zcmc&%c~nzZ8o&3xmlXntyqI7VsuB@dL~inR#F46h%sF$W-%0NF?Z12TeJ8PO zye~sDuFMcMo`53&4nZ3{?e-qKe>b7w(9xn{j~bI1z&J8ePt12<_zYhW0NjOEo=j+! z1^|d$5gus>DWu#76$~x+2Yd&)Oq8L?*QRA_3bPp`rKMY-v=%BOl)_h8k`rHmK>!a9 zcmm+hA==?HLiU!42=)q5CbV*7LaVG4hyCIyHt_GT5ea)U{VZ&EiPn%=q}Sz}(zF_r zMv{XUIoh;LLtc?SpG)KnlwN?!a!T2T6#s?Cm!?-@$CTNrnP!T+gc89&-=?Y$&7l7d zFYpf?h~}lbO+s6D)j3VdO@`ZxucCp9{FMf8gsDR zp}>0>zrnr^UFmT{A4r3FEemQv<@7tkq*W?jo@^_yop72y5vJv9C-y^c%sD3L33rG6 z{Yzn2=u}=?=_rZAZ4Y1$jOWj~83vO#uZ3xgHh}VO7w}QJC*80<%-*MBrP$^~6PlU& z5Z;IyEgW*~Hp~`#iqF*t2ohHkhkZ`Ap5PmX;d>u-ZQ?H7Ir&}Y^oGg##1eV zOX7uD-+F+1w~vXvt|IVR>_&m$<^ig9>`>r$>w8{bzX}*;im>h9e)&Sl+w!n^}sk6Jf1do#dFI_bTeAi$Qw6D4dnns^RpH{R} ziXSDwZABHGy7e5@((S`9q|Whzqul7-=V@xhqczUm)#F43@*Cji-xn+xbQ;v<7NC7A$HK-+2T4%-L0+G!-r!V+85!czV02`WSb60dxHYhN zsBuaIGQC?Tm{IA%3!M>;G{qWv&mzY5{NQndiMmMoqNqk-{J9%;eNuuHRn1NTU#TP>|2;HfEJGbHHEk8$X)pKeIu>!I}l`tY=^lq`>CvxfU5f1 z8??^egpz$P*?!!wKsX`Q8K&fkK!^PfIIr(=DtHe>;_sBa{%7jZJoz2RQm=1B?e#j8 zS+9mo!+wR!jxQCKCDfwg6S@5BJZF?49!#|^Hi529J7IfH2Jk0k4!Z5wR1{bL>Q+%ejwIY}Gzhj|S5^z!x@;XBfAlz+hJ*8|6l4H`FoLNKcenHU-t9uXNeNgX{oCN?g9O2X8{ zq~w&;H~uzldRn?BBU7uJk(He@GdEA4Z!pd(C^Quno8NqE_MEx%=D%ICVBtH97B6|X zbZObLh$Ml&bEAUuC?v_h4zllFE4g|b*cOEm8;i!t}7qN6b!P| z_lVwwQl!txC$(NzNay7gngqH$9Vw!?Vm4FKOODc8j>^MM%H>=ic^=G{mc?F;DGd_H z0l06OUQ+d~5;CDxhE)9*PqBgLtA3f{RaM{qSr5q5<^Dj%FU-u==`{?E`GvHfHv4AD`FpB6M;D*@u|P7)l0gnq}2wP^FM9$7e^7eQw6c$-Ws$ ziACD5z}%GN_-w-0$Kv?jnu!O_UI=Fd2Z70Ak6RL6Fa5&QB>#MEcowT6`kZ^54yF|A zK7Q&{vX3HzC3P-zOFX&wOl8s}`5RJ_CmT|exzw@0I7@y}3BBXNU}1VL9s;Gxs0~jv z5e6HnCi0w++3G(olEWK=jmZi<%T6TA@H~QnF*Pi3X2!@jF(#vk5C+*y@W&xqE(M>d zI87Wm&JOcaSe4mwPh7A>3(K$^k`^T{Ny|BxdWt@*2>apkB>LDsRwY+skpwbVqo!b1 tkP^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fcb03c17aa1da69311db0ff5f9542aafcde37756 GIT binary patch literal 5020 zcmc&&cT`l@*5BurX|w_6jx&OSB27_>fDMrWsiGnvO^pskK&o^_N7105SRhIj1S}K< z1;Ij5KoPqE6~P*f2Fr^*v4HyS9byu{B!7Hsz4hK++*9}7zrFW4Gu+>}N!(2-f?7lc zx>AZ1r~#NX9ZhSGjO^{q;M=S}{<%h2IQ;TO8_#zy4!sFJ=h;?d`Yh{Pt<-rbHK#tB zKDawUao`{YlqRKZL`+zhlLTmvn zzF5TdmqrFI43Wl$P?%64e1e3pkX{3wT%;+j*!mpudDPX z-%6ezMv#0n&xBlhMejKIdxi!|fl&eR5kZl03j?KbQl(IO5E{5JASygQB2u1n4<_7= z=@k=t`MBo)1Nd(Xaa4rSj$`+hP4uSYojjs(K{Q)4UjrO2)vC*A(H+ zW?gvPZU_OR1!!+s6m0xP0J=Ht5Oq%%z*rq#Dr1oGb13*7`UqD)RRAk^GL-pRv;tvkuIvE&*|+DHLM{P^eJ{D?eAmLzoKPjudwP+bOVlwgAKj93l0|k7!A}DjXz6 zK}0GSvW!S*KXeF;Ryu&&2Nu+47=m5~3-;$q;p|-%=(s|{l_C@5Qc(hzsy-N#HVj5qnZb*fYhdqneHay~ zhTataiiQS0L~#ZPq9b*{VaqItsrJO`LTZSNicj#i+8iApo2$SXIRo~N*h?%Lx)Vk% z&?Vej}CQ-StOJTI}C47^!A((bQgb3n1*1Y{Fdar*0GmCn{PU^e_?J2j= zxss>IY)CX#6cq^X8+PMd18cG}uM?_UKA@4{icLOcf-0Pq(44j=Xx58CCs%i2DGfMm zPMU{xdDz3xmaj30;lsyb&%bGa^m8ZhYaZ5+*18yb-AJMk+Q#~j84EwPuYvs2HDtlAd00`M zDQeVjz{i(;#OGh=!>vz@VC$qNU^T|q*|G(eWDENl8E)c>pMhAIoEHGA9y^h0tCxY% z(I%Gab%b93iUUaZfXTF}_|RpF;4zSn`evL&acW!?c*2UjVQhj(g9VtrbuAgt;YYeO zi6E=i3mIM=N>-fxjP?yViyQ6RiYNCt!m*gSSmO*wlpULf4_VNQC#l>;5uvxSrO(!3 zPZzHu>SwwkPM;MPt#S+viHO2Vjw~hys7vILLo-QfO*p@FUzK1+m=7vTxP=YZ8_6yT zH9!NG#JFVq3ZgP<0hajI5nbuOqxyVJ2X|fPBJKh7v(h$=cD9R5MKS;L4@lJ@519%w zSv>EjoQOLd?uzb4PNl~P)O6kio3}%O8#B(3Ri{#g-MpiVt(-1UjaE2_WqTGxu2B*@flPp!&)L|#KK?DSZn*$G71vqm@u=mk-vZ5}>ZaEyYS6nqjQDl7&&xMd1z z$mT0qb=H%rbH_n@S0c9P)D6x|=Xjpp&D-!`c!GYkM;#WozKXp#TZ^<`?1)Jdq{O(! zT)~~s9W3t0>)dPHJ?!ZBy{P-xYD_byMPPY#6D;jIs2%=5h>7w}<9Fk-NZ$>QkmoRh zWBt*W+}iX64dg}>5$eOCc*=S*Y<3MEd&R8e2PSfpXh}#YjQW zfYvC5!j+Qycs5H*DOE5OZIAm7U$Of(Iv1P68dL8M~zZQHXk z$9Pd&n%Hh{M&9_4u5#a#dp8wKh`xU&&#r89;gqCTk9tp*ZP__(HO3t}y?kqlU7DJq z->LGeQ1JJq&IJxt`SoEvwV^jpR}~zIif%dk zJGyU+{th|1+Jx%Apr`Lhr_n2VDA}mFmu?;Nc*NjNmVwEhn{qHWIT$Gmsi2{H8)&GS zg6`@sP!|gssC*e{R%8wOQIZkO8l()$yJP?_#4^S90=u#i%R?SKro*_^#1uiz@qj#x zYx4$+V7Q!;iS(11)aRlB&Z@UBW_<)=`vJ?30FUY6ixeiK{&&B;eyg&`CI#mT&6D_2 z+f)UyW!yWy|MYZqL$A6d)c}W08j}*V3qsxZ%*eBh{~%c6S>aLPRll4H^QxM6Ea8_ARJcz~ zz}akzWMPC~JzWveJwF< zL&9^R#V{EW<%wj97nA2kkuuKherlWyh!yXZ24E#Qpm@*k#47OsfgMHzibpi?M0)ui zHNfE|yjPX+CyD4-#WF~DkpVTJBzv)un`4oD_@K0Yy{f)fQ|{7$jx8e`v#s)Cg3jld z26s2*#RgyADOoR^njaT>rEE@_S95-R*!2U_j_j!g2@yAsEbZ%VE=Y{Jdm@EB#Ck{4 z(jPlA^}UbpSQguNxx_KYx^Q{?)9W?C*N+z_CqBR1x_-#Cq7}-}V*NnwMm$LVWMz>NrvWEfCTED;_W8^;a` z4`TQNO%7UAg#+fohH-jZ_4W3%8A|l$&-ULkfN09gl~R_!{!ipIDgud!AdIBBP>&La zkRmQ00%!u{{~N1$#C${0LVuBbMdn0|sa!;yFFFhX`s*sK^c%s$7=l-$Ow*sHN&iHD zRDLHz;X#qXaf76<6cc7(dZiQf4o;v6&|e|`0r>CI7iU<2h!qh&2$~w|Vt=BT zx|**0)oo~w2o{*=>ocHOgUVm?Ge+-^)1femnE7REeGE@1!#DtlL`J2r$_4oFQ4|JFm=Ii4g!t6)5 z(eW)?LN{`Mc5)|l5VQ>XlMk7_uk2&}ye%RF9fKuO#$FCjr$dYHm%01gxy;;!X$Di5 zhdvlj7`w<$oyMCpn4gR1*YVKBqUBHVFr~DO4s`U2V>Tqtu8h81 zDe(*b9A|mG>0;CJrSuZXbY|!n&eR~<&tcLMf3qaICgzM1UF1Zj{&a|gheEH}$}Q>0 z>FnrzUx{tv;5o^IWvpl&iM*17M)D*F*X61EMmWUNIPyFhd)hy3${fw0CWy47r7=t6 vgW|>pL`RR6#>7M=Y9|E6#D+#iYMWY)GaY9lcGU#<{STT6`nN0kce#H9l9NZ9 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet deleted file mode 100644 index 3c53b83a00add2b8dc0dbafc83e88f7ca6fca968..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3442 zcmc&%cUV+c7C-luVTK~8^M)B55h)`oI0FHq29XECND&bWMNw%Y$jm$$5R3&I5tV3U zHDcFjEGR?;vq}-gVDA;(L{VdGQE_pNExPx;5tUW{*zenKb3VBDo_p%=+{-(cNH)rY zp%@M070!qlXa^u4v8$k7ufKG<(8hoEp4Z!DEA3An_aE}Kshj~u%E;XqbH)biij+R! zDsGX<#Vu9IF=z~=CU0~R*6(GD)FaW zDco>wzz~238$1DEK9cbd$2su1x>5)v2uO(;1(cQ8m&O`AiA)o3CS&a0Wh@uBNaSMl zMn{HGe7K8+7Onycf+P!qL3N6H3&Yha*^rd2OVMYIN>0d1uu8*=wB%7qhKy{TUI6wI z$(oR=np8Cs%KswfN8vVDGuN%dnJ1)x_;<6%_sHPc<00L9CX_a?&`>i5+}3u78|N+I z@q8D!Q{NSmx=GQ7jRu%EQVK3h>cMc$1wh>%934Sa=8mi2%lr5tswb_VL68pMV$@SjG(nKEaP zba8<8Qhzv|ZUYPAd|=$%Kp4=F4kL0qfn;%exEN6i>)Paiy?O^aUaCY+H6%4Ajbg zps8qv{17LIC|e7P5)W8IxWdW~6)@#>8Qg?Ea6!uu*JphJ^9M>n@q-rf@7_S^*==DP z)`c-1F%H9308;t!S!VXdXaYvx)*c=$5KyddNczzT+(&t7y|Q3ui813QpM zS}nCIUnCklwXbq-|A<;vIE=cW}AuvMkrAd%?r$60p z+C*^MRV`|J5uxTMBtXLVpy}I(w3}!NgI~@;x5D;VZ^sTXgw2i++=2~Axy0l-2 z@ejTt?v2T#b`FR{;#I?+_gq2B{T_QK8QT*<^D_jRT#s6_&x=1-F@I<~k zg-{d?P4_X=D5k`DfIq<$Eiw~J%q=Xf+F0APZP#AP$ZX{byAH~Z_MIF$J32YLsJggz z?bhARy@y)k;rW@Dx6kK2d-eXpw~t?6e>NbnUr?~Ne@JLp_<)Fkk%OWJ4;eZvIwm%5 z_=xzCqedqrCMBngNgbP(J}yJ2H*n)Kjak_fa&jkrIcf5gsd>|;&zLzY|Et*rbLP$~ z{CfU^Zx$B)ZPDUymlQ8uwtU6PRjb#m{chd*?>B7Rw7F!<*3z;cwr$^0Ua|AXzwg>z zSyf%Lr*`kYy8S=ZA2@jE@R6g(j-NPr>hzhj=gwca*l_9cKYsq@%2lqBk4JNs7L%_{ zMV$2d4L(>84wS#DaFA}xS6Fd2H-Ede@pb?QaEIoo$bD@&&0Xy59(MNo+56`&NH}<~ z3`-YdGk8dIu&I&*%O)H_!YLn_;QSLcLNnb6kLgC|f4xQ0k+Xb)5zw5{`j4jqfIfq4 z?o>Ui+0*eFJP*DGVw|F?uA`D6O#QEa>&79`rRn)zm|~SU@tLZf~Ez(3k1G>qnL2Nzda0*!oA`2K~d;mwS%Qmg(g=k)}Vi(7R0ZusFY zR(jLZ38AOob@)^>w0_$fGg9=aS*AXJ8Og37RYNCLVJF-UKXm_zpdZ7jDD(!o$e1)X zMVG)(_&MN0y!WbbQHF1A?~fk_;7!O|7n>28lv{UeUhmy8Fa#qC_vNiAxo{->%Y(oX zT8_rW4*PfD{R@Hr6D-Kg6-q_;|MN&yr~xg{GF zH88rj>0-&(P->pM*#MSh2jO3y!dNzk4M=5!SvHjoOcnM5u)+X-709w7{EYdQ8FRwY z$a~-&u>-&Cu2%QNi=c@9JRg|Dk3sx^U9)UBKLo_`tRQDKJU=ecGr}V=dRTUHaIbN( zF;QcAxh@jtH!&R#g1>Eqv6_p*$bT z4<_hB>ZW|g;k|&YglKO=Y)ph9E=GWk;|((53mwurA6zU<@8!dnGJs3g4$I;NY)Ck- zFDT`-#2*wQ)EgHYQ%mVtwjaOLX7DxO;(~jnCwk`Mnt1U_xX3wt{c(z>L&19if+fxz zqr>u8HXz4zPC~N071m*eN}7xWkfwbBdaN!u8^;lF^7hz2Hsz1TqzQ5dE+KP#c1o6e p634kGWM&$2944e>8q*AV2M?bf9z8UQa3#RoAE>4PBJqD){|0`F&W->8 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..84c399e67f3a1f307e88c5325c849ac627b14610 GIT binary patch literal 3138 zcmc&%d010d7C-mC_d)_h5qKeqQBWjqAfSRv5qTgW7-Vt5&>{wifNYTjQ0s!USVXK# zaU1Kxh*n$}rPxtK!37uUhFihbIy1Fe>rAy{zp-^%oqJyU*Ws5-wgSf(Jt~`^NJFA;GKDeH{m<+5?-PSgaJ|; zPbsxY1Axo}laWP;!P4d)Z2>w_av3jfWZ;IcA6k(V2!9KFgq3?DnJvd0 zLAJ&P8fseMR{0uuR$0LG@m++2MvmgLR`-;Ad#WD!XEn2%%OnzRKr-$hT+gMQPU1d1 z{s5wzl(6MVG!|M8}=z#kpXmBmUd&|Z_?{O2L>0&QtwKoS|?|cm1!~Y3? zPN@SX$dAeALqJ+s&K{akC3&*S40l>fVc=CYw{hE>cyoh4I^4gJsXhA)y>;#`6WrX# z&a4Qa6G-gwTrWG;9gY?Ac&453H=VBqbJNbnmw zhEdOTgtks!iJ3Xag+zO?doHq!-=ivZXZ0A_QqvOD$~4*yiv12PxPJpSRYfqp+@FFZ z?}6RozNcVsb^+eE;tkZe)=3`Jet_#SWeBuhF2?!sDJUkY&|drbO}O2=sK04a1Gc=k z*KX!ocP@NpBsLTo*t&Ol$BUk0?8X_R*lV)wcBY>?(e*4 zq}*_dIXZbeqp$b|P42}?gMZO;E1SMWKd(2jxh}mSGwdUj9lM{&I0?+UuZF<+d7JPw z{|?7b+zO;)lhjC`BZJG`YtaIiN+zTZVf(*oxt?e0@dDL7=W?Grvi5o-POI0U!vlXo z%Z@LVR!rWBi%w)qZgFayYVXOMFSEe)O}kKgRw@Ks)Uh}248pepPogWTQruRma;Q)T zarRrw?L1Z|cRPN2CUS6XM5Xd9xKjNWX060sUgoMsYb+ly^J{LRHuE~k-~$N|GUF>r zwPS-r&DR^iyLN;_OOOVJ*-u6HzW+PUec}Mc!E(j?Djsz{zK@?xUm}b7(vkW8vf!9-$}8NIDs@jyuiow+efoO#^V0SoFmTXd zZ=WH)e*Qy;4G#z$F>=)CH-g3lj|~w*!^VY2M2?S&o}i1F7#kO#Flq9Xsfp9{NpHTD zJUwNGAvG=CI5Q(NYgTqnZeG4=c7fSaSX4Y`?!33>FIc#!WO3;`@0KljuY766vdZP} zuUNTi^#@gJ)~;K>;ddK9+_ZVi*6QDHtJ%Keqn)+8cJJ9+_lJG^>pyNd@W~$!9%?+? zbmVAr%dz7pPPU#pedcW2r{~UJxOl1k@|COCu7B3idE@ghZhm=7`%tOjv9-RB?^-Bj zxmkI%)*H=iPL|nXXUs9uB1$S|3!~_8R@`>h9(2`Kihbn8WWKU2c4`2H;jV3o%Kaz*Jmw|3AgawO#k-&_ z6%!A(JMmHz`ySN3==j&cf`V0PRJ#9hFaM2t?EmG7)&5pblA_#A!D^+7J#MZ5Dvi=n zNJxlH9BqBDlv`O}N&!NsAPDi~OKE~26bqplLWCe>2w@rGStt>Rp{p=Kh@vyu+h(#8 zmu6~#8WLjX4fgdNNfzM~$J2dSF&)F{KuimQjt-%Ex)u3?AKhoC4xQ+qnmDyEJz`Y0 zep*5%<>$r`d~;`!L9`bp8X+JES?!5S%Il(UPD%{OOOMPD3{+oqPr@Ojx9td-kPC&Q@_NpdmGKY?hy6mk|SY7*om zI>Ofrp~cpF62Vd}A|r^Iv?_@~TF=GM^|=v+#E%#!)hG6eD!m%3Bq%*hhJx9JMvG6H p$>d`wD9A7NC^8n9v-0yi`~!#h5Ajp!Gyu;(&_sS$$B};z{{p%vL_q)m literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet deleted file mode 100644 index dde573d07cc0c2130eb43113befd5c673b2f0ef0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2537 zcmc&0YfO_@^xXS>E$v1pB42G;b@-s716qhMB#il$H&g_iLfI4wt+ZI$t`9)BD0`UT z#sV76FdAXe81XgdSb~6pI;QC6YcIAekuBTQIdutUQ?}^t{krl{`M1oI_TF>P`<#2v z$(P3$#BeMZ#D(ZN%t0`KMD90sxkkqzL%&{#sF z@L*^PiJvoy$w{~mQ;G7w;bI!@J!T}_2${)I?yWN0JtZci$0)NB#A+%jci6mDb|HZd zB=1G)n^=7et495dViS0LWLk9r+}i!zjku!|qrw7KkjnC{`ADvYtM;7Kn4k~^}wCISs*i{fo*yezO|wgHk|2UzFLwD^~YD@p)L{nV$mts zba)y1_?aPS52?ln)~CYs)EwwOKaJTGDT44fKY~eVf58tW2Z2e_G7fGbNWAszkzT+s>Epj`kSTHK&tl>%Wg){_F~q+%qF!*7|C^Z)!iYc6q!ga$gf; z(;Ar5;D*Gl<*@ErrYN{>HAHrFizk1B(9j$}PhG#ewzt)p85!DmdY7x;me_J(k0<*;@dpF@ ztIyt8(=&LeRv*3X{MGg~pWC}GtIthRaoE?End9HNw90CyEoyeNHmlncV78fQ50D2x zh~$@~@*kz@W|{go5pCMBxjgb!kDDU{w1Wve+^l|%LgL40>-h0eDp4NWtOLOlHS462 z*0q`K6&~Nb@wMye$?E3Gq+K7jK7r3iC8lhM1BKXKZZTIGIhKs2a6D$Zg{$=Yrz|A1 z6XLL9e04doPk8(^p>m^jCj(ZjPomnaLbwAu0WtC5zl-otVn2-97aTvBEXc7^rPTk& zqxUh5#{cD|svp-&l3$X^v05S@mW2Q)Rf<5qpkQ%fqVHiTT3HQ+I9|u|d_MV8$l>`K zURS}V^Lz!LQX$ZEL|_rcQg}X#?&R#>$w@%n)B-icFJAMkMw390wA@U3PN|`58eNEK zp4ZbwXP`%c&ui&9#?DCLdQ zcNZ7N*-aT0ypie)?nycX4E}S$(xvpAO&4G4Lg>DDRuP)cV=Rnw7>aTo#YIBu#9zKI zzw8&s$3u!mU`#x;lscy=W0{9C_$)n@7ldm3{`0ehdXr)k@Ry$FQz@EZqcw0Ar!S~1 zi>)O!nNNjCk!xuENr=8nAvB$!Ng^lN5x#-f)%fm72$pIQ8D6NQPf1A9M;B5zRHb`~ zA0bbwPwW#_dNn>tP=q;+u2;NfPgJ?n8D(_295rFpW|!OQu!qION5@2Km3kGx$Pc>7 KCr}>w$omJ>jBkek diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79a6f24b35cf797574a2db8d32609d38de32a0f9 GIT binary patch literal 5244 zcmc&&c~n!^*5Bt2$xR@TKyFMR%!8mPgMgwq1epazL=hAv3^|k%uTkEa&_Trv#@BQ0*pPNH|2~mVP z(*&JPtAc1f8VmqN+?wBe_9GwOz$1;jeli?I8EGnaxGl>J?%laV^I5}L&hX1)>xTCv zXuRrGH#R6z$X6SzH>Vdh%XJ>pK+9=!2U<)EX%5YiyMrC|eosoh*9HJRSBz=J2q*K7 zmg@o7uEpduA#($cP$uQYC{yERCn_@&X-p!P{E8$iG1~%UOZ}5SnFEsm&=@d{km&%J zc@d%$gP2Izj6w(mJOnt8O8_aLfWtgG%%jUZ0_M>ZP@Y1@j1^IT$NDm-t^deHO1>x4P`(+w@I!g>T+ZrJ4o(+JbAw1W^K>aKUB9Fa+t8e!M zCt)OfIb8-NKMG;d=L)neLl>JX8woW>oZ!NCKd?+00?#MCMVAi`h0Ss)vOjGN#8XG0 z(k8&TViDj@`F9%E=kD^Ou za%6Sp0EpN6gVmW!kY3>msozWBwBAg}pCgA)Umpjbf_R8NWC^WN9N5)Sj=J~=G%Y%C zQ)CFWH5NeZ@CGN%3&;(yf{BNVL3Y3yiZDCaVXT2A?+(L57!B9`Xsm11C|Ey946?8M zAot0AG&f5GM+j?3$>qUj2ND{N9Rr6Yz7X;&2P)L|V5{c9p#mi|-O-1}D4XCJ z7lUdjx~?Ou(|t z1%7WEA%hsq7G>r&@vUhKZ1II1JMymJ{mzZ{g)(S9EqrfevLk z0rpz#C8CXXfpwG(5wg1noeF4%juD%P1GC$8206Ck>jLb-`T9dhA)2u>TTY_Bfi0Lz z+7s@AwpM6ZaSL56eu`WMq+>hN;-K$TDbBMSLmnt>gTvLoB1;IuM%Fu_{Q&|rx%M=i zvCTkr%iFOPr*K%mU%F?m?f^=hkvgvJdd+iS(9)ieSzXoEut{9$0SAn0^FVN)j zfd9CAI4K4Jj97dQemM{Tf(Rc-HXVZfFzXboxp)r0Hf0Rt)y%#TN!PI*KoW5*=i`TzGHXk1YeVJ&KIhj>nec7H zO8ESIIr+uz8Q4yZGpZVR3Lp068+>NVBYey`3$9geHdgL;1?hh zMi$1xvTlFUVEICDIC+{Qx`EJ(pK$=?9&j2v8aG;~3sYXNL60WXA(a6S#hvpcD>J5I zq+Jv?aLiFMwlR|2d~_PJziLG8Z+eFg4rsz14sOJ&CitO69e%JiGY=mS)rl|Azk^be zzQN{yzZ!cwXBklu9D=Avo>;nmB^r>Dh83TfL%gP2$rHzdNo9Gmu;gHwc*4Bt=*#R| zn5nHLcW06vdfh6+6~h)22hyUjoR@y+%B$O=A67Q~joxI)OQ znu2K_4o1J6O(A5LZ$ZfDTnIgWm2AF$3biR;!t}HGc=aFws%f8&wd=hnsl;)gC6 zzxNWv_qC#5y5`s1yyPYOp? ztQYQ*j3euO<2c4{dFZ;&R5I-ge_2@fHasB_(A`g?^~M{l$FR>*upzfF@KyQijvD62 zdvL0RA7NO?pDmlkZyM!KlioAU9@YrSP4)G{%uESCMdu{mt@fe^ovjrLhj~-?(AS(u zn+e#9WD<0~>Lv58B;t8Nk=%HnwOj4dkne5vG@}C3{S<-@E>~!R0;w|L3KpW!FWdeWYL5ezyNAuRCuSQBQE^pAJ z@6nQ{H(^JXS0=oULZ0&m2EYnXM#-RuzbEziZ_1-rV3_g`!cPkn<*$ z;|cYEzgkiU=N|c?onI8_A0LoG9ju7LBa(U0uD>5GF;wH@4kL{Joh@m6;V3q#?VuRK z>-cUxVovPQAXH`l27Ow)hFTl856iBb$GJlqVKICw{8Fw8uJ1U2+LB^m&zlh4o8Jz? z?zq>ZE3%c?#g#IhwML#KzbKDuzBx*-_Q4#aBR+*z3XqCdXQ5p3zD z1p7~wkxCtN7pBACevQy^5_d1+j}^1LjuwD`PH zvgY1-1uYYCahYV>cz1w&KInAx45*p-ftQ9wYSwJ*Q;T%m)ttyc3*P8rMiDQ5yjtxZ7G$>ehv20K*OjgPs3BTZo;v8WRt9o+D+76~ z)%80T@>v{G+nv-RzG!8#w~$&_?`~uO4KJH5M{4Vz`te*YTe?V*Oil4d7gHEp0uw&uRvc%#w?nX&_3bT#o?%#jhXkJ<6&qm4RP431i6`k3} zsg>QwjnmHb{cN0G!y7K2-yqA8XEfQLkY`@o;bm7Co9Mg}w_n$Cb^sMrw>Eh>2!_AiTcIKEZ?dv{aw(L#cOXjuVk1$_B8Z9ti zX`=m1B?Zdgf1s3sZ)1q|7{YeDhn&XxiJbq3G>4axl**F2_)H=>DO1IbPmX6P5W{z> zMUv7HlDp2fN9=4*b6J-6=WFI~NzHOZ&kAC$C6?WdqpEa)L_|DBGW@PXKoLsB6GALQ z>|z*VH~TlX_L#k~7-2594@jz%HP?!idauJ0yT7h7O1}|Tj3KVB*Yz1%_uBKHXr2D= zBriEWH9^%+W;K{(D`wj@%J$ol46*ww#6JN4T@7SvH;{5NV-w?3lr+ImUK{LB)Y(QJ zbjWZVQyp-_q#qsx&FNSEkax0re;g5w(VQSp)|!xNkBep37=!Ts*ldg#|K_ZJE%3i5 z3lgzRE(`ik^ZNf0i~g^?)&H|sNpfJUL~Kh*wvP}4$mCL8MQG@x@R9v*mb8d)WW`P4 zrBEm)GY=~h6p9>$SAxPqs%_qrr}j zBbb5DM1OWZK8J06*oJYfPz14!R|GrN?kk+w`MemHiOwd z7S3fFt-tYFCrl8G^!w8e*}V_!Gb6*@Qsev*6iU`!3(us(NW}Yj=+vp~JdkbrQ`d&x zAJ05y%}b>WcT0nTPy__A`dX!& z_xO_owDo3+&B*)GD-_=B&@Y*-L3*UG$J`i~1x!s`StX{(Ic)uz5c>~>S@Y6bGLbXc zG5Zk;ublpK(gw>~F**uuCHsxEN%pU6Q;$gT&0=u0d9wD5f5wzOntn}?nx`u>=4ZvL thQy|)4^d`hq~(}r$7f_FrKOrXyAO39>Ld%21N{C6X9(suIp%lA{{vSbUhDt> literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet deleted file mode 100644 index cd2748f7d0eb609ee249493c7051b096c5ef985f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3413 zcmc&%cUV-%7C&>#vLFb^-gOsOMaqH-vJfGPh+Gg>U`512QN)!+1lb}Bf+d0)5>%p* zrx6=yFm{Oo<|#!5gS|v-#FyABK6{DLL^JoUs66$L_kHhsIUmf-nK|Wm&Rp*A62(S) zF%+Xw!!k6~@3}b)XQKBOuMq0)fV-oCr)4=h6nPMA&*F5u(>NVqC%d zyZG>R1q4A}1^GVeO$BaN8huihHd&XMs8MICty1wKRg;*cPtVfo_?lXf_&SnTk@Aa# z?7v9)!AcKUGfmbi<{`-!d)4gm6*Bb8Ns#6<8@4pF&|Ehj+*kL6E2k~t@q8D!)z}@9 zdWg{4b$XaLMg%TP8bN=_1;En|9<(Q*@J=ekSN;j7UTy$EiXW_tmBP38ZD86m7UgDG z5@}LDsNbr911pr^sP%vcLtdie<-MRtCPCe6oq@XT2||4Y1m;oDRPGGst`1-)3WO7B zHn1Sx7beaPhQZBgFgm9zm@l@2vyoe%tbI0g@vJ~cx5$uF-9`}29RyBwN5QbYH|Xw) zp|<^KSTtS+^Zxh&LW+}N>}E$emLP!A=56S*1%j$0?ckiP9W>N;18U_!P*gpF{4gho zEME=Mjb5;daD$bds^GI%iWt%pW2G>Gw*=Z@GffvTR{1G2sOJLMUf3Pvg#y@ItcHUZtYQC28cvodQ22(`;A}h$FQ$cp(@jq}HLeIQ zywpKXW(*YPm%*#a`(TN3J(PwAz~yBGdXakrdKPpB$1UD)|G_L+d)5`4bsf;-RZo$< z<{HX$LtxN3LFj@=$S5C9R881M6>fM5zm@k!yF7~92^}L~ZMU`5So>0NPH>^3${wTY zu*1;YXCbvQv8i1T&tv3QVco&&%r($bhl#qyJJ9d0M+k3yi&@TzW6(JB7j$U#ZRFk2 zK$Pe;@O$+d(%db8-njGxl<#?t96?3+{ir}2!mQBHhFYkTXP~`#O~lM<66WWOBASMU zz#ZR5gk-alEUmW$-^v(NwWbj+6fA~Ug&AZw&s@U4PhUE&Uj_0?-A$F`3k1Vv z46rCRbh1eAa-Lq%a1TTth%S8|PFVIP$SkG?lupm!p!Zdao3}otMPLPertgMl8^gdV zE(FrMco6r-RKuJ@yUEkT0-&INJn^W8MkfdF2Y-JTYJ1m5P$im7Y{?%8T?R%#?eWf} zX#NZ2GP@G2Lten8#Bw106{MaC1YuSIwR7wu!SjVixZXGmmhIa{FE1NKlvH`48rN#_ zlXZWRqmSGm19o>abIPAgZ1X&8HY358-eY!`PFKW{2fzq^OOqh?)*!k=-V|`(Q7f=L zi_oK|B!K!mPz>l#+E1~BVSmg)HzM|;%ns&Av)iBEnK6Q(-4Y1bfC@Tke>`1OF%oq@ zWlwK7_zG?6c#w48^fg%%p+wV~mGF(Rfb5v?Gnr$30clfzB_`edg19|Cm)br!3JGuc z69(&@sH0X-;5S!;8B;89XY$P6zH0f$KtgeU6j&~i4kA2}FMmQPfQI{bu##s+2`2%5 z>r%A9OlWS=&eE#AwT*2DI}s!9D3RKCl6CIV)uEfCle3H5)vbGvp6(vKJQZHvpY-kX$XhcKyb>^+5)Jo3w$7s<$<$ZsBMxINI%#|HjeoaMA7}gY2FO{C#AA2ebhm z;=qqEZrh6>w>4Nk#%OW8Cr^2-oiEzltof~OS7!q}!wF@rkmLoOWej1WxIt|R!d#oG zdCC zVrhOozfXGDju(lr)s`c@_jj?;Thdzt@~sXZBR%Uk6quf@OUX1*`8gy$kK`p?A{k$qC7eAVS`wf9g2TBiD21mHYkM+W!V%qIECK} z!Ulu6RWQqjaWmFiXRPr{BNu@S#15I-)6=sLUW7yr;`HEbZVcfD9GYcS+z=GUsr)&s z;Pi>(yd%BF#f;3-g!Y{n7aKi+JJ&{Gd{ff!z{d;bgJ2L?O!4?7ch<(=7$4)K(i1&gr)iFN$xY$U2d@Nr&&NtLlU-BBXme$s@>_Bd*Oy_!Fh!5?XHqJW-_oO#x zgqxhr^&dktT?*a{;yp2PTpd1-V}r6y_r!zcys!<+chcm<7irq(OOMlrX5l=1ot!_8 zk6pQ|FfG(=;Iv7oW*J7_uT{nyN{fBXJ@&YJy=lgQbGNASFO zevUjY55@o(H#@%FRrYIjmd%K#?Xi>L&tew-bhaqo+Du9a#WgNb6yAD~oX4)=XLV1rqWIfjO|` z1$>T=cbMPGK<}tP9;Pg-dLjA*{SNeHJd&>mTgCM_^zTqJ;pOm0(F`z(^6E$zrhWaS*s4`2d$cRsbttCLDAXK*>{iShr7v5+k`-uwW*%)*8dv zJTYj78p5+hAJN5{$&fDOqlwMhKs+)6D%=4qHW1KXqYa!18lb3T0e!*pu+x1itWL9n z1%tuhu~r>8nTl|AVKtP>#(|nqJ-SdWL|QGCpp<3{S}hkKqR|Y(eo=;ISr5oqC4{Z- zkAii!KX@I|gkDb;6b{y*VJQR+=cVDgf+DoFjt3%tJ{UKQK(d_{EUYO8L8S>4VS121 zt^wjd)xcet0|R0nHvH`@*s(|n1c$|t{P0H<9HRiWgf@gGb0FP-gpMOez#!fRoZhma zaoa@D-Nu4L+1}87OAfj&k#MQN7}-}8gLc$8c(=|5wC)?hWuJ7o^)U?AM!P_Eaw&X@ z?Sw3GITYH@h2eb|dY5<)CZ(!^X0<6ieU=0Vu1)~$urcUG(XU9w?=FhgLl6p&jWP_2S#@ zCW6VpT?i%4VJ(^8q3AIjDUOYAlwh$oY;?Z^yZS4Iek!b z`YqA~M{MR9V^m=$gBG_nLyK-CYTwY0Z9IX)bt7Z+cC(sK&GvSco#wywfPnn-kM zK^LUgFGG#$Bha7}i&ZDj2etVQ(0ox9*V%Lg$l3?Ox6*9MUIabhMf|@3rz$>~xA&>L-@VTePv$c|Av8u7%c!K9t(x7S=9%Af< zpM@xxnd1wI4{XUX8@>jE@0wW(R}mWd6^C_$VlbXJ2Uq!;3p>^>$BsuX#gq&Y+2kNb z#m80=?|HrCu_Fse@468AlB#MYht;m=VC)S{O;?j$5Tu9R_X=>)l=VbqxF;6(N{lYO z8&Y_Z)Wu2Z^Wq#vk7Y8(b+GLdhpdDt|v3_)A}-~wci!%m-AJWvxN$picFGg5 z&D#L_$yF@JXb&o4B!T1faEA05%|uyp6BlH00`IOGV83($(#vut{M-=E?%Q<*nx=JQ zExY^h+PNBXI@_}d)5X{Ek_onW@B8oMXEyGTFI2W5+im<<0v21_n0Kw%l1$#GE@PN~C^3|6;l8IA9|oBuF2ANv2*3#8VyJ z**zP}fjh60@S8st8xrfGhJ7}2hKn~6)W`r%T~pz(hmYV_t9vo!x^POhybOfTWx&1U zJl=E7h!usOCoSxCh(i|%{M`&v;ac$=&X$N2eT}5T(#g=#ABTCJxlS#xi;>d({wCZ}i=7a$qydZGT+Lp! zSDmz8CC0pCy@|<9*-As7x>%eKS20`I1o^bH*sSV=h$|$eT;Gt`;9$^o=9>RhlE`F}9j*u;Q2I@;J zv1^{X#5YO{SfxtVD8M;^q^}yCvb?v_2D~EfA2%g}DLtQHe#B@gWou~B9scoO2z^7s z3o~Z5h)Zx10i*a9MH@Ea)EGV2(?uIMvE?;wvn`5~lBM~kfrF=uH*J>FuuSEPO1|Bq zsOMbn+FFvlb*!<^*=$j1%Jy;7BkvEkmTumuZjnR@Ecd7G*0N34b8FkbWzPhs65AZh zvb4-euJwV}+RCNMVJv%HRTL+WBJ@lFlvbh9n#TkQ8vt~i}xcS(0){_J>+XMb7Gxy&)&GSmKw z{<3P1mL#6TfPs5s$j!D)hpUrUIDPZxEoqQl>wP9oFe}9%r{4ekz8ULShWi?VE?2rW zr5NTO4gJ4I&mD~}lVA{oOCk?3k=qi0x>f1MNq8E6yKT!jaZTRwsQdjD%e!jwo8q1h zcjt-^7qqN<{rtiAU55)#B?2xbw5=^_-9#zsdz`H;Zr?1gWuIqTSJJVSKPC9a*}BrM zof=kYLc99?-Fx(wRC;vRmz~Qp_U*~DJ96Mc?)0dKH@c4;yjWpdX`Qc-z~9qDN2>soAly|+2di@TR7fD2?NI`V1&P{qX~LYw{dU#M zkDDffc9q2Mi=L4xGLa%38m1-!m8R$*_<$0%J8kmCM+;i&SB=OdY6sBCr|u2DzA0EW zk??RpiDZIscQu`G-BH!a3BYwYX6%bTP*oR>(Hn|27Yo;HlFAe$Jlz;wv*bj~Hp4-T(-uyXMP!_eXg5rD&%3KjKKEG8#E_0D2!nT{!v6oP*gPAKg6H8 zUDH?TBChhFq4M5Ix&u18NoW-oFOjnU z$dmkEdz6&#Oi`BArIa_AC;Mt*F0vGfM2qP!BMu@_oX9dj zWFryM0LCdH= z$&lIm!amB~#XQVU93b*$>?Pr8IJCHZo;xpH%FOMVW)!+4_0fD*(`%N|-Y(|hZmtW% z-CZTn>3EAr@!5a%Wj=JVX!%n{-L)Fy_GBf-ho)Bb5w=4eJWfv*wa9l0jPKibeYBErx+GBP|)Bi27M dDkwZm!({4YlgY*cM0k%vy)U9N>K}8?oasHu+wOkvJMWjo z@^QW#!?|$oT0DUxfJ2bBbIn&zYd3^~Ef8W^Z>Pc8a3i zL0QAR4BpdDZ%oO}PBmC&>U9>KT?Src=x3%FvvRWyLOQpQ^fpqK(aH;y;=geD)+7t< zxfS+W?l3LnS=j4I0UiG7T*&la3bpM#w71L#@2$h(`|GywWR(XzX!C-UVG>le!w4&9 zNWf!58yLUy08lx>V|xlp9%ew|zJI~Bu4)ja1;UQ0a`^I*11#Lcqk%7-3TaE$dFgFJ1{?~Kx~YLv4sp=sdERLp)TMg84H&)9bk200L)nt z3gg=|VS4^xuvzZ}H)CpHn|&S(Qq`mLwF>0cQUj6|5#ZKx9!&d3g5mE{XttjY>t-up z<*(mBSXnB}+U*J#l0;C^z8Br?jiBLNAGp=e3635a0?d|ipl;}ZqDVK0soM(j8egcS zJYma#23YjG4(`DixS`>wyUX5(RTCs2|5^h@KYougbNj&_#vQVYY@pPeg|>bBz>3SKL#mir&K2OE-H5xD&a8mI^$sM%9saXyVakXi?^%V})0!#f>!FQmN6yfDTHwTSHeIv4w{1ZE&Y} zJv=YTp@*mnsGw1!*^JTwq90GxBj1cxro2cbVm*_np~3ZR%E?4_^U{3cu7uf?YxwWDWA6exA_?%g?mZ*Cz!-@sZh7 zM-z*#jz0;d^;6LP!5z>bnMc(Yje|ksqM`ZXK-y#JKIj|v48EIL2Xv5{HgaP@oLkHs zoV8BWxz-GSZCe6f*PPiewoRkT8+=jI&_=rY^m8=*+H%a(3uO$su$cZvx%9;o-E|3}f$XD)cwNad;>mz0nqft~ z#5TSA*xK3mb?E2lB;lm}Wpd{Mih+X$y9{x4bN5gV_4FDx+}me_O6}|SyOI6@?~NKA z_2x zE8Ac+&CM}ea_8mc&tK4I;YW);E+|~QWa+Y^Pd+VPzG7v`XRB7PSzG$~I_3J`Zx~j# zanl!@x0F{@ZvArG_OGgT?5wWYwY#?N>pgqx_ciSQ=Gy}Y8=IPATMo4zK63Qf@wO8u zPn|w<_T2dk7cX7Da`oEv8#mkkaO;m&e+x_?zO7s@H^HA+6a2Z+1d2N^1eAy53XZbE zKt#`%)FwN_Kp-L2%w%PlExl5+QpqcepVUfLdQ>8FNCB|SA;$1xZVJ*d}E)kC4(?KBnxBluIYo>k^VR^gtw zMdV)9pQAXDHi&35GNBg)c`59nyC)vb*uD#~FF5|sWI-yDE9Bb$G35V7ApC#1vA@|( zk}i`=MM|->v%dsDu8`UCad8vk1FaWJR>`Oc`ST$>&rif(LeV^*$A_fx;XI$lho%W< zAy{BMS%va^B$@HPd&WCqX(kqkA%4Pw;VRWAyao91~f8A1}sR^an$ zvY(Ue7vq~8KP6WmK6*~V)VOrQ&yL0M&CkSxU@ufKf`h0nCe-p5UzO!iS^u%<4AZjEOSJ`1tv;{A;YQ)7&YQ-##AzlqlTqCV^y4=xs_ z*Ww|i3^D07Q!IqRM{0?@Ae7hbe`2IiZ(M9l-KFRGab&5Ay}e?Wq6^IRwW@x>$#A6LUwpA_9Ns;^s#-c zN?K!;1euFTmoqmv)#8(4GWqCoa*TN{^HOun8AgMPZ@>uO5o)P%$Vdir5fE zrHC{^nu%hgs$d0IBUaX+f^MSG>{_C6-y4F7o8*uEzWp}$o4NO%dr$kFdp_oehuqVW zAsIWy!JVNQ7z|*za(T^**yrnePfWR&bgVbyQOb~p1?LXQf}d`xo03}~*;L8^qs>Un z7$Kv{@ED$+4VcgaIud%o3IK#7F*hSj^nk1r=`o~qJn%H75z>n14`(;w z!Tv!pkqJ@JDqwGs_!*L2AY{+5-hUDE58*b5nRc-|bA?hN&J+E(pK{8a0dbYHaa~w7 znNj*PysR9DPL0_mqK8j|gToGz3w8Fx$oWR3N8yjC-t|1(n3zSD`FDy;%`Xv~Tt|WB z^#>3|p2sJ597V5^$PrUqGtpCIx%=-lcMzo`UumMD2g$ zhSkPlB7r#$&Ldtyn^ljnw{HTa4eBs){r=5r^QXOEZ!!$T?S)b6}{1Y--TMHVFKf&MqD}k`JAYvFh zpcB%_V+*(Q-ef7^VQUKPK2uHYDV&S<)mftB#`T1C@jGJPg-68H)5G{9QWJ5t`8WLJ z`8HGo|9dLZ!k0J)N|>Az1grX7sKKk3f!Wa$yg}a}^x|J^vjr4^#b;9pon>m^^)?;t zRHhL_=64ZB*{$$sS~FUzaSz207NDr`J9x?W>u{!@4G4Bno%$$Dh8Lfy+`Fk+6c4qErhRDj_@sWiTb*BIu%eIsaa57 zp*=0a2NfsY#zSSp`To)c!nc>Cgxq>1Sr#)NC%tq=SAV)YsKD5TX#3QKp&aHJY}ak& zyCrL=?S9{l2ASj_OYJP4(EAA;b(a>byndW6^BRU8G-^_Fb47xl(8Ws4~T_e0- zY)8e4s&H0Ii>CJ$1r;eN03)-d{6s1ds^9O#b5}I-*J)5Z-SKHCsd^c@lr~OX^N}ee zP4eU|x$lI0_U%CaU+#g;T?U|3iiHLq!)WuYOrfaxI9}}c1L}60K##oh2!#pFgcoXT z_;a?b(ToxuCF&lGMz2mqk4UpBF8suP z@MfIn73OWzJI) z7brV7Bl6H?lCXHOU8gI5iXc699cm@&`KB|!M_)X-346CY5&8x{126h1f6eeSP!SP_ zt1>@D^*c4yZ99%ogFYV%t({3YW|lvi?wTN!-MRxkLlcc-z3OoM`U?J{od%TSB4-?s z7(iBj$*5f(F@^6C;!0i<6!BwUccJUYR%5;F25psB z5fW+FqZIXUxSD^T*vT_cPu12zJL0zyD-Ya3?aFN4=p&wBxA0qDzFM8=z=IqxE1e)} zu+;-=;T+WW!*d+Ge2YQzs`W08LM9%SI9z+LlYh0I&N8PiSw`ddkIrI(OgBe7#IWhiAA;^!*}c~bCfBP(>%q+k_b zT4I;CdJRDjHt}i5TeFt0IoxHJU4BZcNMaRsqalCodJRKGnnYgkWxAG$XNgZ^LF&dK z7J=t>$qUzQ(w!LBd!w;%eWrn33N2L}NXr`Gvem@5=|K86V~+xt97WNF9ix3}!n&G@ zHfEbIY)X^bAI!+H4(lxOZ9ceZ&jjdhk=o^u6Xsl@7CGF`%kX|La66bmDjkI)-s)Ut}4HI zz2*qFT$k#C){PSD#kbE@7oN>DwBMlTT63Unn~7K1ytbO6^Vt?b?YXYC2QTiKsC<08 zt+x2`e!I0oeYd)jjzX7hrt{nDO0O1s6uRwoJ5qM7+^2T&o%SQ;-yB}pyg`53SBJW4 z0Y86*)HK+qQLI2+( z878hWkk#wUE~)6(Z(HgQ$%7p;@dLl3LoiSXZm>Kl5OFOC@U(y7c+yAAPD0;MH7S6E z9?;=<((g}M2fwgAtYUkWqyCI0X?!3}ks;Bc@%;?&F%mySvRo6{Ib)V6{oehb1pOnd z(v0yS;VFZ{LZSi~k|l~p_&0phNYDL{mK{qOVLX<6TyTchulzBOSzuNpeM~+(|AEmD`7^&J?V|bZ%!a zm%DN+D-W!!q}nPu51b?U%;lra%_p)O#~Ci1Y@fuSq-mHFnZnHnmhb8q5I2I9|hRn?m9tT)RmOqDU#XCl8*Ik1Duc6%$8uJ_3CTY}>P)rtWk^cu3wR8q z@h2U?^C2>1jf%wGQNwN_Dx@BbLh7-VldyZP5`ZURV;Xid`!wuuU!%RE#%i*;%8Uk= z!PiU%W@A}}-BM$^lt?KiyYE}Vm7OTI&=KfH-zgB-ZoZiM@7TNqh*GPixh z1=!W`01X5kPW&PoZOigS`A6E}pt>3zU3?KQX=7kz-Bf(BFb#gwbYjKM3})kDAJFV6 zMy*?p!Ij2!@VKd(35{BWlj0LN^V)ur8^>EvjQJ4T+#rz@E*$5*-qFw768r=A!I9e# z7>T%z4<=xrQJAUW4?xDCdvGfDC+}<5M{|MT3-JpM!M*KS;HysqOK>ES zU6>4whiBr>104D)=L94rg|T}FbV6(3T>NQ6Dg>uyL;IKgnZT6~QP|Qw;FtCgek|Js zOj0ys=f{Jzrjgx0<88@3H3l6R(#DMZ_-|&~*{e+Qp-`_u z4Ri7Cs7qc8OEp}F*DcNxt!GYx6B0L6z@qD!T)^Ua5V5aa()$uZoqv+6X7~uA$BkwB z&hvr7zgD1koQ+IS>9}iTuK*7?(B61gnc$1EKbRHoY_xmz3?qFDH0wBz=oWx5i^X}#2`$5n$I?A%(; z0d@Ty{_}{F`N$FdI`y&OjfJBZElwPxae!h;c0EB}$~%hQB$y2Zb6LgmE|}_j0V*E5 zIVQ9FW0hIWHp(|9Cu=b~U0x=Oi9TfVYC9PDdSCgkKI(g~shdPD>sqiEKFx0AXrgpP zyq zXF+Kl6Azv<3F}MnLnyxJ`1xc(&MH+(-GBUh@``x=f4Q;hS9O!*Cwt3TwN&2PI}kvr zQuqi31rv)C-4{!-iH)KXCujvh$R}Sa*@93jXe)(uL8uf`D#bM|G035#6hX+MJ(+uY zG82bRia>#ciSvg?Ma7drT5cwtr_|CmjWz^Y5OlQB>giO}3(<5wyF4~Gro4D^jWIo8 zwth-M71dkwNPP2Vkw(Nz5kW{0MBI4dkm|baca{{#*^C*Lf`RTA-IH{P=soj-Nt5V& z0&U!>i=n&YnN8NTE<>3)Jww$h_S}v!$Xrvk(`>he#ArsujEGk1Q~*yu(9Qznk$|AS5P$f(rqWT|uQ_Nl?~+2r5f!l}$j_1_Wz0EEbB2xD*7d zsGuk=c$MO<1*{70A_^*kMas2Ww_00q7xm5wm$rUw|M;Hgd%io#nVEOyeShyVlPA9< zP!Z%Hrp1BcnJTf47={8^ZgM&lWX{bGo_gdJ#2!z}nfWUJ;ZCpEnCX+;UitY5M9Oh* z&TsJPSsf|{u|#ZSE!HD1j+kTO43@mXJt=Qc8vq2BnDlyxqM%Y=eiNC9X_*sn^kh2PFhRvp4%?4vTLPQ`HTgJj$4pKpTmbs4xtwoE!Z*T9(Pe&E1bybMrSMTBgbK> zcymfLyr|z!@hzt?yEnAKzN0VEC{SUSc~fD;R6l6EFq|?h zdWFoFeFeJHUctZS?gPrzo=OqB18-pg{mr~$&darF@Z*VG`10fdX5+RG@a8%P)L>Ch zjoe8BE1S2M?%Im-Q+Nwg26PD2{FY>0xayWY&u>`%b@P$NhG zDnflv$z_YE0{i#fD5Fn=5Zu2UnU|h`+kVX`eJCGAA8}=_+1aDj;m*J>F#yJL4z`$5 z%S4?DXM7u_u&#DCT4&9`*vm%Dt}}1Zo?&Mw>pg3zPdmNf8}%&Q;OB)ZzMfC_i(8qm zzw&1y4kYST?x`{KO9(|fGrI8z`6%w@cuUmZDx(zRm(aUY=HSd{Ug*-R9{q>8r}+78 z^Z5Tj4@AYrC%C>EUE!B+dy&572IOG4mZKALpO@6b<1guG;O!0`i5kz@;}13o_-fm+ zoI2ea{9TVZUOLrKKUJ_Fum9*SFJz69Nt9NCx%DD$29p5?-mb?Ve0q$Vugh@ip1hzl z4}5}J3nqy4Zd*a-B#_6DqPl8l(Im zUhEw!xPSYYPV>-Ha3pXOw;&>NMD=?XF#hHP#$fzfRKaz`#>T5DZvI2?-joeid3!jj z^bdKdeHwn?y~%>(qq2rD%Y-n5qNy``I=D|lfSIRsfuHJ9xNU2$K*Pi{`0&~`YX1~7 zUGr6?wBw9xRHcPC)!KhZ&*fl|-WG{F)9e||F?K3I9n)qpDI2|IGw+sCv2lQUSI!&a zF|-KdFOslLHwMpiZ{d66z4$llD|P;bE^@-n{cv9*1A>1%VG1tAQ3a}S?%AAb5Wats zj-EOU_jp;Nx-UI-ZD!=qyuJ>K*JZ@#-uQ~TTGNUp2U2*$tE<4|kqE*oTd1>LcAPY- zg>m;ar}tiu~z#HQf2@O_^!)y>LWE z1ikMwvGBrZH}2FJANmTvikteP3w3;xjZI3A8aiKI1s}KXH}f?gkE9z;Qs2(3qQX|( zK{JQbyeY54m^F>}P=8q}on$Zq;yiYugy{#U*yBKz{$LB|7nkEuhfd)?hpBn*hZ~`= zL_W0Z?m|lq3Muz}2d>mbp<>IrsGQkQXS4L-R0oQEZoFPp& z=Ock(J<1iu!=;KF)OwDoD9_Lcm85?`E!ln@olPs{j5`s=EC zM_o-|yv}TN_t!@_>7@WNr-&p=ip8kw*?s)>qkO)9kC6KHR4N+YyAhcND{ zpR>(y8j20lFy!m6MNfQM$6A~{3;UFKeqnOGUfJywWGb)S%x)jU;c$fV5Rp4Wz~w3aiO(w zT}@RhOYP=0705hx6mA$F*IpggykphIiO|b>=WB!o&F)2f4ehbhQv=eXa^k5wOsp}< zu~@y?_*3;Ar3T;^qPCP^iZu4m3EsIa1P_c1s`WK`o@)qHCDHlJQ3JH_OGXpW5BaQ& z`jO_BrFp{P%3|-fQJ$+z+;R#z<#n0URZ#P6!@M~9cZ#u8PhedLM4;U4Q%U5@k5Z*p5E@xL}|q!i>vTy{0z%CZ#tmn*N>Z}FI|`-E8KmZ2<0V`6=Sg#6NHs;vfub<3~p zMXz0ZomY@Zjivi;Oj1kEmp8~XzvSrt&{$|RQT@}MUq_!_GsuY7_51 zGH)q!jBVc=-zRU~LMbJ-GkOuJ>pz*T6RM zx&4jfZ_=ozX_P#FoQW6@5K8ZN$rnEi)(y6JwHN(+7BTA(r$b`5_zmMQg$^|BU}P zUjHg>S)mh1IcZUGF-Z|(n$T@?{0G)+l;0)5tlsaZBF18l z%9XXIrP>o>nLQ>R22aY|h~S&C_*%#RGg*-6$xLLb|Kpz&|0Op6|8kH2Yd1+!Y%kH1 z^CUlyH3X2ENQH`^py?ql0~brWm9}G>lR~LbC}xm{ZGH+xra~F3@Kh*b6&|tLH6>B- zXJ;M?g%3Ltd2l3>cACZ_ut19Gi^tj7O(K(N0p6_ak;(Sc*p5Ie6e_k;hOwe{Ut!P6 z36YKg4v`_V7e;$dP6!JPievYa0!e(IEFhg0&qE7Bf*@@GPdjDz-my;$4{=J4_KHC3fD0FP literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet deleted file mode 100644 index ce4390a418031143882c26e15a5cf27824bf0a3c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2745 zcmc&$drVVT82`?>4+?@H^wwLfib@MQ9t8!_RrHpZKt&!R7G+py1+=u+wuq?nITu{q zreNYjTxGdSGbI(Mx`Pg*l+**{We=J$HFYR|8-*>*} zIls18DoqhsAwp1F1S~)#fJ&Eks8k>Hd|9>tLZqNc5O^Zy1zwj9@v4AK?41EXa117# zM~DYQJJJcPHVya)tr%QmbJ?rwZQgnTYk18%q}hZF83xspENK%+C?H@UEP@OGWNj3Z zEW|wxO=KAeMkbaBC~^LQYO4kDL7bv0AQ4r}eH{&n{n1?_42e}@$C!x)LmS&0e0-SQ zUE9>?aQUk3HlHo5flM0g)wOPCQ=>~s=OWTvLWWkx@E+Fv1DB6Ya;Hkzud)a)_*?J_ zdm{?xzFY*Jj%oN{{XTZ>uDkGi#~Acp;--G8;f1hc*ba7f^j3(i(zC_eZlKQmPvJtw zdUkj9*?yyvPcbj%kA#$S-$En%DSmh3A#^M5Bu;f-3tD*Q6!fgPgg$xW2b4Nc##`KW zxYfCx369U?cE5H8I^MsHhJyu<|GjrJd0g+K^K{HU) zMi0^r9x^m$*zggtdP7|N$Wc!vBqk*rQ&LBdNlPD_F>d_RrU{u7CrRe4?3~=Z$@v9^ zmMKM3i%Uw&rj<{xm|?A)IjicK>e;rMTDxOTU46scdCo?c8)QLN%@6f(GVAqa%Ajsq zKx{V)a>T-xMPnD6Wq>7|49^O3!ct;j88NWDc7-EMhLvqZrcWYl6;a6OltK3#HM-K} zHe8O>3D^%Wll#8V;Kl|Qg&K#KbvAf?K@O*bzFzWF+d0j({+e^4hD-eotqP1iT(QT# z#~2~jLa~T|PK3!p5rWW#`y=T>A6yhw0g0#z=u*6Sbe9;pkChlPyy2$sB)AFh1s|uw zRp;~L=f_BM8X2yt4Lj8Ya2~b(1fP#fP`fq_MBZCl?`X6MEP+`)eh97gI?LXuNhEhj z!(!hm3%p;sFCJ382lXdEB)r8$PqU)Z3ySENcnI`FA5HAXQ2UDGzb6YCUaQku{^g&N z{|VgxlN#vBY*ZnNpf$mL#c!d?^)5QnO^P zlX4}gPRgoN_RK_K3Y}$1Qa&BYIxv!zGWAjm)Q~iF(NoFE8Dx@EG?}imnkmnrjF^@r z3uR_2T`F?PNZ0dfQj1b*DyBEtbH~lI&M2*?^2TBkU&~y=6nj~U5fTI;zddD2WqrKJQc99hYtaGpexQr| K`zt1YTK@tgIEX(0 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d0f1bd9b4393fd8255b1fde58a8b75d36fb67909 GIT binary patch literal 4341 zcmc&&X;f3!7T)I$$xQ%>l3)TT$Pkek1XMsuf-RpYYQp@b*jap*jBCfX$9ro8;Y&3?T@$CTd#X1XWGNr-##a~U*ak8G8Yg6 z2f+*v0VRMj09whpi^mVWOwzBOKJL|&T>m{MUw@Ije0LtNw8Ca?-;#A#&j>)k7YL1L zd!B$LU};!`J~g5fQX|#?z}3a*mO>cb3*ru>As|Fnz~YI7oB&y5;F1toY={6;rBqvx z>SZJj5mSE`N@PUm2m%b~hDUb*@|>rB$eBRELBQg$9YBfJGQu0~7?+Q-6;dM_A%%Xk zQKb?5d6icFh5(Zz=p;TM&{qnKS{5G>6dAW9P!=as3Z)03flHP}g~vxk$}#mL)$ftG z6&F9pH2#Y%{}Sl~bwR(nhu{rMen8KaUOi(uWvql1RvV$JO9EX@%fYC`5Wc&m2(Py2 z!IQR0u*`su4jqnyoW*?5+t~(D-|7LFs=$j;807vK3VtWv!Oi#OzzVj3!#*M?ea?e5 zyCi5`j3Tx|WCP8|%;4f4XV8u?h8J_*qpzw>AYVg>CN=5;@zfNkD0gs3C7`2P7dR8O zK!xuB?JIb&)z2EjvK?Vo*9usaqzRk?6}UaS3J#1)gz=^|=vtKq(rKyyezptfG+l$} zI!lQBNfjDLErRXKH6Um3B+M)ff~7~b;krKy_H`Xc54i|xuPQ*#XccH_o&ZGg3^1#G z32Cl6FuS@0L>1<+AJd0o^;$?7s)k1}9Xg!_*uzb>uw@P(M4vlDTK{)wMf_+uM(9FB z8VB-?NN7880*q3e!1FB&>T)N6IF|)S3uSQWzA{|6LBfr_X2`9)1axDsz`z|jI4C0YE|%1O9op94L6=LOB8#!n z*xsl>Sou>r#?iMYD~j5o`s`bz4IY?Hs~IYHRYG%H8lg!XgU+RPVC(B~*pjpm`_cL( zCOqnl?`u|s+JkLyKeGUaa%1obrt7e$%hwU1`Rc6x^EJpk^bD~#jm0AM{jrJmHRQ4j ze&n8(XHYlG6H$G3n4%?y#|s+3S?3L0vgqYLelmsR)1S+!$!FkAg)1oe&V=ys#@MsP z^^kS>41Q~#J!Cd7$6hv&=*FxIkYBR^)oH$jXysF=Ds2XgpWzOTUyZ}{Hl6_GnFH|c zl4`*1%%P$R~MOXW9`!f^R zI%x@5<=9L3T7PTOsB#+~Zsv<$gjldCS_bQ$xR7H~SAo%&jjYkP5qkMA96(kEW}i&Q z)mABjuSyYjP;j06;>0Xcc08O{T3N++5A#8X6YgN+#oFv8qQ%_7>mpn-Ihm-4^2ZWi zJEI!|_ePgabips#x(LV_i!`^Zx3S&UDl6_9dVogj7a?Yr z3g;TI6@D+!1Gg!Z?%h5V!ZqSvt+8e=*tVV*q3|VM`^XTzIUPZWzPbaR)6-!7lb!gL z@9I&z>@@_306J@+gqk~ium?FyM>&q!f|We7z_`C$gP>opqc;!32&`<4=$`#GVwC>YoCXzuo&#vR$ElRgQsNIUPq4p)CAf(60xPNJ=84Mc&_-H zyU;s6VPf>WS}ZQ3ioJY?COLDtGbT%r5he|V{Ch(eSe$paIkz}v?C4*+QRk^tOk?L+ zzV*$`u(IQrR(P)}CM-IS-;c{DeX}2<`Qr%6{+%zmt+5{s7Df{hD&t|lZ3Y=O=QtkR z3iv?EIXKhjjP~v>R6aR2hN`Ud$9=;&(4kz8QdDwrhiZgze-M-E7i+LN?Uj7+KF76s z#b+(6@jwle-l3e9EGm2c5iH?c80$W%h6QkS@M~#t&{*>( z6cJi{y2IapjgUPFuZEb_R5rm`2^b}G*q^!{r^e{}oZY{EBb%q~Qs_{!Ax%MO5z=+G zWMhW1mNZi+DczK%qVILYr@1sOXRKMk#X`w}^v&v1V;*-kAIR9M>9BzkNe^b`>A2+U z`?eg+`fQ?SsY{WxEW5zarzWJkr7UNs>C%==k>jDJ@wyOszi-Q;QblrE^ zWm%M6E-+eHpUjzj=xUK=SWjctS=q;osD!C`_rob1Lp>PgBm6&+j!Q54?S+ zF!k7j=1mIf_PJA!KWxiYw_U3^t)}+>SRy<$HjW(>9>mZ& zn!Ba4RR?ud-{^|J))u$Q+3g>%*^m@`7;@% zt>q%5Mr;@|`sY>J=p!=v+~n`bsPYGD8XgoG95+m{X*$}E#P^KF>Bcl2{d4%g+4Qf_ zigK+$$ckMS5)>g55Hu6l!~Q_%^)x(=syNWUBftt1eq3?^YuNZ>w#?Z5ei#Bw!1AzT zya}QFU>MSm(L0PBke(VHUyF$^-}pblf+{7_5PAHMe{G2W8S4LkxxV6m>L#h?8>muZ zitgxdZIOmhQR3w_$J=K3VhQ&Xri@rgq!Ni_F8yZ2T_Q=8NP{I#5=pScF<8DPr442= zBS(qEmFemENKen?!&oK)6G$>=wV|o$RC+LTwhJ?NOk~=bOhbp3NIaNE>dQ>!dWjh` z4-2rEZ64sgAU@D(T9~hoR|unz@TBuwwSsQs@f_tr=p<+vjwc^7x{v&0{k*Lr1D%5< zGR9xNJq?Ez-;uf3r=K!&H>MeeE>C?ppD=n&8YlC%it_cD9p&dEhfeP|cNm}B{EzdY zi$%*H@?lCTjSh5P5XUGat{#lN+$eEm|GBR6deg?8*{ye^0C8{rgB z?<2>__|x%eSLSGjEkURiEsI$h9~5W2EIQg)784VdsFe^D6B`;8sby|$Vs2t4^3VYI N@PbAHz?1&N>Rji%PXJ zTG1+MZBcP|YXh-MXq_I-X@yFY6y^KBT??a_d$%#}Muh|Z$YkMf;FxhU6!!dGGAsT4M8 z0CpZ69R)&oPP>(8L_r+_xszIDm!&i0Oqi@QO%^$&pj<&k^lHUVDbq{1+ZynchDf#Jdhj;x68D2@m9tjC{<8Q&A8y z7xk?YTAL@qZ*5ut z+zWuhn23m30xmRo!>*SHTwEh?zQ75KGD0zBeiVka7GPXi57;er!PVq?tm|Bg?g87# z>3R+EYTg9b`El@SK8>Os{b9JTL{sN+STae21;6e@bWJWMZt=vKOgUD!enoC{BxpR< z3D>%~;NXFt5LOLEU}GC9;=PdEuomh~L0H52VpZ2h%zV~>n;3#Cu_AZl{Wr02xGU6O z#-ifx_oSet3$_a0&{x=@+E0d-?c3ouGX^P-<=9cx8`>&4w$$iw{JJBKUXrV|RGpC`ZVjZ4MJA!4g8?ibe95+^QI3#@{JaI%E0^L~RWaW)pq#rmsa(jf?kc}~WE%;}-zU^o z$mKHMOs-e>Hd*%34B5(qKVrwQ6r#8_n6vNC@g?G3#CrXLB%(nJXMH@ZEj* zWm7yH(xXw>-Jkn${2t6Zv5&tzG8~l$CUI?hW#rD%d=5_Cr;)(YdZXPHOaeN zC8X^q9?*S(z_*6*?$hlVh}`GJ2fUxSpZ^`|1a@UAfjv-4KNF9RbmE_$Bg9Vj;1*r) z=LN+e5RMmQ@(v2Sj-BisIy*XbadvSPm2N7vdsj`j?mawudU|>LXnXng?$g)Lzh6LL zQ1Bc5LqZ1(92EBE;345}MM#lRL!)D2Y0N+NGZ?>r=CQ#iuJ*)vjK%_Oo@Lf3beU#=1?Lx70U$x%I1U+Z%W6{Cd~!J$su{ zn!nk%|G>dRhg*&uJ$C%W$y2A#oIQ8`!o^FMuUu{Y_S#>y=TsVzv*LMNdtkKF=NlNd z=bD7Ve3Q9DZecD1O^T5FdF3u=cNn6HGvc8zGSJu5lT%31)Wmr>p( zp;Fjn7|OqVN|j$k`MX_Thw_fkF~2a^kY~0wEGy1mbJwnLr#Szz_1E}(Rf6iO5UAv) z?8&)$ohZ=0=EJ?v9(^=PTU;V&6GjN9YF}khZWV6dhM3&%^%pr&o-~+g3o6T9uolWu z54Jn;ai{h@n0<@mKPC&Mpw_69{>Q!hPwJ`vmlv!3vtE+&wo)l*70N0%S3s>%*-NRZ z!_&g74@Zs5nJ2|aQl1o*XE}?c0>juUN|NH)O!u~#?kr0a zv%m~V!{6x}5HOG~qLbs;KB|;%=}=0y?NdjMV*3O( zSW~xzZjEOOJ&QE!(n5^sW0H*-V=SptelTizbbn zW=HwyQe>(1o-DyKEh-~fDrr@+Bxyajq@J#iDWQHWc`|)!pQ^H}u}VVaQKT!LT9Rw_ p&n_zR*A*8VOFgFL7Mt>o29Kc7enI^L)kzxQ`3Fq`kV5}0{0G#3JWBuo diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1cc7b2c26517de57e1a2b4fe464578f4f87a314a GIT binary patch literal 4035 zcmc&%d011&7N41WZ*oJ3AQDIb1<`~>kVV0zh$QTbia~{75!r;u7Fo3_1r&`tAd0wy z8>neokpnB z0fkXQ^6y;nEvOs;7XfO~q&5JQIZytOGmZlm1Z)=52@Ip8G-;?d+T^2bdE}6UN8-Qh z=uj2>^_CL;UI9{8(1U-dz?2gnAC?>&5tkGk9+DKokx>U(cyL&JOmb|TvZev3a}Nre zG2v5W@Lz=dWTn}<+?~2U+&hGFKD9CRCy_0kB``%5^+^Q4X(cbuQ@TTS<#xivzN(;Kc(fYU082F&6&DOBQlLKcr)`4$? z39u=>8)ejB;Hs51+!AOH9@`HhUb#EAuR$HKwv7PQ2V208if!Qi#sqA%)tAU&@)Sar ztIrr{t%5ePv-sXD27@pSLMDz?L|9uOvAgjJ@T%j1eQ&*x`b2~!b8CV7nAf1)_Kw&6(US!=U|q*q>^nb4plDcx25Sj;c}5$^t(pf<8V-VL z?G#jzYsmy= zSM-Qo2j-)_)i$u!qy{T#e-A%De;;!^JDNErD+QJ5wqt2Q_Jl=QHWp(o#m<35Fts2I zeA(|oXr(U$7N_eN!@q}c@D~g$?{x>()2Cs2%hW;PsYUoZZYOc-#0(;&GDfqgtU}v6 zIshI?xsFB&jhTC8rtn=S9}`VjftSVyq10FI@ba%WhCj({W3B00$T|iebMo|Cm|kfb z>O0@xgu_h>ppAA8L(Tso8GD0dt>~#GOZ`T}x^Jz~{M~F;!gw=AwMGTHc0+(xx@ZrV zvyY?g3;Id_Y%vkTD*^(GrOXr}1ysJ@j^;0KV6M?17}YP|;Hj0%V5edtNAtcpNSz{K zEV<fpfwSlv~ zyP1lR&{4&onScp*9up%b4fq7OL!zbx;Qa+f*@*SAMZZ`gM z7_mwn48w4&y{w1%(jO2D#Z$qf>Ud^X_IIFmQak!4rwcpoXrv+7l#koazKZ<>FEIiI z-l)jL0~?GXKt!{?3%E#0Xoes(aN;hC%^ zy!HdQGb+VI?pKYH)>beVZ8s!b7rCR5lo0&rYOealG1HhX5uW&WtV2xsn?Bg{bviQ0 zuh+J}vI#8dK5pbCm;iYNt=O44hcM~-2XM|voOJw`l*q0dfbVw5@z@cgK$Ps^(qGS@g-oXG&P@4z)pFKyhZ`mN#k)yLBj|=-=Jw)#p ztYOW#p^iOklf#iWcR@oxG3pBv;$LXbU>wkPg^`jp>Jyz?MRK!A3vN2|kFQglZaojD zhfh%fZlq0Z*l(90bR@uwVUXg|nK)|+KqSv;e|iQ+YMBPq@6TAp)HL?k;Z&HJ#pc;Y z_0|`zTB~6sR`5hctJjS%l@tdw6lHDDwGKVELv&!xCjH3?4|*F8tleVhlu7c%2NgME zJaSE?jR)6lHSsO-C=ef7zinJVRa9T&p$+*~3mX-D=ffKdCPZ}?OPdaF+BHdj>s*2J zkLE%Pg?DIOl+Swz20*S74uIg^j~c$%Rd%epUKj9E8kHWb}rX!ertKb ziO5Sup1a&CcAkvxt&&}9t=M%cUS8kMa#uws($Y^)PjAi9pK~;ObmDz2GjSUgJ86K5 zUA;M|*8Arh4cCUJsjqi0&Eup!J>|8Rig~YV*W6>xY2H&Fcq-xXz{_#H$3xq4Eu(+2xLUpV`>NQ7zgS*BE$kHLQ6`mj6`M!{Y77t!T0niA5sOZ0esTZ;>cf zB#sohiA0ei=Sbz9m{OQQpPWS^PkN{Lp`GHC$3)r#?MO81i*Z&~lc|I2Ob?oOPNi2@ zdZAp4L_YK)meQL%d|}T$=Zy7gg!X)K&RJQ+MK|5P!RPX~4|* zzyM|GRJ_@$`n=|RoDbD3ivE-j-Ab`M+vZJ(c~YnoZGA>qR0L z`sf}*_dp)#Hf3?B?NX{I6KN%?$*FYzsT5U1q3*;=ODc1!I!Z1TiBnZ`QYK4VQ92@J zCsjtuB31Xw(xtI($y6L=owPmWpE9LKqtXOCBY8-|lH`aa%P_gzG9)1(KGi5CA|X*0 cA7^A^Ki+1%HQ&bofDeD5j{5cLOZ|NRH{sIBfdBvi literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet deleted file mode 100644 index d0c68b087a629133e6561c3090078ae686a5e13e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2537 zcmc&0YfO_@^xXS>E$v1pB42G;b*Q7FE6@^QNEq`gZ>R`3g|aCYT4}MgtPeo9DQ+{t zg+(-)VKl;`G2&~^u>=7v>X@RNuf5o^M7C^G=hP*bP1&Nm_v^|-<=-+-+I!DE?{n@w zCtp5asN+~JgbOopn1fINiOSfGOTsm~-{61?<>VSd58=d|Sd{=#k^oOJi~)c&0+SUZ zghAbkR2-{}2XTl}AuTgG%%xV7+sa`%mNy}JGg2q2CF5CAYDg%+VIWHg*#I64jU_Y+ z4~C|Y1UMs@oP_%@l_>ukE~eq$V}`?xl9`?5-YSd3Q))JOOfnlmZ06E(r`=oS5EAG> z@*bqViPgujYQ)b7Hjc+frd1!ry%S{M29Qwn9_F?WWo*cUg@3O{?bZ+3#Bscx@*G-^FUthLQxhkuZoA`!kuAG&-FT+LKcD{^vwChmU%uc4Q?>>6vvnx!BZ(7idbv1b3(2!3i!U(YD}(8yyo$J+NJDRRxMf&+ zg$kAylYr-Uw}@=ay$c9Y!T@4}rqtIjn-`^cs^RUjQY*p ztZB2i=Iz;B*;{#J>7izO#=dVI9orqJuHRkV(^{RS3Eyyfm#fd7)N)~uC+9%P2mSkN z&fZw%8#q*Fh}m}jYWu3s9bK2z=O(B)>~G7gvF}`3WpmILwYXWk&Fu-Y*e$dN$b%n1 z@=H?rk5YBBO#PdPHtpzK9{H-r%#nWD!2}*|)&NH#31GB!0{94(D35N|{?PH7b;5A# z+AWR>kAL3y+x7HBb@N2ht`A!u$LFIGQ#QtfLhLTLTB=MOOU6MPxX!A3^O4jvq`G|F9ITY%GO%UeEJ<0r^wN<@s7( zU%_Yad~i#^ORNU5ULIMFUS_^O^Qt*Pq3H!p5;@6^@Qu8_)_+ezuvCl4@Iod1NO7+s83X;1+S|Dc3yDK>YaWw^T+S=c|Y%aInTXb_w~KM*ZrK2`+Ho(t_A`^;3Eid z5=aYR6o7t1MnIyG^~8rC6&vGskz)kU);Sbfn--)Ve;z&LH#i0zHP_R@#J^rV`m*)S z^ZY9d*9btMAW+m5@TfnAfT3&*+N>`zgsd-K03fA}Q6&%Ic-mBB&38yyK!}Wh!4nDD ze!dX_OM`u5f(4iYPoW1XJV%;oNR#zX!9=>$u0enSRq&_^fa<3pfrk^-rD<3Y2-pa4 zHj@C7%K`@VNK+4&%QE9p?zG8`RCJ7Q2nbnUW`r#ChmQh9>aU~J_+AvXq5Lr(rx0&-8#5=?E4n1awx62ey_P$n!0rV>u7Dc^O0KW-FM} zwHy{DssekbJlt|FheI+6prTieu9Pby^~Mt5Z?*^Z#w!q2I~5{cDnNtGBG|D^8MX}l z2sYV);8QjZ+7~n6Kvxxd%t26dSqg5;$wPC~SRnRUgMQ6hNOe#L=ZZWKl^DQ5OdIwo z)xhe{70?Sa;ijDcd%WHhGUoC@bkq(~2cDqiadJ>ejEC@4He~6N&~oe;=&rT}myZmn zP1gaL#_sfC@!x4}e>kiqehv0L3 zE9|x_f&;V7;qe{}4JJQ@iD_eCT)83q@_H>CzNH1@BSxWj2Y*G10lg?z8$ncrI@oS= zhUkj_`R+GXKS02 zB{>~Xapof$2TqvDIek>@z(w<#8=z4$8l6q)#Mac|kdf$(b-LNW3*)z#u*?oW&?F7U z$J|j(K?^)c+XK~xZu!qyvKaJ!A7pAW|4IfGzfa_@kVJefbS4-+()5X*H z4L5U0Yg&fAttZj7Ip@L5OoOOZeG4`Gc&t3t8dR(uq2cNnoS!j>G}avhS(`z4w6p?n zGkrW#U;(VSG~%Ss4#vmK82GtmE$nHnB6AOUWBY3iP`y?iKIO!(4RFESNu*I z+=e!xr;cY)>?k%0IBiCrj9!3|+KVwQ^J>!nycd~Oy%6bKS0szuKBLmnZMbgfcD&xv z4z296gIzIc_~^ymc%tkB6drO9Tk&!O_H0=)Q9H*4v7VY?QL-n|=ZD43(mxm<0`T)2G=k6rJE)$~q8A5Mi6qN{hoWo9bO@86AIdQyive1C(WU_fUk za8XmI2ljA_kBrr*3=I1r9OJyd0)fA`qYsZm39N9X=)U+J>7>v18lsO@Rrir3|Oo zMXac|No<#a8B#rU`5bbcG|1ot-d1{(`Q9B!A4?N>UK7de*nR`*r?p{?nH_kgxtgrT z*4>2RyxX{^d=6)+XbGpy)LuYZFH)VIBrDz9#3Q};O0hG~o#Vxm@sLpU9g2xj;HbKc zWra(f!1H2qj~2Z^r*yZMY+{ru^Y>o%XX=fHp}cV-*{hQa(y=c{^#=y_NQCy8|*@6 z)bHKs=E)RHdG{H<@%62+qO(#htVaP8=CtAuVzWrk&3$P87=mT~$&=jPFo1@#qlj>M z6*y?Rkqn(%g$JDjd~o$yIQ`TP?a$4Y{c&_OtF(48?it30PT69#T0R}Os6ZI!xhAP} zp&FapQOXDRvmB#$e1?Cu6ROwwgtjzqVr`yZgvFl?Wjr7iF+YwvekC;)Ztg2V9U*>D z_{l~3_RkZr2gc{nHPKq^;#!f^W<@iSb1;pmmbF->xn~)Y;@6?I+z_~y_ZZ*BQ01ob z717SvAMjNL_t3?d-HeH~u23Z4vGb&BqzZa-fK~QcuGY1PHHEVf^}l|DIZbSa1aq#! zsvQE<{o6Ba;QRp0xi5`hYOW$=_P+*IH%qL0u_p07e-7gi-v$M_CXv)NPu35+A zjkC|T$XlB#B{U4~I+M3Tr6_l;F-H z&$EZO=1z-xd?Cl`NX9z3DD#lr_Hhn-tc$Y> z9kUHXZl5du;mF*BmYWo9N_G^x6}c9BwUq2Eb+7bm+hbF@tK9oUbpP#^(%nawo>|LQ zvMtN5^1qO!s-amIv0wV6HaT35I3KP0e(k~lm z0>8g|@#NOk2eVQTDKp;Vr-$P8Y{BG!uAd%RrO9bH7kG3$wn-NmMf7xaJh9t8)+S@T zXJ?PY4xI%R0o|Rwj@kYOf$asJ*M6R}cShXHp6+W;=N=TN%4m39@0(ZPuyb?H6ia`JRy@|IXju(R2wGvF_4=qUANl1 z110kxQJ@AdQ8IB+gPY81FT8`2$t1XCND+A@YRx)J0^OcN}qA;=PP?nR}M>?)bw=}IOdvfHK`rA zQRZEr#5Mi#d3R0dol{#)kH5UvuySyiYj)z7N9VRE&B`!4`KI@Bu4%H&%({2|*Gs)m zXUwes^~K%#m7itI8$P`5xs*F=n|b5M_ff;|u{ZBA&6+970&GNI*nfK!XNQMG&{`uf zh6oFZiDd?c1=81a>Q0^_qp(X~p-5LVTT?TgNsEs^fBpXU;_V}F{wkeV_0m8hA`l}f z)zQUe5&nqHGxPgHL-f&KV9Nd~H>hP26SBVe&~oFiqm}_W%3r3 z3h8btCjZY~C50Me1)e5LVXq+{K%^{`7Q4F6bvGG#vxEl;J=%=KmSVAZ z9`(@XC>AG(ErZ0iVsVhzDo8SBNjc1+TUKJR16@=7FE!PZG-K!pbRhBE?)l8_u3mr(Z%w~eE4Bs}T$RD5bFeKaGUAXJO;jb0HK7(2;7 pDr%B%baZ5bT6|!1Oh{yent}0TgUR|LCuM-IA7~^1T&Ul@{sjY`8d(4U literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet deleted file mode 100644 index d10d236054191cfdbad484e674d3f07a75c5e0ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3413 zcmc&%cUV-%7C&>#?xG+ddzW2Y5Ge~N$U=lz5V=&56%h-?N|Pe&u5@DoH6*A6BhP{j zG}u5PikK)x1cSXqY{Zw?D?VG~8Ka42?p;xN>L2g>-uH4on3*$k%I}=H+}|aH4Rv8C zM#&6NF#-m90!Swo#%8 zr>#`fX$1h?YzRDx5J7GPMKh8!l*|nTMlz{hbd)-FNNWMT&5R=wqQ| zju;y-1mM94PXJi=Vg{-(1`ewwg+LDhsVDRT8lQ3^&`q367qn7Q=ZRE=UfW1;1@G_T z!`BrM1bG7#2dOs|*c-%ZVlqxPTW$guf-9k{M;2H(SECb^a%A050pdl&!MfoDXm<|)^#cji z_m~LFrpaOPAK!sraU4wDZUrYN381uXC%V=RLCx{*aM9QVnw$CnwayomHBTTf&>DiP zHh`?c1=bVxu&!4P%z0G>H(@B84`7IEpAUh;QDTsN8vuFj*HKc2G3=mhASF)^iX3TZ z*|iHC=lDa&a{=rwZ~#St0JaxL!I8^`aOeyTXG)Z4MEM4=NjnBFX8D8l9cMTjT?Ch3 zsv$c)42tv0;8o^9SQ)StN=JCYwbcZAk#igR<(q?5r7JvoJRde+umc-)Pt>vgDKd+_ ziPG&6Xw}x>za$t^tHu&F2|KBR@|W;i)c~~5skpnyDhM|B*-T9}D+QZLwp2)22dWJ` z25keEQWaBLyZ3cINq!aR050cmLJD<^Xjr}n{cd-haMiR6vrnIdmU%y;qZ{raS5qxf zqKSpyYd4a5_MUXbs?$)l|2eV(6*2e+B`OazK%<-Mp+S*~4&=5H^J+;b%$`8Bj`4&0 z?oSBm_5iZ9Ngv#Ig`t{_EpR!1IlL-JCHpw%5FP^u(TPR91nmc_kxODDRgxzVjF~&E zTd}rRw`7Y;^xEc!Aa+7@>GKhU{s4l^VCo>i`WYN?z0vK?-A`#T7{I{Ujqt1@5DX&x zAlbr+csRKh79MRR&yDef{HAHdlR6rm8F>giJZ!1my`MmhIFqQ%^92jvAgDiOPKpa( zAln7Iz|ijnT%A${q=%B!FkT?a$fx#BT_$+GG!1UG%!kznchYOhCJ-exE~w6~mi%PP zpX9{jw@J^&K0@ofOk$_=1>xLD?(}})13Fn5K^}%Q7`!S5a_$bNd*;pr$367|;|mBq zc}fCAZ35-6p`_VNeHin{LUcRm07~zvhhiH&=)I}q2-k*XhqGpM z`H@#>o9PkKaog8qT~Gj;)fNCN((*~uNk5Um@h0=xy1@%G$n8);+&>*S zXz&n^p`OFMSZ^O+KmUN?fg^%cBZEhUj1C<$cHH=|@QBC>6DLicGBqkXCN^$bd_rQ< z^yCz^Mmr-lEj=SMD|_Z=vu4kko0B_l{({f*zW6eK;iAO_UllI-dTG%=mMvegvUt_% zHEY+El&;_KP1(jxo40H&uh_P|vg+F%JF9oq?EdciJ$q~G>Kpbo?r&;7@I%YNLx+zX zJ$C%W$x}a`Zas7M-1!S_KVAIi&%a!{ti8hJqqX|oXJ4I*wBl>mxn!MoAfj-%Z>^Di9jJ{Ro)(#jv|z&}D-cuZ^IR~+~W z#%+8B6vkTp4vZGZ`}b2GtN9h9J9WP`>@(NGGn`P~2}zM}E@ucG#SQOD5PB(zYL4#W z(x~Lbv~*!yavVnAM&FK{`+}$~Pn}s?!G=dBky?h^6iI z{66VnJ6@!sPFs%j-rvPWZ%Gf+_72OBk)Gij3QUeu$EWM4{34QELW+{!idG9ud+&$; zNYW40DJyUTsUR&TAuc6~p)ljy67PIC*veJgO}sGogBv03+H6LkGw#aDoZZ`NUcKC|M;usZ~DaezdX2q+mj?|mrFzn zk)*X3epAZiQhhcwbX3@2-Ge1vPdRhq#(J|XI~xCT62!7utam)?&$98XPdvZpjSWU} zD<75(py6EM0jWdcb*c0@y$%a10T2Wz&B0BZZapO3B1UKkP=b`KHOviiP=}}>BnuzdVO=LJV}Jc9Mk(%lmemh-|kEZ<3;6JMlmpD#Tk#Xke*;p^o5aeVB` zU5(BXq?X#K)EOCZ=}s|Pty5HLswT@aGcGkPQKPnWaqsWaUnx_`0p9#TJp~Yg|DpN| DjS{cLo_o*sJNJA! zC*gdA8%J|~+!!s#a?l6BwjgJ6)wJs~Pv+Az>f*J1r#!^yP;%e0z;n$_KNtRC5M zz*%w%7f!~Rb0SV;;|We|uZ)82l>z|iU`%Ei!qgf{)sAd9TIm5IbEQHOr%O+mk*v#3 z=CE8QZ$|PnNR_N&-=s?ELVN@c1DVXp0zl|kE=Ws!5PVkC6i5)z5-|;oDGMg1tdERv z3B$s;2%>VX-|t>4*xnljOZ~^eU(K&uBMs zc5S7W2fqz&9oHd^K8qVFj-khn=dgRmZSkD8RyeV+3!T}12f15i;w>2o@VIs-C2{g) z_OEY)Lyb?6J!rA_=W4Vs*c44@u7?Iy7HV10ju+NaP&Q{OZl5?7?s`7JijM-RtxYoU zJQ|5=cAkLmN-E%aX%=PYT8MqTMlmU?tVOp^RwK8R*JyDRuI!?!CY`TRxh&n2B8N(lVsEF0`9398#j-;Ab9XMzi zIy(yO$?L)cRQBR6DNd-TRY~!~3+VkB({cWzK=k#~D;D<_osukRiD=FxxWA{(AL-l9XcVhCP12S= zr{H+_CUJ=_ZouAm9AWsiyG-BVTsE zK~!tcF>~*a%gndR{g{@31d+8z3F;U-naS7~q?~lCl1fSjbah$mTmF5@FkYR8hjw8I z$O)3YGdxgot+q-^e(oA&+%~Zb*C_q{cbXR5SI`188QVm*1IR)>`#VB28MwcU5RFn= zQV5t;_-$LTkYf8dMKx|)SR^*L4_fE9eNnNA!acdOaeL8+X11CV1z)vzX6DuzT= zCtqydwXD)Lwz)*_>Wb#{ zHVYkBY_Xn~byK4UxKG&T6P>1dbvMy>b}P{b{rc{;(3=OWB&wXW&Jg4g4*l@mfMV8WEtE(IHba5dh?p#uT^22edc2>%DEaE~tTw*>>z1&<8`pvNS8VY{Fa&4zN?nG&S|FznS?lZsKG`dJW?$U@^7e1Qa zvUc3Jp6QqC=TlZ;U0zw&&u#Xv47==|ceksd$ZGr*pAWnVq=?bPXy`t8NSS#3t z^fu1l5n~VU7Fp=&?<8Ry0Bq| z@GKwyoR3u-2HJ4gSYv};E=-cNlysrqCT7#ADcL#V#MDHgXpsulD3xF8FaKPjsfBHljOB!$dB`S(Jiov7#IYdq~hw3kW>KizpW$%U+NPSY^NdZ02FL5uTXJcJtazh z>m9J6cQ|AJrh;qsqWq>NrYGeX`-$5~{sU4Svr}EOB_;P&^=~lxI;_f44^W7*+z|+Tzdr?sIg!>!kfs%eXAwFaOf2+1 zk3&DA-&W9XF#MljK`v9;D7F9N*6L4cZ~iaItNODtNxo{3T&7~>KlZl-P}(SDd_=_f zNN;0eDYntB!tmfVJkL)cUtxsud_J#9;sba-iT6)3oM{NdIAQ0{^TEPO=-!o3hHbW> zK~Thx|6qu#s~6df4GR)z|9oK{D@;T+&ufKA6D^R2dtNQjv*X;u+~Ojq1P&R|y>!In$pRfB zOh)Ji*Ny(nCTE%)U8F}wbW~VIOq2mSu{XhpFJ#io{*Yji@dtl|P--$00;l8%7ksc* z;5RVk_u8KjY=}1rHW|I4=lLB^I|iJBJkd`?g{Lucg2Faam-T*b sV$RU`%*>&>tgMWD+q}fA?39djTQ|>PZo||{tqs7757g5D;pE@luYn)u8UO$Q literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet deleted file mode 100644 index d9326808f106cc0c8551356f9e11397011237c29..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2753 zcmc&$d2mxj82@(P(X_Ns+Pv^m0u*RMfgVY1OTkchDZNAKK`AMek~X0&q{$;mDFx(| znc^S_10$dVRlKOk$l(l>VmT~_Iw+{XIF61pDuM{kz$k~;?h64b@P{*we3R_%x8MEk z?k_2pN=*VQ#0qg%GC>qTq}gBo>h$ktooYUGY0Ds!{ak+mgeXCiAn-yY`SQASh*t%5 zL{(4=077Fi8F_>l-taEc39L2^_(-i7T4Q%Rs_X5(dI4*AO&ij@g$#KH)#E&A6KF01 z1~NpF34riMA<0U@(-cLPfly>-iGUIpn5ec|5bx(Hs)7cC3lRO zI54!c_mYo~aCmB)8l7%`wZrbWM>LQ{gQL3E<7#SjE9HENG+!aZCdP0Y>;6T^M;7^( zO4y^a3eRDs&JFC1Xq>aQ2)w%{<9+qJ+0{G#gx_|hqW2Rw^->Kfgf|Ai!Oo1?260t- zws`vu)RF%YT*z3*wpX9+H6r;m^L+krFrE7z8rhHVp^f{|tzjSI6wl?5`Dac;_p-0h zhp+sEQu@hwtH%MiI$mQ!<1@MT&1YcO;oE2kSn-$-jA%!G1S&b&35N_`bZqHaysU$P zHS?$8vt`-vbNY2G?#yMj9SMi@y%lKRYu)fo>qhu%wU-&3yabywMsp48`}3EN??I-9 zE_Ulmo-bQGE^L$BKg>1oYwqQv*Fc?!xNWx!aCjIhWDKoCF_?*<1@tF6D7i zOIsjee<$DfJVMuhC3nsKIvB@|Wnx;wq3rinNdICtTr50>{86FE(PieoOg5smmFW<= zF&en|Dm*N64_AAlk}K#GVf~&dXng_)!#|JVcAWYPz1{B=lkoNn%)$$~aKKxR4;JPk za?>wxLI;x|v~l+zrE=^dOAm<96&%}#Ow6C3A~b;erG>29I962-2s13lhp0lsdWA>y z?i1NJDq1aQ`ia_@{<;AJ2gME^GBi$a7#2T##K?riq-3Kh<*C%P^idh3$2@Hwn>lX0 zWXYP4os*lFUr=bBSTw1)q_k{u`IL&Ow#sSKt7cTswAa)+oU`ic8)naOHM%_@3$ki% zxR;YzpFdLub<;fJyIGJU=eH~vwa_91EaGH%MvxO069-F(gQd00oDniC-$G>iBr>cZ z3K<u@Y0PY}GstZu;B< zAD7cz=MUiLNu>D{8E&c#`_%++9`^p2n2#z@yE+X--d9`iY_tn3fmuC%0Il^p>(1!$ z1aKh@i#@w6@B!tXcu4i`_g}yQZ#7eER#eV{B044!f@h+SA^u~jeug(ysRI>}D$mGGn-Lbf1ZT3$~10y+drS5gn;Gl%S1{Pv18k8Ns^r*f)NI*C^j zsIOph&wNO;2tAMwZKXwaFn&qC(r?miLc!Kc(geE9b+|0Ko0p^!a`Vo5S5)e*T(vNQ+INlBJk5gkeCWWY!%GO(|dZfneG zB5{;DsXg&eOzF`CG(n7&?cTXfPJd#pEGOE%UQcuEJg3*!;Bm*A(vwU{My*u`aPI@1 Libh}`HL9r20P(g|)7Kl=! zQlyCm5Cd4iC>SiL#2Cdq6;R*3Gr2Ozgg&OlOp5Jrz)9m1N)uYmgYUFOYN`5U7^|RV|ul$X>sWNr$?SF zyRp0UY-uka_=pIt zvHg6*{Z|J0MhEdRMV{g#r1%RGijk1@PtF9oRMf!7fNJEZ4nUHH-RCR_1Z)HhHj@Au z85WF_VVTQPTC~VXJKFS%xRqFAJ4!6{hmInZ%U@Ti%|CL%q`9P%pL5Zc@sC&)8x{~A zv(n!;##bho8U*{VTon--8x}6fE*(>RhlKBN;XYjTe}Vb8Dc7s>ThyKSzcD3e_dxbd zAJZmvEvzxy1|{7h=&oH2y7@Zr@D>l=?9zf~&C_Ak3~ zcr^}#^cTV4UH$=Xemnt;z&UW-Qve0O$-#z$BD5unhpiFJfrc}NaP_b)ObOG2SFRt? zjZ%HcP*pm{m4v-Tt?Y_|i=+UpQmX$s-LDniY;Ww39xD(o2j0TyHhfY0eE z(7v1jN4n3Vehz|)HZI&9FAt3klYz*w2E&RWNVeAmm(qL?oHT+wOdE1kDqz#6Qg{sW z;I=Iv>;HBx>~d8A!S}Y1{QMzW6FVNx5K|#6nGG4bBs7iElZ7i?By{B(B8L>6Zb0AHyLb#sjjF3*l4TWjJ743`ZR1 zL;pbxjcgeJ9q}ZXQeq6hy-I>(cc#JA@CoQm-oKEt|6>%RjUX~y6Kr<7z@pJKv_D#m zr=R!;Z%a+lMZGL8YlO*zS9^&8$0n2@+lO}APY0vhk0Fd` z#cDIpqnHV76c&6ROXz5a=EQrbC4UeZPmIKJBm80QuPGQ?dp>zGy8}w=htU*p!sc8u zL?`TJ(89(Vs1-(`rmdY=VigX%5|&~wW)5LWr)}{g4Lqng+6;Z-O!$-@g-!JqN@q938ZgIrrWhW}uLfJqcW}-4 z5$EZ%S)>BUz*igV;N3}kknvgop=x^A%N13S)>4PxS~MTT4Xd%CY7%uhUxAFWC8$z; z2qI-Kpps;3P_uS~nj4dFt!?EXyI=%`>uSEVsf4RZpwJPh9(imbDGw^Xn!vHIw78QxmV@o0rcdbx-ZVLk+#~s}K!yvRA>D zXLjU-t?NMdd<|p#9fXGdg#-8&f#KKlaOHJ8@RHBwjPl#bbLGyY@7Ykff>R|5jv=1t zc-%cqO*n5K~566~j5E$oP*1sTab zgYCU^NzP-BB^jzz09v{U%s4U*&VJgDE#1(-OpzrSDrRC7e|8;e7n{n+4a|V}+3t+B z{Wi!mHxsS=_7LpqQHSVgMUJ}LWR}6+-5hRHHMYk46?)(>i#7HB01D#ha@xvfGMDU0 zlndjY$15J|pm!I;2*Hhe;5IKA7C$?Hw?3>w9lmcNFbGin3>nnW>4`nq;WN%^!Y(ZT zu`$N^{W=7^Z%6O?LkO&BgP?c*bz+a3CdoaZiu;ZFV<^R)75H=p3=TAK^zLC$=XRJW z_VrUMo;wXp9=#ytP4=RErZJ|XvKwcnyac<$TVY1>DTY(bGFIf!X13e&+1y7}1srl4 z560mHe(lt4=IWOP!ifl&&zSY|D6s^RZ)Q!aAvULSmt0F?G3;OQjK%x< zGSOt?&+xaNi1pfPquUF3SB`OZu$g-;!VR{P_3D(eU9Q!T|WA!Mrnf>LsO^hAy zPDwkacs7DHskjJ!sfb{zzLo*4od*c-f;POR$AA%yw~-bOTEyua1a3IAPq`<59y={E z1vTSU%o(n~qD_N$A!nZrK1ux}Fv4Fl6Q^8;l8`9u% z_n_MswqmLW>J?_*+zD$t&uBPknINU?%lO5`MYz}Yr)cpcf;Inx7rCeAIU3E1B*Ns? zAjs-C3UNJ)2VMgFz(akwzCH)@H0tL4FfoetwYM_z3S~p5>&Y>2JLe?o2=)WB z8&1UCCo{0VnU_$PAPH+p5^%RGo0FV8F;gRB`MAbMs}WbB3MI(|Lsx!3zMr8kldPbO zGGo5OHy*u@TA~jybSmA!!sj6)pI5;>`ZybOPt4-ho2!BeX9;@t>NOTN%!T;*GKw4b z@lns)LG06t6t;6O4}Wzf5>0w=2&pf!#CnzsiS-K3j6#J4DA0W~IWn6Z?`z3q`&_!q z(yz`^=>2qs!TxZE9r{QS>x;=C^X8_KBb%C_Zon4h9?FtERk<8@?Kq_X9!(sxHwuiR z4{p3-z8sr%`qTJI_hOclP9r?O@*G+l&l2NuU(wf=dLBQQr^~PB{O86+&_}C8&_9Tj zAl5tRX zeg3voSq)3ElBnR@GSyyP+_3x4K0VOJI3ayjRmM zL))wIXxewv+zRZnEsM5i>Ufp~^)wdkIAGw@C>B^9OV2h5>MZtZI=1uBtjPYW*;dDQ z<;+cZ{nYDBs;m0fTY#KkDt4P}l`5y@ULjtnX}eovL568sMw-KZ{S)>3?7M_6Ip&)% zzQavz_adhp%S?w`GnSMrt4-oN-q!W5487l&>3Bzfh1>e~!=!H38Q)9W1#?q$v&#b7 z4$j`d&^uTWeDkDNb&B4hAHwccts7zJA3neKQA_sRRQ;T)*kA5m*uXT%ty%Zxc~5n! z!I6ubMu*>zFb(tS6A7-ex!5qjF`1__XCup~;8L1`j(3gNsPOVm)w%0OS;j}N?w#s* z&^*n!s3lW(Y1Kxy$+5O<(~!G0X(q?pbIdmkkFrgRf6TYup*(N9>50xFr$g(bIW)R! zs|0;c@xmgPO}1vaH7A!0E^ppWqkB~jkDSX4xs$kpyLsLp=t(l%F?8*?(=>X<2Pt~1 z1l^&0485E}x87k=(w?(s!|*b$1ignsuPFKHT;#)+9LIB|_p1|zp5Jdh_kCa8R-CKm zbpFiGO=5YS6>aCwKD@F=(=pfSLfPZiER&GlwhQG0H}b4@s5w_vJnPaaUUX{3^{UFj zJ7-q4<~mpZ@ZzWQ(SyC$tIxgctKY^|cd0r5>fzPzbXK<4To@YovC#2|OKsKfgWcsJ zeeJc?Z(q{z+>536fBOPn_u?3Qg8BQ`f55+Q9gxBYzR9yX?ooHqbfim)UOV#kPv}j3 ze??cTyAb_;sL=7aD}_GX{i08bE^ISVJUW4F{pjB%;I`)1f)YUsoxmFFDA*S^vaAWIr>K(17s37%9i1Sja~t+&WXZ-;81* zfA#+lsDGEYAl(d<7}2YO0>XUx1jWR(us<<6Emfz}@)i`c1T#$O%i{AHQspm660P^2 zW8q_bhLbsMO(;o@h$ej)g)nwhTFR7v4cfm1_*ZVLZ9l<)uLNf!RM z9wo&m#)>>4OR>UO0YIRt#1px@yL!x#K3Ph6gaIvPB1@4-w2=DJ;wTcui!1|0HX>1= z$SP2>W=SbH(<3X9$e!-0`B+cQB*SRh1MNuUx?ab?;45mdz{QT9TgB7u0=l7Gi$qRz zW9datCHo>ndLH6u>|*5Su_V^tW_E~|r+W~+ALd5Iw{8v9Nc>qzoKQhfBK0R3(tBUn zM|*pih5OqEihOB%2|Se!C0=85_iw(T=MHouO&Q<6G` zw@{j&!{RUFp^8PxpW>lQX&LEnyCjC*5ZOD?`Vyu1G5m%0l6q6crev)2B9S#cv<;Lt5&&-0e_;OxMuIRh literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet deleted file mode 100644 index c7bee26c70d25045c9d71709e8ed0d492f391026..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3122 zcmc&%c~nzZ8vot*USd!X1YSrmDzXGrkflPED)oVEvWQqka1DzHk_1T*thKn57L{sc zw4zneS{GEPRqVJxU2)%6>dYLcb{w};tD~p(Xb0!sq_I-!@+{FVLTDDLDfmvCSH$jHZhG!^-w zb5Y+ap|yDu{MYos_m}MPXrT{oxAa0*Z&$K@gAohHyTWH#3yk0S0D&%e(3wN!4|&Mg z{%>4ttAji@92-Wf@%eox%vd3jvLbsfUmcG9TZ3?PWh^}P0eCR1jht!dizmv9@ywx(99} zr|LDtt9cV#=f}aT`4kFw^n>A^5>1`QV#y>87X0!xqN{T-af>HTkCUUe^(%6{BSGWI zPPp2|1qb%`gs^G|f*K#9Jl+e*4Qrs@6pYoJA69j3#LQ<6xPd{q94m6yKX?lZhq*%i zWh~0?d{6R=yI`x}4Sl&Cs{CbW*}fhAGh>kQM2;Pmy`Zg>V@tIT$F4cz$ORcL)C7^C zb!*^lI)SG%V&HWv5EnD6aIMXNQga%r%h%%Bw8K~yyAicRBXE5MN1m45M4yUo@T?EP z{ReZf{)#WW4bJ4z>Ys>v_6=h8BPcX@A!boBM*K35d}^xTE9=_uxS=2UCZM{L!ZQKu zd#)EIy4S*O+_!wn+DD|d|6<{0!a-8%c#V7<-wVN)Z$K}c;F_22BxYwjqR;yqS9A`$|AmJU>SFJQkjrf)m47y@HP^hw^yhs zm&;{-<2c`lZL+K*8M2iJe#DNX6r#8}khAZ{@x|gE#CrXVVq(`HmdjR+2_}!Ryd@p}#L=sqL? zO=r6CK6AIjG5RUKo6rD$U=VK<2SZU@A?%vCME+#43IA-F12?^q`)utPuBI`V?D5^r z*ByIC#-6;%N9^s{!K-{4x7qC&|Ngizng8Zb_=2Ex{wPccU!H}s+i^1IS<~Uavq|3N z3Ly`F;sM<{1id|ocb{(0K;#}T-skfOgbpl@JM zaL60|Lc`wdKOp?AfrBF69xO#h4T+A4jf)?ekeHM_EM<7=h<8SgN*kS?F=p(z@e?NM zGPAOCCgo1f%b!x9Hy8`27MaY&(@ILGzuRfXdo$lJn>BmR+z-k>{HS8y`~{UCFI@D= z;;KI_(JuXTS?}uQD?VGfs-|}Jn$OpMv2Oi_jdhzgZ>ewia_d*ywm0th`p-Le?cURr z()`Wdeftj_Jk)ad$kAiRPn{-cyR@MS2y^QiU z36;Vo!%+U^Q>y$T%HQewI+S;Oj`;;ShFr6?VOeqhs=IcDJH`2zt-r?Ss}fXKhC(Gb zWlhe}>qLR}H6QMU_UNNY+~P8rHerNvs`gbDR`mj{1g+NB3QlunF!|9h%f+Uqlk-1WgB;`s`xt6m?Dv-ohQIZtTX1ce{bZ1$b zm<47?8uo6Vz`*`=5uF^z_E9BljAjEhElG)Nh)ic&3ttLi`ze_r$-$Xvql&X*224pG zojRHE^(i#I>G^c9*o(3lp+Qix+OsSfubsXrBQ4aB9h)oZn7+k5O@~stZJ#=FB-;;V zgEe(a=+=0q(6dOhE-loUK04W$G1`(k^*7v_-_Up3$3u%n>4kV$DI*KBV@H`8LyAvi z@)n^IoB!eQmU`1-Q?iv_l7_HlYyqo5VMfe={LGM2T9bZEh!(ko)t`oFy%c&DY0;#S z)9ffeU5YHR-jgL*rbT5WOC_yJmL#p`mekYrF~!u6B~PYL?Ne2DHC9QeJPLJ1Q;T!V q0a=BG0lK0hV~NMKoFY@6(clpr);G9skUCKVJpZ6c08;3`h5rB{LO@#p diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..151008dc482639c3df72873480ca796f9ef31d42 GIT binary patch literal 3553 zcmc&%c~leE8o&3>OlA^-1kJ<*QK*)P>`GBVOPQd68WnJX@Ser62*@e~sE9eBfN>W$ zK-7l_O5LgyZ3Ul2#HB7M+M-xXU1(LbuUhM~ty+6$Qmj1nk9W>Huiqhe`+nbk&wQ6) zE_5hMvVQCsEz7W=Hvsmzt*PGE9rjKoHH$vr_uF{cbxzIf$k#s3By=9 zt8~G587pE%w!?uV(~VU!-6#Nn ziYBNy1=aCt=1r%(ID%JCG4qcIkPtx|`62>G%b3*Y%;eY^pL{%#((g->Rqos^pO6?1 zw`W~N1JncQ%?Xb1X^V>BJafq%sWVXa<3M=vNr!di;2@%?b*A_u_`T)Ye#hvRr=Q9; zJii959n0WQd9g?mc9%)+V8pp?4a|-yZ^1*SJ&@1-6|!`~qz(yxac;rGjy0&y?FaO>v z5L%OrB#+L5*k4-U&o>fDR6S4C;d_=`6+DQtD6u7?p2i?p6w1Wib^>>A9g(Bu04#ii0-h0jI*9BCX-?Oxi=EIJnE(qO-nI zLM@bnUIa;ezO#*f6b7i78Xs`4E|qRw^(AN+@j3cuaVxRc*WSuuc?mfzeo)<+p zOh6BlC}8o?Bg%9!o-k=6=u`PMK>FTsGGox>fxrXGE%)>THy}#+fzBUz9PCa^M|)R}g7xcVmZMwuQ`Xbmz_AP2C^aM!P8^ph zQGfL{=p2wWIBiNDGA!Cn&suLsjhz*U^jUgxPXQ}EH|Sk@Ozb%FOK~-w_Def#JD7)T zOB&_FzgQ0DT-a+r-oXIc)CHfGrjf~g27q|~?J#lj zJ|eCe5G6O=z}cKlD17K;>4CoK%zF_wP?sbI7py*n^ZP6#{PsX3xu>T3ovcOkRd;%u zT)z{y*2bdfS}klC@(eC*TFfk&T7$BVCW^kHY*3VB5OKE90NOU~fUOBp!0Viryz-qB z`et}Dyr^1$PAyPbEU_6)Nh(aV{puOLnmT7e3wb?UV3`0eZn;6M7ui`B%5C5}!x|#D z>T7r^qeSGqKNR@QyeZlut+S~5whXv@IKrZFv@P(IOow+K{D_i&wE)?^mWtf9ENp*# z7d?N!NIbDaN<27`2K!$thjvpmsC|Z-%#lwNZIzFOaiK=)i8p1f*U;jb%~u$=hEjRQ z^AjTRvrFQnP6hhLu$rnEwTOB$|0wwERv_G5UTU>-{|sVs>rOccJ1X(|NiM2>7A&n{ zWnyO6bL;(~HH_A|33Q$40;ikyk-auQ;J<>k44X&0;NPF0ZIUmolH@NwrZUoAjwj-` z^AP${;Qms)$)ic}9Do?5UqxO%!Sr?vZ>-2)NXrHWmHKU2P-vkX7Js>M%fdxg_8OCt zt1MXD$1$`f{77Zt(!L&1Cri1lMa%n*Ouv2k$ks(G?fe!nDovHCcu>%4M_p6Z;x&VV zD}%~3)l1enhu6lpH&rh!ai7^_Qu%LNR^}Ohp+ zYbbIpKahO2e(n>I+s04lbe<|3wa9H#edfI@2j|i5n-9%}M)?@`;>w(5Zbm@HxLX@# zT>}p1R*8+$!_%HbP(~LQW8LgW42h(GXA&L|QVVf8g&BrrYM}HH{Zz1yVBI7{2kodWu0k8t=)0 zZ)&_&FWPWYY)YKL{G2ei-UDx`Z`kA3`>Ogk7=2x=s%2h4DaweBk4@IIBz|=`pqCym z4z}7|ef)4s23|%IJDQg!l|otg2M)p(y36BWgZ10<`US)P z87xpRDqEHIfBdpi|25B?|I6i7|5Z0hvD!|-s2N39fBc12*(#-6Xz1iHAM?dhR*>#I zd2t$!<3jL{rwJUF&1vGe0FH~}{Nsc(4F*i)SN*buoZa0= z;>FlWLHyo7n;*yW16Iv(T7J;z_^rU_JotTL)UZiIqr#?V#sqjL>cT_gd46&*_HXWN zJP7*y1tr)COlEz;lIQi1&xi=~N{I=K=8qN!Ps(oi>f%kLL$-=z{BJe-iPT z#-I=LO4Wr=N{t8?LdW((%<+w%+S4DLEKD!?!>3Y{785w#z%#gUS{^Tevb*hvj1%&W zlZ~l6^&B^bUj`=eIY^5L@SYtt>;s&WcX$v^ayFlT?4r3S_)H^6V$X4On6KkB+2%S4 z&hkfF>8aWF hS+VIE38^XeLx;N!b@NbZZ2@@k2M&<{1ml11eg_+(_4xn* literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet deleted file mode 100644 index 4748c07ab9f5c77e8fc8c141ac748fc7bb24c158..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1354 zcmb_cU2fAr5MEp>P^lFXV9Q1p%1bN|>Q5?yDB^)}XwnpuCJ}AoyhKiHC28UxVh0x< zH~}X>Ja7OGz!^9R$6#hHm$(v@P+8gQot>F)zWF9f3p!;UJ!bC)Wuz~!2PxM7nZj#IL}>w9Itl$ zb1es`ey%X35RE3TXGj`fH73Z{La9DH*~53H0XyBSIh`IR5x}XXhCn z4w{E#R}Mq#*RFBU;keD5xzY2X*jPq)k4yQ--%cJZ(ck+By3i~a_-7eW*O diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b485d3882928aa5ca25ac11f280ec415a3432930 GIT binary patch literal 4311 zcmc&&d0fp|8-LDkxl8M9`Ca#_AuYEeS}5z7?k(*|p$+ZYyJ?Scg^40jBgztGX`zU+ zm8A%cZDflv%otu`3`UmJ@SfjIhS$s=@8|P=-Z}L<=RD^;=lMS0=ltq^%TwxQ%}`7b z6XwCFF`x}#)p&aDzJ-&YT<8v{NM4#^?y$LRI6rGhcKO+pf#1nW0_PWQbvXU@*XEzE zTr@P?r}08}IRlK4>2JXZ@E3>S=-YuAuP=s(*XIQQs%8Z41&AcGC3ccdq|Z=dTi^)9 zB7R74d{|&)a8e{g2nE8sNcby~WFrah?}CXfaMr*OfI9&m0I;2j7+();vNY@|1bhS} zpGyIy&I1}h9Q>%N^Tw%TceZKu4s4812#9!nJ0c!>?<2&dK3~O~?@1Au)X6uJ68Zxv zGxaceXi8jod~#q|aB{GE6rMzd1%}FFQ{v*4G;0XqGD5P9lB_26{|n5&!Jvq;X7Tv3D0JQL_>F^AB>x@dd39G1`01(UTc zAph9}z)}aEX%LY6I12pt{sEWYRsttt43ztdq3o#u=B<;W1&L}zw0I0O?Y4r`8=S!~ zZYVsP{1#oP83qOVB4plZ4Adh_;K|*=K9hpB8e`y_8i0E|pjzx5`p5PC5hK^PIrwfhAlHDS+E=;~_oS2a2+`z`N9w zu-3T(Ho1+5JL?G4z2H6!$^HTit8L)vvqiA|nkg8^YopH1FH!%ndnnlqK|;I{IIWxn zi8WJ+y2x58xAHB#su_Wf3@uXS8M?#vLEEXI{+qyfrU~V_r4t=;Jp=8d@~Nu8Hr2tF z=gAeW=3ssG9>h^+h{J^kP?zah!bbj(n||>;v@E=dzAJr%Z2Bb-CGs%nI<%GKn~kTd ziZ4RVu{X#NJcuzTtWc$^I-1hl2!|z!=y+xuvG5QHdFcT}oA(5GZ1ch;dm>)QYlK%Q;8jLi{8-u z1ayZYdea*>LTv;=rZ5fQZ1fshZSJZ(=om@sf;x=)`Y61vas_q22@pGADDh<0A;|ge zD0#(uJY+Y`CZ0FY=+eYfFm9X)RX^}K)aj-Y)me@(z|kEVFML7j=5-^J#d|?>LO1*z zSOerZD^kwb11}|;IvBK?^CmwD{?)Pw)}5@S*KY|RO6sgpgXtl1c=;dXjI;O2@ka-7 zjj~dSTFYx(#Y{W;822|i*2<4O4M{MjI20CixX{{}8DMdsk)w4Dq3172fZ%Ol_0^YT z{|q(o?#V&--H)SWZ9WP+I*z_!X@zLBnS|;1J#^?Pf7-241OB`o3Xh*+R z(qhLda$dVL988=>G`KsXHA&fIznRy`bj{l+F6u{O&TmVIN3$1D^%Ff2@BTOo46 z<~4DbXwsaz7hP0ZZ3a4@Jwjb@e=wwt_TtRBml#@K-ivVK@IdA#Iwv_ zsiJzkfr$2hhJJDz$us`(K8jSaP&vEDjyrwTLIJd2FycYHIT(+B8FTH2Gyj={*}l<;6J&?>diO--)4!ZS%yp#-F2B zc^c8GYxT*Ho-hI}@!~~17z~f@H>n)gJ_ScTH*m9qLk3igHHG1KAJaO+^HC|+hS1Yn zNphDw0hbM#FgR-m$0KIuUI8uVf{&>$(Yb;nii3m>3^WMkmhIyv{^ zsw>bivXwZTf05ih-ayl2#ahZ{$_>&_r&uLWJWHj_)P-yv>%!2EGX~yk640B3_H_P< z69PTkY?PEFREbkPKu+ygOGZQj`e}KPhJ$tE6 z!1SnCFqs;Eek@aLSP{kkvczpW&%g{15^vM*hfr7E(CY(G}Ic&Vm=EL$Wk`zA-n%&Wq;sVr-GKdX?_Mba%xR_KjN ze9+#sW$DU+_KSF8+1BiQBbNd*zvitu-zGD5zggd8;9m?}Ij!pml!S&r?WN$~qSR;(&@$ksOZKT(mt#d(#X z>pI8Ef^F_aHc>ZDRDN4Nd9y61|Aea5mEILzmHsVNg*$w9hqSJnuwzYiz=6b$8!bE5 z?g~7%h_C0gv#2)obb)!m$(_Y}Bfc+lUGG%Au0E!HPt*@5tJm+BCmd`2fbMt9bb}II zdt}XL=s6p5FnVP>qcEO!{^mjN2Q2Ow1z2tSK?&xm1fvy^l^E*!Aq-W!{%TDh)VYl; z)H(&|R~GcfQCCoh^=JiDv?&0diWO=t>mAAm&5gXeEmUFZ9*ZNq+!UxXbPjfDSx;x`bua{rbi8SDHcg3rW6kM7khYmZd7TOMWc~ zam>*%pI(=)73#D?Z**$sYppPsd?VL2BQmtZ-3o{J?Mv>`p6yXQJnCv)hEBw!jbk%j zc6RARdX_rm2yN%;MtN;>FSbj0qZ>W7(z|kc{ahwydUe3T)E94I%5eD0potU0RA!e*IXC$1giG`bBy+X4oiz zH&s5B9e(U)kiwt~2i&(Sus-4K3*z?|grsw@K11~CN0(2f7C$a3p4F1!NmOi9QZhF@ zHk_4VSaW4+2y4uR4~!(c2TF3ethoI6N8&Gql_%rq!Kf%zl^L%=4QND#6Es$p?N~_$ zzn6z0@5j%TBtD^RB3_>_t0X^P#ZDh7$*tCsJ;VM^Nos!3m9gRR5y`!hxd#b5k)&j( z zf9zEfHd+b=5}vTlTo*vBFH)0wc}?~i)B9$LHdB^tvz5xEQt1@@u+3d6O_RzZq)t+4 zgw!EIxhBI76WN)ARO-r(*zOy#rJN?Q5!gV|$#aKTT8_e#36oq{+aZnZC$Jq3EtPt( zoy?ClmHSdF){Y6WnPeT}Gd(5DX>^RAuU90yALohl%ZSFEGMyZ6cY zB!3^<_%P=PX)t?U36JUE=GSL?O`FErZfw^}U731sJ~4PrmK^M3EBEuAB=`4KQpb2x zdil9c{WKq3EZjcIhb^TnAK|1uk=!>BRh4DWowY& z?=(6(#3mirWCZJki=4*RAE(%RDR@n$Jc%>M?C`#yRF>9zPs(K3Q`kqUtYq&YCCT1( zC3U|zrxc8%I7|U};(uxU8z)yG4gdfE literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet deleted file mode 100644 index 9d247f8cae7a335a4ecb364c654ae9a66b3a8c00..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3450 zcmc&%cUV-%7C&>#vP%)vz3VQnf&vRD$bv%DpmIS#UxQHe&M zMpOh_L@Y!FeM+$gD{907@g<5awy5~<8B3y>dsi%p{_(!=eJ}ID%$%9ierJaJ8^YNL zcZOnon1LEb%s@K;+iAzA?3oqz= z##q10ST1hek&DqA9T`UP*Hye}<|LpXNHQ+~R87=d7)}=P`nXJ8LRv;_yf#B?k&GwF z@v(9G)J$EP0PHs;YeuR%Qq@c-|BIL(h1+P!T({IPPe}psFD6g!lL7N4LW<`c*xtxO zW6gMQS=SwITr`I#iyYu?tuw@RlcG%}dRRC{3Jxo3L4VBwz^y$zZbLx+gJg&<{~a#8 zDg{wuUnm)=fc4E*Fm)A+a?;I-6h&V+v_lQY3j?96&J`Yqy+UWpdcabp9649p19jgG z#QG5MnMpx?nLS7xZK1u?2hOEf!QyByn6SVP1~;a_=xiI1EN>4Dq1$0Yn=I(;whNu! zu0(bNcZc*?1)^{A~~T7bL*gZC&9^j0lPwccSa22rBB# z;BwpcaQILcpw_L|+fu$}q)Rvcn%k%&Ue<6b1`Ocuq7s0jyEu6S&3CGUUaK1>5 zf=kzdz2OwRoEiXjciiB@xTSFQRT^YxL_$IC26#Q`D69zF48_6y;rc29z0A1{-Sav@ z*Xu8$V`3hZl_z_555ncQum@w}}keN&s1lm1= z6CO8B@7(>2mVyQJo^}A9Z3zO4D1S)p>`FWuQwj4=9w09c?GJf}#uF`7G&(={7a(3l>7tj&VNN+%`oDy0v1LGdt4TfM1LDj)YTwvpq8CdH z@N?~KSaozKy?VnaqNu_hRXJ9YJxhKkN7vmZ`yc3HVwXFK*y+|_G9$)|K4|igPE|*d z$H4%7SH?lk-9dD_nUld~Z?&jx141oNNdWCPpdQeVw4Q7ZLw}o(ZigH}8SNw}{(v`q z*-edTrx?Pq|1LW2STr46Er+GMMj+=4)^zEK*Jx{p6Qs-5ugIy5fv_)qI8hZ6h`u)D zksV@wAhRv6B3<%N#KebR5ckLDP`d|*Bk^r-g0tL*I_UHS{)ClDn7twwrb+znyDMA= zImQ2R;MPFdD)2zQ`W~SO8k!$qp-D`Ma{zyWDOzMAmYABETePvXYTK^8l#z9iE37*z zJ9W0P?b6lG-a+N)hi)#-zwYx{pUY=f`_3qR6bMJor2l%kQegpjj0tW>JhiC?e zhJ_D_7&>hDh{%yq(W6GkjENnq9Tyj$Fg`IUIb}ksE=|u(OgCg?PRh!jJZ0*%=`(U> z&YC@EZtfTJ^5!pCnE&OX#a}I1`j2JHzg|(Wa@Fd>HATg1*R9{M@taL0n@hKB-L}2# z+Z{W1l~?TE^UuBeDyyn%_8&NS= zx^k6k;^Wbrx!JU9GY}`eeuEFzg9GJF6%NvEb)f}kb?fKbC3k!|fV(tDg&%6msqbND z_p!5I&Of+#Ny5RyRd{zfHiJhr2b(K7FmJ{YB%JcG5zfC+6SUAx@Pux%|Ib?_ojCKS z7y->GEq{F`0O&Ww>TcEZn*E)w!Ha-vAjT=G>N_bJ!pMJvJ|qr_E;)_o!UO}Anrz50 zNk~oLNd+@VlUUZQkj?F&s+XzG2t@n-fq#@H)Qnn#S(#@=xwy(4Xyl#1^A~Q7Z$^BT zKHzUbPtcHyTXlIh#MXaW#o`-ICkS#Qp?YEaDKfNtOO2@sX^9y|p1+J_SCHzCjjGTF z)8Sv;ejRcU!bpd*mE+7-f5peSM z*grPqkH)A8a$8QDJ~1;P!!?fMT(#-x`YhW?3F(GpeVVPiR}c3dYK2A#@b(9)DS&YN GU)LYZD$;EL diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0da33db3c308003647b082db9d88177f6ae33a22 GIT binary patch literal 3765 zcmc&%c~nzZ8vpKn?bU}~p)$}v-dD@r~;I>zdxmpQ$?rsM0{*lD+e z(|h;ap34EJy2kk9?_JNV33TAdP}_0AxOc3kfGl2+3<{8YBor5-|-{ zau#HAc9fiG3DT{z6HH%8Tg7(ms95?J9VLPC`YPG{1qvomPGYBa`Xv;NJi!>BZAeVZ znx3G~(#uoGASGdXyfHP~kY)k50xO>(r`wd%BFg^1i21vauiJ1x*o1S3NHGZNN9U-llfcAHuQEaV>f;RE{E#Ho_-P8R*!;3%H=3g0J1QXmzX?Jn(A6s@=iV z=4Kg4*7bn8?I+;dl6CN6MF!Q+Egz2>>CL2+*@=EQxgWWw9HBQAibRZNDjqa?KNEi{ zmZ>`W5W*T&u(>@9%Z6hrn`?mJ0ngyH$4%+odm|WIki*CYN8njaD9CkzklM!;Kb%$% zOV1pkzMME3N}6Zlwgv`W9DfSR_D@C!`nExx?OeR4FaY`ljDyC{dsAgD3^*?S5Uc}# zg72p91!|O*GID;vW|z<(&sZyJFU^G8ClM*b`6!P)lr{9Cis-cL;k%Gru)KNa5>Z{viS<3wjU|TyAllA zF>g@4a%HeSvxMq7^(r;b`deg3`3BGVM-g5%ZwkIYGoL;%J_50KM&Wepk5Ny95rcjQ zZ2f2^ZRT2-k3JmF==Y^6s%!Vyj+-5W-ksZu`#AL%Z%WZ1bBmhdhb^FMj8k#W(_nP* zr)xG37oCz6ou46jA3cz-wL2jWGg-?jUR*~unsVfBTPl)9-)9ZiSjmFR4Q$QCeyH)R z7Eh_Nl4J}WEUL5Kga3BT5&tE?)+XKR1H69PJvMrcKa;Ae21l29;{;= ztO3u8{U^E}Hi8}a%^j31b&;Oi?Aw(U##GL)U z9-Y@eg_y%@sU~MRYQ7MIFIdO-u$ENgt+za|^vP$C_`C%@yD^)_JMz@mMt?@Hi5S3G zZLp`}%n2A3MX^bDo#FnSX6do+r{G9LmAFJ7*JqdSAQ*P@0n=kxDcUObz;<@4DRI$5 z2&q~K&V{w2@T|Al^fr?u;sS;A5!=nSf$vk^ka?%C5Y@iOVH)ONsO^FME%yi z4OCJxpzF(LbQ{}!6~^xvuxl%ZpsWy?@33IW&H8Gou1C3ax_X-Qv~LK<1ian%RZ-tPWB4R`_$yhmR^0!vfgv&=;au$8oqVl>w{wpD-mClc|#MSn-*HT_78C zl1>QdiLV7~P+dikwQFPn&9+^p*sHxH;u{}QU+rnZ%6&$*_pTja|1UYjR-dED&uEj3 z`;m3wnxwxz0JM;XiWU+jvPI<3hji4>Um!G-fk(^9q)AFk@&U6dzby+3D7L#MrfEyT zVzHus$VR`diwdn&9?4glwk|HVcJMDz@zqO~_RvJ_ifOJcT;5X~_vuD{ThU6pkr{Wd zG;b?j-PdmstM=bsQaT`{Ortxxed#-cBC12m{dX)|Hza0%^3|g|mTz#IakNA|cIS%n zVaXSE>5lDOSvdl5liF_!GQkbT1XYZQ*1k)cFRu1B!79$gTKQ^iOl;1hyJ04v%ii@e z0orCGCuUm{ieEYXx!2d?PlJ)a(2wmo#Mv#x&0;CFV8uky)l zr=8dCirgMn?o;Tzu4dBi*pKHkL)P!s)y=wEUpQpL-l>iApSLqE8$Zw=U#9jgaw*@R zc&@@bPvlxrmvX5_*HGkIdC<^WpZk+&Xw{)PH_w#&77zVveYTGYR5lGH858@xc<4|9 zsl&z$KQm^9O+!A|ab}unL}pn|6+K^Wh8vujl`mkBY2Xkc;Xo}(Se@_H0g`;KL!Pgd zR`?dTTyM@^plAP zH8RnWW{(b_mz{IW%i27AIYD}{_(AnSgQ_9-Ec+7agl+XJo%Z*)=Wr)Y2zm8wl^9ae z1j>+@NvEb{W{DG16NPt|JhDwv<+8rYGj>joRZe#;gygsP@poU+2rl3y^I@-GU=XcG z451J4ggV3_;T8i-|D}JOIuHwssMszYfjYdtN|b)34t|6MJ#zdb>R|nnNTepFC1rIo zBQsJyMoz5*oy-FWd3fFbA4L5<$m$iIpb}-qCnp;898Cz6BmNB!bF>fN-NTQN3h>0L zSL@D+I+b7LHiF)-C&FP)6h2C@rd5{H5x5#A5xP#vu@~|0Ao#Zg{!g%=l&S62;s0Zj z_$Lvy|H~cz({7UT_CTe~iB*>MvjtGwt7Lps)SJ;hofk{Bg?1CfllSL&K9YO|8OQTE zynhlO#Pdn~*d)uEKT#MjjK=bOsL+#nS5Ib^VW!|gaKyhke~6pgNHPeV5F*US<_K+| z&=A)=A1*ZhI$>(r=e5FocAUoq_qgcE*$F}3vvo01$-=%Ng5;Mwi!>I0V=Ycd5=1)v zS%$*iEB2YO(Vl4u!AZPcu(yOKaER!-=24R-3G*4a`cy)&QLWX(TIAMI(>#Y`~9 z##o?}c#)m>!rpo{A5ttLzsW}^rGI)t@Z>CEgAWZC^esv`UGXDBE%heFCel@Uo(~X) z!Kp$G(qn_XXT^EUBQ+T=D3K!P2=yl^b{+~@^S4-%%n3TOujBo5I?u_HtYAfScuOTa zjVvHL*DcU>hM;T`$AVL^C;o}4a5SBopmIppXUxe?%yNxSPj}U4WEgWC<|bxjrWn&4 Z+`Wdn57nx}?EzkXpppDNiy(hk{|!(~IQIYm literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet deleted file mode 100644 index 78e853246b125d8723fcba4db0657d7509e2c93c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2667 zcmc&$dr(wW82`?_cb8oSLEtV6ZHf<8G~Oc!3=wyE!-{|@EQGkrt~}hA>>_A}SlQqV z`5G-49H7x05mYj45KvG0D+o%dp<;bS0j^6~HBGL94j_aOJu=f|lF?R*;*>0_)03{IJ}G6|zEE82F0pGO5U} z4FF_YM+x#(G*JV`8IDZqmZ zo&X+Ck->X}gtmz&j!97}vP+~QyRHPM{p2oo@K4$lNxOj=P21N=Ybq!+=#18Ut=g(~ zD#lE)How56FEbeVLI#lJERwIM<)hy82Ei>9W+&M7;Y#a@jtJ)nL-X-Oe4f*f(0RyY3w9 z?Y)Plf|82-Gzc{(JE4rD-Ec@=ijGyCr`B}R@J(P4+K>|lH^N3JX=@_gc*GIJTiu|2 zM;{DVZ-oanrSz1*)l^h?grIoy3&Ly1_oLwA9;RWfP$=-tqh>_!7Zmj82(}%)2`SxD zXuOv~InJi&GPVm6r`>_UkRQigxjIMS4o(ohst4}uP6j7cJm@F+Q#TiN!nzYZ^q2Fa zq58-YYNShmKA+nUoA)n72PTg|yZdr#&)PVc6qgF!XI`W?`w773t&hPu{!jQJe=pEc zL9~gD0Z~~sb8zt%;k`{3__^;*nE8d5pniKU)zBV{x@L6J&4UlnYo{*L(LGZfrmbC0 zwRjBDm3d(Tzm|=(K1f9mfCVD!3SjlsB!O$y3h?Xb7Eb&cp^@Kdczq}lf?j!<_FCZx z?^&v8kGu zIL4lW~BMeK>&Q8xu96 z-Suf=4f9KzSmRjUZR`)xfP0;ZICIi4l_$eAAo{**Sblz@jHPUVkTmvXi4Da@0uDM0 zqc65t9dvpfd3E@S_0p0mN6GQ=^4baV8XkBa?Z#8zkqhgtWGVg;iDW|rJ1{*74|a)E zWakJx*!hRM*mx8k7?59de-=DAkAj6>XDqbZkgfwsdXfB;n|zxa28buUKO^ST3iGT9 z1*y-JeB`3=szbg{$FnDbGu2BCKm}K7fJ3ry90R2q>fx> z=De&(+r^SLF@c0aIR(dY8Td~qmE+1eMIo2KafMuLA-|`<3Udh;%W=tM#%ueGS3GMW z9*84u-pZE(1H&;BpO!?{vE^ipCj)lPaY`~MRAkA^xgfG$stHL8)?_U#(ELU-g za@#ra$r3B9!||Q88SzEh_W9CPhJ-R4hp&^^WB=Hc9F0vAq~2zA>9R7N)xW@O_E(pd jn##SG>q;%fCZl(7SU_+d@*WeNT5o*FZrT)|lKlw%4$~03z@hH9_#iisa z7R6FE2VM5KEuTGZ1pv}o7(H?kh8IZ*2azg82+V*bFW_^6gW^L2B7>45DNLEG+>4Y? zAz`GD{db`RhIEcVVL%V^^aMcbfs~UQodc7zF@b=C0Ov>%K+3W~PL^#YPg^mfFg7u+ z%9P5VS@YTBCww;gw~aDg$zONrLm6F3B^Zup4`9Lv#QRcoWkVA}QsTnmlLJD7l7nQU z=tWd$KuAJtN?g38xLi#6cO<-w3!h@D{{`kJOWr?`>YeCDJs~Ake<$^*!!)gP#eT54zwraz}emw}$=b_#839xxN5421AA>k)&03#)MHUWd&U!uT& z?_0S1t`S%fmQe32fL%}JVbu;1%1Gp5(E>|2y2lt!eB}rlans9d%IRS;NI>dC2oOhnP*a;64-$pRH5_PO%bPThIzM6VhOc zQ3pEPs){rZHvw;x6KEbj3*xSs5dTCO4o&zBio#W4^B>=UeQ6j3wrIe)Wh|&1YDc%E z5pPTU?zMKPF zJ$NAa+7YrI-A2(V3b2RJg19UW6d01wzjrSfu5kd**DUDD)dOKJ3tCEp;N(p?IDUzQ zOFNB`OJg-?C7p&hs~kY{fe~B|E`Xcw;$da750qxrz-a0KlsGm(rOQ0HwF5(MGVX&x z_GHj#HHD|oGGX^M9ngwbL@%m-Ln@(nQL-)qal9rt72n#6CO#eym$`!*L{ahRX;|i6UErhgiv_ZU5j&c=aEfi7on}^HPQe#%(BlIH9E_p zg}sO1urLuFTRVuY>&9X0%B9$#w>|t~{v6}CIO3H@xnRE62X)r=!_Dks7|l(@ry6Bo zRu;3#sDeqXM*|(mBpV&Wex0a}Y-gtlI;hi)&!~Tke|F|RKJUm>Db1`@tlj9ERQfV=vPbF(8Efo^ zpMWH=EDM2*hfbv8+Rwpo{~?ybHH4o3h64oEf${vgxXR~T@cttQ-FH2Pk`*~9^oSLC zAaOB9>Mp}{=5>%E$NkBIjwML%vI^OFauhXBJc%1NZ^I9|I-*rWj&N$&h?Rt&A+24s ziI(#OZv4DRWw?4SCr7*i_2b=A`W{cvn#b3nqR0WCto9CA@gq{}GzOqGCK20{HwSfZ zmzTA=*hMNVo(}zkX;{L-0OanRA}ze}J=~d+sw4L9#F96*N`-G%BkjW-v7po-qU}qH zdtP&{lwFuJafMSSC4M!Gt{zy6sh0Hc%r9?&6@z=!U9@K)e%S#2?V>u|Z_@*`XfnZ` z_tuZxcIXlMqf|`9DNTV$+jb*sqdvn;E7Byh0$6{DFNfj(u zS`$B;l?+!an$X3lU@$xHMqIzAkKHuyLze`Z*r`l`%qA5pQo1TzO1)s&gx)*hNQT#q zGG(LSQuQr-J4;PAi>HE$lMC_Hwcn#tNhK_UE-$bSyv?fScFNS=Ed#^GSu#CVsxU)( z33~YKS1j(e45ZDIRbE|0q2ZU0vC-ulIPO1i@n^@yX!4KcNX^>@8(tk@#Tf zXwAkvHS0{az@|1kU(=~T*RQuWr%=apms6Qd-KJs#-;T)P-nz{tMuENA0^8lWWiuiN z8~l##-cmkGeCtG+ZT;4YIV*pCFnp{&uX6rcjH6=Lu&vrQTT#!yuOYw2ahrzo4!g#J zI@eOusOx==h4mg)HaRNxO+}5~4PK4@{Y}NqK6`>s?yzs(-nw*u;=}9x%_Uz4^ki}- zIkc3vhny(TTRPBEwm0I!F6VNG)*W3jLmg2!23pJaC5U@YzDM`#(fLY(t~jgh6X-cK zdgG96x?%pL#pk{|FotZI4#w3tBygS*I5PcdZ5@NwO{2Az546!}zc6TLEA}Of{McVO zD{uMSRWG03y|gbcV1C9Zuy_{R0!8bk6oiZRZws>BC^yxi$Ra=3e)B|=@XP!2Lmcze zL>qXs3qqX>br$Ykbf6&2wZtfBplEhsxO@4m756S5D2(u^oWG7mSr$butZ~d2F77Uh z@~-!&b||(ij$YjC(;j}MyEw+@YX<#g@$$KQB{%yu|&QSl6k3riwmqmq)P!eYZ1CPdSo9%JG$Q3(l&ooT3uYMq-|H~Wx$6k_hi@q{f$X2e^;Q(wrR|0&kS^Ek;si1Z2TBiaxOAv^q64N1t!5hOHx7|X28bWdh7Ms$!NtW@VZA6kzjyXw+9NU+a?ic5fLgPs4Wc=y)v@3HpW0t^I z7Y8M-NC``x79ti;3rb8(NK;P@OH7JNh*vi;pKdbUSm34#@csjb2mnv|KL`H;bMo-^ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet deleted file mode 100644 index 3704000ef5a4c06ce7bce030a206d86afe278c9c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3289 zcmc&%c~nzZ8vpKFLO?blFD4ifSpq1SK!qYj^nnnTh=>b<3x-7&A&Ddi)?Lfgg`!fe zxPrx^pimX;C`%C)_Z2tlI67*rTiuSHj@4G@zL%m>^^ZAc&h$IU-M;<%?zz7tmW}ga zD8`9#RpSZl0BnQ$Z7B=TMm;?6LyYz5Ne_3mGr-s}VlPI(*y681<_{jiPM$>ADGdPT zP6QqW2$2H$b|hmcsUHYzr4sXGjW#tYLu1Hb2(dt1j>KD#JVGw~kR_=X7YjoG9&GUh zfcHwKzZ$#eLX=Pl%ux`D86453iIWyFk~eWmgq<9T5d9$|#<9P9ikTL!eCz~4x`Mo# zdL6unRjMu}FDFfFOiI-lHCCB;k(rv5qRY<9(emkBLgLFvUP{VO60-lo<$IHSX3fmE zQ8QOaKF=nzr;TLz=TjleZvoVFu+Y&w8N4?3gB#~9;c2lO+-dWKl)fTVxmgEGCW^pq zRU7E8xdABb;juLVMfWoyvF>*`_p%BE>4C6$tQ5X}WD7IauxL)MC6Of!gx2j!IJ`~; z&N<%jc=$_nsnLt$7) z7K|_G4dyHD;bKe;R9NSOqhcpIQ6obx&D9`U904xPCqTbz0B9eGp~-qYte7l=CBJX`3^goFIU*jveT_1%mqH-QZGpd)U|72dMQ!L0SI<3L{-0rgkGpt9_uH@PPF_ z>tWWbTDS>=;DU-Fu7CCk6b~1H^jj4a-o1gc^18!z$`x`7&7s7LhPJvo@R}74vCjpt ztH=}NMFQAXs(~X{ZQ#&Z8qRJ|qNu8k;A%JqFJ^>;%WVamOD=({FSSr$jEB;~3V1c` zAgoetfwHJzxW0x!FXr5WehYhnbB!-NdORO0FS>)P)($-_{{=au-b6+Z1bVFtgfESO z+}cq@eZ~%|sOlyByLJHj-n+D$&^a0^`&3es9Lm6Tf*TcE@f0;g9)pg65t}badlAbB|!U zdi5Eo-TNFlgPI6DphQ)XR%pb&CTN!DqWyE*iFpkq6c>ym+DC@Leg7wfWSfdCYqbRb zx_DH-sSU0!TnVp=a>+i5IYdyvKsvLer{M0voyaG%h1yUk5R9BX*rHV5(<0mP3cYUM zLlAi*y6kxrVL5;x^O#0Zxjcg-zCT*rzVk6H0xJlZ*#gh1Bf%;m46+@)iH8##VA0VQ z^8CnPSlBw5c+yCtv%?NSP>>t7tM?PA7fmB-3WtK@&}e8n)r%Arzd&va>cA%K1zbz2 z1u{rU>X;!Q%v(t9nY2Rie7OOBYMT#h4(_1WR*WMy)cc@D_XhH#&A*f5kKZDLTl$!} z6iy>{C@z}Kp5RaKHG4p3D-+1WV1U5YDKO_w1Z_8WI(Y4F5_G?a(34;AyQ299l!FJ6 z4%01R(a>dH8R+X+^ zvv%G34Q1sUzpmKyP37h-Rn=R!)zp5weaFta`d#1sefOS*#-`@)TlTi@+kc?#;Gx4u zjvhOH;^e8*XWGx6JAdI~#}AkOarvKDuIm5A`J?rg-DX~!jr5}HH#ld$dXUJmkG^Zy z(R#^`6*qsnwRua39^f{ur(#>X>qp(e*6w0!_s;%|t=;FWJ-BjJzO1K=AxwA^(e=qT z&&kws*pp_UvNH`vv$X6q?m5SAcR4L?V8qWv@+L-J%H!JGt$FVQtz?w1QApTm2{Z~y z1aO*fiw6$8-~B(Llxza+yz z^sf02IDKE9(jq^Q2n;D1X*n8(!tm=xyoKU!GW9n5AsC3kkC1dNHX|?zcg0Cg?oAsQ zf)S{LIBiP8w}eA_0(U`*?c&6a95ThpEPnR$@Mwd92PaXRkVahLRRM&WLu`s|o%5+Dy8KYX5?KDLil zxz?B@LE@y>D?TwT7?w6t79rcUeRzf!7}0lfY|69o{9|J3{k D8Fh$h diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e9af93b02ee386e9ac98f1c9aea4130450e7a99 GIT binary patch literal 3781 zcmc&%d03Ry8h_9EzHjE6VMd1K8)t+RR0c$lO%|1uZ`c(LL1l4)VG#%rVnAGK02Km3 z7R}W3qTrgAhT#&JY@(S)38myU^L8^tE4`0(UBc{qLu6j-kNZ6LxgD5u&U?=L{@&%B z;rs~aBiuNebLA$6aUu@P0aOdToonMS#q#;GaF27~Dhl0~HoxCs)t=7*XUVCYI0g9; zaUyFkaA1e_RP2y807%DT(kl?AV4S8MS#z}76GRHNN|K;kk`$M!)2DJ+sZbt9$|j_V z*06sDrFJ5*0f&Kf3Nio?xe#&DVMG*y^`j|}AfP2;8W=eXBJwdIADNu>l@oQLnGFsC zhLRicBzzS+w4-9FUv-ourDs>krip_wB}`GJrypu=(!b^8AWzCl%*#w(k{g$#%hk!# z$RsT(E-@=3FLQ}8%Y#_C7Hck2nrqnlzwr5ox%ZlJUzvq*M@dyk_TREUq|-rFvmoc- z9Nd`NKv%r^Gu%5k5uJ42Y{HJ70efus(D5U-!`Ov(bokC6P*dnR==G_g_s4acIJ$IE z>q5tY+r{gUNuR^-Z#<43*q_JlSvSSYE_6X>(G~Pz-7VxkJR8?$CBcKHU6jOODzksf z1vuFH5RHZ~Jo%I>dNWjxX1AY!_cb}Fqp%wnHBnHxY(DOe4218z?qbzj!PNFP8F;-L zg&KEt!k48R;c-O{W$RLaeSIb|Y3r>-H%}i%ZfPxaZLvrcX_zM6oNXn|7I2ZjzB>7cK{<<0nLBmk_%9Z*tjmyTNtZ^VEpt zGKd@~Lw1`wp>IY9$~Bjuq!wT1vWqLKiuD4?MhjpZ7GnFUhnd7PvCQ-nDp-Gb9$N3j zz_?FGFmIlHj1CMxOF12=rdIX_!+SY%ar2B|^t!&38ouyzYMJSmC^PK~{L=R&_}1bA z`pC?1#P<2(Y}5D9@XRb+cWf~|z;!Xl-kr(l8Zs312i~@vksgiqF1>F;;SIus(IwL8$5HC4|zLURZ)hV8CFqOUW_!=^E*n-?FYedqh zTWsc4R`T-2W_Ew1Ejn?|70=&hBFULBPSj}nHvaonJN!z3rCGMgJNWfQ{cKdVmdQ}n zgPqed@ls|fG(3JC&tKUlE-__9jXyr1^Bb0dwuHA^@)@+5! z&qu+YN+)_$xGn2aQzbR&XvQzb{tI23?#+(6*td_wril)vUxG4TH z3Yd89Wcao*OMIdFGidfci{G!gK)o~7#?)@zCfa@WWvbpjgz6eNu9$qJQn6j>$8-cG ziL5+J(Z#^IOxD&A^_>1qR7xtKYvu97{LL#d-jIo%uV4tu4UtXp2$p=^R4=#b5Rb*qX^ zRPL$0t#zwkHMP-}s`&afWfl$*`=Zt)N%$V+LenFSpiB zTEQ6YqLUvhICl8zu|@?Q@w(wkz%wRoMitpI8=QH)E4)a(81{r2Pxq z{ff;!*UlLixi@@w#F4dg$44EC?_BR$KF@jnvFz*JN6MpI;#vwB3$Kc3x5UIV)pn6b zD`GrSKHd?y!E0TtXZodgl6#J>o9~_VWoxm8cjbaf`o2$gM;@zOI3@q~wayLRRg0cq zdGF5ko?}&U(+Y_D2?e0)nMT|qOS1x%^f>WYbO;q#r^geAtnyQ{0@}j|6~GzJ;4~~~ zLrHb)ku}oci+i3kfQ~UZq#y6n8If1|2-Q*8${*2YZs?JsXIhyB&po1N&q0r5MTZM6EW^4awwCSQ4OQKyFaEe$#WzWoOXOdg`H`(sYlu=I|ho z2zmO=m1L$Z5jaJ%p3X?q=Zcdvl7;t}Jidj)l$B$Y53Mv;RGMBRVfp=C{lgbFoC^r! zq{ipfflbE1An28|v<@+*evpWS8xD~0>-a3C=rw)^2#aEMzv>8-;@MRK@`O@^)SGFl zHGf7aOn)O78OcjhatEo?08;*dGy`Kb+r|=7@htwIc>SZ4)fJwg66q6DlQVT3O&FFP z{vA=XvkrU9!jF&*@WiU8>&}S=m7nH3g5Hx8;V>r(^A)UVmGN{0GKYzWp;NLOLGWz^ ze52$4Ocs<1wY56zf6OBPA_Dh+xx;_iO;X-;P%1R6^02KXfZAFm<0B$oh?+cju~a)~ z7omCbTAt@;laJ75@O(b6P2q!hK85#BF|KKe!c1Z2&-01c|J_&v@wEc+~-{dIX%IBmRmy9yu75KN$D}s5vjs{W;ls& z`HQ48;`tjvND!nA;u)vH-c$Da*eK5>Nx><+POvw+C+UzDGbBgMoh!)Gg>Eo)W9WnN zq?0vmt}eeUKV1Co*JRJ`y7HPkWM<}H>J1KZxuCT#}h6(ycrTiiP zvqO#bCdDRgsPsG^AWVZZgc@YW22FY~!F?I2$wWbk6ggk0KM8U0P{^9rXh|X`*^&Ji zUYkF7PR3vbE26_2D>-OnOmcADn0icRP#*DP%u}!@_{3B=nn6ua*<|Z-Udl_(bxzFA ocGl(OWaZl|P0rD$Wi7FB^P1o`!Brh*4e;A9oS*^1$v?2a0DmGVAOHXW literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet deleted file mode 100644 index 0d95d679984cbe1a72e4410ce8b15f3ba58a4945..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2845 zcmc&$e{@sT9sl0@-b-FeN=wO0lM*lm5{kC;hqROuN?%G_+NLFKfs$e^X%pH&(maxs zw#uO3a77WdM>cdj#T^+N=r{v$M@qGbLq$DK*$>#xHgU(n#546AJR9Ps``#Av-a_{}#?~nWaar3!x%Z>RGFQrNbmqaB<0}w6qa)wu*X_0#+kkTY$wj`ohBWWx} zkR>K$rfVwz&{-*tA|WE!pC%TGHy45?)ojwWd%JyYUEXMyM2({HF=E_J?8SERcAm`H z7z#)fa7e`o0QoYh*o7gm(5yTMIs&{-TLuD38EN7ZGcib}>%p4Dgv2CrS9Oe-z|FfT zOq!`CKq>bj*!S|+GRR8yg*$pe{_a?t&l~e52XGPawRMDpJ)v$jr;EgRp4f*t`^(hw zUmSVMOfQSlR-H>4j`v}d|9k<#&c0RjbW;WVt>_XpJy*r;c|{LJ&o`4Jdr!jqgS+7C9T6@wcQbVq&ldti zQ#2P}J52HeWBl+ojizbSqNLr*l%(LaKM6lL@hKQ`h_L5M4b{)3T#s}Ns%Bh`Hf&r4Dc;gscK2QzGo(c#~%b}m%I|AF^9OHh|R0@Nytf7~V3FKFGuY<#3<6pSr z5*#sXpg-NV7^W?*h0$M5)k~>Bnm}^Dor7iFm<5m6W3-Jru@U zB%RPoKjE{b0qyl)uzEdAJZmJ<2b|S_6Zh4-gp>;KSwAuZT3%cagmS!~(TcjH3Hs!T zDXEjvCL1JUy2+d|#WHo;9oEe0GiKWCv$AI2c~^GMoZP(pg1hGy7R@W3U$Vfluyj$G z>?~hgQCYR5x~A4uSHINV(AczW`HJS1o|aYjwBFmc+S}gY^RMab3anih40VS=krZ*g zJ|ZZ5G*+sBp=Sg7?v<3(zW)2>-S1QYHVO)SOH#5op@YrnV4!1*KUsmTPoc~<2Zskx zp?E|A{kPGnPT{w4Ha9#-&cSyw-F$RVI0xGox+O*b5C$tK`iFOvEGTl5)Qz=|)YVTt z2Rkdzfrz2l_Zch_jYHUy@c^I;1-coW`J;R=5RGa5K|ccrM1ldWaqlGK6BF$}Pq7cF zP~LXUT$n$+FTO9Sf;1>Tt!(6ubP*x>XqR+YSRgl#{ zX{nsQw!NSaYciK9VUc@T{V~LNQ*h6zT4Lmw9m+kjvp3!+HCSeaIF7vcAV3%jr;*9B(8N?zL|4N1}mnw>7_LPX3%cv&#Z- N?E|AcfE)i9{~HhQtA+po diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d112e179eb9bbb8372b524bcc40eea2e5cae3f76 GIT binary patch literal 3997 zcmc&%cUV*B8h^iYPLh+bA|z=7Q79t}QBcIK%?UC^K^zRV$dXMEP`tIa2%@NniUSk} zLnQ2NTSv^_2fT!~I8vS#hZwu`H>4z^DdPw!R9u|U7AQamQNg z+o!bh&k8UMzXP&%;&la9I-$yt<*{LLNr9n3NkKZ1^dd4eFhm)%JT^{UQ$G@Yk7NzF z>=q{dFCgDpX_H8JTI4Cbz)}~Rul3{r%VqsCh;mGW${q@OjxPn>f4I|CTd#US&?NC>EZ0bjo?1$OvkDDjg(;X^*G+C!mF z6ST1?$z*7*v4>N8UBNVV6byL1MCYqUL$*|mtdE%k@xTs9$%1dNC1jZQGDCkhs=GzRWY1GqA~5{h(_VT4^B>a3I^v*TqT*ysjk$2%duaU8_` zB!XkQi(to6DQtRv7@Tv%U`d53oLkI>13k5lCm+7k6FP!qXt<0stUe?Y0&K|#QN4wfh;cpNUB{Sz5h0fT0RVF2y=){=R&qE z3)<@I!FII^c>m6Z#!PFFWwN0nHwZdz>cPoNEV#7a9(j}&fO+B>c(%#~%q4vseKZGqJ+*O2N!%KUA{fIyORP%Gn=jg`S_2;FQD1 zM456imi((Ly7cVEutG~Wd|lTPZZ-N*=Wz@R&OQH}zvtC0q*eV3Ym8tUo9Fw0%(xVR zXLts1%I3b0j-9c`CO2mB6GmIJ8}ur%Yd0*g10p%=^cofH`tUyKyH&x85f_5?2kk_w zODH>Z#(UVpRn43mu2#s%F#~lw&u5*>7^lO(YXixXeAvtST#(=XohWc^9%S_z!=Wr& zqG6AVp53-BJgwHFSX96Oy5%u}H2>xHKx{kqD-g`NmzIt3W#S$Fu{KztJ*oIt^ z+_a61w%}JP&tYzRQ@|#@oGq$VlJSo};d=K^(i*NR23DFj=;8$4QQpn@#TQsh6qDhH z1|_F!>t#4Pp#wXx`Z?6{vfYfRgrGCBdP*)jzk*~QnH+SSM!Hh(maf+4`}!H9zUOw;g<&Cep6ICc3$&?a1G#aY3R}?{&A!Pp#DaNdcxQSNbnh!eU6H}y zc;1t^`n?TybNmT(Ns@}4PL*hFG@Q!fP+P9$6^=)-rhjdjum zhG=KfcKqXm-=NcpIqZ>*J|JImn_ZyYpmp%O-C$cfL91!16dZU9(EWi&SnTgwkUU*S z^zjZM>izWr_UglR+*voY@qv@^X!xx>WIR`a^)8kX9|>l$iv-Rn+~*V4vq`LBK?)9c z$%(7v=%cxU8?R2Xxi7A8WA2Esn@QQM{3+{M&sMj>S9e{}{=8hh^2WuuS68_Jd|P>r zPXz4Z7v9<`A)iZDyc*W%qar;=wm|>MemK)oOX%(&V7_(gNZ~=!R+z&1>uZu=u2xJi zA854fCQJnMgulFi&~z3&Q8A|}9D=(HFj6efPg#SL`c{5T`D@ZRd{ehvc|mHrmUwJL zPg6nKdOZ_GhL|c`yTQQ9N9EUCn7-*f``}Z#RMEQ4MiUe6^)wf)-(oCJB_)c38QaX< zvaS4E4sO_P>0RiyTT#4m=SaW0h~AdsO*wW;S~4U~hcb6NL|jz)w;tM@HzB_7)NZGe ztbJ2fKDyW2TC(N92PqiWaE5AYfm4RQb-)SLwj$T9rtW)Ylx7#t%pDtf^+f6R60dy4 z21Dnv9i?+sKBWO|Wjo7#Yl1uWIG69LT=;py{i|)|In{wpsazwMirm_eQ`y!F+bee0 zhhHdk&vU8V(-_@T7kRC{GVe3xt(FZ&u2p-#NW6Ekbm7UWeMgfY_I2dBR_{N)>gl8V z*G^U+`07)@c~ZBU{N^;$z-G~@nu6B#d^3-|ZncGNo5T)L*H6_Jo!nyLv{C9_cd%o- z)!ed09d*TLa_mD+?{%*~)R{LivHyBUeaZO)@-*IXj|Np&k=u5g#itueFO@LpoW{Lh zsD_|hHV&dIZ7!G5=;)J&K9|#cHRw(ie6yvOk2IQI`4heTO7$D)sf*u6mmDtfstrlw zS)bl`YtUtxGN};LkR-RE6oVTZ8OIQ&utXvzGBJr078AzM51MPFa75RQMLou{ z{Nb_&H5+^LnZ1)x$c0m1Qz!DMHn0#8h*&gv`k8r`wA|B7?;uaKwOYjFpbbNw-ae&` zhLEQw>%SvUdatQeOjulal7RDkRr#U1zV&b7K7qT_RL)jf;_t#|zF(KP?D&tLv)fdB%RE$O#ydVoh zI=%@LUk&`9#e#^GNF|>CV|M%xQPuy;^OgNmFG!Xo&b`$A^^K3U}@084p&dflErYKKlQ1~-jwVtwP_R+y(XO9W?U9ddVWm2@ipHBp% zkM*YW`&Se_sN*@QgV0IPqKT(oGP)uDi2=TjaiOl^R1o8@hNsJ+#eZ<`Gk-p__h1H1 z>FU%q`9#xailiW4N2R~tY-NC-x^x zHcB4EpX09XH{EPn23t>2Gnl1o4AX=70GCNo!DCm_JsHOs(M?Wf`cJ2*xfJ?Lq4uOR zr>mp&{*)qFb5H7I886y~Qg>40q%Kl(t}fj_)@3=3qpp+jr{mME%++WtL2MEql(1}h tSkkDF`1nyl2?@$%lNDhJiIK`UlQH8*j~Q(*@st9*{)1x#fH(ca{0|mk!UF&R literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet deleted file mode 100644 index 81083b2ee90f8208676313b318811207bed04c6d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2537 zcmc&$drVVT82_D1DeVTxqqk{Ybxfe5A_XT#*=D#!TfhoPraa=*LMu?{ZKcIBjgxF6 zb4t`07dxjD=i+OCY~tdgPThi!I3H6t+5VAX%ydqpnOQd7aOd2vJQV(0<_Vnpo$vYk zzH`1WEf5ORco`qXt1LXnBL-0D-W9nQ@kizf9()Y1PNw`QUe3#PQ;?(x$~1ix0A(CU zca9Ja`evfzW!iMeqqG`jnN_qcth9P6c}~r#*Aum$=%?xx!+FvsQz+m$&@GC7fXAT` z%46$sXc|RO(od-=+$Xuz_+PjjgWDU@2RA9g<|?mq+C}d|o7HQLsG>xbZDF~qrp_r! z2^=KqV?=+4(|^h7`s4Id!$cf1t%fjub(n?kCQ{Llh1@?%Oz$j#BmFJ3b_wV@vKYzR zC*az5;kdsx5jQ)>p?tiG?A+zTnguE(Zt8^Vr$oRIi+hnAnr>I2bpPMD+`9+ziVW;3 z*5aePQCPlNAS-LbIfpg_$J`zn7eEOTh-6hKxI$0NE`A6s}pwY>?q% zTLP40;}EOL#J7$ptSg;@>eX48-Q~di`sbm1FBVtwTCpS2htY=5$k|pMi9fOzs@2(u zKXMlC1Ct@%QR8srd~95-!-v=RJVVg&&9nG1Iu>6Ye*vc)_uIKHSur^->?LIRaf4N2+b- zCBWIJ#D?Thbnf4eUmxHguD0N#Lg>Yks!v9 z`^~?Tk+$o^n?&Fisg#DiPJ zlehkS<5;shJ9*@~Qyz zbUYVm%k07LT61f4>Em3D_iOr<56X~30aCLckKzZ;qVS4>_rvp}Qd6`+u zG}0Zg3CPh1!MjKtN&Uw$|5D(`lZBeo>a>>sc=SG{(fGf-RQ=O>N$QIcYEG|EKZsBP zTAe0bC@jn`$_P9xO|#6vR=Qvm1Yr*SXKR)q_yl8xU=oB1A*({-8L7f-Mr8>?4*Sw` z@Jmk;>0ut2BO!m;1cPBJC1&PjvvZb@?K9a%T?>MRZN?IIl;lDxJ6D%Y%1bLNnpbBt zO{*>`F05p7X930cx`S?#zbwfK1wmK9pF}d*AbU?~QMza|R|r;SFNLS+&{YyV7tWo_ z&be$0q%J`Z;HjoOqt{xL?kXwHbCniLsnd9K0{P{>Iv5Wv7F`4Iuu>Y`HuF3$69_pL zrY|Y^g7N3%NcE=0rYl%_L72fva}BG3yVNwzQ8uZb)?_kMqDA(x`csI3OQAfYWJ!_J z?5Mm%F!}=bB!OjCR7a3188DKP4DhAYOPr=U8b`{L*;D`2lwD0g6Ph@;wRTCJ-8-?| n?Vf0@t#$e0mfCAQRW30uZAwa7N~+eP1N#1OSOyf(A9?=(tu<#2 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f3f7d2a7d0cc4ecb1608dee4e76b94cde19d2b83 GIT binary patch literal 4018 zcmc&%d011&7N41WZ*r4>M9obTKtL%0abXkOP)XPk6~PLq6vHAQTS{28?obpJ72E(p zR*_9m5LyM5T?My-&jq#0vseLN>-wI1efK8C>a+dhec$_D&zH>1nK|eD&N(yRmtQ=j zo)#2N*-;Me6iI;*02r)Z78|!}x1Rp2qp7})1!Ygd(-hlXZ{B>jHe|0&-nH;)HWZ)) zlxRG~rT7$!V(Hod6LQE#L=IU20JxtcW`qdq1Cnm2OW|T`z~YNVoB(-LkbkIL8A>4` zSJ(oDolp`gA^$9tcs!jWpb(%3K0N_I>mHPyJDmfQvk8s?4g{JUHV(8jNn1W`!N_p+ z#JCA5?cXyNkwYgU68p_YNLTUcE`7)|QB{F3EX4^(dhz#Vm}mw?2gXGPN5%RF$z$c3 zVe}#_$UiVTA}%sYRoELSd;}%uG07vO`(I%GVYy90s#EAreNU(=&gVVLBb;{31u<0% zP;F>6o?7-RcwRLLo|=%!Bgf1J2S*>omulyM&;5*WkD_O=-t`=~J|!J5_wVMJnq9=U zxsC-ESMPyH{2V&D=NNoxd>&aw53*NuT?FkLZo$ri2hj2}1uW~M0r*h zsA_rz#{hRUt<4;kxoX1sEhoT9NepaV-;FlZV_^G=#i-lM5&UfP6p0QwV|mRQz~-no ztUb^U?xgGi?^0v1(PrzA?UboR*iIeR;OQD@5!Q(3CbL*xYiDsY6*}Aqz1u`i%R?ZT z0ExU;Zb)Mi!s4g~;B5E;bXfLr`v)cy0-y<|tZoD^%3Xn`uOo=in}8mESr0aMHey%2 zW`dOFW$0-G0WZ%v18i-L@H+jcpjHr%Dw7?6p2KW#;?gKgu>CbON;(R(9AAUK`&R+X z)*Oqb>;M^;f*)VHoAoMP34Ur%1p7``6WK+JQEsgTY%s3JtV-TuOU~cNW;Tvy8z#r2 zYO`zXwSG246Z;n;!rT`-3zT45Rv=h6;6jX8zY>f;c7iqh8iY^(ML#xqF)*Js1Jhoq z0enYhao`E&{KAS#!R&A!SQ38==}E@0{l#B$-&_=9QmfT?d9)u&c$J1o)@o_&d)Etxn`A)?K{|`;{eX$OWZ6M3NM#Fo} zd}479j}tR-EUQ+l65YIQgz|)Tgo1YjrMI>5z0+hwgs2c0jbFizC*nc%yS-@fs%G{^ zErO+EodOf8SHg=alQjAFO+ms`Pu5p=ouE(d9_ar~HrU>y50pwFSKnhaX_mf&%WG{w z%NIR{y>64q&wscNL%HL*=WA@(3)436BYDTL+I!>Ri&K%f_|h%lF(Vl)7|6iR{a6pX z}cc7E$ zU6|>q7Czs~fwY`|1MBQDV=1xogq@oae&`a8nLpjF-BU1wvstkbwqx~d(|NzZH4kop z{kxs8QTo3ER@6iGhB2o>Wq1ravST`|-^JyI3oRK_Uj#qs{XYPFGDZ z3aoL54P)QJtu33$EelFfd}}!C4xx<#xQ5t;v<53jo%a-0>SfnW8&3ZSb-< z5p^btd0Vt?32uH0+hC{Pu$I1MkSC~ziJD>Ha=~3}FH2uDS)dK~#O}gYANT=wDl=H) z>O6tn(jQp`8nwIw_p-qFvdO$ATU}tqT?hvrKS7bNcpzb>rf~Ic3idpIfZlz%ku&GE z2KM-j0*>mD>X*R+Dh`_3>oJ5#Ho&JUZ{RUrW0tz7G80#@-`4~yv?WDse=s%6Dzv-QxAow*?8%C|XS3`}w(p<5;z@r`YsrqhS?du; z+o3eAz&>Tf*hOun=|#?IV_f$+lH;)*R^AW#U!7q@$+zT!wjIXO8R9 zgBP-=C~aZ%&aTTjUiuNbaorGlwCR;{8Xc~E zLrwXE77HpnlZV2Of@)>VNncw3RdC~Vaix`a6~XeLuk&?OA7l&3;XpY^`j&gRn+Xm z=k|xh!j7_i&;7_?1PF)+K!R-t9y0HfmVWibJm-&ipNO$miHIDsVOZ3syR^}7To9?a z;NaLlBT-uKDOE&pR7k9v9}Ys{Ae3A;k*qhNY1F6T{{-}psuibN0})FZ7#bWYr*N7^ z8lex^sFAMwp^?EXKd%6}1M{l7e4$zS!7gsH|tu7nh(8VLXp>xwj_o}Tl(r>P&7 zC?7Xt#9AtoN~QDZFC(+1(gdk2MCv4!hDhy0RC_YoU=Fjgmr7lknbwD9T2n2ROavy7 zbl%tF%*>|H3&*)GjBcO6jE>Aehn7m+nL*~usH$_RIirUMSkAQw@Lm`f*gEt#{A{FRHmt)qgRL!@%XUjLyPZ_?)k+RjPAw^ z>e5xItMdt`_hhkhZ|i7ZpSjVCd{m{=c=Ofuxh?oOAG%qze2@>*N|_?ad0{MbAa!+T z>{UhyL-_MuRsE)$P0LX0rBVlG=^VlIK(WYa>hb`~6?9J~F-CNg6PW(fDXK4p-jk_3 z>CEZs=yPAGEJ1xws$>~2+D5ACq}oYUqXRB5JH;yq1KJA@nY9ToLdAizzPb%7V-8whusS6f0+_ zAs>y-U^+shnVOO&%^;wlhVOjPls3~OH9dT!HiJ`&ll6V~wmek+HF*ZUd%p8N=R4=j z=JNSEj^zTlAOnXv2n3L8mU_IeZw#3*g99#*Q$`Sa04L?7ns^A8`*^AW3;_H>F*z|p z7_?1D!?Ef(kOrt#ekCTmx!7uQTRE)6%JoRufV2r(`AC-35hN7gFpwjFd;lH|jUhA& zkA|j_`#6J`l7#y>l_>ukE~eo=WCp?wSC}28-b#zzQ*1VQOp0=Xl$(o79X4;JT}Yr6 zDZ7yNHr5`&nn6E<*a#k*icuz}rE+%tM=^m*pJtQSD{&CgKDp2deZ zwW9k|&f!?c4Oz{l^U%HeJ9Os5n<(}Gbei4Iq zXDLwLu?{$_b)n<6J$Q8+18Zv*;-0x_a4WtKtM+CvjYs_<{$K%W+0hMG>o&pP^)6;& z^eUW`FkMpq{y6E46Q3bnc_+JVjZ`{!<&04c&T*q`A>T{39=iiUkx0_mpN;*eVaCg~ zL&oHLa60y*EB*iceZYIdRb?l)f z8>Rgl-0(~HoAAMj1ClM<7vgO#I@CU;jd|(gznE9g{mLYFPLxevQ-u#iUzV+06ffzL z-Imy5jLa!;L*nLASam&95?H$&B3e77V=p7r_XlY&(?<|9;}s@&xj${nYlypzBIv6Q zjEwL^DwYI{kA4W&VhYwPT^3Y+u5;)SZ zju{wTo5fz{5%-NauBXOo8^)4xecbv8KA)7Bx;_q6Qg^A&8 zgP7)d13mOcdKLJ5481QaiOtcK6fE$X)2A;p7Uo+izcQD^x4eQJg1r>M2nm9KXiq>X zZFEz=f88vc-tXauN0lK?s72p#_5j#fpN80*V`F zuvS0?>#B+y(dVdEtJYezxZ&x$H;C3}d(M03yz}~<+2Q3U1ysi7;p<8Z=bKfX zyK(Fd4YYvnZbR$ShBSxfn7e|6>e30Rt|$PYE5VpiAK|!A<1?15=yy)6`)9QasVLLY1d44yAf8RZ8&7>=utbr&xuKjM+-MJuB+ z6fr5PsnK$kTrYuHBt%b*QYL38QnU!yV1issoJ)w8$ z1fJ<*u;_6Dgzx<)TzaX~4o*KfeRZ^6b#>WyxL!oh(1Dsgn4Sf{0@NCRG zbasawEHf9Pe)U#BJhBH$=@0I?1YFo*1-!l%U}EYH=MxQKNw_N{E%1cVt%)!xs~7M} zOyEjj4XoGCgr4?$(3u)@WZAG4Oc(foWy2Xrt#yW!-vm&vKM9t{n#015eK4vt1|qii zf#%5^Sl7B6-O)jyIjsxVj7*@Zu{RLqBf&w_2Km005V)ffL|Yx90+T?wnFi*(-vRev zIJ9`v*qyJ3LD3jf5bgAa{9k@Ti5W()i?9MkJ`a}JkZ^SGUa*wx^WRDxCdDQKVN1(px&;Zo!>xb-dtvQ!~Zn!g_2Pd^4Ly*I-; zzY%a}6^7b#?!y3e59m|l1W%vMg^gGGf>nw!dS3Aob&tM>R1yTKDVE^1BoM}R6e3Oh zZer1vckpV5GdgHns!R3phmE~A5)s|kfz@PdLbm=nY8_lm-1k3%`r9|-i+%fnW6M2I z5T~$)lKn_!%tO(K+{mo+&2TjDI{KmV5pwF5img>f!^S@u^??9v z==Tn2i?1FU+f)w?;xu$5_X3u80EcV#4(PFK8z$WDjjwCugJ#1~xTP+E_lwf--u5|| z+u$K&!ZI_?FUR&E$Am+~+I$X&luX9@j@Uy+9SodLgWP>xc23F+78=18>XM zaMJ0n&Vz@8NK?>*!Ltv+>#e?^7d8r#d)i`8rW}C69}eM{$Bh7WV=UHIN1}_PkHfM( z!Kk)Z8)!_YV>S6Bq31|{s6X2Sx1P5b3`VuX?WsEecXPm%v^!84YT~G1Q> zxzO)YcXIXmiP&0=BdY6r0N--*J(_g-K0e}5Z?0wjbZlFJ1KG4ad9B;6qW?MC`k?37DxZBJ2FU(Ta36 z-fi+#Jj>t~QY75Kru|lcJ&MgCYDdcub>9t3HTVv7Qz$WzS3~8uu|x;mOn$p}G%4Sm zY*@9e#?(J46m6P*9qTFX!(E#oK^@H^Tsm+zu~j)4%Y5aHF1Ft^dNTJoui$(HZzp=J zx7_R~*Du?Ezv}&u$VgI#98HTkIw6lJ#Z8JgyQPlWI<7aWKjnZ;T%*fNv+Kvv7}Q|@ zxM_{e8EI;is=EtYG36l@vebi27FL0^O%``LIURPtUx7`W)yOR{AUT>BZ-~s@Gf=bI zSa`uMa8+(b?-D8=enA`!12Eema28%U?n0!R7ej{(7kK zLMV2@Aj-smSA|vHbHa39oPn4(&FJ-=Bm%3RCAvA{46#&ZN$RdN$0IwUF;oyl#XaZ` zkM1|>95Fr)hh%HGYI$VO&BOY_z`Ku0lYzyklIw(-nJvM&1y8_dO)m7$-^K}0O`=lU zvU##!hUk9ZCr6*0#|M3!z)x;#;l2m~GQwjh{HjrM&o8|Ub%Rb~4aMj2T_Y?EtQW5& zoW@?mtNQxj%^mv;Tp^(cX`K05JEl*s(zT&0 z_@BO&3APBFIa3gIPF1e?Q|#g;zHBwMZU)UUW zWaHx1gHrFDDD&J@R6Z>0_XlCS=8x(1RSuBG&T3h2&-sSdK^pZ~OYbEXqn0@rE-Um~ zVYlV*a^H*Mz;d@a80~jS5>y>f?or}*xqooYq=vb)e~V3cZSsw#68|f9Q)DyWyd`Z) zcgeqBAR1O+Q?@7O^r|7VIJT=a3758p)fL#T-lzD_v#;*Vu9t0-fon70WHUEq19fX_ zm{B%AceUpx)FEqn$beWfxD!N?7DVy59v!fn?w~fNl^b;sKQ{+v13y3Tik)2tNUA?o zChB-vFuHu7;!$gD`YZh@ZZp~*T;7{$Fm=S7_SY}kUKvEV=ixkobC#jpBVSkKs(Ni0 zIkM2CU$7?2D9UTG*^ueaUmHdH6nk6xu5g}V9OG9qAZ(xNjd5&1*}#Mr%?y*c!17@; zUOs8#q~IDheUBXBtzahYv=>G>!e3`H z8F%WU9P_71Ov1*3TB97ov>oM$vL(m&seA>or}xC3ZL!M+P>|V@(X+-Yv~ef54`sTu zFR5MhPn2zSJ?+ISL4^DMX*|7bt2Czk!LJ5PF3jkW4H{9l`R^%AxySEhFhPjsdR~W0 zA!}f296d{=462@8=K8q(V~v-UEr-NQ2bj}XC!6&7oPF^W2`MbIj7cYw6Vg@OnB*9i znK3l1f-l(BS5Ran-qA}um&;PkKYl{~lxxXouK-#{OSB|Z9Un+Uv|=Qe0y*MQZc!g7 zp(Tb4jvSHcfm#2);CwA#mWh6dl_gC!F;3L;ej!B7Abu#DMJ^!Zq({ZaDC9K3@N8@B4}9I)JYc(tJHxoa1rvVWc$(8`{5kz) z?LJ)zjnSL{H`bdFYOjZ-^ca(&>zb^)Gx06h_}aw(4i*IZB6Cr||N5VazlqHMU!LN> z?NJhJw-xA%DZ$fzrT`*yAzvC4G$v$d=aVI@AnaLjk$Om_(y`2&6@RHTQ|b{X^^!{C zq@HowGY`gKG`sSYN`2Xx*>}y%PP5QjcvbxXw z)5AktQlh=%q;l3@o1THgNLbfCXuefx)Wf#YBQ@GPSj8%&z5%Sg)+n4`II#56gR?LSju=cO=b9$HVPat0lv50iRicHWb=Sk{ZNk!m~H>7)hO zd9H;Xrtr#O@@R3g{!DzvmA#ryOAuP5%G0K0#HeheQd4c^X=%z#i|H|G=?Tgd3rANw VM>_{mfH}a&7c>w6GUgYKe*tj26RQ9K literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet deleted file mode 100644 index 7b231f3e38c05597bb0f2244bd064df42bdc2274..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3314 zcmc&%c~nzZ8vpKlFCkGjAul8t6|3SipRS9G|(bi;!O)^d#uZs z1%aoaTP6{7YXd+pcLI+BM3Bot-i~B6B@G0=y;LGhSL-yBbJV6Bnh*=b^+?==nA>UeQr!9TeRz;sDxngUq9&aO~7fge^zJqCuI0igR; z3@x@3VR@De7C-wAqE=A#F9eN8d&9*(j?lh$AW-W@fTHmcl*YJ0 zeBDNnZuNr=geR=)+X%B?)WHoH4(F9LadrL&uw=9dq~9u`^zJp3SI`HxQ|_QG6+(qK z4>}qezxHu1b5RBcseT@+;02A+4Ksy z{9Fe`<|J5ES_Lnr?}wGjZ=fxYqi^gNJir^Myg+u5&<-H~fyA zH8+si6M;eJ2GL97VeGSos4-^;RbKrZ{!tfz_V}#oEpUy4%>y@6Q=BWoJ-Eyff*$Q}3ZX^f#789Yt!+5zBeff9y*CW5&Hfnt-pU?A5BL;=l^D+*k^47Nh z2C*#?sC*JjSO*Yf0o@Erw?E*J|MgzC@4U|wfei$I+y;MajRBjKD9G>cL;NM2~ATnA*R zg4EODASfuKc28N(f3nO3H#_En*I8%YmsJyp^^Jb0c~BEsedq<6c=Q$-);7?}t#mrE z&FK*NNm>xkd)wz^z9NM@2qp+wodLynM)Dj=W`Osu7Ji=#2tE3pgjt@i zONu5CyN%-rkq_cE$0^YlrZUng?MJf6?lRKm{!C2!^;6NA3iwIBT^OFdL@!-LI%#>JBf@y$XPm0I4 z5(sx4v@2#;>Y2eZ6_hH`QCW)zsE~yM0G}L*vfx{=RE>Q*%o~>z=mn_qOl*p=1An zgNF_uIeP5)iIb;JpE-N({DsaRFaBhSH`4%uOY#*`1N_W0z%NY(5MAaJkU`dE$>$2{ zf~&kP_iJ1-*Ksm7U2usuSl_@Y@C=HZw_Z!Yrb~323*M(obcc)U-vxrZ7k`qU>?@-Q z3k;3y`G^a(xjGh-GEG!|uE}hbnV-o%@c1<^u@cXB60fD@yJ&eMhj*_Z-dmq+7usFL zVipD``_N`<_J{#GS>(cmFN6aveg&WsLAQoP(5=Hl(wn>3Sm?rmDuGB_`Q85SLL|GF z_>`Zi%QjnZ_8=0UMDmOM;X)Ktj^9%QWAj+!Hccy;uFG37W48WvwX*cNQ%26F5TmnRqy3|1PY5F7SUA3u1v( zCRP29e{%dq`2T;ov47c365sAC7RUwSa%cSfNo5jiCNXhzQi$bZNj6ZvECn(V48x4U zUzXw+rjUuqW}+D;n~BWk_9C#tD0UUeFfr_m`R*BW+|tB)U>z}|XAbf84aSS8_>n9h zS;&r2?0{V}jEWs1QdpLgGYXcUn(iO(m!34fKodP|YD#iq4lCCt;QVIf;eqoP$vMGE zV6yn*maMFYy(u**P^VF5GiuhJ3y;gel+w*7jvL4FvFuQj>A z^{L5R={VjPOMS5)^~?u13)5@)u&s list[tuple]: config = { "cluster_num_bands": 14, "cluster_num_segments": 2, - "cluster_jaccard_similarity_threshold": 0.0, + "cluster_jaccard_similarity_threshold": 0.7, + sort_output_cli_param: True, } launcher = PythonTransformLauncher(ClusterAnalysisPythonTransformConfiguration()) fixtures = [ diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py index fca5485b4..8c4debed9 100644 --- a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py @@ -35,8 +35,8 @@ def get_test_transform_fixtures(self) -> list[tuple]: os.path.join( os.path.dirname(__file__), "..", - "output", - "docs_to_remove_consolidated", + "test-data", + "expected/get_list_transform/docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet", ) ) @@ -45,5 +45,5 @@ def get_test_transform_fixtures(self) -> list[tuple]: duplicate_list_location_cli_param: duplicate_location, } launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input/data_1", basedir + "/expected/data_cleaning/cleaned")] + fixtures = [(launcher, config, basedir + "/input", basedir + "/expected/data_cleaning/cleaned")] return fixtures diff --git a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py index 07710b74d..9ad8a32d7 100644 --- a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py +++ b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py @@ -28,56 +28,13 @@ class TestPythonSignatureCalcTransform(AbstractTransformLauncherTest): The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. """ - # # create parameters - # input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) - # output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) - # local_conf = {"input_folder": input_folder, "output_folder": output_folder} - # code_location = {"github": "github", "commit_hash": "12345", "path": "path"} - # params = { - # # Data access. Only required parameters are specified - # "data_local_config": ParamsUtils.convert_to_ast(local_conf), - # # execution info - # "runtime_pipeline_id": "pipeline_id", - # "runtime_job_id": "job_id", - # "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # "minhash_num_permutations": 112, - # "minhash_num_bands": 14, - # "minhash_num_segments": 2, - # } - print("====") - def get_test_transform_fixtures(self) -> list[tuple]: basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) config = { "minhash_num_permutations": 112, "minhash_num_bands": 14, "minhash_num_segments": 2, - # # When running in ray, our Runtime's get_transform_config() method will load the domains using - # # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration. - # # columns used - # "fdedup_doc_column": "contents", - # "fdedup_id_column": "int_id_column", - # "fdedup_cluster_column": "cluster", - # # infrastructure - # "fdedup_bucket_cpu": 0.5, - # "fdedup_doc_cpu": 0.5, - # "fdedup_mhash_cpu": 0.5, - # "fdedup_num_doc_actors": 1, - # "fdedup_num_bucket_actors": 1, - # "fdedup_num_minhash_actors": 1, - # "fdedup_num_preprocessors": 1, - # # fuzzy parameters - # "fdedup_num_permutations": 64, - # "fdedup_threshold": 0.8, - # "fdedup_shingles_size": 5, - # "fdedup_delimiters": " ", - # # Random delay between reads - # "fdedup_random_delay_limit": 5, - # # snapshotting - # "fdedup_snapshot_delay": 1, - # "fdedup_use_doc_snapshot": False, - # "fdedup_use_bucket_snapshot": False, } launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input/data_1/", basedir + "/expected/signature_calc/")] + fixtures = [(launcher, config, basedir + "/input/", basedir + "/expected/signature_calc/")] return fixtures From d07a23a47d3faf0e5bce744cd375b1c5ec1d5966 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 15:24:41 -0400 Subject: [PATCH 040/105] Update versions in pyproject.toml Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/pyproject.toml | 4 ++-- transforms/universal/fdedup/spark/pyproject.toml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index f2b9d8268..fa815441c 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_python" -version = "0.3.0.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10" description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev0", + "data-prep-toolkit==0.2.2.dev1", "pyarrow==16.1.0", "pyyaml>=6.0.2", "boto3>=1.34.69", diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index dcf1f48e2..548f350c0 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_spark" -version = "0.3.0.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10" description = "Fuzzy Dedup Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk_fdedup_transform_python==0.3.0.dev0", - "data-prep-toolkit-spark==0.2.2.dev0", + "dpk_fdedup_transform_python==0.2.2.dev1", + "data-prep-toolkit-spark==0.2.2.dev1", ] [project.optional-dependencies] From ec2168c2d8f9b1bf9575689b05b08650bf91510d Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 15:27:39 -0400 Subject: [PATCH 041/105] Updated ray test data Signed-off-by: Constantin M Adam --- .../docs_to_remove/band_0_segment_0.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_0_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_10_segment_0.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_10_segment_1.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_11_segment_0.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_11_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_1.parquet | Bin 0 -> 1532 bytes .../docs_to_remove/band_13_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_13_segment_1.parquet | Bin 0 -> 1526 bytes .../docs_to_remove/band_1_segment_0.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_1_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_3_segment_0.parquet | Bin 0 -> 1510 bytes .../docs_to_remove/band_3_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_1.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_5_segment_0.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_5_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_1.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_7_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_7_segment_1.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_8_segment_0.parquet | Bin 0 -> 1530 bytes .../docs_to_remove/band_8_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_9_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_9_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/metadata.json | 58 ++++++++++++ .../data_cleaning/annotated/df1.parquet | Bin 0 -> 6923 bytes .../data_cleaning/annotated/metadata.json | 56 ++++++++++++ .../data_cleaning/cleaned/data_1/df1.parquet | Bin 0 -> 14933 bytes .../data_cleaning/cleaned/data_2/df2.parquet | Bin 0 -> 3068 bytes .../data_cleaning/cleaned/metadata.json | 59 ++++++++++++ .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../expected/get_list_transform/metadata.json | 48 ++++++++++ .../ray/test-data/expected/metadata.json | 84 +++++------------- .../ray/test-data/expected/sample1.parquet | Bin 36941 -> 0 bytes .../bands/band=0/segment=0/df1.parquet | Bin 0 -> 3984 bytes .../bands/band=0/segment=1/df1.parquet | Bin 0 -> 4763 bytes .../bands/band=1/segment=0/df1.parquet | Bin 0 -> 3695 bytes .../bands/band=1/segment=1/df1.parquet | Bin 0 -> 3684 bytes .../bands/band=10/segment=0/df1.parquet | Bin 0 -> 3305 bytes .../bands/band=10/segment=1/df1.parquet | Bin 0 -> 4466 bytes .../bands/band=11/segment=0/df1.parquet | Bin 0 -> 4906 bytes .../bands/band=11/segment=1/df1.parquet | Bin 0 -> 3317 bytes .../bands/band=12/segment=0/df1.parquet | Bin 0 -> 3138 bytes .../bands/band=12/segment=1/df1.parquet | Bin 0 -> 5020 bytes .../bands/band=13/segment=0/df1.parquet | Bin 0 -> 3138 bytes .../bands/band=13/segment=1/df1.parquet | Bin 0 -> 5244 bytes .../bands/band=2/segment=0/df1.parquet | Bin 0 -> 4782 bytes .../bands/band=2/segment=1/df1.parquet | Bin 0 -> 3988 bytes .../bands/band=3/segment=0/df1.parquet | Bin 0 -> 4323 bytes .../bands/band=3/segment=1/df1.parquet | Bin 0 -> 4341 bytes .../bands/band=4/segment=0/df1.parquet | Bin 0 -> 4035 bytes .../bands/band=4/segment=1/df1.parquet | Bin 0 -> 4860 bytes .../bands/band=5/segment=0/df1.parquet | Bin 0 -> 3554 bytes .../bands/band=5/segment=1/df1.parquet | Bin 0 -> 4872 bytes .../bands/band=6/segment=0/df1.parquet | Bin 0 -> 3553 bytes .../bands/band=6/segment=1/df1.parquet | Bin 0 -> 4311 bytes .../bands/band=7/segment=0/df1.parquet | Bin 0 -> 3765 bytes .../bands/band=7/segment=1/df1.parquet | Bin 0 -> 4158 bytes .../bands/band=8/segment=0/df1.parquet | Bin 0 -> 3781 bytes .../bands/band=8/segment=1/df1.parquet | Bin 0 -> 3997 bytes .../bands/band=9/segment=0/df1.parquet | Bin 0 -> 4018 bytes .../bands/band=9/segment=1/df1.parquet | Bin 0 -> 4326 bytes .../expected/signature_calc/metadata.json | 48 ++++++++++ .../snapshot/buckets/buckets_collector_0 | Bin 263 -> 0 bytes .../expected/snapshot/docs/doc_collector_0 | Bin 31 -> 0 bytes .../snapshot/minhash/minhash_collector_0 | Bin 2840 -> 0 bytes 71 files changed, 292 insertions(+), 61 deletions(-) create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json create mode 100644 transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/sample1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 delete mode 100644 transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6 GIT binary patch literal 1513 zcmcgsL2nXK5T0EY+$N2|#<$r`I1r^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p zHuSaLi9eaPx}L+a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuPAEYJvDI#?y${L zc%Jv>w`yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;` zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m S&e+cY1V1>$PjeMN^1lIVM)pPk literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9df2f3bd54e13d5078be076585302c2d0f4e93af GIT binary patch literal 1497 zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K} z0vA4laj)@Nj0+bodtj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&y!K9Ws%zO&4iPF=5B-UWib&PVCM{MVlq86>j`Kpc8ToVkIRUE3s(@ z;&_BI1rU7Dco^$sb8>9Kjw%N$)+<<#qXDs%Y%n%K0{xpj=0*PIRk^UctwEd9doL(o z1^x4~{(2edO&6cjauz);w-2h+x7v26MkE~S9dY8mc>X^p>;-4T14Yte+TjAOpT4j= zu<)rU-@}Te%gU#v4FD~sMrg74E>di-#A~ tc(3NZy0iy7R_BWU5^g~{&Bn|ZS{zu%je&GVwc!Ysk|b4+5e2C(^V z_h(#v3s4O~7;bOnswf(OdF)mp0@r=W zbJ?x>3920NSH$jW6gOwF`w*G1yNSF8m7!DV+Z|_QPf>gm)<1@j(7(wOUhHpPl{35B8n#_}?*-+n zpnqP{UoRoOtHtNEoJCK|?SU!{thU{$F$sryTb#Ntp8wAYd%@Z8K#{bVcCdi!r!VXd zEPN`;_pl=Avhpdi4xq)<6<#bpKH2w%q+W`Ke?DYsN6OQq3C^R^pwNK+$UN@UAs>ZN6*5_KbWQ9dz7jZUdq zH%sT`WYNuU=Usf5iEvOZ0>6-T@wy=XKC!XEGK4Xfu=zD}>Q9WNgyr9N= swf?Jfd$?`&dfT;r-xT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5601f5cb07d71179df35855265cde6b0634c130f GIT binary patch literal 1532 zcmcgsy>HV{5I@_A>!N_ts?V~e3>m=c(2t~15velpl9nhCN)jN24CKT4Dr!DR(nKBE z85kH@85!9a8TcC*PzNT&j2HlM_iVR?qytM&$vfZ2yZim#Jv+w>QRZV2wv}ZngGGSl zck92x#v6c&0zL?UBQjKurHaIWEir3_X)K88YRJ&M1VdBYR2`OQcrhp{U&;f>XlRnG zyaMaD6k$_-kV|YIe_GXrh_2CQ(0tJ&(6<_7@C($J*U06 zSrDAhc=MAK)%vNO@?(GVD<3qvtx;Rh{3z+)B^>PkrP8UC?F$`m&Am-^pxT}LmF*QqeY?AL-OCJMU+Zqs5BZ$ z-WJXtt<=bsBI!~({?v}fBeiJ!sI(Uy*E_{#EnPe;#R?+7ofG&l6XBp7d3HV{@Cpzo z&^g~Gyy?RDacs9)Psf{fr#Py`N(YGVhKq-DffP3-^Ta33xy`q!ID*?DP2$$S?HFRE zB}HA+rM}(WjXc`GJ`?h`Xg@-+sh%qiUNFM_>fqU7W3+1b`>WN#pf?GR8-w9~uN#hD VUyH6q%tQ#l`N0l;sdM<<{{tHM{bm3F literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..02bedff1c8ca8f026eb6a3d7664517b0f74cbb2d GIT binary patch literal 1526 zcmcgsy>HV{5I-lW>q3Oms?V~e3>k{mp&xOhB2smENlO$6B?%%eS@PjHNm28GlP2nj z#2>&v!Op_MU%=|B zyB@0mR4=8fgnL$*F-dOOOVE^{_eJEUqqr!@-6{Cw0>L#fJ}g|byNA36m0_>q*qz?U z78Jip>Tk(-7cxEtk>qsQTbj)zIcs{vpG0yE9mEG8y`Rc8KlU%b z%9-754cilw{JLnoT|}DG#cydji>9921x+4UZM#!rG7k5CVB+4m{*9A%gR|j*D(g{w ze-77hes{r`_f*vHAyqaM^~=I4fF9MtyjXm8yyp%{y9{iTjCqQ4Uci^+3FjDS@kgAu zc)BHGDNJ}uUun*>WYPC6dbE^&LGkCLM5%O^N~4qFZ4vB|rbeEW$d)t7SC*NK*UjWb z`PdvaI;CbkQ#vmviXy+A7x*v};h7)1#llY#9vt+|M-6aMPB*y literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bf131f43cbf10180944b4906799b7d6288c54724 GIT binary patch literal 1523 zcmcgs%Wl(95FI;->!J$Ms@JlmEFx5_F8YXxib&PXO5R0x6Yjz-J{2+lO3pO3uGk4~WXU@5EHHxnkQ{^B*l|z220-JN9xan5DQ2-FpkpwZc z0s1c?#=_bjNKs8)Zk{=VVWrWt*oq=4Uj*fcpuf=N|LLM_Asb*Kpo*daSj27>A`tZ< zFJ!msC#Z77UlF_6C~nSU_aQQ2cN2LHD#KpIu{*tyJw@?NSpOJ8lCxzOG@DB@%5LYF zI2obL0R$g39>yBmm>yfOqsqaGdIjrAG$6K|4aO!&pnsDmyx8BoD(7~$HEg@|-V4fC zLI1p@zg|Lmv&9#*oJUW~?SU!{thU{$F$sryN1VDZp8wAYd%^keK#{bVc5n^XPrtT1 zaP3o3zK0b_mz7VEbpS1<)aaC& zb+dF)P8Qw#cHYH@nFt5vGVlvo7q1KA4|L8C39nfkJx?At8)m9$bxOl}vV4yC?)!K+ zcah>p=>qXFIk)&B6-RJKWJ&z`4?V}#q%zdiEcdPMapLqY_8C*OMc*S7i|YB};RQ9` suQ{(S?BTZ6?{C)}r#Fs|Y-e!R>&A_HJH}2zJBR^z4;m>l00$Gw|Fn&mB5!d%2k!`pB#MZqFOLbF6;GjE^zmb+hY|S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDSo6 zerMF3qxb%n@s;n!&E`u+E~2NO?1PpVI$gKd5D7ZRd-B{B3FB*DI7}{v2db>6^@A1M zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0mccu3lJd5>j+1w^dS%kl(~5*<87>|n7I zqt%AK}bR=(}@YNKY(z94)L zr+5$pq_n9}VLle3gNNLlz#Xw=nf8Ai27zW(6)1_&QGiUE}Ar~Vt0tMqdK literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2838dd9727770220dd6b3f3ae9f0d4bdaefd8ec2 GIT binary patch literal 1510 zcmcgsKX21e5I;MKaZ!cRs?V~e3=t|;hyIC)ib&PvC2g%hC`k}0WXT`LNs5{{IBBA8 zd=wTI1_mS~J_8doV(7p}Kx|0dd$yYdhk+$0`|jPndw0K|?uhZFiRp*LY0G^=}LjE1yMZDtF7b#M06xUkU;+-BuG@-2Pv+p zEA4Z4II6WAi>xY=@>x*63;KyJ|4$Qb8wmgr0aY9g0QxC%P(ZXdgw%Hc^)-k<%p-+( z>eoVDz%v^}pCz~EC#Z77UompwC~k6cw@U#zF|rBz+t`%d9pp8vjhvcm_nfiKDSi`_ zUxNN7s=tpQ$#B_!G+Rh=(bNLa;i&~X^Tfi@fY|)hzRFWxWS&>;!tQrQ-I)n~yrutS z7bRSr(F;dm5k0N252`e@x^}NYBpm8Japt~w{$F3%3oeESilim9!zEn5<=p{&*;7%z zL={Pwl@GBE04<@e(sKFf*?~7C^;+CxGND;YX$fB@k154Khn`T{p}7u^WijCiTjeOt zGmCy;(PLY==Pdq=DPAg4=OtVJ5RG6)rnOByPNi~hkRrSmmw|bR!)2v)p z(`BCDEpmLAiEvP^0>6~!csYn0=#(BYyk>d)GJV=^nVGiLtBjiI>ILF^=;NW3BgI46 z660f1YSANB9Kjtk%fzq$$aA=6D#yB-)q&MNO})5}eI_j0VdoKw#p?Ou;RQ80Xt=L0 o?a_`k80<7$*O?^8wmUp``bp!#uCbfa4if<01IPGNE8;)=AFJv2-2eap literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7cb2cbac4ca976304da8c64e8db39c678260db2c GIT binary patch literal 1513 zcmcgsPj3=Y5T9L^b(=H>8{cL(;XsUTdax~9Qd3MX4_Y@UR=`M=3xQ>UHigB7LVEP( zC-D<#()bDb2|Ri519t?-)3B$i>jWel(Zu=#1{ zS0?oiAOV3T!^nVbu=@|OEyZLer!^(o(UhoHh0T~M0{EzFB>)i5Q*S(Jc3hjStCD&xsNV&nZYcjVL?>#%L_n9q0I)>ZoluY*#zkRKho(e5By6#Y z05QToBfTGn=LG6oL|rl~R$q$UD>HF^a0&+ymgjjIym|GeR#7*7vWNl)tsE4X>a%I?6*Per|!RLM}(kIB0LdRklOwc3l5$H5_K z-^3lV882|otN15-#yJ*x{0ZkhUhMg`0yaFSRgv=&Ma-j!8Eq9`QT-)3AF5a)XiN&c z?-yP2G$^z>#fFtXv(0?AZRRfsCU~|{i2bp`ThNh&ksA14%%guS4%!$pTr;P zoF5Wjt2TX;JMMO@eAgb-C+%G09Qo~sd^q=+;zNZh>9IJs`5}QLxg&~XA^y`K@jbaB zO|=>$dw86Ey^C{ZRPE98NW~_+&;q`oWkxOU?S(VhvPYw>mgl*%%+&G5r|vLgKH4_7 Sv-&|AAo#%%ewr)zk^c>+^Y&B# literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6 GIT binary patch literal 1513 zcmcgsL2nXK5T0EY+$N2|#<$r`I1r^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p zHuSaLi9eaPx}L+a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuPAEYJvDI#?y${L zc%Jv>w`yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;` zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m S&e+cY1V1>$PjeMN^1lIVM)pPk literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9de62574605a07b28b991ff2c736fbf6e3a7f45b GIT binary patch literal 1513 zcmcgs&2G~`5FR_Oait1{inVMhhX{ey18w6*MFjP-NsAQ-B?%%0U!ufulAltmS!@OGnx|bXiD6x!d5~R0Rq(ZGJuGQBnTBA`oS09YdIRwPJ{;-av)LsQ}&61G@H zfCOP5*5<-)qNH(i;xⅅ7#3Y>fTGn=M?H&L|rl~QD2(fsW5SVa0&+ymga%I?6*Per|wRLM}(k88I9^o+K_>-Fa+kHSOJ zzKOeJGhXDJ*YHpFlyfZf_+!p{ywnS7MQnIRs}kpBikQa{GukS>r1}eT0aU3>(3li? zKPYXms0F(f#1fE=p zrdsyM86M|e?ctmmReSV2QgH|`vXC!m*-^`Xd+tuQozZB!<@??&J9Yi>sW;4;4|dF* SoPLl22!C*dpXMrltj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(M%SvVG!3&e!nK$$1&3xa^%a(c7VhT&LJ0&JF*Z|o4 zwDUWya)3Al!hmhCyZ1#_f+bSA8t%+j8f-3!;(1YRtq35aqezkh=65N@6k`t(Nkd=j zocg12qwP6tT~oA6N&709N2dBeO$>qtOa@d*Gyv$Q*=~u#A)(}3fLj3{h|O%2SeT)! zVGa?CMzO#mv6yVWi_eMOL`tK^*lYN1-%D_qZ`czBT)4(zi!=f(cz)i`qp-Er^A@IKryf6yI?7Qd$DGI~a75A?*y z>AC$TOW-{3$ycsQ;4ggPAh;YJXo`_C_E&K8R(A(hJ{9d!(G*kF-p6kP7%6?7SF6uX z9tA_vzsWl!Q=aFXSMViy!Z`-I{D|`|FLXsLj|op{Rp7ix7X8qoM_Yv#6n{=ilqwXd zG&*_S6G4wOHJV9{Y~9YicC1{cW#umFC)T9huXS2>?Yy3?iu_(#;KNLWgK`o2m7>5a zK>UTy`2pdztCLsR<4)Vob)0@}+{)I^5Z}WY9?k_)Je02xADeTBA5d`wcSx3GR{wF} z2u-R$UG4hN860O`?qZ)QMZ0t!p*U1;CIMd1)5E6!=G+}`Im6*r)Azk;dgA({Q*V&A W9&B6N8Dl>M5Ik^*f3PzC-2VX7ZuN)& literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..37aea5168fab44a7bd45091bfdfe5871bee8d360 GIT binary patch literal 1530 zcmcgsPmj`25T8=$b`cG`@ilG2frB<2_77dMM$8_3u)7V4E1%K)i& z>%Wrm*8pMgB<5QJWSQJrAyE=15|I+B9IU8vkgGsyMidM1s*44H5IRvo5aarHAwk00 zCP;BjU2Gq^!%@BESR|%Il~025P0*{l{69^!6~=&s0970f0Qx0zzley>h^0OPyjlVg zA|SzapIP{62t)z5DI|4~-;%$s%A)_p_)SM~k@LHD5bzVlH@l#}kDN2Sf!v1mkyCf= zo-?*N&7XquThQNz^$#KvovwRDx0yu$H9^5gKRv-Ry=Umj(TF0clPbzoO+Vnp$xaRB32+?OuaOIN;l%3unjk|N7Hja5g+pqFO@R zn!{0?-&xS-Jr(6NqC|CB`5d_kpe588tyCWG-}8o~ehsZNna~`iw2Uv4hm>NVLw70d z(0qr-a+vUledQ@FFpGX*(PK;bXDt4NDPAgHV5QN?(Jl`50~csYnO=#*|Vyk=#5klAau%xv50RY%QC z?FjMR_3=>3k>aLYnej0xwdgi0j^K8fW#ZR=;5l3~m1kYe+Q91XrJrwLp9zb0*nWg! zv3kCEctK4L8t%(ud$ei|2CEI%btcKN?G6u}e$u$JW~`;PtptGgzz*JJCA{_j0OaiY Ae*gdg literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3d1f158e9e79bac193f88f94d2b548b79827778b GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJS(ac$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1h@mC2oUSy{w9@^A|VnfO^GT~Plb&+S=t|TTP*^}6v&b!gYiR3iMYNCiKMQrw$A*) zu-5b(v8KxE7fJml8ApclKST6QA^<`LQb{rZ3kW4cKD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t uf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEVGN)|KlR_NHuSUr literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A2STKw42<95IbXmGq9LDE<4uke-BpVv@1u5v%nlfGtK{}wpq0xsL^P|%? z)It!iTE~h-=XyZ8#$1jerle+L(%BMRu2v{nOg4x8R)@Vqhqr{rWBh*%avcFxh@}I8 z${0K#2(%w(vj#>BS?oW!CwW+_r$Tc!^>TgvR;;Ehjb^W>rC%DTW!j(jqK~RLg;%D@XTAQ>+`N|aoE_9OjR;7<+T*=-d1vynh=sAc-ahFVQ5f)upxq6#H=1VDIdoGJb%!`WYZzwmjfD_T zOz_%x%S@-?hGJfr?)zht%aCjGZheNAJ2PoqML)5Z|Z&)_}IN>8nA0bGHQ|Gj%c#d!l8VIWqgpA56-?pJ(8pCJnZX$71T-V^r zYi5lnendLSYnNsYD|hui&$Uj7i{|$}4DAi)6Kk7(gYO=Zxp!r(Uvsf~aNX^=`j}TG z2Us3I6)W01Uk{tk2&{ACXQ#f8d>9rI^kjb@{l(@=VT8|^x;8XvcdP%4xKo3H4r>n* z_sbMYl$h`kKI-Vxaz}l~SeYVPkePm*?->xXJy)5t((TF8bsK9^Z^fM+kT*r6myekGFC+lnWE?|Zl-k99L zKrVzlUEE04M(U19TC#{?Ze7GMYO3s;S(vtW>BKclyCxcSGw69wh>?>xIr?WhwvG8# zpDr+lqvzF@>Q}Xw2wz&)|9YA599s}s`n>NAY_rlrwDaCwlR2uc@AG=(QOC-+T4vnl z28i0=;EUVe|Fo`=^TwlZokUEPRS`*V$MP27-j!~(d-iqM3lm<+{d3$3VjTEKub(Uu z_c|w3Dsp2IU!A{6*E5UVb?7OXcG_$&>(_6=*Gwz%tgUl<=7`W!0fBA}T*U|Fko!WU9Bg3IOcP!l(X|{UZ$nTP*@(tkeXf*yAOZY$@dBhKGZhX zA;#UBUZiJfYim0p!uHfYbzf0afKKkc7xN+_Mt%Eko=-T^v)&{w?~p-kb9LxV=jt)G zZqM^V^Rf?*J~6`I+BGOmA*SALuF)AaL3ic;?8MvF)v3w*haJd&DYJ24{bWq z$F{YFjs@9)+Hj2;OLUVD9tzkq%{1x4gca!X(>uJfJM5OGD$Wi@@x9rF+zpB64F~mA zEy+Tovqx9nj;ot{rbkbbu43OiJ0s{>{n%pejnD-P*0i^Oa%O-p!fnCFEc9>MCehHlnoVed&ole&a(MmrVnn z-EIDD7eWi{FOubC7sE6)bA;!zw5p55Q1$b31{3}3<`*QqT6cK{-tRYrYu7|yJbKq% z&UAgQ_`-#P=7Le7-U7PbkjSXai$z6fB49z zMv*)vX1~rw3sb+qVUK=s8Ft(C=)Qq4_RD7-i`o}8FjxOR;l%FyXP#^mA+7<&Q*uvV zh%_9(nPr)oU;e|d<_RX>o5q{2NOBs-$UWCr$852`bg+9#n()V2KbM|p^83){ymS6k zYJ@tajyYqGe&bJ7-CYmpMwi6A0esDo4Y8AjaeeteEc}k|jC<|O94@u73cr;(i|}*e z4Ov1zE#7M+e%Dl7@2+!TV*4ns%e1a58qXi$M_p+0sAQ^(T`h1=eb z+)W!Ut9-l2>UcpU#W61Hc(_`3(VqRLtNE>OeiIF__kF0d+u}rwt_tWH7Z+WnH2LAa zkz|*n`;?Pjdy&flZl(KsUv6(0pGBzgQ|p#b2=jRn;VZ5=6reU=JAa(n1sBF7+N%!{ zk}z&9&Cp}?mJcnv&+3JY@r@1i<*|K_+%?EDpz}C1^^n6}vxSBwn;b@>wj$~%Q zaygfl`@UeWSj(C2MrQroz#f0#yN}DoTaQz%~pbq^96>;A9x1UES&V~b#G)xLdprl-(27v_3M1lHW z62h1Y7!%_NDRq(nc33h7!mdQsBreBgTDJuO9{~UWzlNMZ0ah_spl|^2IV#gGq43Wx z`Up%j>Gy(-Kk50izDJ;Y#h&H~!b0SQV zg2@o2e!N-&K#5UOyLg3|m_%SHi4!1(l*${n52frijw3NuD){P>_#9M1+3WwWM@tKE zwCtT^#bSOsS~5UFpa2||_EgzgNMk?n;uVhL{WiqvYYO_Bg1)ApuPNx?O+jD0Y%*ny z5^C+|2bxci#LziuSIw7L1$z0KE+#6x zKQIEo1AK7;|MZWzwVfX)`wLREFSw@u3mz?W1n8T|m#5C;ho^}{Vkohezf2IQ)hGD` zMKcpv00rWJ+X8Z~8l{{h1mT2`Bsjr<6Rk>fcy#r2c6D~)`hXv@j~_^H7z9Otzrp?q Du#EMC literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json new file mode 100644 index 000000000..047921334 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "spark", + "job id": "job_id", + "start_time": "2024-10-14 10:43:38", + "end_time": "2024-10-14 10:43:55", + "status": "success" + }, + "code": null, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "annotate", + "RDD parallelization": -1, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"] + }, + "execution_stats": { + "num partitions": 20, + "execution time, min": 0.284, + "cpus": 20, + "gpus": 0, + "memory": 0.36, + "object_store": 0 + }, + "job_output_stats": { + "source_size": 4111, + "output_bytes": 8856, + "processing_time": 0.46729254722595215, + "input_bytes": 8753, + "result_size": 6923, + "input_files": 1, + "source_files": 1, + "input_docs": 12, + "output_docs": 12, + "filtered_docs": 0, + "output_files": 1, + "result_files": 1, + "source_doc_count": 12, + "filtered_bytes": -103, + "result_doc_count": 12 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd GIT binary patch literal 14933 zcmeHOYgiLkw+@Ji5meA1DB=bYY@39OqT-cs6OfyLPz9`$$&idpX5!3*K(&@CUfOCE zue2&v@Jef~TCG}ZRn&T`7p#g}QB*#w^@dv89<|@M_9P&BJkN8!?>W!;mHZ%?Jv)1^ zz1I7_YwgJ z@p5UtC-t8B=<&s=EL1E3;f91H(Rh zUiYNzNoFarKGHSEA=x3v!Rx~H`w2#fw)o?VFtZTUGy? zN!4WPt}&-gmlmCm7mi68c;xD!4{QDKq4b>d0f*eTmHEaFn(t+#Fs zzj^6w;i&!JqncS?9ADS>Noi`2sg>;A;ya_?if;6sn)ozg$^78z<%5SWEU*1+@~xds zUmWXF%dBildvE6KTHldxOiliL^O$ZwZrtcn=kD%a78CPlxYzpu35wghoh#~|99}S> z*RI{e^6IN@4)L+fcbGnX#JHVB+WG1SPQ7c|&g`J*5twE$eYXEW=8bRGMIT(8w`1Dz zZN`E@R}$}J%70b&cfDFVk19&MA*VevP#bi&mMDxdZ8)D z=yreTy7LuP5AyPxes*)~?leKOBl(}_ruz7H?=(+1ryiHbX6J+*F0buQog@M$abq8U z_jp%L-c#CbTwcYh>Zv||pL#s~M8JTacVq9pu{k2%DW_+jPJdK%v;>B(*y6%|weIqp zsYPQlYLu7Pe!j|e@coEg3g5rq_IP;rQ2^i0yC~}Vh=A`NCtdrm+m#cxnlJK0g1-Fl z+~gHY%PR~uiHk?96L;i1_1(Jmn_l5>mQFZOnB05TtV7XSf9t0mQy))IvgiU{~EcM)QT65yO^Q|y_LH*k$+a6Mt z2Q#YqDZL~8wf@kkvSU1Zu5Z?3 z=8NR%YcfB+a zYt#?URZXY*Pbu$rVB_{_yO;Mb9aKn_udUcrJ#K2*Wd6vTJN7Qo_uVo7p2^kyjlk2a z@z%YILQLX?Lf_cy4er0!S3O;xAZ9K7{>LW6mGT2F?7It!m_6TIiN5(HqV2R!-ICTX z>h(tBPq%-W>UOB6msf{O-QEYW?~m3r#dKM?G5uNBcXy3E8>=3cernByE;;Ulx;rlm z9)HF=^~C7do?>F#xwFDz$38N@d(Q8tp!RpD@`II@+%w(w{5mgU&7y0x{9Z(tsS2+# zyMB9k%e1dP!-r1x`?lX6&Aw|##{a?p>^n1Pj@OJ)4`Onh_KfPeW9!yy0KPj%0{xx2~e_ISK}^XJo_=e=>7UcXJOK|y=Q_PgWzcewuP z@u+unCCsPDYt z)~KF_U&bFOry6=5ubk9j+@Rp&qgHC#$U;gCjp)>SzEsG)yRN4@MCR_VTE=$JbVwbx zwcUkdY3JIwHHweCR?6Bt`o_$jU3Ast^I^wE!>qw|Wj@~>pR_v8=fUO`yUZctt-Yfg z%bpHJF8tNV{bw%6|GGOxXZrnA!_1jgN2mX><$T5XlXo2wt7x(R&|P=(5>FQmsvcVS z>B8>Cmp%vm)5`t(ib_D}g}OZg^M$&K|5<>`s;+WBSPW!}lPa~wjVqGl7WoX3Qf z50@`;IeRK;K@|1Rh=&7Ig~b=Al|LJi*1z8%s>Uf}p7`a0gsxkDUbLqlda5#g@?`G8 zGOvYonv#-FS9QGnC)pIbxP#Y0yV~mqy(TpZ0up)!AFgKTNhvT!M~n{+38Sv}ZxM?}QuM{6otg&HMP|mXPhw zG`0IGKl9yuefZHG`&Qp?)4AKHe(QfatGF^acv`wg&Y8PApLvYvVeD`(FOIL7@ZO^n z3nN17+mf==*GIcJ8tNARnB99vRI%UIDE$pTw?L-u57f=PpocA@ys0G@^1SNPINf_(N{;m%ssGX|F@$Y?~FSdlKxi-f3vM;kkTU{ zzWr6du3y#pxE;|qg?2dG$*ZixmD6kX2J7#R@|0CNIs_P(kNC68CrbH4&)$c&$&xST zG9Pngzh9{we(nI77A%@}&p2fY4CE#T3D-I7WxkDUqNVxUFGH1IG&}o8xp0 z!RlaYhCzvDK}4x$QJ{4c8p-4KbizdIgduX6;Kk(894peSnG(Q4o)9TLO`;UZE`mX^ zMJ9q3u#IUjvLKJqWW}bWrKV>pQ37Wnl-NXdi+w0KExp^5b(|@aG(WNr1d5?ZDLh<0(n|hg2_uEDBLx#h z(gY)dwI&EK#VCS7Z6if}%funRh zfig%=6rsh)!9tAS1;l{`yr6y&5%3bh2pqBiJeXn?Pw8P?2uQOl1ha|aY48gPIO2=| zGlAj@X%Z%)S&}hJ))E-IA_dD4Rr%N!(m?Qfc!RMdUvNev1Qqb5M?3}VW|4_wXtG!d ztA_5_JIKO=6LAl`0I69p(AE~)l!98GiLn8(#IV7A$rjX#9MYO;;Dgx&FvX5Gg1M|{ zHX;ZdzRo9%G%OJU4eM(K3v8+gO>8sT^*m=04G<`a2gR+tWkV~*NIuO2v9f6aOb`H@ zt#Jh;9Vjh>0B^y4pd!seq&Sgywcj9Xz-SQ}AkKJZNe8TK02&HNOBLf0@SMrC&H_HB zz<(I%CLX-rIu4l7V>!M+i8KtMC5(h*L}~#;7&s%LMjD-t#!v=b0M}!m*xVpZZoW3`qlA}N#ANQf@eihMvV2@XmGplb#SM3~}109i>~Iq93gh?g}GgB-%oii9A7 zthIy+gWN1Fxfw@N1n}4#sJ4~utyEd*j|76&p+wq<3SsG3+m*mz3X3afI{>X&>eR?T z4iowX5V16vfJ_3i5KIBw;5oD2AWfv>K)02M1RqEo0X~^oo-QO9fMxTt5mkh#NUZ;)_A(UIj0(uzhHUjT*E(fK$7 zyo5$jFJKx3(PkUK$4CPmKo}v9ON@OP)Hg`nYpWgah-wd!Uw z^&ksFn$yw{=$A5+(JTPrrR7N0;{pQ$pqKKv3A%HTQ^2H`B4o4K0?5RKf*gW9g(#zm z;ff)@V7!Nhhd=-Y4)Q;U^bjl0iHVT$LFcRq#|XfJ5wu{nnPF5S4ak4t7m0SDOIogw zQW*)pfPxfA=y5v7NCr}XplG2u%EzI_Igx;*DS&<#!U`dj`A}d;T8=Cr`G{f*X`Yj0 zP>te2z9c^5fLjFuvn?4C36xe~egjHp7{lcwb5%4IX_yRi~3}tX}W5HR6fC^Nq29dQIl#(QhRl%rO zEN}%t+7gig=#Hra17r6>orUW)h!f&i28t@EB_OFo6V?TgdLt;k8q`TGj|SI@P=28d zBg`o$ZI_WeWio?F5D{GJv$)EF{_-puax$Po4-96cas}2k*dU6g!NI{6i$!hXfjd&2 zkNXGX3#h`N4iqRM7-j}+n0b}1Wdos->PHpkg`kp3S}TQNVigU&Z5s$xzmSlKegi`X z4Nx0IBO@PYCJQJYb`g-;Sd>IVZUYB!CaKmPjDiwGpap?=S|}*S=$Z&cZ;)4ATZtVb zq1b{vnMCJP@?@e2C7JaQJ)V+}<;)OKHV9WQz)k=bpk2#BeL^7Hm!vjW83pqK{^KB} za~5QvplpVS0m=l@n$sqNC&5Rw0EkGj6*4FjNP|ltL4-OS0s}g27Gam8M44F#unh%> zH5=fOShEQ914md>r3m@?g{@d_v3{8-qCm<-SPvx0ra@Zt1`&cU2eH&yc?0Pm6BhFm zKyJZ!ff~(7*-9h^ixEQ?KnL4G)$)GfgMoM!lnsl6HAEA<7UNSQ5>(aDz@m79H$sU9 zqet;jw4f=N2T)O@Q9NTbi=ct9w~3Lzt`R4F0-Os;T!pg?PA#xz%FJ^Nr3BY8dvxYkaMG+1dDuR&$dH0FksLNDg4W)T%^~ra=~= zz?YIK#1)UrV19}(f`Szq2dUX$5*lo=SHBRCe}AX^A0b=*`ritUYkd^1w%-a|a7_qg z6{MdD>fZDG=$*j*y#x2R@v-mit%7CW+e?VH@9ph-d;8uV^1M~`?R$H0gne&s-`hhC zWZ&E4=SB9ty?t-r^3(^O9@_Wz_Psqkv9j;&TQ>amz5R%5gxM!WG(Dw*7r%Nj~tDz|+aw$6NXTbc!(0{s6*$xWIn6 z;H95%?1u~NhYRe73+#srS{+@pA1<&TE|AVuzH*ihKY)yGIqzpbTwp(3U_V@7KU{zh z%-IhYv^;CDA1=Trt1|v~$MFBp9zb}meBaY4(8cq~gsw7~w~v>rCOthhd$9F+gjb1U zFn$fxYv45+|AE&ijmDyh(re;08oef3x}}MV(rA+L&1j7#0l&j_^E+JOk5MnKQ(|Dy zLC_XECKmqUFTqRtD1Cx)q5ni}Sa`^|?C=oMBDKQ((Rf^GaF`s%8>@-3Sm%{)O5LD~ zLHfYix>{Q`niTwjfbrfAV+!65ODWXGXNg+Le!V6}8jrM@f_uUY@RE5UI5#!POo=Me nWKt1u%Fu$~jFI$D2cZMhq3RHCB|MM8k7Rywm&wxLA4>lPWZ^x$ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d GIT binary patch literal 3068 zcmeHJU2GIp6u!GGYZrVOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd zr)%p2Ubx#!+mwyv0QF_?+Op+i{+J=%~>8 zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N! zT+gohy}f4!viskNbvSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk zldhaoEl!8**-E20aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1 zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R$Zwx35Deu1M^|QO;^<5i^#8S> z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@CN06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1 zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^( zZ9Z*Ab4(47jbd%8_xTj?;P%JFsDFc^j4ttIwch50n#t|Pjl@Z;5h{{%m9Hn zNRWg%h%mb^f+SF&DotmwhftZyhNh&Zc}UR;)ehtIz*79ySESPK{RMg zuYgQ+EqUVihQ}lSIB@&^*rf@Ts86DLFIs`6{Dn);;B`PnKtWIeAV;PGnmp{Cy8RB% z#1_S$grE}L(2f7viQh&n1@Bt`&?UFNY zm$ojtt(D>tZiIZui}NNCUsxY9MV+9TgHCXD?T)g&!64fS0)JwT-C%g;Uz&yGTp_ol O8wNhkqX5vrFZl)Zqj*Ab4(K7jbd%8_svn^E-#6&YV1r(OZGmYLrk|07$*$KF#%afM*G2K!!NX zK@teeLxkA_5hQ^ERcR`XJ%q|sHZ*0L*A(fyQiaqnQA~ys@go2c9U@55-P?!tAR08L zmq8}FmOSx$!}iE;2X5aVyELH^^+{CkMJuqBzi8=cybh=cXb2JjYNX4c$-~~M+wbs9 zY)Skn2rAJH-T1Ga_-({e@V-R=-Ov)uarT>Q(TduukmD$M4AI~QvBTJe*&f?q%wv@> z#cZ3gU7oBkw#6C!DWg~Ap+j!)h3xVEO+E+F;KdQDm9R1z8a|_V3CTY17WQshMQ7Y9 zZeMhptA%6S2=!1G=S?ELus&3ZIzclBo#5))9c6lhL8cP~{=^)+!SKw#H1jLje0Etk N4E#8c0ze&q$uINqc(?!n literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/metadata.json index 4a1b54395..a0b26f931 100644 --- a/transforms/universal/fdedup/ray/test-data/expected/metadata.json +++ b/transforms/universal/fdedup/ray/test-data/expected/metadata.json @@ -2,86 +2,48 @@ "pipeline": "pipeline_id", "job details": { "job category": "preprocessing", - "job name": "fdedup", - "job type": "ray", + "job name": "fdlist", + "job type": "pure python", "job id": "job_id", - "start_time": "2024-06-24 19:39:44", - "end_time": "2024-06-24 19:39:57", + "start_time": "2024-10-18 11:36:37", + "end_time": "2024-10-18 11:36:37", "status": "success" }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, + "code": null, "job_input_params": { - "doc_column": "contents", - "id_column": "int_id_column", - "cluster_column": "cluster", - "bucket_cpu": 0.5, - "mhash_cpu": 0.5, - "doc_cpu": 0.5, - "num_doc_actors": 1, - "num_minhash_actors": 1, - "num_bucket_actors": 1, - "num_preprocessors": 2, - "num_permutations": 64, - "threshold": 0.8, - "shingles_size": 5, - "delimiters": " ", - "snapshot_delay": 1, - "use_bucket_snapshot": false, - "use_doc_snapshot": false, - "random_delay_limit": 5, - "worker_options": { - "num_cpus": 0.8, - "max_restarts": -1 - }, + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, "checkpointing": false, "max_files": -1, "random_samples": -1, "files_to_use": [".parquet"], - "number of workers": 1, - "worker options": { - "num_cpus": 0.8, - "max_restarts": -1 - }, - "actor creation delay": 0 + "num_processors": 0 }, "execution_stats": { - "cpus": 16, + "cpus": 4.5, "gpus": 0, - "memory": 14.396823502145708, - "object_store": 2.0, - "execution time, min": 0.22008283535639445 + "memory": 15.91, + "object_store": 0, + "execution time, min": 0.0 }, "job_output_stats": { - "number of buckets": 15, - "number of docs": 3, - "number of removed docs": 2, - "number of min hashes": 5, - "overall hash memory GB": 7.152557373046875e-6, - "de duplication %": 40.0, - "source_files": 2, - "source_size": 73126, - "generated buckets": 15, - "generated minhashes": 5, - "source_doc_count": 10, - "generated doc_ids": 3, - "bucket processing time": 0.04204988479614258, "result_files": 1, - "result_size": 36941, - "processing_time": 2.286285161972046, - "source_documents": 5, - "result_documents": 3, - "result_doc_count": 3 + "result_size": 663, + "processing_time": 0.024, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 }, "source": { - "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/test-data/input", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", "type": "path" }, "target": { - "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/output", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", "type": "path" } } diff --git a/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet b/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet deleted file mode 100644 index 92b4e58c722d74a2f1528e89c521ffac58af6360..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 36941 zcmeHw3v?sZm0*=yc1v!%+v;ig$iz*`z1>B3)3#NmzZXcCq_VAUS(5d!Yz63)N+nyi zepr%!fh(NH*Cp*$jut z;S76Ul}ai}B{z-ThGbi(+g82b{k(hcyYJrj%0VVX88ily;fT-BVZe4{m=VLe@yl>b z`m|Mk8!*EjgYgKo>4EaV};BDLXW8^D%zd_Sy#X5TW zj9t-Oj>`(k5yCV?8QOX~dSArh9~mUI=K`Vc2#|mo+A#Rh1HUkAYe~Afy+%@JjU)v@ zuWutC-nLEGEurx20@m+Tu()5+*B0SMVp+oDQR+At(@6mb(pwB z!e%ukOi(6J$rNCjNID%hi_K2UItjkY&Rm<>C4bfdHxk8YGQAgoS$f85n? z=&DHXj%1!MXN6qpL^>(X^?$VY-r7N)?j*n8(f=MnUsW|8!wnDNK0}+R?^k=`KxEN8 zGCZ8fl!d`UIwSOT1_u|ptWQOse%$AHvvQFs6aEG$FYGqIzqBvcn3zl zw5KKjT3GVRW2SsZn*uBEEg{wIP-I$azdP8f+NE6920JFj)`!&P{^DhT!IGB$n6WbnFveNBo2>YMV!Vx2UB3!ju2q3g+<*1J??SFKSmE@?JHf& zb3%@O5YvKa>Nt#V|J$QOH}_ze|2R8C3=)37J51nRD`4L&F(T~syIEp_nGS~{gf~pY z*-+pJ0n*w2@|E!D9rFi5tcP&T5Rq|~2+xEg>_j)?p(vK{`llG5*F%g3LK93x--pqC zeMFE6F%xWr4e9$ab6>xP@!`fjA$Bs#h7FftmOi2fIz`!07pAeY-52fm2O>nw8yWZd zaeWsI7CJ!a{^d%_#u8CKz=Lz3SWXsdkFo9u>p4P%1McJD zSdefte*Au)3z!k`ct^cJ^k^tBLHGmyfqDE>?OGx>&iaY4FA&468gE!nU_;%fHK5#A zVciF>z6xs(X(9nV+kI8{sllHZ;`_~B!X1nPNh{XZ0gKbb5(mqX{v(*hzaPUs-o0P@ z%54wp4`NpTK}_%Ki7{a!N5?P|kRJAXVt!&G8jcL?&-mDI7?{ua0q1apfobvqFMNQw zKNKZ2v>mtOJ$ifBScqkq@%EAa@OU)h3B>$;{~1plxW9{{DR$ytJurPCmSEt&KR~$M zt}wxLYy5!&j}mMoOnBH)uYcg1-1{^h&*d5R5e@6nhcQ|oHi2kBYa-~2h6!4?9luNq z{L($Rhn;q_tOwuIHSUfa>DKO@?uiCHOr&p`BliR+BL3(E^S?AvcHmJb(Z%?Dfs-Q# zJw!MX2r*;q5kf!26h^@2TGFrJx1~eC&m|wGfAkm>Sx;K9xb9;;OlXV^9a`4L#+mkK z2mZ0k#kh~(Lhqr72^TSX68oMPXcft>%O|bq5%NpA6Ir-M!$M)v+)@iRl7>yAAiGUvvpuL2P z9rOA@R`vaa#~Tj%m>K=W8jF{?yZaLa;|qhnMExH9#h5Lp#k(kzm8P^y3z*jLb{#4n z^!mLKFQ`?Vi41h@0YZX4c3@n$53@b`nSPK=0vPMM??<}5n7warmp|ZP5A=1v4|9AT z?n?d}DtyomyaXwv@<_ZflQWkaDrhzPjd(GYW-4r6;VWwa^U8-HT=s0DlWr}&?~{MS8z zf;%*rJ^ALdlp&6>_`a?%DA*G}?Fq4g&=_~y#DeZKfBKR($hx0-3=8SM{nY2M2!4ac z7u6re%>Jjb(Nh@aF`Uv^*pHmTwC;c(U$8-u?eK1`CQS7GMGsgdn1e6>^$S?!SQT zfdE)l|IE%eb;b~g5gQ8YufV7Sk2}Ac{t}40?Vm7>nbuu$;3W;?zve~PC9rAHUt)y* zZZFKf9|OuJ9^8FKd&D!&gvZ@5m^&2Ee-g9mywQ}-d*`vPBZMpZ)Zu|^fn{Mf(*CPG zp@5HYc?Yf;a7Dj7ZZ~;hP2o$c1%Uv)WL!-6q(I-V*?m}xvDB*?J9QI{>2Eh!j5JG( zGVXBqi&)@^+jsv$*A6LY8k`@lEp(QTN6c>xQ&Y|w;e(p?}6Ucb<#@w-prH(UgD_oVQX zkFd9nq;Rdn;&p{_d}&}U>J|P3BMyBMb6Dr`u;D-L3By9-9(UahAgo3I5Ozfr*EsA? zY{w7(jpHI-`}wyp$5RKoFJ`yj_jq^zfjQ6^cOW_7E z*ckq~BVaK-URZ&^-p#OI*MvOy|L7Wx`h0p5W}}z3{ms?heS5$^$d$AD!_mh^{|xK< z((W5|&$bpWONPJFuAkT8!G+q0=5dd>DV@?>f2P z051DFJKNunKd7b5rbBmXy#C(8uATSmm>=%R38i#$sCT~>j8S;!(H&&8bLVe&?TdDT z38kKTh_z_Fks0nUj3Dx{K(NsQ@$uEPSK0>g$VtG+$?=`LsM}nh7QO9c1McXW<9U|DB59-?w**=c# z=VrNdZiom8nVs8q8;Nu-m0!&2zM~BbLibcxpdc)^@7Z4@pp?kw7X|t-)&uC2OQTm| z*w9E8KZuA!zE&g@RPo_|+9nZCS?U}S(YZoSn6yrzr9oP!hZ$xR(K|{JY59FD8K*%kuR!(QjML+!>zDWnrz@GA&rW- zDwfswTGvu!aP#0pgeYmq_ZB5h!Fl7)@pJUuCAa>3dhcjQP(=Pda_$u#Jjf>OFhnO_ z>Fi6hi5&X6O$K=O()K{wYz{T<#cqw%_F`H-57Uw>6~*n?H=@0#%;f#Y$TtYI9lIcj zpx75AaT6s`bz9wK=#j&79eYshNl*)@wFx;R{D*2~fa(HaPP=T#hvE;gHrLnK2t${J>Vy;go zY82@pz;vP(ExEg`_ue-0D;)X2hY(S3Z+KOb3B!JweAZRkR`N;REbY;s#|ub{cjo zVy+!fxZNnmE-{S!{%pUYvl6=2B2$WX86pyq-Z**pEcyFXzpVpP1#?fE;Q;_7MT|d& zzjnuMFti*CgGlkvY`Qd8PE^ByTzW8@PYRjhpco@YKL$DensU0BMl0zsq90;~%ehRR zOM)#Qt^{`F-@qcNZm6&d(X<62cR9njW?0(icTM>GUNdamW}=fbZu6+e8W^WlCg?W& zJ%pd}J%vq!fZkN738+D4JcQYk!igneu?X`eO3B}D@BP|#@{R@a%p4Nc_Y_6%G5)IR zP|7ebYmoWOFw8-bUlhPm8BAu01Xr9R26^I=!fI&_eENzyDAnK+6&W8s$03a8+Mz*G zUREd8vF`pcyY>>iqLN=7P(3Bcq_paN)xPEm5&%SY8jc1+dTvAv$Q8m1Gwe{Z9M&5KS zdH;tIyVf7~Y+zMcF=DtvX2sqDTFD^*YaTQ?25ARjvW}Qc@b6Obr;2#>wQOu`UDZJT z<0m2CFK0|E^rJ|=C@%kj_ukY-p1vL@zxrPg16q%|9za#X8*}CEqkvcv64;3ZpR&W$ z^K`;YLnhjk608&rx$cCGvYD-%)nYcAD9Byg1kO&|tQ1O# zq$oRYNkIZVMW>ue7$TK|Bs?upDVx&-xo_TXNu_wR!(uYiyw$-|f`z9X4jYB=u$u&d zvzYB%GR0Y)iKJknIl#lhrR-+TVWzE?gvny#k~SyG-8-Bnn21!0GC|5-NH{nr%vIfK z3}D^SjnZa3x>3r2usXl(%eN?|zO61t{;QjC@}ZC7XwKgYh^tD_g`B6jz2D57=ZkkL zBw|~6F(X<5DdlD~?@rvjH-6^TYZ_w}E^$yXp+ z*~;ra2S9gxQUPd3C4_;3*;QcQy9*~DxvMp>rakA{n#z*Uwzg)3|Dfv1EzYfwQuVgf|OCb zyWa>)7&S;aLjBMN@4t+;p?dM3*h0lV8l|-i^{Q`cL^TR1wNV=F+4G-KP}z~>VOJxu z_g8JbPqvW{{W(sa`m{*qI{~K3k{5OvzR)G}lUK9*Sh|uQnDAkF<`XDC@PYG?AJ9Ueo0bv; z%-@OmfuL$Yd)@p1N>`L2NIfS&*3noAT62GOS!2vvMbj_DXfMQQ&ug~eXOAe>fgMQ! zHsR?43TugL?TdZA5BHMS{}oPtbj_$p(_cR5e5gOlnl(yCCMej$&s8)97$bD*PEbh#k;E3ZiI#4e>p-`M|A(fJj zfL8se*V}r(*+%~5 zqe|L2Y@tqn8!(giYFaqbiB6bj!o=Rgm!lqKyFoi#RPBDeu5K-kpP=sVdRPU?b z#mSexQx#R%v8jx_9#GZ!yvT5|EUMimlx-|SL^TCxWs)iqs;gESYhN3Px&HeKVmcOu zS$Q452k(6VCr|$qPQLg(k&4<7%F$G@>@{2@W4Yttc7OJk*n#^^FM>_$j@5q zk($9m_jy!^M|K8PiFO8kX<)!-f1zXm%KR4vX=Rs8*4JLe$(vsl8PGhGDie%`etBNH zj-cD7@`W@N?6h*7vHGn6zgPaZ0>2K$QH;Cs-g|NK_rJu+Z~Rh3uR4NaEL8;8U6J2r zl1*A1>l)WDj&)THG~N0e1x;PZXFS2fsf-M)is(|UlxF+!ufe7IwMbdTuRceZ=R&)4 zkhZ?<1fxo0nRb<`2HJ1@%X`N@9+A^sf>myMf8YyqbQWr{1|Z^4ytWD(IvBmGh#Om1P+zE$}kfX$^aKpkN> z|6W1Zc0N;vBg%_XFh#r${`wm@`S2Tvq~@_&Ku{y8@6AJ2%-B)n=Y%Y02*M@t18wzJ z(4`{96>iMX1#OfUeGgzhNIFR6)%%~yLg_A_vZQ*Vw60J-nx`zRoo-W>Di4jx<(+_5 z^$?eYjr!3YxdWo3@yLd=jrN8 zvu8wZ_{soz{*UOB=kFN(J3kpuUN}0v$p`sWt{;$p2&az8ck!4n`L3L?CI66*canbu zClSdF94;g`cwUg)gcE<{R-dOMFC5%}$l!%*$8Y1ML0jj0y=!M8$am+s0lAe20LYEx z^vMmR>B)C3M^0{jLYv&w95ne()5_#K&lHn?BuPqg6SIrtJ5Lyro1Eh#H#;XszFP?~ z@?FeEkz1G_BDW-E(|hI&-rGTHdw+W$z6XP?p|Jz*&m_5`@w*2;yidAIDdoS4`$mWl zes~|eS~3aW_AR1UO71&C>_ay#p&$ph?%@gE?y^{dN0azHUx2Sx@vDHoQrBGt57Kz> z&QrLkDO857A0?F*GD-0cXKiE&trO_N4HWJx=hEohNi*ZNLn&hBiwxUe#g+T8L@a z9RchrVCWXP45d#Hd_)HJ|M!K*k_ zZG)m>J?kJJx~O|`L^B30n>C=bK}Ax4K`$p*7Z16?=3-F0>fa2!zV3X$MG5;>aOJv9 zhO1(qC7ZpuaNwvxGhvm&&4pCCuGhRRI` zE#BHyqqbVn0CJ-`D&C-N1}NREMdh0euXa;Q1!#Tc=0dK!>e39msd|&4S8tN2fUc|C zT)6d@JDOoP*KaO-`D9NsWMxeQOogJpBMq+H=4ghgsMutXbvH1Y0XJ1QfZw1NO9!Ev z!Pd$*7oKIjz?WQBlzVa)V5j4z#NP6we5v@=b=PIBc#0 zwW0Prz?MCILB;xmY)@EtFv<1y=ef({iDSza7(YAQ75 zi_E4|4Y>2G(0;T8{FCuw(o-?3h=1Gzw7M-T62BVpZ_zI20v-b&DJ8C%a?&$ns_>mk zOuMW~T48=n{`tu?;%Oo_WtxeF=8|J~IDKxUAGP_d!It2v@NOYN`ze@tSaWr#7&2a6`9X+v8iHw)LEPNT&R#7 zTanZMKSUab%N6 zodBPj<6@Rp_9Khdg=sppl!%QM5^2%SZWaGP_E$0$y+HQASr=eENr23W>kL}Y)bt@6 zFDB^Gnr&^?hiqcZIajmIZ%aoP66RzX^#k1$^?%wN&+s{aAwjp~ z#37$(Yw>Hj{%-Y}o18vYY$a#ax-F9{SktD#Ps=T|nlsRA3fVeq%lSG#wguo?O~`ku z_!%v1Uv*8E{NYwIb-r*kqqY8+#G59+UfdT@;4iESlJCA%YsSJ1jr``8dRSM#b@p~@ zZN_}8z>kBCSZ95RPy6TjY-SmJhNUfDYcug|9DG9XEwfX|Kdtz)&HJ5r3|fLe%T32K zXs@-!K2=5-cNO@v;W$0F%uOGAH}EUWWX1iA=%Xw44l2G21R&7fLi7)%y;4g%b6(ah zcze@ya*5Bj_USo#%9^B|tMPSw`Z)33r#(Y_kV(N?V&5NAoY+|$BdJqNL&R`@jR8z9cm5YuVUjr#ahi?~E`%()Ev z_jPbm)T{YQ=WD&M_#_b1TI0sp)mDC7Ee_yM@$}r>#MbT^>G<@qH7@2Xx55wBw^g3r z`t?;953SBXY_x?vgLSP|ucnU;vZN9VeJ6Gn`AVZ}@#)qvFchl-8z#lC)a32FtW9q5u70a*h3FSSY#rt?S%kQ2tGQHdr#7b>if^~DAEHk-1#ulYJ~1}ca{t$|?fIJL zDTsBtkAaUn1#;(HMKMr_X&1TJG2kieCB*!M%L?CD6J}oIuR>oT-s)YM2wRqY^Gq7W z!xizKn%<)^=DlTTx8}{6P~HXpUUVfM4`~!g#y|#Tb{u6+T=48+c!u#Wk1>pU8MU2d zW*ErSz%w(8o*hh-VSKZYyKy;Okc$bUXBT?*l^h1Qne}ffV!u5h8z3TUSL-LQ6^7iehvN zpi|7H!5j>Aw*J7#YX7P}ih~~yERbvB?PxI$z;BRe%5IIo)m1_TW@Es3<@12=P_)agaRq*vJ z%lI-Y2qsb#;3E1VeZ=_{eI`U7{2lY*pvX@*<65&z@==k8rujeu=#u0Ei{i7(Oc~K<@tmbS5+zq54nNJPy)DwvzLEcVv3(^*F78{Qz>DxRq5cZ~i$o~w ziFjUNtS$x*suDq>>X7--LSBOagHg*LFk`5{qgdgOEUySjr9GKa>Aw)4a#q)RRG$mN z{Ltcf@Wgu5*Bhitu3A0`dB-2@FerE5ns{X)Z`h-7?24zdDt|M z`oq(@_E^e81Fx9lsJ(L;ew-DaBCn3XF%*Jl1szwybBk39`IqcNHQzq!M6wO_laf44 zRQtQdPEGt3<-6{z1E`&)f^WsYL4!>qR|HH94}KqfVDYLVG3iB=O8%ij{>65qb!RQg z(#nA3XBmwLDkGX|aYS(t^o?OI$BaaQTfC+$lm_`iVNg6|L{ttD5tI#%5SifJZ17&f QcjX`U!AT4YG9l{!0VnkT$p8QV diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c7d3d807242ca605dd7ae80e4807895ad4d48c50 GIT binary patch literal 3984 zcmc&%XH-r&>{ns3qCWSl>YEfiY;u0EYF)g*Cg|rUM zp*e<5V8I)dNqK|X03aBJnN^4|Q9#Jsks(dW9D$=Flk%gZl4It?N2SNpSR#}(B1tQf z$IE$tK9>Ya!~>Ct=d8nMv7|e&v8EDQ z|F^8Ayg`YSNBpWIVJi9KEhE&j&{Tp6BJV7g_mJ;Ou@J?i%*#xQP0pAT6O|DqN?;BN zF>~gnBxWWhYl_=|B`=Zu8X-gszsT=vGQob%EB8+bGY?0g!fVI+c`f1aNk1{`(j&s<%y90=+$>yWbCau% zbfW6H-%*LS;lz1Jhe;*#;N!>M)X?RN!RoV<9Nn7;J^z^*wy6DJ`@s}Kf3XmP`_>|} zq851M*Mu^L@=?qgXX>_%E!rI61pGohpe!P>xl1)Q?_31sdr}Jd)gdU~ih|MC^r?dv z-lD_9E)Z6Sw-8HidBNvtGjN@s7Yf%a5%kelsDB>!r=qG7bt(=Y75gm=L!V@IVq^IT z?i|@{LEjY_p>SAA9!iPC*)P1%)z{s+73SW=g~{GDzqmFkdcsP2{phL~vrNvJhJDO9YeH4G$ZHD(qT&Yy8Px02~W}VP2 zN-9xW0cKWhOYy@6lb~@w`!8k5Ih8N^q&#i5s$Ig-(*zXGG0| zG3e##BvST8Cj?H(h3Surh>Ks>qPD0P5El=q-cp1b+QabO^>c@~4c&yxAJ}8TPnRL~ z)fM#e-a-=ZUn1*vxlC>e97$;v84}TbF&M25;>Gn?!jnf00_#o;X99O|^P-}SD<_+S z!~Jh5J%_ERoNJE_3^o(o)lb2D*K)ATJ$B+qPVXy73qAsjY2<<>^Kp9S3)w{j}ReYljUdow!7~`kIkPz95O$ z6Di$}@+thasjE?xlFPT8`W;&KZ=ZPw3_1Ba-LRA5LvK z*^By$Q^_PfVQL1-r*CGM0g?}+O-d&WqKP3 z*JB6^zL8S~=d1D5w!>lwZ4x;CDCW$oR-wAl{b+sTTHd=k>UxF9w2>rB@R z=~X-{x?b?x-$;^ey_94JiJ>x%sWGz=Z~leQddj#kvhX^wc!f$&7}rQ(t4C?fM<-_ zGRCJ!ZTph(cX1En5A^jab=5kaTNqarjLs^1HITv`lscdfRPFEqkIdkA{z^QuF3;;J1@hu?=_2{{i@x{Lf`9_AxCGO%Ax&kSVf#j>}o9Z zyE%6Dsipl`qPk@p-Lk>RZM&+&e*WcMQ&vj+Z;xAW{YYffcK1ryPddEfG-`7t_1Mj-#P$FaZ;=r$zt3^gGtEAKJ zxv!(GXEzs5``UAp(G)c)3%c)}Z{}a>UJ~@ccl#LY$kVH2!4FkC9Fw}6OM)K-lulXp zhSZd)w%erU$g*)2O}JPA1#c}h(ggnScL4R5mX57}7$ zvzZ}{2N03>`!Ir^l#tA#qu6vZF(EyJ8=Dx*LJS5^)C(l-BPC@f@c!%wHu^1=>w-|gBjo2{+=o#2peDofwlk#q91{FZc8`NR((I0Oa2frdctU-DQ zM*kH&(tZb)5@VC&G6v9JFOob&@&XHay*UGr{^t7T2KZ^y4P-ZAo{f~k0-vmbgFT*SUyCzApZnQ)w=Sj|w5(CH#r9wqe(A3aL1A`?k zBW+l6R45e+#Wd#2lAl76tx(1(JQa#Kg?G)5#Y`8ZrSYW!7hwzg+j$H%5av|$Q8CMzcAW9z%Dv8Bs0cy;==H-pm&qC%-nGqH0m=YcqkP;E5DV>QoZJ<8i>F?*mG>f6%<-@j8nHu92lEDfTJ}OpU zqm(@uf0~b`-%PU^8f?8n;m#ht64@T4MtDwK5N*GR>B%@&iD_~++kYm-fk$EPlp0GW zbEY~*9;9gUDc#>x(6Gu}gYtQ&+OxdRy&;+SzYE;^X unXwtx^HNi-qten+vQ4vM)6x@Cl1=TL#@dawm8lE?-hJRC2@uHqFaB>f=8;VR literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c355b299a2b6bafc20dea943a5a9833b966e68b4 GIT binary patch literal 4763 zcmc&&d036<`@f&(ec!XEsFQP&igZN#YK!TV_K<89(P>Z1NsDX^MRiKH2q6?j$Rssi zDpYnuri3Cf_GQL0Wl-VwzNcx1S^oH4*YEm$pX;3GeU|&aKlkT;-gEVQ@{)R62uY!r zaJr|ECxih2hC3F2a%Ycj(%o2op`t81$mz2car&$PfNzNDUVsQfVT~mZk+zT&+W<>VEaC>rVuI&| z%i_a@SW}?cj5OPjWRZmPZ?VM2bh04CKzC~N1Av|f2wgqtB$(vQNd#O3EH0Y_z6u8f zDjd6hv=!4-dmoJJS80nleLEr!`rSs8rtTGn5J?LOS;HEs2HjQ%Yzc5Lt+x< z1j`Vtc21QDYTpj1>*uEcyL_+oofx?+1U{gG*SZdig&A<<9$>$Mgr>v8fID0d)HUp(alRU?^RtD>EJv8sG9TtF(gkj=I^3LG z3VZq`fsR=@x>Bl*^iLcBjVxEtKXC}olJ?(+~-T;{{A{_0R)Hw4K1iC|vQ0cmdfF!^u+h!0r6ZfpqogDN2T^I>=dg39a>AjxLIp{+8w zcvlt9U!&mKPIKg5QUJr^e}L|#&Y=I)44MOT;O?gwSd=gewx;cY&kN7N7MK07%Y6de z--c25@+V-V7z{&8E#dW>R4BeV9EQaVK%KjPLt4R)P=XiHuY~^!?dO-1zVlqH$7Yy`oCA{`@qL$Gc$R{4B(IVBmXpP$lu(aQ^fUY6F^ZHiaV+)bZydk$SI$RDs?*` zR`ob4O`8Zh6Fs2j>R`fPV}`@9R|YAoRACcfs?2pj|XgGb!~`;$7fSvV6+@> z-!mKUtgt}U!>fpri=WY)%TI_2r-rcg(-z``11}Qa`rA^*2iFr(=Dx%Qh=;LxL9qOp zD>Y!nVlX~d!|Hz%p^o1O09g^3e>0xYS}cHq|Adk~!UpO;M<-FTBT;IF2TL_PB7M-l zg>6_zGL*eD!VvW|hzY6HGV*}jA1A$YLD#z5`@c#(&rNR(;C_dGQQ0);EZhBCRl&B; z_fUVsJY=D0y!}8X9a)ST6xJ$gPewx07;n~s z`_9N`XD*tzYCEiL)rI1<#-y&-5RTb~bv*v*YCPZX4Z7z(iZkra6BN!f=3OqgWzSrn zqQHjk0&(b8-_=J5mMJHTmEs)gX0MZXqss zWQloiuRzF$2K4@ZB#DcbirXh#A=i89Q~WL3L|{)aM(N(1u%{#8#gjVT=>g~Al-CZn zLKdjAf821ediV>aZnXgwur2YRLF)*1`YUkVu>wY>9b|bX%;Cg#e9QHEK8Am0=<U-s*sUjswc;JpG3i#i>r4-k~35BO0`%=n&5PWlv44ya;x4RgkifzuSCR0;L zPR9*`)2hW~-#<#+EN#G=N93Hr`-?#PR~7Ipyi8nbHDkpSmnnO91M<*Sk}&Vsq}5t5 zo|_q)j?NNQ?2%JmqU0C1Ab*oHF{(#SYmwr(7%+N`Ex*H}sQpJ57T zo0716>O3^bEs-a=eFq-uEF2y?tpX=xl(IuN>rzgkE?BluMjl=z6kOFG&z>0KMqcL@ zv1329q8rCoVC^ln8n(@AU_n!vp1Xk+66KvEPEIc(e6yaS>4Qnmgpa<|`kLpcXKO4O zt*!&%j{8vLlp{pg86dViFoi2i^0ALai{Sf#ahz}bw2*HU7n)Q{P_lY9VSgB5-p>+h z(1miGEPggXVe**USRR6d$~{r_h>vJ>eI_Ss`hL9dbR_F8rG*1|`oxvA1h|oZ05wJg zg3VP=^48;#_^$04bWNO!FQtn4Sz2}!Z?}T2m*d~B{$VKMYgD0Bl?b?2aG%)B(p5>* z&_cNh8;NCm@1RTZTUbVw-e4c_fK?!<;O~8u2gW6%__cQ0V8xq>p1t`MM|bfdX@ZL8 zvQ0wN`tAk(JU5*?sa-(4IUkD#-`kFKr#awOe+junV-jnRh7$_&{+6Pz7PSK5dV#U9 zmiOnSh-6AHBAGv0blc6+)1e>4rymgtrJ!>*bBn3vk=*5gIU@VrD^du~07IYJ-6<>C zYC~PO+83mz@kN&5Ewu$JGgS2)6e4Nis!Vl5@BKb?g=wn?ng?FkD&3R5X3*%kr!94R zGS=zZr*gy&dleh>U2_b5>-T1E9PU-E#=qFCuaBHrI_E^H(Bp=&UuD#t`dp8jrgOcPeCVQ# zx0cDyWQoV68|RgWT;4WjDa&MAMMU!f-|BRe?cYbYRW0sjneI5Y;NhjbaT%uhRf#{} zI=+-`wzFn&=kwO;46|J)lY6>8bhFKO*QStsEjxvIL4BHF(AZ@hi^4OR8b*FK3X47G z)@YAg(!;Ubdtt*ck8O6DmPMCxjb~Ra<60G8&a;lZRg-D8uOZ)VX;%-|djF3F6IW}E z&$2FQD*6lh;A=FxO>!T4d=`zqPl>K}fJWcrKE4lq2k&e2k|jMpHhWzU&N}iH`avan zb5{I8CHh0&U(lCy@c!h^pr5o&X4~$pDGPkj+KV34{h`}MbAD?|=ojdAdm>fSIwP7( zW_%ZV=hoZEmLnloi~Vo^7TtC{@zvuyx8KR{)upKppWF6+!NZHWW?}7ZAL5@j7EReV z_fA*h&#jg6C+&B-lU_?YEwqKWR|mL$eR<|aN5n9iAS9lQiil5OheU-i7i;$JXI==FMnMBcvTJ%PX0q3aH<7BA+iNiY!1i}mjgkxK$!RYt^XDC^Q7C~ z1p@7+Y$8sd4I>c#zDgT?Ef8vK{!JjL{vj8lLSn)adi4)25k4Wwerw4|D_SD_J^sI- z{u6I;whf3_@j>As(J~=P%OnH*h1xRE_B^C+PYWin!J@B=FJ$!^f308`yFZUbh=nXq zJI0$7DUXOzWSBpO89a<{&WI6|5W~$tm znltmrK+DM%fwN{N20M?5^!4!$XZEAL==>JXr#odlM`aK?3A**hQx2KEul(cvX4%99 zyM#$)jK30|rb9R1zPa~|8O+?B>3XRvQ}4|ul3sI2kj=7@`}$0l`}ruT(|A*R`MFR3 zIv=`Nbo(M7rj!n`!7eism<_3$Cu6TPO6tR(>ZYtWU2MAbm0l{H$P8Vgm>R_TIggni zXt{{4i8W(H7deTkKb>Olq0nm%N>4g-njO9GD|JZfJtt+dj2CSqRaUasNlCJIT}jG-rOb2Pn{AkvGK#Vtq-NiYeDjWv-RKi7BbssI20 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ad59ee31cdea4b0c0bd77befe4a698181186db33 GIT binary patch literal 3695 zcmc&%d011&7C$rh-YgLI+!zoQMFJ|wq99sC5(EUJBA^f`#jpsJEwYHE8W4-J3gU)! z0|mvUR4LdBwjhYM*lIgxYM-s5zPU-U^|k%uec$_D&qrqF%$)6aW|DIV zmWQ~qH0#4oSF!>Y%mHkcR#63GKPaA$mVNuf$Mstt<&7R5b!y9$`%7=R4ZGP}^kW{IcLt z6zyL*8#0^1a7)rbx_r-bc-%A*ed1JOC>S*pc8%OcFRwJo;2pj{h=Wjq7eG-4X{v-6W{pZ*%<1T;6nN!fS{4(mTyNBF{Xz;d-c=)-k zo)S8EGkZ6kfu_zUXcQ>%lw&SvPoNQ++jRs!mSv*uf(G?KFcL%}|5X z;a8X~UH3ukgqZD5g0N^JrgGSJ2(bMLK6ATnaQn_AMhr&axwHd*+8YQ)DnCda?u75Z z(FP^G9n@E$-cWpWA%57-ptG}1fR~pY-8$kSw1^ku_ll;&@aZ$*$mwB}c=c0cx8egZ z@p}s2#54irEwUMq`Ii6PazAYY>I>YJ2M#bg15l&qa!@C3kW^@7kO;z z25^}+m9lzE1fc^Z$Zlf~+?v^qvdx7kzQc>T$qm|R6^$zS6| z->t;=7Ut8fvx1S}mKWBT97aRZGH~xjXI>WdIpY&#NAEvPQ!WqJSzWA~DlE~gL_Jg+ zf9#y^(6W1%VCy=6YS@V9z)Qc+Uq0$MyqA)R4^+N}+BTRPd7Wux%;t@Up1xe1F*gRy z3d}K({q1YGK76sgCbR`-uX>NaaKi}3Z(#sdFILk{@35lNwp01j69Vb4gbjSn&ll18 z!v)xSW2e~TTm>xZJ75!JHvvgE9j87CYoJu6w^7(ITHyUm#Z(=+iw0^mbehF*Nb=o{ zQsx|_5|073@!Rq6Mc!5%;o2`cG$d0nEz%09QiafGvIi}*D5rdy5H|RSj2ZfAGhSwS z$Gq6-rtnO20*-4|qV_S*P+3=rpfs!zFYZp^U1h9rtbr}{MNu}K-?|r_Nsa~g(@Of% z&9V5Z$1!x)vJm$cS{jyGc`*jH#eAFUXydNy3z4C?4HX(C!`ZsOQyX|AjEcloXnpot zYH9t~s5fgPZ(M5#_$>IAS0`#QtiQ1d9QRBz?DVpR2?q1go$r6ZX-^Cx*V{<4bRCN> zKE8)vys=U^>xzi_{)7e%yS5pP2vy*V(K0$uJd3we?1vIVw9M1VjG0=&7cMw> zMttSP37+uT1!3xS3BHD93fd-4l1=WY z8(T6_&%Z<{kAVkegsBzL!hFC2sZVXea!O$C5Ybt?d8P|wWU{YQ7MsvO}{D6mx27q79MQSG4W zsxMhNf)~s$~HPL=qk4K-BrG6LQ-F&s(V+(=1Fjk^Dfkiio1PQ z3$?WGZhtLES8pYrE~{9r^>X{lZA0JAysgjzJRoArh)pA{%TL7K$?e1*SXnfhX-)mB zh^ZY!<{dXJ(4u!4?d$sdRg@{Or)KYnoI?1z;x z)ABj@k19j#3f%I6W~}#)`8PeAJ&D=U$73VUG1})p;(z^!Wi2DES*$PkGhVHb!nEXcuJ|Wp z(W%K<+5Cjm1g?6LQdujK+_sUNw~}>9Wmj~a#cyBQAJ#)3*7IfSB1pi%Am}Hh|33FH z3EkFD+&g}yX^{&0xpGcODj1aEO8@V7iO4UF48~NLvF!TjKdbyEFZ+Phg!II0eRFdc zNp2!ppRMfqC{p`>Z~jl5{t;Hoa(9sOvf`2w($p+XdLTRen^wxsTDjlChx7#Cj-{_A zoaO0-Uu{h|xnHk>#Vk+h#c9)0-L-J-7A77BugA`c*tg;A>m2_lSdbW7T3agr$9>*k z)Mx!)9`9fFk|YOY5@VS_veHHjU}-HC$wNZsgip~wENLz6%#piXA(zYNk`G5Sv) zB2n%ymnX`76LotEA~1_v`O4*i+)VhvnQ*#g7H5GoB%hNv&e_?MEc|BA;CSC$ZuH{@ zVp=X&a)Uy}vAT1)3&*F#y3KZt4WFMA?>{+36%mreou>tp_}+Sx3_5$hIwK?qLVA0; zC3p6UepY0-dwP68qFl}C>)aDKgj9ok$h>(RAH)rM=(^DL@uZMFMYcNJJwp{SJ0mhe z2c7tvtH&1<_G&yNS%iKQ50^@XCO%+(Hg_NoRC4k4@tlJWjlZl)VNpdcie-fhpQplb{r%56w=*YQBuE^EjlP*|Ji^#}znba%k zKdefHT${xSnOVsh={Bw& W<6XzQSSqanUjD%m8X%bbtNCwBcqY97 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fb2a0b13d44e39718519f4b481204d15e4dbf2ba GIT binary patch literal 3684 zcmc&%c~}$I7C(0~NhX0nSSBWrR@N95WfNPa3lks-ii*_A;(}q3ogjd4Z)2? zsen}$#SN@Jwbr;C)A^E+HjNDgrF3er64Q#h!5Muw)7vq=YTnZw}Tgh%L;ox@>`QjP{6 z_5kvZH&Dj05Hz2JOFKP4G{^-U9E0FuniN*Yj)e5(!7#2f4W`Z+2%_~4@cH;A*kV%v zFMIDr=bMzs{YWD?F4u$mk@JwfZvH z#SFQzgm^YxOl0d)C zy8%OsUIMo!Ke&5uA#D465O`$SqrU2YBd5gcC{K+bJIftv_Rl2EbO(Qb^Ekd?PlmL1 zdz0hX^k`H!`%B_wm7DN`R5dbpD0qzjdt{?=8d30*4qg8Bn%$2JPm7jZoFUqUevqtl zJ}DeyvbC;!+>Pwi70B1IR3MJJLuFp0MDJZWL^ZzUiVmOkA*O7y66FjZENHQ9BEGoh zNxUE8XqRoZo7gb@78O;Z;b+Ke!P9Gwa5jH7?0LL_m@>CjxWtw(XnF86S+M6F)KN4- zBE2yL3Pwc=X7y-M^atxv+~SQ;e03mfEA}D>Mz~VmrK`nO#}5%{vG-8-m{%x|uWq0e zv6uMV-jTw|B@3jYLvIl+*N38qM>9#qCtpIu>xB^c(Wm^g-yTF44L?Ej(RI9ILnNs6 zQZ#YNHo?JGR7=!c_anp)&O_4A9q3_CI!SDutGM>Yd9ozJoo`j9aGPhtUt|q7Jd)SG{fh8G6!2$gM#J|lM&ZSh&*0E2?ZlDNi@e=$xY&BG zDkJ?ScJXQl>3JRI1Jcp^ilv)nLHy&|M1gZa5xNrkHs82Wub6bJjF+4OsC&f>n_&B5 zf>@hL4EvG*ZJypb&|fF|?qIDr#-TzSrd|xqis^De+cyrx*&-49828K;Dgn!2mGSTDZHZaSmDPD>cRTWQ)FVu0OFcXjan+T zw!;%3cY?)}G_x*Wy7j-9dTL-P9lNa*_>0 zPO?8ZOjUdr1)g}1E+7=ihdvv2x0@nG^MIh_LDlmY@F;tA^s(v%i-b})eR)vL!a^&# zUrOh(nng=(T{J~X^0tP*5WF584% ze_Zi}oymH%u5sOP6TEhzy>k7|4a4&9bXvSgOjD~kPr0YRui`u{(mnAcGQmANGuhiK zGD+X*!{8()^0O((tejQyJ%?aHD*6-zon}Scy|OFs=3g)Z&!)yeB>Z8s|ok8$`K{X*0KDv3Nz`a$6?!Y$+=yP z=8>+O{C)$E#g>#*`4;kjo0L)0oNT=%EM5OJA?IuMFIAPeDrv$}-u2JdMNwvI7F+6* za>1ly=QDf-kQYv3c&fcUuV zC*ZOl|D=}3J(*uD`&fe`In^)2mi-rZvC#8nzr*9N%D(N>CLkjzD>={7(cr@0hg1jM zRb6hl@V{vPFTDO4WW~w=kPC7XQj#(aG>ID@PvV*W$x|7=!yyPa1Q0;TpU*ojun0fj zm#}hw+z3t3g7DW^ZBowN4%=f9*hBv_s^K)mbt72)(|t{-J#yzui{1McsJtjIbg+0%V!12A4h6m1GFlPh@fyrWzTe4@*>F35q z1!N`ak{JW5&$-8NFvaxqk#E1v@?+S+0-X!p5>Gnb)8rYV0*o=yES#0+gbMooPYmPojgyV3KlKa~!Y*|gY!0$xwWHGF z{pR4Dj9`Uuk_*`U;}9*Eg7-9>CXO7V!{;%KroeJfT(GPbmSMO|T9i1DmVFL-Or|y; z`{8i1`q(~JWv|8}338WgL(Z)Hq`YAX+1bMkIXT7xm)S`b$sao9kHMMp`-1XS%(bMT@N9VqmqEhvbIcLuFJIP(Y<@bH}p8HD@ zm_$FCqFreZ74ED}h@9LZqE+8+iZ^?B za0w6uc@t!f)T<1N`NJGY-T800@S+y@S;4R=Sppj#Il!#d3@XmIA#x?b zuz#BZj;v9Fo7NW|k9vVlHw=afsTg^;cmVZ44g!4~gq2WmuE7JW2e^WhFbvM-I>3_D zK$yBP0!Es1VM0+quwL#2m*eZ9+O81#%6FiX^-|>Cx&?#_qrtuPBp7xL0o~6cXtA3B z%ce+S(eJwM!ZSYXD)$CiIUnj)s^RE0dpLZa2j?plD7JPzco>hv^I1{gepe0`(ktNF3mp`h zl3-tTgc>v zz@T%7s3q|*=Jy4tF?%~zUi$+6*)Rm{^Ih3f;1&lp{cEVnE>+->=1C<~KSk!CrPQ6c zgJ_`q6#03KH~3w;1zPGj(YkyOGC5i!OO{QHM?Aitb8sN=Xs?O144K3G7hKm)*Sc+Vq3&e z^(>aK8A6Z+bTcU3e}kj`H+$T@_W@4`wh;7j8~nB<25eIzA+N75@yo;}Sa7V3yf`Kl z%Jxqoo;35&`H_dAV#heNtKSo76iz4VOGiN85pmFRx)13&ZztGCK8Nd*8h{K@ka{`{ z1O;W(-pR}O&z2hDcKclLzTm?9x_UfO+31Iw2Q-njM_-}|C+?7;ZT+p>OQ#cCosW{A zqy_SPwthk8DN@KIV1(dR8BlyLn&((D1AO+h@OxcG=*h1n%rYxM@!@dNWrhtbE*ekl z%^yn$eG#uYPKmxUmXXeBKafTC*N`^nCt}*qpArwI6jQrKCLqC`5W--;7ddP71gO7Z zU!Vh4^ckKHU1Ih6#iZEd7E|oy#&Vr>72=M3aR#AK9z1%8CnZ*tU>YETqfT+YcQ+a`f2o z6DLodK6Ccm`3n~>T{i!4j2_v#SdA7%S`OKHMFL($zI zacgakj-{kbBbAq9G+AZlWwH-Ee$7j)MDv|RYiL;wE!)7+-J6H^&L_JYe>U5&M+{(Q z$pj0&5Z(AwEa=h@3%YbzN_u-08~?$n1k0)BzVBls`&aywm#ND#S$Osc5}iS^BYk9L zeK04z?fpG5-<6=GJOIReV@7tSR!viwjy;JtSlv^qs&fj%#0&w1xOrWFjUlqbyQ)2EcSmq~igswC3biw{zS^r$%KPL+!fkY}%{f~cg{6+l#f4Q-L*-a8% z>Mat;1fpjW{QOCzVjCtgaa2;Uy>9l#)T98NMw!K^ zS$i%#P6ta$*F16TSau%EHkQ;m=oUOv@mjb^ofM!?Nsia2CUdFdc%v=(#eUQs4=xs# zH}J5f3^!<$<4o)Z6Qg4FIiHbByCPP>y zT;xKw{urX=Q1DtfXNi&H?C^dH6JBUJCk`xYg>@LNk`^N_Ny|EydWtrx0LS6-WbLtk zY|0*uMH9rX26g_lf=rWdhQZ*g&d=8ux=zo`H|FSdu6}`o{RS%}Dk;FL542DK3HVRW Fe*i)ig~9*; literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a46cb40fef1eae7a5d0207565c796ee63a4a4fe GIT binary patch literal 4466 zcmc&&c~lffx38-1o_%0IW-C<+Lu zfXF5&TO?qV9RW8mF1Vm^2b1^|OMU3!0g&15RLbs@Auckw^Pp}^Ozy}+TM$xJ?S>=&VL5Y zKa-(DF(Ke>!%>WBFrO&P?gf=i@8A^RhUT`J!cu20T-OL;X!PK-K;|_|d--VAiHs4BrMY;#2YO0y1dt(&gZBdonoKQAOkw`k_O$X0UN; z17=Y?iY+_;1ela4XQG^PFF0qBAV#G5K*Q+*f}5vbF+iN#z7}SJ#if{sy|Ir zyAI*-ujFM5ECQwrzQ8osaX|j575E7MB5|r_ArV*=#VaVU&~T0PhQ(`cqedhe zpl`I6zJ^!>sy-Y*e*bEwZ&4*^wXfge303Rh#nhQx-V;NRFvo+o`o077KC~bDZ_WX` z`*pyv-A1^M`(%bm`W_a$wGplKeF5*e%wl|g=LrmF8L`e+ThW)MCGnVzo~ZWGbok~> zG%mVy8@PXw0zAI`f#`YA0DA*}1KwvcuqHz;Z0_?$eX79|RG9@R|B*Rjy}kfK-(G}o z?nmON=wG6N`4{jscRhlgsf`7Vgdn)ZgAw-B5IlR*%xYEd1ZUlI>8XK1lS<}I1r`r~ zCMH;yDH$2nA<5}1-euS54zWT0cIEn!56R(NX zE8$qGn=idPsRVEqbl@R&pP>Oq16X^|LDhJ163!UDhB5jzne_WL*!7BwNKh5Sm{?K- zv|n<8Z^3!2yWfN+$IcTrF8cWKOE_jaoT1sD{{?fKYzu718t8_Let{dF-2{0V4%kGU z-vMpab9&O04p0#pi%#yD2OAFXxYoUOgxXg#Kzm;TidpOr7dpqY#JBE%hm+P!m3h{p zxUCiR6$f+(`xTBTa7`dyxtY(or1u5gF4P&n$}FPG-uA<5-z6gL%q9)1A9jJ&eJ6EY z^evz;y8}D3qzLob{uC~mh%@Go`VeWShu}z-43C~L3544g!^lNdSXdjtG9SzU7rx0u z-exyA-+mU$Sm3J(eWIA4Pqh?on6MMGsf392lbFytSB*A^2GvuIA2WNaLs4+G8*H35 z3U{<@V{Bhig4VP~((V$PD2SzpT}X)o*Ye6>Z$uEVyyS-8d~AsBTD8F|qGZ&aEMjlh zv?f?bQt7&Tm#efqTmjh{4KSG-0j}iV#}3eRxG5T%aDUuBZ2i$Yuv?x=Q?LC?`v`yly$RsJC1Lc(t-DUun>aM%_6jd>)f={DWGZM+w7idJ7op z90ENpRk+Hb7u3(@c^>~4wh=#3<=?;jICZo_oVv)Q9AoGjkRA5>We9f@;FSs_wL}%1 zxe*|S(B?>D62?$B@NPPiw297};*@2RpPa%Lnup(L%HOnARacrSloV{#cFba~;bvW+;I5<*tMKD-RgQKf5s& zjjLFer;H@;{3%ZvX_Mn0k(dhbLTvTq+q-ec5kv2II zPT-ChAU)@UGt;!2U5Osi2yX5~rX8#-&d?FK5ucoEJ_BJxXucQn4r?&F+TymYN3fgxFEmqz>S5?@#N7r_{wsZB-u6+icWy`v%i+VCmgS&H` zYmQyWnJpi>-BnY3>9EZv)Z4I&?ZlI}jIzKEne z9qayv?vgu|B4mu)P;~dNv!v0-bXQY6Wcp8ZSM^Ww8x^`LEN+a}=OV#oB)(=MemsLD zynpro4^jUPx@e~*5Yptq;i1uie4J$e`skyV2z_n0;}dMi*8pINgrC-*Pa8A-l#f$( z|2`Q$;?vx$sol6xc}moa1|lPjpO(HR>0g)fuMGS@!2$tKq%Cs$KYqjc54~Lczue{j z*i8~-m0o@z+f5{Vnt zNPQ?(xh^rK^vEFdFU^9ymd1xT%!%~z_6Voequt5;)~zHPr9WGx6EX=>#{4OV)Y>Qe z<-T5)t3n*ZB!Sd^Wq1+}DL&)6$5&rbx(n5eL06_emQN%(CyfjAvW)TZ{xZhbTM3G5@$EcUTKst9)Gd3vfgB|Nf|G_L}Et`9iylk z$b22%?e=l}o! literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..56934cab8de9fd4346c856340c7d0894b2552537 GIT binary patch literal 4906 zcmc&&cT`l@*5BurDbxYxjx#_6=_pDOupwdSASwdV)W}c-q)Nv!DvE*~qI6LZr3ngx zg`$9>Vtrr(`!gCWF>2H()_3plOyZa1k8iEF-rI|N>fZae_dfToIX^tbUgi`*1yP|M zlmZ2+0A_7I6-xar*H`oX^po_q{-mV8?|Jv*%MX*5mL00-cz_?ZFQv}=`%F3SbZFunPu6C01VzCO^Og%0gERR za)TwYq4Ogoi4hbgP!&8zf?tq97Bb-coj;KY9XL=J&<&670OS>HVIEkn?3n$f2W^iGa@S&#Uwn&qkIT zMvy!UpA-(gVsMcBEjuH{(72G~n6TKS`Js{|iDD!@hzy+{5*M8u6D!O2G$ts)3x_&YAw=V#bvN7<(6$N;;Ngp0{ z8$-xwKH5_m2kXD%gMNND#Qjqrz*G%hC}FVSX(R+5_z0IjRRb%0GF1ADp!_)xmK2H6 z@&pBJp=dI6?l*(eB`%;7GZtRV{DdynkAqxIAu?{$1LBD(aN?$c-3kKw>h*wYs10g- zJLp}=gUx}~5Vg($ruQ#|xoKL!-KGZDX4JthrBoPc+KA5AX(HYB8sM*U1>N@Z5Z`PG zu|Er-O=&J{U7!i;KO6+7f-nf$s{M5{kCLcumTp;uD4`^Ys3hXEJASROw zxh5oZA2CC;Gx&=f8Q z=fa&&v5=PJ2L+kC;B(4J$akrN9qzVpuLwi$mk+|2tZzW4&H|pl$bdc93_&ke6}{T| zD^d@AfRc<5#K-D_^X3_lQ162^MKll_sz1S-dP{V4Y=JySXBzAowTB2&-vN5_^a;;h zuTYEIS?HgbL)6UglOJt*0pI9m4CYrKKn!sfYu|Pl4H=%pEaD!s(|RvJ_sW~-OxY7; zF(Mu-jSGdLmI|C}WJ}f*_CkHfJEQ|1*yIytsM<{t&FX4{c7p_Td_^C&vIU1tX>+hX zZzp(a{Sp)Ib-{OZDuDF?Kh#vw4R^A(!RHML_$brmn9amVWMr-e>+#7(WFC2pD9vQC zyrtjD6~t@EMUT8qmUKM>{#Zorc;}8OSYmiG)e0`UZ=uKHzTCq{6G%QN!o;P=;BAc? zC$ENPpDz7XIFc|MYn|qT@)EP~5%UJ{H03)eCh``x=;t-qlLgC( z=INe@GiZawD<454V&br}Lkoxx)CKa;f$5~AA(~fSTgRUk<&P>;Zek-1bl9blM(D!@ z5iTCTl&FcDho!!8L6_d&R(Y0jio2#ah`SFxRotr4&32b6D-?aci&TsXkvTtyCFl2q z6LXuxU3#^ZQ{z1fwVgG?=9I{D6UG^{nw0CX8@KhbWsZE6c=`QU-gl2Uep~FxXkj_% zo20Q*$P{S!oQKU>(#c+F7e1r6O-|G^9@QVlBGojQmTtq4}$eVbg#X z?Ac^OXnBs}nC5JjlRw^yEew2t?z&Il=-nDb5ppJS=NhfqzFSuExUJq;(}OYS?a>%Q zbnzy5PR#_bgO|y(KeV7;$s6!Lx)tvjt%y4N{INde5H)3PIac<-0+V}v9>Rt$pttv; z2yFKf(QVuF#1>CoQa)c35B?B}p*3Ed@Q0(}$zZ44an)0B%(H}@B?%r`HN_Cd-+xM~ zjn6@4Yzs_7V>8ZP^9)=|R>0`YT9!xBTu%H;DcAGyB>59M%azDg3ZR4&cu(zB_G>>N zgX|~6Pfc;`-Yr+4bwUr;p3{r(x7AkG-~lOmU@1$GKV$ zI(g&{fgPE1;sj5_Dhnkh3glws592MR4nMkbN~F=z>a% z3zQFzNZ`~q&%*;kIPpRwv0wm9GK^cmXaH;Gbp2yNs%;c-1ZAstbODk@nGl}`EG0k33Me(?0 z3Qh7A4+?>^_p?f~R~2WxoG*Ix;up+gOc$iuDhifvrO?2eC)ndtk74?41^jGR1EEy< z0<^sCv4MF8#A5z*)-Jvi3ipzd^u44{m3pUOLfuvV{fb2}r56&+j~y-5TjvdiFz>?^WzbX{|e0=g=)zc=(O zcP+Hvy>8nW|Hg=cuHEbNO@q3!L=Jm46poMRs|q;2XJhe%_0^_PXR&6PB$Wk>9JW-Xi%Vmp>TajaR?(S&?7LnIaRDWMNbEkc_x>L>8YVRtq z>cH-rZMA;;gL{gcYV+#m98P$2y}LGl-~5gYu7>m8f`*XOxyEx&?kzkJ{*Q9EV&}S| z=BWP0$Qvi?iVwxbcl7**9?)S}B12c5Q2!_N?2>F6y}F;0>dn4z^N9CD26wU)%y!?9 zfqBZnNNGeh4K>h0Lp2m%t^WdbemetIBn8dt++jb8QledhltNXX6yUi?s?c5RP&sOG z#QjHf7&qIPBB;9TlZA0*&TtWo7gJK9;SwhG+2{jj`J3l6J_50ApY=z8NA&Rd8YZLn zZ(qNBqa0+n66Xpm(|8j5OnH%Y(pz4zW44;HZ&R8|i1S8`Nhz=1s)V}c=(^=urmKdz zZyOVEFlk72fk)x^$g551YT+}sPf7pv)sR|*XPHB`z-lo+(rfp$LhIyr{DnT%-qpU% zi>WBzx;aNuUcaNF{To7h@~oB!Vgj3^hG=vog=nsnGv`p8NUGP>OhcYLtnna4Q3`{U zl+%|yq#~UdFoMB&`s=Io5fPqk=@wGp_{8}z-L3Z&l0IKEM0-5AZ+`P~#1_~T#7mB^ zRWbB!D_9hAYV#g=5U&+XElP^KR5`2Cx4kGi>e@cZsl2Jh zDKR$=EgHPqUYr_t=h#a22-}jhML(R%G4wlHlAbttvCJjkcKhPwC)XOnuN~dKB=y;y z&b1@HEnS-aa2nIS6R?xA^6e6;Ix531)Cu$fBYi<@?IW9&G?OFaat9CL_`=y((I&PkwZuj zmuD0FZ=Ugx`KD>1KmD($m5?!$iIDR}hoK~YUZs`(O}8+bZdEkUjt%-J{2d`t{*9hQ zhsA~`4O6gOOt2a=IBIQh*@~tle}?!Q;J->;w8095ti+IruowwN&_qoi`vc+A*Ywz{ zW=AtNu)>62*PmhyD}T+H7`@+5hQcV8hYe#*2xX_la6*hm_;OnM>a>4t#=i{s|0W9p zRgtF1<3G(t|401qzw$Ew&t4_L+lc~I1CHRW1s_19DO3=9dCl~jJp5(}cM_(ISc&b$ zV(~2cVPu+EoGP{t7dwl^;bMny*_u7AFr67Wh{bM9PtU*f^h`EPWIQmA#4{Id!4@;jgZ+GyL!Bo@1^9bKF#9o{bbRRx z=|<+yLFR-Gf|g-_vLUngm3?BMpH*zAOSo9V*vsJQbZ80qGWVK2o0+>a&2Z|n(1+uR zqSx$`Bz{(L0sb@M0{vyF(|EIn^K z5Zk8?pOY+D#){Sv%PKi+BujF5U6y)4jB_%LBg>Pqr~T8W%+U;Mf>1kNlCUT_ENN^= peEe8RLPA`sc1l=6Vq{#bwz>5<^KoV(4^4pI{-B)z@TC7s_AlGe7Q()!s=un9a5mDN zAsIKOznY;L&llyRS>Ml zbh12q95TENf|vf)(zVP!m?)^9AleNdNmp8(5Y)c@)@;q2R@l{HHFURFMvG$}2s2xR zqUnt_!t&9dz=P-9(BS<`%yL}%&{d`t=vs>%`by?Y=}h*Z;_>amm?A$aLtX~#oTdnl z1t!yb-xQ-yryUU#m{PQrXCZ71h^A@_`%BI4Is(I>NP2QxAdKCx3MR~53zjt5fUlQ0 zk=bj56|T#cicOByph@vhVQc6BVejjAVXD|ke7@34Fs5j>nW$zos=D15{(2~rRMa(t z@S*cSRC@_=?q)Tt*SrKNsQ?~!l)@tov8Z+7gf9NwmLRku6Nz730NPiz@Yl9<5^b5L zXz{r~7KQhsOiGkQ;_GAt3nGOn_Z-24yGO*%%?SJuzD`i6N$jz0h&}MQ^@uX}SOx?2 z!4fO0rNj`AAW`tTS-^4L4qB}rE7U!oDGI+o$mCXanV9-q0=f_+ab`!O;6)6eCio2o z538~T^+lIK&44rL`(^dS9v@p%yTv7Bzlf{E&+sB0Z#M!x&!B+G?iW17x;>*qx zXPaE7MM76+i)Gu(xgR5Zu%0R-}=6676Jr>OF zc@k8l=b$}HhrsG$GpTp|eyVG<3pmkWK-m!qFf1fbth#m`-0G2UuNz&3^j}m6CKlUJ z0TY9eCSODDn#)M)dJPo}*M^XnMOy^AS52_-yIDwCa@fM_$HicB!yemEI}a#deUdm7 zwS|aVa1TaVlR}^0;;5p7_u=a=b!4V_50L7=6{e5cOQakJM9EDTaAE3t6zhIP@=f;~ z;jnm17?&Xe4W`@S4D&^V?=FbM4^@=a=}I(1(cZ1l`HrZ*Qj3x*)v%_|Z*bw!`N9QJ z+fe?obovHmi4w)Vhzs-dpmBXUtWQe>o^@*S>K#XP!|OP_q?m)w&QX{wu=J+HrG*09 z<>R^>y)_Yo+CcRNoWwQD9 zYT&eefXQKRCGZfBf$dM9q0E1p0E3TIHhm=nn_fOZZ^jph!dfK6(^EQV-MSXqjP^rK z<5c8Si!geVMF32RoJrv-b%kOUP)5 z9{oMN1`HfDc!>8kBNzmi~n@&xbX=SG>J*c z+KDNtX_L}3GPAOQ4oxO0Yn?Kun|UrH)8|Qbw)xnG!ETEEX9}HUDpoGiDW`For~40A z=*Su2BHh55yuQDUPne}ioSm$d>Yn7x$@+Zmyn-(Z=Py{eXz`MzMa!12Sh=dW;j7Y(Wt%o{`FiWN?d3amR_xlnXK&@cs{P*_`1ZT%nuFgT`r+`AqsNY)IC<*y znX~84U#P9SSl@8z$IFdBU1_>{t@(P(jn<#rZr-|m=kC4s`wt#I`sMMHr_Y|NPFpD% zWNbmfovXMgGcAj2N7`I6BP~}i&}L}43XjWqDJ5%TWE&XOKNwXx-|ODZ%m-_>8{_u2 zF$t3(prCgG$BcLUGMCf9DPl)W?d$3~5HA8o26KCV12+b616Iwl zYHsk0sHwiTy^pC&rA)OAZ{A9v2&#%JDP9F}}|x;eprZ&nv+oFd6mnOODs+ zer|k>XI64h3ajDn^X9QTnBqG2kC7N&RbaG~_mB?pbsa||{_&B^mZh7S7?A$+`XurYOno@Iw~%b*M{2DYbBxI%iEzPPV}|Uz?MgmYrqm?&ad{;-*k5 Q0eJfZ2T1_J@gJUl1J8q)CjbBd literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..842ce2caacab4c58384e71071de8b40ef03e2e4b GIT binary patch literal 3138 zcmc&%c~nzZ8o&3xmlXntyqI7VsuB@dL~inR#F46h%sF$W-%0NF?Z12TeJ8PO zye~sDuFMcMo`53&4nZ3{?e-qKe>b7w(9xn{j~bI1z&J8ePt12<_zYhW0NjOEo=j+! z1^|d$5gus>DWu#76$~x+2Yd&)Oq8L?*QRA_3bPp`rKMY-v=%BOl)_h8k`rHmK>!a9 zcmm+hA==?HLiU!42=)q5CbV*7LaVG4hyCIyHt_GT5ea)U{VZ&EiPn%=q}Sz}(zF_r zMv{XUIoh;LLtc?SpG)KnlwN?!a!T2T6#s?Cm!?-@$CTNrnP!T+gc89&-=?Y$&7l7d zFYpf?h~}lbO+s6D)j3VdO@`ZxucCp9{FMf8gsDR zp}>0>zrnr^UFmT{A4r3FEemQv<@7tkq*W?jo@^_yop72y5vJv9C-y^c%sD3L33rG6 z{Yzn2=u}=?=_rZAZ4Y1$jOWj~83vO#uZ3xgHh}VO7w}QJC*80<%-*MBrP$^~6PlU& z5Z;IyEgW*~Hp~`#iqF*t2ohHkhkZ`Ap5PmX;d>u-ZQ?H7Ir&}Y^oGg##1eV zOX7uD-+F+1w~vXvt|IVR>_&m$<^ig9>`>r$>w8{bzX}*;im>h9e)&Sl+w!n^}sk6Jf1do#dFI_bTeAi$Qw6D4dnns^RpH{R} ziXSDwZABHGy7e5@((S`9q|Whzqul7-=V@xhqczUm)#F43@*Cji-xn+xbQ;v<7NC7A$HK-+2T4%-L0+G!-r!V+85!czV02`WSb60dxHYhN zsBuaIGQC?Tm{IA%3!M>;G{qWv&mzY5{NQndiMmMoqNqk-{J9%;eNuuHRn1NTU#TP>|2;HfEJGbHHEk8$X)pKeIu>!I}l`tY=^lq`>CvxfU5f1 z8??^egpz$P*?!!wKsX`Q8K&fkK!^PfIIr(=DtHe>;_sBa{%7jZJoz2RQm=1B?e#j8 zS+9mo!+wR!jxQCKCDfwg6S@5BJZF?49!#|^Hi529J7IfH2Jk0k4!Z5wR1{bL>Q+%ejwIY}Gzhj|S5^z!x@;XBfAlz+hJ*8|6l4H`FoLNKcenHU-t9uXNeNgX{oCN?g9O2X8{ zq~w&;H~uzldRn?BBU7uJk(He@GdEA4Z!pd(C^Quno8NqE_MEx%=D%ICVBtH97B6|X zbZObLh$Ml&bEAUuC?v_h4zllFE4g|b*cOEm8;i!t}7qN6b!P| z_lVwwQl!txC$(NzNay7gngqH$9Vw!?Vm4FKOODc8j>^MM%H>=ic^=G{mc?F;DGd_H z0l06OUQ+d~5;CDxhE)9*PqBgLtA3f{RaM{qSr5q5<^Dj%FU-u==`{?E`GvHfHv4AD`FpB6M;D*@u|P7)l0gnq}2wP^FM9$7e^7eQw6c$-Ws$ ziACD5z}%GN_-w-0$Kv?jnu!O_UI=Fd2Z70Ak6RL6Fa5&QB>#MEcowT6`kZ^54yF|A zK7Q&{vX3HzC3P-zOFX&wOl8s}`5RJ_CmT|exzw@0I7@y}3BBXNU}1VL9s;Gxs0~jv z5e6HnCi0w++3G(olEWK=jmZi<%T6TA@H~QnF*Pi3X2!@jF(#vk5C+*y@W&xqE(M>d zI87Wm&JOcaSe4mwPh7A>3(K$^k`^T{Ny|BxdWt@*2>apkB>LDsRwY+skpwbVqo!b1 tkfDMrWsiGnvO^pskK&o^_N7105SRhIj1S}K< z1;Ij5KoPqE6~P*f2Fr^*v4HyS9byu{B!7Hsz4hK++*9}7zrFW4Gu+>}N!(2-f?7lc zx>AZ1r~#NX9ZhSGjO^{q;M=S}{<%h2IQ;TO8_#zy4!sFJ=h;?d`Yh{Pt<-rbHK#tB zKDawUao`{YlqRKZL`+zhlLTmvn zzF5TdmqrFI43Wl$P?%64e1e3pkX{3wT%;+j*!mpudDPX z-%6ezMv#0n&xBlhMejKIdxi!|fl&eR5kZl03j?KbQl(IO5E{5JASygQB2u1n4<_7= z=@k=t`MBo)1Nd(Xaa4rSj$`+hP4uSYojjs(K{Q)4UjrO2)vC*A(H+ zW?gvPZU_OR1!!+s6m0xP0J=Ht5Oq%%z*rq#Dr1oGb13*7`UqD)RRAk^GL-pRv;tvkuIvE&*|+DHLM{P^eJ{D?eAmLzoKPjudwP+bOVlwgAKj93l0|k7!A}DjXz6 zK}0GSvW!S*KXeF;Ryu&&2Nu+47=m5~3-;$q;p|-%=(s|{l_C@5Qc(hzsy-N#HVj5qnZb*fYhdqneHay~ zhTataiiQS0L~#ZPq9b*{VaqItsrJO`LTZSNicj#i+8iApo2$SXIRo~N*h?%Lx)Vk% z&?Vej}CQ-StOJTI}C47^!A((bQgb3n1*1Y{Fdar*0GmCn{PU^e_?J2j= zxss>IY)CX#6cq^X8+PMd18cG}uM?_UKA@4{icLOcf-0Pq(44j=Xx58CCs%i2DGfMm zPMU{xdDz3xmaj30;lsyb&%bGa^m8ZhYaZ5+*18yb-AJMk+Q#~j84EwPuYvs2HDtlAd00`M zDQeVjz{i(;#OGh=!>vz@VC$qNU^T|q*|G(eWDENl8E)c>pMhAIoEHGA9y^h0tCxY% z(I%Gab%b93iUUaZfXTF}_|RpF;4zSn`evL&acW!?c*2UjVQhj(g9VtrbuAgt;YYeO zi6E=i3mIM=N>-fxjP?yViyQ6RiYNCt!m*gSSmO*wlpULf4_VNQC#l>;5uvxSrO(!3 zPZzHu>SwwkPM;MPt#S+viHO2Vjw~hys7vILLo-QfO*p@FUzK1+m=7vTxP=YZ8_6yT zH9!NG#JFVq3ZgP<0hajI5nbuOqxyVJ2X|fPBJKh7v(h$=cD9R5MKS;L4@lJ@519%w zSv>EjoQOLd?uzb4PNl~P)O6kio3}%O8#B(3Ri{#g-MpiVt(-1UjaE2_WqTGxu2B*@flPp!&)L|#KK?DSZn*$G71vqm@u=mk-vZ5}>ZaEyYS6nqjQDl7&&xMd1z z$mT0qb=H%rbH_n@S0c9P)D6x|=Xjpp&D-!`c!GYkM;#WozKXp#TZ^<`?1)Jdq{O(! zT)~~s9W3t0>)dPHJ?!ZBy{P-xYD_byMPPY#6D;jIs2%=5h>7w}<9Fk-NZ$>QkmoRh zWBt*W+}iX64dg}>5$eOCc*=S*Y<3MEd&R8e2PSfpXh}#YjQW zfYvC5!j+Qycs5H*DOE5OZIAm7U$Of(Iv1P68dL8M~zZQHXk z$9Pd&n%Hh{M&9_4u5#a#dp8wKh`xU&&#r89;gqCTk9tp*ZP__(HO3t}y?kqlU7DJq z->LGeQ1JJq&IJxt`SoEvwV^jpR}~zIif%dk zJGyU+{th|1+Jx%Apr`Lhr_n2VDA}mFmu?;Nc*NjNmVwEhn{qHWIT$Gmsi2{H8)&GS zg6`@sP!|gssC*e{R%8wOQIZkO8l()$yJP?_#4^S90=u#i%R?SKro*_^#1uiz@qj#x zYx4$+V7Q!;iS(11)aRlB&Z@UBW_<)=`vJ?30FUY6ixeiK{&&B;eyg&`CI#mT&6D_2 z+f)UyW!yWy|MYZqL$A6d)c}W08j}*V3qsxZ%*eBh{~%c6S>aLPRll4H^QxM6Ea8_ARJcz~ zz}akzWMPC~JzWveJwF< zL&9^R#V{EW<%wj97nA2kkuuKherlWyh!yXZ24E#Qpm@*k#47OsfgMHzibpi?M0)ui zHNfE|yjPX+CyD4-#WF~DkpVTJBzv)un`4oD_@K0Yy{f)fQ|{7$jx8e`v#s)Cg3jld z26s2*#RgyADOoR^njaT>rEE@_S95-R*!2U_j_j!g2@yAsEbZ%VE=Y{Jdm@EB#Ck{4 z(jPlA^}UbpSQguNxx_KYx^Q{?)9W?C*N+z_CqBR1x_-#Cq7}-}V*NnwMm$LVWMz>NrvWEfCTED;_W8^;a` z4`TQNO%7UAg#+fohH-jZ_4W3%8A|l$&-ULkfN09gl~R_!{!ipIDgud!AdIBBP>&La zkRmQ00%!u{{~N1$#C${0LVuBbMdn0|sa!;yFFFhX`s*sK^c%s$7=l-$Ow*sHN&iHD zRDLHz;X#qXaf76<6cc7(dZiQf4o;v6&|e|`0r>CI7iU<2h!qh&2$~w|Vt=BT zx|**0)oo~w2o{*=>ocHOgUVm?Ge+-^)1femnE7REeGE@1!#DtlL`J2r$_4oFQ4|JFm=Ii4g!t6)5 z(eW)?LN{`Mc5)|l5VQ>XlMk7_uk2&}ye%RF9fKuO#$FCjr$dYHm%01gxy;;!X$Di5 zhdvlj7`w<$oyMCpn4gR1*YVKBqUBHVFr~DO4s`U2V>Tqtu8h81 zDe(*b9A|mG>0;CJrSuZXbY|!n&eR~<&tcLMf3qaICgzM1UF1Zj{&a|gheEH}$}Q>0 z>FnrzUx{tv;5o^IWvpl&iM*17M)D*F*X61EMmWUNIPyFhd)hy3${fw0CWy47r7=t6 vgW|>pL`RR6#>7M=Y9|E6#D+#iYMWY)GaY9lcGU#<{STT6`nN0kce#H9l9NZ9 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..84c399e67f3a1f307e88c5325c849ac627b14610 GIT binary patch literal 3138 zcmc&%d010d7C-mC_d)_h5qKeqQBWjqAfSRv5qTgW7-Vt5&>{wifNYTjQ0s!USVXK# zaU1Kxh*n$}rPxtK!37uUhFihbIy1Fe>rAy{zp-^%oqJyU*Ws5-wgSf(Jt~`^NJFA;GKDeH{m<+5?-PSgaJ|; zPbsxY1Axo}laWP;!P4d)Z2>w_av3jfWZ;IcA6k(V2!9KFgq3?DnJvd0 zLAJ&P8fseMR{0uuR$0LG@m++2MvmgLR`-;Ad#WD!XEn2%%OnzRKr-$hT+gMQPU1d1 z{s5wzl(6MVG!|M8}=z#kpXmBmUd&|Z_?{O2L>0&QtwKoS|?|cm1!~Y3? zPN@SX$dAeALqJ+s&K{akC3&*S40l>fVc=CYw{hE>cyoh4I^4gJsXhA)y>;#`6WrX# z&a4Qa6G-gwTrWG;9gY?Ac&453H=VBqbJNbnmw zhEdOTgtks!iJ3Xag+zO?doHq!-=ivZXZ0A_QqvOD$~4*yiv12PxPJpSRYfqp+@FFZ z?}6RozNcVsb^+eE;tkZe)=3`Jet_#SWeBuhF2?!sDJUkY&|drbO}O2=sK04a1Gc=k z*KX!ocP@NpBsLTo*t&Ol$BUk0?8X_R*lV)wcBY>?(e*4 zq}*_dIXZbeqp$b|P42}?gMZO;E1SMWKd(2jxh}mSGwdUj9lM{&I0?+UuZF<+d7JPw z{|?7b+zO;)lhjC`BZJG`YtaIiN+zTZVf(*oxt?e0@dDL7=W?Grvi5o-POI0U!vlXo z%Z@LVR!rWBi%w)qZgFayYVXOMFSEe)O}kKgRw@Ks)Uh}248pepPogWTQruRma;Q)T zarRrw?L1Z|cRPN2CUS6XM5Xd9xKjNWX060sUgoMsYb+ly^J{LRHuE~k-~$N|GUF>r zwPS-r&DR^iyLN;_OOOVJ*-u6HzW+PUec}Mc!E(j?Djsz{zK@?xUm}b7(vkW8vf!9-$}8NIDs@jyuiow+efoO#^V0SoFmTXd zZ=WH)e*Qy;4G#z$F>=)CH-g3lj|~w*!^VY2M2?S&o}i1F7#kO#Flq9Xsfp9{NpHTD zJUwNGAvG=CI5Q(NYgTqnZeG4=c7fSaSX4Y`?!33>FIc#!WO3;`@0KljuY766vdZP} zuUNTi^#@gJ)~;K>;ddK9+_ZVi*6QDHtJ%Keqn)+8cJJ9+_lJG^>pyNd@W~$!9%?+? zbmVAr%dz7pPPU#pedcW2r{~UJxOl1k@|COCu7B3idE@ghZhm=7`%tOjv9-RB?^-Bj zxmkI%)*H=iPL|nXXUs9uB1$S|3!~_8R@`>h9(2`Kihbn8WWKU2c4`2H;jV3o%Kaz*Jmw|3AgawO#k-&_ z6%!A(JMmHz`ySN3==j&cf`V0PRJ#9hFaM2t?EmG7)&5pblA_#A!D^+7J#MZ5Dvi=n zNJxlH9BqBDlv`O}N&!NsAPDi~OKE~26bqplLWCe>2w@rGStt>Rp{p=Kh@vyu+h(#8 zmu6~#8WLjX4fgdNNfzM~$J2dSF&)F{KuimQjt-%Ex)u3?AKhoC4xQ+qnmDyEJz`Y0 zep*5%<>$r`d~;`!L9`bp8X+JES?!5S%Il(UPD%{OOOMPD3{+oqPr@Ojx9td-kPC&Q@_NpdmGKY?hy6mk|SY7*om zI>Ofrp~cpF62Vd}A|r^Iv?_@~TF=GM^|=v+#E%#!)hG6eD!m%3Bq%*hhJx9JMvG6H p$>d`wD9A7NC^8n9v-0yi`~!#h5Ajp!Gyu;(&_sS$$B};z{{p%vL_q)m literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79a6f24b35cf797574a2db8d32609d38de32a0f9 GIT binary patch literal 5244 zcmc&&c~n!^*5Bt2$xR@TKyFMR%!8mPgMgwq1epazL=hAv3^|k%uTkEa&_Trv#@BQ0*pPNH|2~mVP z(*&JPtAc1f8VmqN+?wBe_9GwOz$1;jeli?I8EGnaxGl>J?%laV^I5}L&hX1)>xTCv zXuRrGH#R6z$X6SzH>Vdh%XJ>pK+9=!2U<)EX%5YiyMrC|eosoh*9HJRSBz=J2q*K7 zmg@o7uEpduA#($cP$uQYC{yERCn_@&X-p!P{E8$iG1~%UOZ}5SnFEsm&=@d{km&%J zc@d%$gP2Izj6w(mJOnt8O8_aLfWtgG%%jUZ0_M>ZP@Y1@j1^IT$NDm-t^deHO1>x4P`(+w@I!g>T+ZrJ4o(+JbAw1W^K>aKUB9Fa+t8e!M zCt)OfIb8-NKMG;d=L)neLl>JX8woW>oZ!NCKd?+00?#MCMVAi`h0Ss)vOjGN#8XG0 z(k8&TViDj@`F9%E=kD^Ou za%6Sp0EpN6gVmW!kY3>msozWBwBAg}pCgA)Umpjbf_R8NWC^WN9N5)Sj=J~=G%Y%C zQ)CFWH5NeZ@CGN%3&;(yf{BNVL3Y3yiZDCaVXT2A?+(L57!B9`Xsm11C|Ey946?8M zAot0AG&f5GM+j?3$>qUj2ND{N9Rr6Yz7X;&2P)L|V5{c9p#mi|-O-1}D4XCJ z7lUdjx~?Ou(|t z1%7WEA%hsq7G>r&@vUhKZ1II1JMymJ{mzZ{g)(S9EqrfevLk z0rpz#C8CXXfpwG(5wg1noeF4%juD%P1GC$8206Ck>jLb-`T9dhA)2u>TTY_Bfi0Lz z+7s@AwpM6ZaSL56eu`WMq+>hN;-K$TDbBMSLmnt>gTvLoB1;IuM%Fu_{Q&|rx%M=i zvCTkr%iFOPr*K%mU%F?m?f^=hkvgvJdd+iS(9)ieSzXoEut{9$0SAn0^FVN)j zfd9CAI4K4Jj97dQemM{Tf(Rc-HXVZfFzXboxp)r0Hf0Rt)y%#TN!PI*KoW5*=i`TzGHXk1YeVJ&KIhj>nec7H zO8ESIIr+uz8Q4yZGpZVR3Lp068+>NVBYey`3$9geHdgL;1?hh zMi$1xvTlFUVEICDIC+{Qx`EJ(pK$=?9&j2v8aG;~3sYXNL60WXA(a6S#hvpcD>J5I zq+Jv?aLiFMwlR|2d~_PJziLG8Z+eFg4rsz14sOJ&CitO69e%JiGY=mS)rl|Azk^be zzQN{yzZ!cwXBklu9D=Avo>;nmB^r>Dh83TfL%gP2$rHzdNo9Gmu;gHwc*4Bt=*#R| zn5nHLcW06vdfh6+6~h)22hyUjoR@y+%B$O=A67Q~joxI)OQ znu2K_4o1J6O(A5LZ$ZfDTnIgWm2AF$3biR;!t}HGc=aFws%f8&wd=hnsl;)gC6 zzxNWv_qC#5y5`s1yyPYOp? ztQYQ*j3euO<2c4{dFZ;&R5I-ge_2@fHasB_(A`g?^~M{l$FR>*upzfF@KyQijvD62 zdvL0RA7NO?pDmlkZyM!KlioAU9@YrSP4)G{%uESCMdu{mt@fe^ovjrLhj~-?(AS(u zn+e#9WD<0~>Lv58B;t8Nk=%HnwOj4dkne5vG@}C3{S<-@E>~!R0;w|L3KpW!FWdeWYL5ezyNAuRCuSQBQE^pAJ z@6nQ{H(^JXS0=oULZ0&m2EYnXM#-RuzbEziZ_1-rV3_g`!cPkn<*$ z;|cYEzgkiU=N|c?onI8_A0LoG9ju7LBa(U0uD>5GF;wH@4kL{Joh@m6;V3q#?VuRK z>-cUxVovPQAXH`l27Ow)hFTl856iBb$GJlqVKICw{8Fw8uJ1U2+LB^m&zlh4o8Jz? z?zq>ZE3%c?#g#IhwML#KzbKDuzBx*-_Q4#aBR+*z3XqCdXQ5p3zD z1p7~wkxCtN7pBACevQy^5_d1+j}^1LjuwD`PH zvgY1-1uYYCahYV>cz1w&KInAx45*p-ftQ9wYSwJ*Q;T%m)ttyc3*P8rMiDQ5yjtxZ7G$>ehv20K*OjgPs3BTZo;v8WRt9o+D+76~ z)%80T@>v{G+nv-RzG!8#w~$&_?`~uO4KJH5M{4Vz`te*YTe?V*Oil4d7gHEp0uw&uRvc%#w?nX&_3bT#o?%#jhXkJ<6&qm4RP431i6`k3} zsg>QwjnmHb{cN0G!y7K2-yqA8XEfQLkY`@o;bm7Co9Mg}w_n$Cb^sMrw>Eh>2!_AiTcIKEZ?dv{aw(L#cOXjuVk1$_B8Z9ti zX`=m1B?Zdgf1s3sZ)1q|7{YeDhn&XxiJbq3G>4axl**F2_)H=>DO1IbPmX6P5W{z> zMUv7HlDp2fN9=4*b6J-6=WFI~NzHOZ&kAC$C6?WdqpEa)L_|DBGW@PXKoLsB6GALQ z>|z*VH~TlX_L#k~7-2594@jz%HP?!idauJ0yT7h7O1}|Tj3KVB*Yz1%_uBKHXr2D= zBriEWH9^%+W;K{(D`wj@%J$ol46*ww#6JN4T@7SvH;{5NV-w?3lr+ImUK{LB)Y(QJ zbjWZVQyp-_q#qsx&FNSEkax0re;g5w(VQSp)|!xNkBep37=!Ts*ldg#|K_ZJE%3i5 z3lgzRE(`ik^ZNf0i~g^?)&H|sNpfJUL~Kh*wvP}4$mCL8MQG@x@R9v*mb8d)WW`P4 zrBEm)GY=~h6p9>$SAxPqs%_qrr}j zBbb5DM1OWZK8J06*oJYfPz14!R|GrN?kk+w`MemHiOwd z7S3fFt-tYFCrl8G^!w8e*}V_!Gb6*@Qsev*6iU`!3(us(NW}Yj=+vp~JdkbrQ`d&x zAJ05y%}b>WcT0nTPy__A`dX!& z_xO_owDo3+&B*)GD-_=B&@Y*-L3*UG$J`i~1x!s`StX{(Ic)uz5c>~>S@Y6bGLbXc zG5Zk;ublpK(gw>~F**uuCHsxEN%pU6Q;$gT&0=u0d9wD5f5wzOntn}?nx`u>=4ZvL thQy|)4^d`hq~(}r$7f_FrKOrXyAO39>Ld%21N{C6X9(suIp%lA{{vSbUhDt> literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e67164596565a48f5cd69702114b2db7228ee7eb GIT binary patch literal 4782 zcmc&&2UJwoy8ibmGiP9E!r%-JMVg2pMX(nJgrTX3QHr2cDbl2xD4-Y+u_Fjb2T_V3 zAZQd&0Rch93o4>fG(=;Iv7oW*J7_uT{nyN{fBXJ@&YJy=lgQbGNASFO zevUjY55@o(H#@%FRrYIjmd%K#?Xi>L&tew-bhaqo+Du9a#WgNb6yAD~oX4)=XLV1rqWIfjO|` z1$>T=cbMPGK<}tP9;Pg-dLjA*{SNeHJd&>mTgCM_^zTqJ;pOm0(F`z(^6E$zrhWaS*s4`2d$cRsbttCLDAXK*>{iShr7v5+k`-uwW*%)*8dv zJTYj78p5+hAJN5{$&fDOqlwMhKs+)6D%=4qHW1KXqYa!18lb3T0e!*pu+x1itWL9n z1%tuhu~r>8nTl|AVKtP>#(|nqJ-SdWL|QGCpp<3{S}hkKqR|Y(eo=;ISr5oqC4{Z- zkAii!KX@I|gkDb;6b{y*VJQR+=cVDgf+DoFjt3%tJ{UKQK(d_{EUYO8L8S>4VS121 zt^wjd)xcet0|R0nHvH`@*s(|n1c$|t{P0H<9HRiWgf@gGb0FP-gpMOez#!fRoZhma zaoa@D-Nu4L+1}87OAfj&k#MQN7}-}8gLc$8c(=|5wC)?hWuJ7o^)U?AM!P_Eaw&X@ z?Sw3GITYH@h2eb|dY5<)CZ(!^X0<6ieU=0Vu1)~$urcUG(XU9w?=FhgLl6p&jWP_2S#@ zCW6VpT?i%4VJ(^8q3AIjDUOYAlwh$oY;?Z^yZS4Iek!b z`YqA~M{MR9V^m=$gBG_nLyK-CYTwY0Z9IX)bt7Z+cC(sK&GvSco#wywfPnn-kM zK^LUgFGG#$Bha7}i&ZDj2etVQ(0ox9*V%Lg$l3?Ox6*9MUIabhMf|@3rz$>~xA&>L-@VTePv$c|Av8u7%c!K9t(x7S=9%Af< zpM@xxnd1wI4{XUX8@>jE@0wW(R}mWd6^C_$VlbXJ2Uq!;3p>^>$BsuX#gq&Y+2kNb z#m80=?|HrCu_Fse@468AlB#MYht;m=VC)S{O;?j$5Tu9R_X=>)l=VbqxF;6(N{lYO z8&Y_Z)Wu2Z^Wq#vk7Y8(b+GLdhpdDt|v3_)A}-~wci!%m-AJWvxN$picFGg5 z&D#L_$yF@JXb&o4B!T1faEA05%|uyp6BlH00`IOGV83($(#vut{M-=E?%Q<*nx=JQ zExY^h+PNBXI@_}d)5X{Ek_onW@B8oMXEyGTFI2W5+im<<0v21_n0Kw%l1$#GE@PN~C^3|6;l8IA9|oBuF2ANv2*3#8VyJ z**zP}fjh60@S8st8xrfGhJ7}2hKn~6)W`r%T~pz(hmYV_t9vo!x^POhybOfTWx&1U zJl=E7h!usOCoSxCh(i|%{M`&v;ac$=&X$N2eT}5T(#g=#ABTCJxlS#xi;>d({wCZ}i=7a$qydZGT+Lp! zSDmz8CC0pCy@|<9*-As7x>%eKS20`I1o^bH*sSV=h$|$eT;Gt`;9$^o=9>RhlE`F}9j*u;Q2I@;J zv1^{X#5YO{SfxtVD8M;^q^}yCvb?v_2D~EfA2%g}DLtQHe#B@gWou~B9scoO2z^7s z3o~Z5h)Zx10i*a9MH@Ea)EGV2(?uIMvE?;wvn`5~lBM~kfrF=uH*J>FuuSEPO1|Bq zsOMbn+FFvlb*!<^*=$j1%Jy;7BkvEkmTumuZjnR@Ecd7G*0N34b8FkbWzPhs65AZh zvb4-euJwV}+RCNMVJv%HRTL+WBJ@lFlvbh9n#TkQ8vt~i}xcS(0){_J>+XMb7Gxy&)&GSmKw z{<3P1mL#6TfPs5s$j!D)hpUrUIDPZxEoqQl>wP9oFe}9%r{4ekz8ULShWi?VE?2rW zr5NTO4gJ4I&mD~}lVA{oOCk?3k=qi0x>f1MNq8E6yKT!jaZTRwsQdjD%e!jwo8q1h zcjt-^7qqN<{rtiAU55)#B?2xbw5=^_-9#zsdz`H;Zr?1gWuIqTSJJVSKPC9a*}BrM zof=kYLc99?-Fx(wRC;vRmz~Qp_U*~DJ96Mc?)0dKH@c4;yjWpdX`Qc-z~9qDN2>soAly|+2di@TR7fD2?NI`V1&P{qX~LYw{dU#M zkDDffc9q2Mi=L4xGLa%38m1-!m8R$*_<$0%J8kmCM+;i&SB=OdY6sBCr|u2DzA0EW zk??RpiDZIscQu`G-BH!a3BYwYX6%bTP*oR>(Hn|27Yo;HlFAe$Jlz;wv*bj~Hp4-T(-uyXMP!_eXg5rD&%3KjKKEG8#E_0D2!nT{!v6oP*gPAKg6H8 zUDH?TBChhFq4M5Ix&u18NoW-oFOjnU z$dmkEdz6&#Oi`BArIa_AC;Mt*F0vGfM2qP!BMu@_oX9dj zWFryM0LCdH= z$&lIm!amB~#XQVU93b*$>?Pr8IJCHZo;xpH%FOMVW)!+4_0fD*(`%N|-Y(|hZmtW% z-CZTn>3EAr@!5a%Wj=JVX!%n{-L)Fy_GBf-ho)Bb5w=4eJWfv*wa9l0jPKibeYBErx+GBP|)Bi27M dDkwZm!({4YlgY*cMP%$Vdir5fE zrHC{^nu%hgs$d0IBUaX+f^MSG>{_C6-y4F7o8*uEzWp}$o4NO%dr$kFdp_oehuqVW zAsIWy!JVNQ7z|*za(T^**yrnePfWR&bgVbyQOb~p1?LXQf}d`xo03}~*;L8^qs>Un z7$Kv{@ED$+4VcgaIud%o3IK#7F*hSj^nk1r=`o~qJn%H75z>n14`(;w z!Tv!pkqJ@JDqwGs_!*L2AY{+5-hUDE58*b5nRc-|bA?hN&J+E(pK{8a0dbYHaa~w7 znNj*PysR9DPL0_mqK8j|gToGz3w8Fx$oWR3N8yjC-t|1(n3zSD`FDy;%`Xv~Tt|WB z^#>3|p2sJ597V5^$PrUqGtpCIx%=-lcMzo`UumMD2g$ zhSkPlB7r#$&Ldtyn^ljnw{HTa4eBs){r=5r^QXOEZ!!$T?S)b6}{1Y--TMHVFKf&MqD}k`JAYvFh zpcB%_V+*(Q-ef7^VQUKPK2uHYDV&S<)mftB#`T1C@jGJPg-68H)5G{9QWJ5t`8WLJ z`8HGo|9dLZ!k0J)N|>Az1grX7sKKk3f!Wa$yg}a}^x|J^vjr4^#b;9pon>m^^)?;t zRHhL_=64ZB*{$$sS~FUzaSz207NDr`J9x?W>u{!@4G4Bno%$$Dh8Lfy+`Fk+6c4qErhRDj_@sWiTb*BIu%eIsaa57 zp*=0a2NfsY#zSSp`To)c!nc>Cgxq>1Sr#)NC%tq=SAV)YsKD5TX#3QKp&aHJY}ak& zyCrL=?S9{l2ASj_OYJP4(EAA;b(a>byndW6^BRU8G-^_Fb47xl(8Ws4~T_e0- zY)8e4s&H0Ii>CJ$1r;eN03)-d{6s1ds^9O#b5}I-*J)5Z-SKHCsd^c@lr~OX^N}ee zP4eU|x$lI0_U%CaU+#g;T?U|3iiHLq!)WuYOrfaxI9}}c1L}60K##oh2!#pFgcoXT z_;a?b(ToxuCF&lGMz2mqk4UpBF8suP z@MfIn73OWzJI) z7brV7Bl6H?lCXHOU8gI5iXc699cm@&`KB|!M_)X-346CY5&8x{126h1f6eeSP!SP_ zt1>@D^*c4yZ99%ogFYV%t({3YW|lvi?wTN!-MRxkLlcc-z3OoM`U?J{od%TSB4-?s z7(iBj$*5f(F@^6C;!0i<6!BwUccJUYR%5;F25psB z5fW+FqZIXUxSD^T*vT_cPu12zJL0zyD-Ya3?aFN4=p&wBxA0qDzFM8=z=IqxE1e)} zu+;-=;T+WW!*d+Ge2YQzs`W08LM9%SI9z+LlYh0I&N8PiSw`ddkIrI(OgBe7#IWhiAA;^!*}c~bCfBP(>%q+k_b zT4I;CdJRDjHt}i5TeFt0IoxHJU4BZcNMaRsqalCodJRKGnnYgkWxAG$XNgZ^LF&dK z7J=t>$qUzQ(w!LBd!w;%eWrn33N2L}NXr`Gvem@5=|K86V~+xt97WNF9ix3}!n&G@ zHfEbIY)X^bAI!+H4(lxOZ9ceZ&jjdhk=o^u6Xsl@7CGF`%kX|La66bmDjkI)-s)Ut}4HI zz2*qFT$k#C){PSD#kbE@7oN>DwBMlTT63Unn~7K1ytbO6^Vt?b?YXYC2QTiKsC<08 zt+x2`e!I0oeYd)jjzX7hrt{nDO0O1s6uRwoJ5qM7+^2T&o%SQ;-yB}pyg`53SBJW4 z0Y86*)HK+qQLI2+( z878hWkk#wUE~)6(Z(HgQ$%7p;@dLl3LoiSXZm>Kl5OFOC@U(y7c+yAAPD0;MH7S6E z9?;=<((g}M2fwgAtYUkWqyCI0X?!3}ks;Bc@%;?&F%mySvRo6{Ib)V6{oehb1pOnd z(v0yS;VFZ{LZSi~k|l~p_&0phNYDL{mK{qOVLX<6TyTchulzBOSzuNpeM~+(|AEmD`7^&J?V|bZ%!a zm%DN+D-W!!q}nPu51b?U%;lra%_p)O#~Ci1Y@fuSq-mHFnZnHnmhb8q5I2I9|hRn?m9tT)|AS5P$f(rqWT|uQ_Nl?~+2r5f!l}$j_1_Wz0EEbB2xD*7d zsGuk=c$MO<1*{70A_^*kMas2Ww_00q7xm5wm$rUw|M;Hgd%io#nVEOyeShyVlPA9< zP!Z%Hrp1BcnJTf47={8^ZgM&lWX{bGo_gdJ#2!z}nfWUJ;ZCpEnCX+;UitY5M9Oh* z&TsJPSsf|{u|#ZSE!HD1j+kTO43@mXJt=Qc8vq2BnDlyxqM%Y=eiNC9X_*sn^kh2PFhRvp4%?4vTLPQ`HTgJj$4pKpTmbs4xtwoE!Z*T9(Pe&E1bybMrSMTBgbK> zcymfLyr|z!@hzt?yEnAKzN0VEC{SUSc~fD;R6l6EFq|?h zdWFoFeFeJHUctZS?gPrzo=OqB18-pg{mr~$&darF@Z*VG`10fdX5+RG@a8%P)L>Ch zjoe8BE1S2M?%Im-Q+Nwg26PD2{FY>0xayWY&u>`%b@P$NhG zDnflv$z_YE0{i#fD5Fn=5Zu2UnU|h`+kVX`eJCGAA8}=_+1aDj;m*J>F#yJL4z`$5 z%S4?DXM7u_u&#DCT4&9`*vm%Dt}}1Zo?&Mw>pg3zPdmNf8}%&Q;OB)ZzMfC_i(8qm zzw&1y4kYST?x`{KO9(|fGrI8z`6%w@cuUmZDx(zRm(aUY=HSd{Ug*-R9{q>8r}+78 z^Z5Tj4@AYrC%C>EUE!B+dy&572IOG4mZKALpO@6b<1guG;O!0`i5kz@;}13o_-fm+ zoI2ea{9TVZUOLrKKUJ_Fum9*SFJz69Nt9NCx%DD$29p5?-mb?Ve0q$Vugh@ip1hzl z4}5}J3nqy4Zd*a-B#_6DqPl8l(Im zUhEw!xPSYYPV>-Ha3pXOw;&>NMD=?XF#hHP#$fzfRKaz`#>T5DZvI2?-joeid3!jj z^bdKdeHwn?y~%>(qq2rD%Y-n5qNy``I=D|lfSIRsfuHJ9xNU2$K*Pi{`0&~`YX1~7 zUGr6?wBw9xRHcPC)!KhZ&*fl|-WG{F)9e||F?K3I9n)qpDI2|IGw+sCv2lQUSI!&a zF|-KdFOslLHwMpiZ{d66z4$llD|P;bE^@-n{cv9*1A>1%VG1tAQ3a}S?%AAb5Wats zj-EOU_jp;Nx-UI-ZD!=qyuJ>K*JZ@#-uQ~TTGNUp2U2*$tE<4|kqE*oTd1>LcAPY- zg>m;ar}tiu~z#HQf2@O_^!)y>LWE z1ikMwvGBrZH}2FJANmTvikteP3w3;xjZI3A8aiKI1s}KXH}f?gkE9z;Qs2(3qQX|( zK{JQbyeY54m^F>}P=8q}on$Zq;yiYugy{#U*yBKz{$LB|7nkEuhfd)?hpBn*hZ~`= zL_W0Z?m|lq3Muz}2d>mbp<>IrsGQkQXS4L-R0oQEZoFPp& z=Ock(J<1iu!=;KF)OwDoD9_Lcm85?`E!ln@olPs{j5`s=EC zM_o-|yv}TN_t!@_>7@WNr-&p=ip8kw*?s)>qkO)9kC6KHR4N+YyAhcND{ zpR>(y8j20lFy!m6MNfQM$6A~{3;UFKeqnOGUfJywWGb)S%x)jU;c$fV5Rp4Wz~w3aiO(w zT}@RhOYP=0705hx6mA$F*IpggykphIiO|b>=WB!o&F)2f4ehbhQv=eXa^k5wOsp}< zu~@y?_*3;Ar3T;^qPCP^iZu4m3EsIa1P_c1s`WK`o@)qHCDHlJQ3JH_OGXpW5BaQ& z`jO_BrFp{P%3|-fQJ$+z+;R#z<#n0URZ#P6!@M~9cZ#u8PhedLM4;U4Q%U5@k5Z*p5E@xL}|q!i>vTy{0z%CZ#tmn*N>Z}FI|`-E8KmZ2<0V`6=Sg#6NHs;vfub<3~p zMXz0ZomY@Zjivi;Oj1kEmp8~XzvSrt&{$|RQT@}MUq_!_GsuY7_51 zGH)q!jBVc=-zRU~LMbJ-GkOuJ>pz*T6RM zx&4jfZ_=ozX_P#FoQW6@5K8ZN$rnEi)(y6JwHN(+7BTA(r$b`5_zmMQg$^|BU}P zUjHg>S)mh1IcZUGF-Z|(n$T@?{0G)+l;0)5tlsaZBF18l z%9XXIrP>o>nLQ>R22aY|h~S&C_*%#RGg*-6$xLLb|Kpz&|0Op6|8kH2Yd1+!Y%kH1 z^CUlyH3X2ENQH`^py?ql0~brWm9}G>lR~LbC}xm{ZGH+xra~F3@Kh*b6&|tLH6>B- zXJ;M?g%3Ltd2l3>cACZ_ut19Gi^tj7O(K(N0p6_ak;(Sc*p5Ie6e_k;hOwe{Ut!P6 z36YKg4v`_V7e;$dP6!JPievYa0!e(IEFhg0&qE7Bf*@@GPdjDz-my;$4{=J4_KHC3fD0FP literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d0f1bd9b4393fd8255b1fde58a8b75d36fb67909 GIT binary patch literal 4341 zcmc&&X;f3!7T)I$$xQ%>l3)TT$Pkek1XMsuf-RpYYQp@b*jap*jBCfX$9ro8;Y&3?T@$CTd#X1XWGNr-##a~U*ak8G8Yg6 z2f+*v0VRMj09whpi^mVWOwzBOKJL|&T>m{MUw@Ije0LtNw8Ca?-;#A#&j>)k7YL1L zd!B$LU};!`J~g5fQX|#?z}3a*mO>cb3*ru>As|Fnz~YI7oB&y5;F1toY={6;rBqvx z>SZJj5mSE`N@PUm2m%b~hDUb*@|>rB$eBRELBQg$9YBfJGQu0~7?+Q-6;dM_A%%Xk zQKb?5d6icFh5(Zz=p;TM&{qnKS{5G>6dAW9P!=as3Z)03flHP}g~vxk$}#mL)$ftG z6&F9pH2#Y%{}Sl~bwR(nhu{rMen8KaUOi(uWvql1RvV$JO9EX@%fYC`5Wc&m2(Py2 z!IQR0u*`su4jqnyoW*?5+t~(D-|7LFs=$j;807vK3VtWv!Oi#OzzVj3!#*M?ea?e5 zyCi5`j3Tx|WCP8|%;4f4XV8u?h8J_*qpzw>AYVg>CN=5;@zfNkD0gs3C7`2P7dR8O zK!xuB?JIb&)z2EjvK?Vo*9usaqzRk?6}UaS3J#1)gz=^|=vtKq(rKyyezptfG+l$} zI!lQBNfjDLErRXKH6Um3B+M)ff~7~b;krKy_H`Xc54i|xuPQ*#XccH_o&ZGg3^1#G z32Cl6FuS@0L>1<+AJd0o^;$?7s)k1}9Xg!_*uzb>uw@P(M4vlDTK{)wMf_+uM(9FB z8VB-?NN7880*q3e!1FB&>T)N6IF|)S3uSQWzA{|6LBfr_X2`9)1axDsz`z|jI4C0YE|%1O9op94L6=LOB8#!n z*xsl>Sou>r#?iMYD~j5o`s`bz4IY?Hs~IYHRYG%H8lg!XgU+RPVC(B~*pjpm`_cL( zCOqnl?`u|s+JkLyKeGUaa%1obrt7e$%hwU1`Rc6x^EJpk^bD~#jm0AM{jrJmHRQ4j ze&n8(XHYlG6H$G3n4%?y#|s+3S?3L0vgqYLelmsR)1S+!$!FkAg)1oe&V=ys#@MsP z^^kS>41Q~#J!Cd7$6hv&=*FxIkYBR^)oH$jXysF=Ds2XgpWzOTUyZ}{Hl6_GnFH|c zl4`*1%%P$R~MOXW9`!f^R zI%x@5<=9L3T7PTOsB#+~Zsv<$gjldCS_bQ$xR7H~SAo%&jjYkP5qkMA96(kEW}i&Q z)mABjuSyYjP;j06;>0Xcc08O{T3N++5A#8X6YgN+#oFv8qQ%_7>mpn-Ihm-4^2ZWi zJEI!|_ePgabips#x(LV_i!`^Zx3S&UDl6_9dVogj7a?Yr z3g;TI6@D+!1Gg!Z?%h5V!ZqSvt+8e=*tVV*q3|VM`^XTzIUPZWzPbaR)6-!7lb!gL z@9I&z>@@_306J@+gqk~ium?FyM>&q!f|We7z_`C$gP>opqc;!32&`<4=$`#GVwC>YoCXzuo&#vR$ElRgQsNIUPq4p)CAf(60xPNJ=84Mc&_-H zyU;s6VPf>WS}ZQ3ioJY?COLDtGbT%r5he|V{Ch(eSe$paIkz}v?C4*+QRk^tOk?L+ zzV*$`u(IQrR(P)}CM-IS-;c{DeX}2<`Qr%6{+%zmt+5{s7Df{hD&t|lZ3Y=O=QtkR z3iv?EIXKhjjP~v>R6aR2hN`Ud$9=;&(4kz8QdDwrhiZgze-M-E7i+LN?Uj7+KF76s z#b+(6@jwle-l3e9EGm2c5iH?c80$W%h6QkS@M~#t&{*>( z6cJi{y2IapjgUPFuZEb_R5rm`2^b}G*q^!{r^e{}oZY{EBb%q~Qs_{!Ax%MO5z=+G zWMhW1mNZi+DczK%qVILYr@1sOXRKMk#X`w}^v&v1V;*-kAIR9M>9BzkNe^b`>A2+U z`?eg+`fQ?SsY{WxEW5zarzWJkr7UNs>C%==k>jDJ@wyOszi-Q;QblrE^ zWm%M6E-+eHpUjzj=xUK=SWjctS=q;osD!C`_rob1Lp>PgBm6&+j!Q54?S+ zF!k7j=1mIf_PJA!KWxiYw_U3^t)}+>SRy<$HjW(>9>mZ& zn!Ba4RR?ud-{^|J))u$Q+3g>%*^m@`7;@% zt>q%5Mr;@|`sY>J=p!=v+~n`bsPYGD8XgoG95+m{X*$}E#P^KF>Bcl2{d4%g+4Qf_ zigK+$$ckMS5)>g55Hu6l!~Q_%^)x(=syNWUBftt1eq3?^YuNZ>w#?Z5ei#Bw!1AzT zya}QFU>MSm(L0PBke(VHUyF$^-}pblf+{7_5PAHMe{G2W8S4LkxxV6m>L#h?8>muZ zitgxdZIOmhQR3w_$J=K3VhQ&Xri@rgq!Ni_F8yZ2T_Q=8NP{I#5=pScF<8DPr442= zBS(qEmFemENKen?!&oK)6G$>=wV|o$RC+LTwhJ?NOk~=bOhbp3NIaNE>dQ>!dWjh` z4-2rEZ64sgAU@D(T9~hoR|unz@TBuwwSsQs@f_tr=p<+vjwc^7x{v&0{k*Lr1D%5< zGR9xNJq?Ez-;uf3r=K!&H>MeeE>C?ppD=n&8YlC%it_cD9p&dEhfeP|cNm}B{EzdY zi$%*H@?lCTjSh5P5XUGat{#lN+$eEm|GBR6deg?8*{ye^0C8{rgB z?<2>__|x%eSLSGjEkURiEsI$h9~5W2EIQg)784VdsFe^D6B`;8sby|$Vs2t4^3VYI N@PbAHz?1&NkVV0zh$QTbia~{75!r;u7Fo3_1r&`tAd0wy z8>neokpnB z0fkXQ^6y;nEvOs;7XfO~q&5JQIZytOGmZlm1Z)=52@Ip8G-;?d+T^2bdE}6UN8-Qh z=uj2>^_CL;UI9{8(1U-dz?2gnAC?>&5tkGk9+DKokx>U(cyL&JOmb|TvZev3a}Nre zG2v5W@Lz=dWTn}<+?~2U+&hGFKD9CRCy_0kB``%5^+^Q4X(cbuQ@TTS<#xivzN(;Kc(fYU082F&6&DOBQlLKcr)`4$? z39u=>8)ejB;Hs51+!AOH9@`HhUb#EAuR$HKwv7PQ2V208if!Qi#sqA%)tAU&@)Sar ztIrr{t%5ePv-sXD27@pSLMDz?L|9uOvAgjJ@T%j1eQ&*x`b2~!b8CV7nAf1)_Kw&6(US!=U|q*q>^nb4plDcx25Sj;c}5$^t(pf<8V-VL z?G#jzYsmy= zSM-Qo2j-)_)i$u!qy{T#e-A%De;;!^JDNErD+QJ5wqt2Q_Jl=QHWp(o#m<35Fts2I zeA(|oXr(U$7N_eN!@q}c@D~g$?{x>()2Cs2%hW;PsYUoZZYOc-#0(;&GDfqgtU}v6 zIshI?xsFB&jhTC8rtn=S9}`VjftSVyq10FI@ba%WhCj({W3B00$T|iebMo|Cm|kfb z>O0@xgu_h>ppAA8L(Tso8GD0dt>~#GOZ`T}x^Jz~{M~F;!gw=AwMGTHc0+(xx@ZrV zvyY?g3;Id_Y%vkTD*^(GrOXr}1ysJ@j^;0KV6M?17}YP|;Hj0%V5edtNAtcpNSz{K zEV<fpfwSlv~ zyP1lR&{4&onScp*9up%b4fq7OL!zbx;Qa+f*@*SAMZZ`gM z7_mwn48w4&y{w1%(jO2D#Z$qf>Ud^X_IIFmQak!4rwcpoXrv+7l#koazKZ<>FEIiI z-l)jL0~?GXKt!{?3%E#0Xoes(aN;hC%^ zy!HdQGb+VI?pKYH)>beVZ8s!b7rCR5lo0&rYOealG1HhX5uW&WtV2xsn?Bg{bviQ0 zuh+J}vI#8dK5pbCm;iYNt=O44hcM~-2XM|voOJw`l*q0dfbVw5@z@cgK$Ps^(qGS@g-oXG&P@4z)pFKyhZ`mN#k)yLBj|=-=Jw)#p ztYOW#p^iOklf#iWcR@oxG3pBv;$LXbU>wkPg^`jp>Jyz?MRK!A3vN2|kFQglZaojD zhfh%fZlq0Z*l(90bR@uwVUXg|nK)|+KqSv;e|iQ+YMBPq@6TAp)HL?k;Z&HJ#pc;Y z_0|`zTB~6sR`5hctJjS%l@tdw6lHDDwGKVELv&!xCjH3?4|*F8tleVhlu7c%2NgME zJaSE?jR)6lHSsO-C=ef7zinJVRa9T&p$+*~3mX-D=ffKdCPZ}?OPdaF+BHdj>s*2J zkLE%Pg?DIOl+Swz20*S74uIg^j~c$%Rd%epUKj9E8kHWb}rX!ertKb ziO5Sup1a&CcAkvxt&&}9t=M%cUS8kMa#uws($Y^)PjAi9pK~;ObmDz2GjSUgJ86K5 zUA;M|*8Arh4cCUJsjqi0&Eup!J>|8Rig~YV*W6>xY2H&Fcq-xXz{_#H$3xq4Eu(+2xLUpV`>NQ7zgS*BE$kHLQ6`mj6`M!{Y77t!T0niA5sOZ0esTZ;>cf zB#sohiA0ei=Sbz9m{OQQpPWS^PkN{Lp`GHC$3)r#?MO81i*Z&~lc|I2Ob?oOPNi2@ zdZAp4L_YK)meQL%d|}T$=Zy7gg!X)K&RJQ+MK|5P!RPX~4|* zzyM|GRJ_@$`n=|RoDbD3ivE-j-Ab`M+vZJ(c~YnoZGA>qR0L z`sf}*_dp)#Hf3?B?NX{I6KN%?$*FYzsT5U1q3*;=ODc1!I!Z1TiBnZ`QYK4VQ92@J zCsjtuB31Xw(xtI($y6L=owPmWpE9LKqtXOCBY8-|lH`aa%P_gzG9)1(KGi5CA|X*0 cA7^A^Ki+1%HQ&bofDeD5j{5cLOZ|NRH{sIBfdBvi literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f892d384dcee4c386d866900d99cf672068e5bc4 GIT binary patch literal 4860 zcmc&&c|4VA`@imIIU9!~=ja@SlC6}bsE{1HNTRZ}SVPwAGZk8tN{VSAZA95aOl7Gk zN{faz+7~9YP1{Tp>3yDK>YaWw^T+S=c|Y%aInTXb_w~KM*ZrK2`+Ho(t_A`^;3Eid z5=aYR6o7t1MnIyG^~8rC6&vGskz)kU);Sbfn--)Ve;z&LH#i0zHP_R@#J^rV`m*)S z^ZY9d*9btMAW+m5@TfnAfT3&*+N>`zgsd-K03fA}Q6&%Ic-mBB&38yyK!}Wh!4nDD ze!dX_OM`u5f(4iYPoW1XJV%;oNR#zX!9=>$u0enSRq&_^fa<3pfrk^-rD<3Y2-pa4 zHj@C7%K`@VNK+4&%QE9p?zG8`RCJ7Q2nbnUW`r#ChmQh9>aU~J_+AvXq5Lr(rx0&-8#5=?E4n1awx62ey_P$n!0rV>u7Dc^O0KW-FM} zwHy{DssekbJlt|FheI+6prTieu9Pby^~Mt5Z?*^Z#w!q2I~5{cDnNtGBG|D^8MX}l z2sYV);8QjZ+7~n6Kvxxd%t26dSqg5;$wPC~SRnRUgMQ6hNOe#L=ZZWKl^DQ5OdIwo z)xhe{70?Sa;ijDcd%WHhGUoC@bkq(~2cDqiadJ>ejEC@4He~6N&~oe;=&rT}myZmn zP1gaL#_sfC@!x4}e>kiqehv0L3 zE9|x_f&;V7;qe{}4JJQ@iD_eCT)83q@_H>CzNH1@BSxWj2Y*G10lg?z8$ncrI@oS= zhUkj_`R+GXKS02 zB{>~Xapof$2TqvDIek>@z(w<#8=z4$8l6q)#Mac|kdf$(b-LNW3*)z#u*?oW&?F7U z$J|j(K?^)c+XK~xZu!qyvKaJ!A7pAW|4IfGzfa_@kVJefbS4-+()5X*H z4L5U0Yg&fAttZj7Ip@L5OoOOZeG4`Gc&t3t8dR(uq2cNnoS!j>G}avhS(`z4w6p?n zGkrW#U;(VSG~%Ss4#vmK82GtmE$nHnB6AOUWBY3iP`y?iKIO!(4RFESNu*I z+=e!xr;cY)>?k%0IBiCrj9!3|+KVwQ^J>!nycd~Oy%6bKS0szuKBLmnZMbgfcD&xv z4z296gIzIc_~^ymc%tkB6drO9Tk&!O_H0=)Q9H*4v7VY?QL-n|=ZD43(mxm<0`T)2G=k6rJE)$~q8A5Mi6qN{hoWo9bO@86AIdQyive1C(WU_fUk za8XmI2ljA_kBrr*3=I1r9OJyd0)fA`qYsZm39N9X=)U+J>7>v18lsO@Rrir3|Oo zMXac|No<#a8B#rU`5bbcG|1ot-d1{(`Q9B!A4?N>UK7de*nR`*r?p{?nH_kgxtgrT z*4>2RyxX{^d=6)+XbGpy)LuYZFH)VIBrDz9#3Q};O0hG~o#Vxm@sLpU9g2xj;HbKc zWra(f!1H2qj~2Z^r*yZMY+{ru^Y>o%XX=fHp}cV-*{hQa(y=c{^#=y_NQCy8|*@6 z)bHKs=E)RHdG{H<@%62+qO(#htVaP8=CtAuVzWrk&3$P87=mT~$&=jPFo1@#qlj>M z6*y?Rkqn(%g$JDjd~o$yIQ`TP?a$4Y{c&_OtF(48?it30PT69#T0R}Os6ZI!xhAP} zp&FapQOXDRvmB#$e1?Cu6ROwwgtjzqVr`yZgvFl?Wjr7iF+YwvekC;)Ztg2V9U*>D z_{l~3_RkZr2gc{nHPKq^;#!f^W<@iSb1;pmmbF->xn~)Y;@6?I+z_~y_ZZ*BQ01ob z717SvAMjNL_t3?d-HeH~u23Z4vGb&BqzZa-fK~QcuGY1PHHEVf^}l|DIZbSa1aq#! zsvQE<{o6Ba;QRp0xi5`hYOW$=_P+*IH%qL0u_p07e-7gi-v$M_CXv)NPu35+A zjkC|T$XlB#B{U4~I+M3Tr6_l;F-H z&$EZO=1z-xd?Cl`NX9z3DD#lr_Hhn-tc$Y> z9kUHXZl5du;mF*BmYWo9N_G^x6}c9BwUq2Eb+7bm+hbF@tK9oUbpP#^(%nawo>|LQ zvMtN5^1qO!s-amIv0wV6HaT35I3KP0e(k~lm z0>8g|@#NOk2eVQTDKp;Vr-$P8Y{BG!uAd%RrO9bH7kG3$wn-NmMf7xaJh9t8)+S@T zXJ?PY4xI%R0o|Rwj@kYOf$asJ*M6R}cShXHp6+W;=N=TN%4m39@0(ZPuyb?H6ia`JRy@|IXju(R2wGvF_4=qUANl1 z110kxQJ@AdQ8IB+gPY81FT8`2$t1XCND+A@YRx)J0^OcN}qA;=PP?nR}M>?)bw=}IOdvfHK`rA zQRZEr#5Mi#d3R0dol{#)kH5UvuySyiYj)z7N9VRE&B`!4`KI@Bu4%H&%({2|*Gs)m zXUwes^~K%#m7itI8$P`5xs*F=n|b5M_ff;|u{ZBA&6+970&GNI*nfK!XNQMG&{`uf zh6oFZiDd?c1=81a>Q0^_qp(X~p-5LVTT?TgNsEs^fBpXU;_V}F{wkeV_0m8hA`l}f z)zQUe5&nqHGxPgHL-f&KV9Nd~H>hP26SBVe&~oFiqm}_W%3r3 z3h8btCjZY~C50Me1)e5LVXq+{K%^{`7Q4F6bvGG#vxEl;J=%=KmSVAZ z9`(@XC>AG(ErZ0iVsVhzDo8SBNjc1+TUKJR16@=7FE!PZG-K!pbRhBE?)l8_u3mr(Z%w~eE4Bs}T$RD5bFeKaGUAXJO;jb0HK7(2;7 pDr%B%baZ5bT6|!1Oh{yent}0TgUR|LCuM-IA7~^1T&Ul@{sjY`8d(4U literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a786300b7ea789c918d25ecc282aa7737b43bf2 GIT binary patch literal 3554 zcmc&%d010d7C-mC_mYcLo_o*sJNJA! zC*gdA8%J|~+!!s#a?l6BwjgJ6)wJs~Pv+Az>f*J1r#!^yP;%e0z;n$_KNtRC5M zz*%w%7f!~Rb0SV;;|We|uZ)82l>z|iU`%Ei!qgf{)sAd9TIm5IbEQHOr%O+mk*v#3 z=CE8QZ$|PnNR_N&-=s?ELVN@c1DVXp0zl|kE=Ws!5PVkC6i5)z5-|;oDGMg1tdERv z3B$s;2%>VX-|t>4*xnljOZ~^eU(K&uBMs zc5S7W2fqz&9oHd^K8qVFj-khn=dgRmZSkD8RyeV+3!T}12f15i;w>2o@VIs-C2{g) z_OEY)Lyb?6J!rA_=W4Vs*c44@u7?Iy7HV10ju+NaP&Q{OZl5?7?s`7JijM-RtxYoU zJQ|5=cAkLmN-E%aX%=PYT8MqTMlmU?tVOp^RwK8R*JyDRuI!?!CY`TRxh&n2B8N(lVsEF0`9398#j-;Ab9XMzi zIy(yO$?L)cRQBR6DNd-TRY~!~3+VkB({cWzK=k#~D;D<_osukRiD=FxxWA{(AL-l9XcVhCP12S= zr{H+_CUJ=_ZouAm9AWsiyG-BVTsE zK~!tcF>~*a%gndR{g{@31d+8z3F;U-naS7~q?~lCl1fSjbah$mTmF5@FkYR8hjw8I z$O)3YGdxgot+q-^e(oA&+%~Zb*C_q{cbXR5SI`188QVm*1IR)>`#VB28MwcU5RFn= zQV5t;_-$LTkYf8dMKx|)SR^*L4_fE9eNnNA!acdOaeL8+X11CV1z)vzX6DuzT= zCtqydwXD)Lwz)*_>Wb#{ zHVYkBY_Xn~byK4UxKG&T6P>1dbvMy>b}P{b{rc{;(3=OWB&wXW&Jg4g4*l@mfMV8WEtE(IHba5dh?p#uT^22edc2>%DEaE~tTw*>>z1&<8`pvNS8VY{Fa&4zN?nG&S|FznS?lZsKG`dJW?$U@^7e1Qa zvUc3Jp6QqC=TlZ;U0zw&&u#Xv47==|ceksd$ZGr*pAWnVq=?bPXy`t8NSS#3t z^fu1l5n~VU7Fp=&?<8Ry0Bq| z@GKwyoR3u-2HJ4gSYv};E=-cNlysrqCT7#ADcL#V#MDHgXpsulD3xF8FaKPjsfBHljOB!$dB`S(Jiov7#IYdq~hw3kW>KizpW$%U+NPSY^NdZ02FL5uTXJcJtazh z>m9J6cQ|AJrh;qsqWq>NrYGeX`-$5~{sU4Svr}EOB_;P&^=~lxI;_f44^W7*+z|+Tzdr?sIg!>!kfs%eXAwFaOf2+1 zk3&DA-&W9XF#MljK`v9;D7F9N*6L4cZ~iaItNODtNxo{3T&7~>KlZl-P}(SDd_=_f zNN;0eDYntB!tmfVJkL)cUtxsud_J#9;sba-iT6)3oM{NdIAQ0{^TEPO=-!o3hHbW> zK~Thx|6qu#s~6df4GR)z|9oK{D@;T+&ufKA6D^R2dtNQjv*X;u+~Ojq1P&R|y>!In$pRfB zOh)Ji*Ny(nCTE%)U8F}wbW~VIOq2mSu{XhpFJ#io{*Yji@dtl|P--$00;l8%7ksc* z;5RVk_u8KjY=}1rHW|I4=lLB^I|iJBJkd`?g{Lucg2Faam-T*b sV$RU`%*>&>tgMWD+q}fA?39djTQ|>PZo||{tqs7757g5D;pE@luYn)u8UO$Q literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bc20a7699b64f53ab5cc24074d8f61c1e997b00d GIT binary patch literal 4872 zcmc&&cT`l@*5Bur=?Dsg3?QJ$h|-%zMZ*Bnn`ori06{>ibh}`HL9r20P(g|)7Kl=! zQlyCm5Cd4iC>SiL#2Cdq6;R*3Gr2Ozgg&OlOp5Jrz)9m1N)uYmgYUFOYN`5U7^|RV|ul$X>sWNr$?SF zyRp0UY-uka_=pIt zvHg6*{Z|J0MhEdRMV{g#r1%RGijk1@PtF9oRMf!7fNJEZ4nUHH-RCR_1Z)HhHj@Au z85WF_VVTQPTC~VXJKFS%xRqFAJ4!6{hmInZ%U@Ti%|CL%q`9P%pL5Zc@sC&)8x{~A zv(n!;##bho8U*{VTon--8x}6fE*(>RhlKBN;XYjTe}Vb8Dc7s>ThyKSzcD3e_dxbd zAJZmvEvzxy1|{7h=&oH2y7@Zr@D>l=?9zf~&C_Ak3~ zcr^}#^cTV4UH$=Xemnt;z&UW-Qve0O$-#z$BD5unhpiFJfrc}NaP_b)ObOG2SFRt? zjZ%HcP*pm{m4v-Tt?Y_|i=+UpQmX$s-LDniY;Ww39xD(o2j0TyHhfY0eE z(7v1jN4n3Vehz|)HZI&9FAt3klYz*w2E&RWNVeAmm(qL?oHT+wOdE1kDqz#6Qg{sW z;I=Iv>;HBx>~d8A!S}Y1{QMzW6FVNx5K|#6nGG4bBs7iElZ7i?By{B(B8L>6Zb0AHyLb#sjjF3*l4TWjJ743`ZR1 zL;pbxjcgeJ9q}ZXQeq6hy-I>(cc#JA@CoQm-oKEt|6>%RjUX~y6Kr<7z@pJKv_D#m zr=R!;Z%a+lMZGL8YlO*zS9^&8$0n2@+lO}APY0vhk0Fd` z#cDIpqnHV76c&6ROXz5a=EQrbC4UeZPmIKJBm80QuPGQ?dp>zGy8}w=htU*p!sc8u zL?`TJ(89(Vs1-(`rmdY=VigX%5|&~wW)5LWr)}{g4Lqng+6;Z-O!$-@g-!JqN@q938ZgIrrWhW}uLfJqcW}-4 z5$EZ%S)>BUz*igV;N3}kknvgop=x^A%N13S)>4PxS~MTT4Xd%CY7%uhUxAFWC8$z; z2qI-Kpps;3P_uS~nj4dFt!?EXyI=%`>uSEVsf4RZpwJPh9(imbDGw^Xn!vHIw78QxmV@o0rcdbx-ZVLk+#~s}K!yvRA>D zXLjU-t?NMdd<|p#9fXGdg#-8&f#KKlaOHJ8@RHBwjPl#bbLGyY@7Ykff>R|5jv=1t zc-%cqO*n5K~566~j5E$oP*1sTab zgYCU^NzP-BB^jzz09v{U%s4U*&VJgDE#1(-OpzrSDrRC7e|8;e7n{n+4a|V}+3t+B z{Wi!mHxsS=_7LpqQHSVgMUJ}LWR}6+-5hRHHMYk46?)(>i#7HB01D#ha@xvfGMDU0 zlndjY$15J|pm!I;2*Hhe;5IKA7C$?Hw?3>w9lmcNFbGin3>nnW>4`nq;WN%^!Y(ZT zu`$N^{W=7^Z%6O?LkO&BgP?c*bz+a3CdoaZiu;ZFV<^R)75H=p3=TAK^zLC$=XRJW z_VrUMo;wXp9=#ytP4=RErZJ|XvKwcnyac<$TVY1>DTY(bGFIf!X13e&+1y7}1srl4 z560mHe(lt4=IWOP!ifl&&zSY|D6s^RZ)Q!aAvULSmt0F?G3;OQjK%x< zGSOt?&+xaNi1pfPquUF3SB`OZu$g-;!VR{P_3D(eU9Q!T|WA!Mrnf>LsO^hAy zPDwkacs7DHskjJ!sfb{zzLo*4od*c-f;POR$AA%yw~-bOTEyua1a3IAPq`<59y={E z1vTSU%o(n~qD_N$A!nZrK1ux}Fv4Fl6Q^8;l8`9u% z_n_MswqmLW>J?_*+zD$t&uBPknINU?%lO5`MYz}Yr)cpcf;Inx7rCeAIU3E1B*Ns? zAjs-C3UNJ)2VMgFz(akwzCH)@H0tL4FfoetwYM_z3S~p5>&Y>2JLe?o2=)WB z8&1UCCo{0VnU_$PAPH+p5^%RGo0FV8F;gRB`MAbMs}WbB3MI(|Lsx!3zMr8kldPbO zGGo5OHy*u@TA~jybSmA!!sj6)pI5;>`ZybOPt4-ho2!BeX9;@t>NOTN%!T;*GKw4b z@lns)LG06t6t;6O4}Wzf5>0w=2&pf!#CnzsiS-K3j6#J4DA0W~IWn6Z?`z3q`&_!q z(yz`^=>2qs!TxZE9r{QS>x;=C^X8_KBb%C_Zon4h9?FtERk<8@?Kq_X9!(sxHwuiR z4{p3-z8sr%`qTJI_hOclP9r?O@*G+l&l2NuU(wf=dLBQQr^~PB{O86+&_}C8&_9Tj zAl5tRX zeg3voSq)3ElBnR@GSyyP+_3x4K0VOJI3ayjRmM zL))wIXxewv+zRZnEsM5i>Ufp~^)wdkIAGw@C>B^9OV2h5>MZtZI=1uBtjPYW*;dDQ z<;+cZ{nYDBs;m0fTY#KkDt4P}l`5y@ULjtnX}eovL568sMw-KZ{S)>3?7M_6Ip&)% zzQavz_adhp%S?w`GnSMrt4-oN-q!W5487l&>3Bzfh1>e~!=!H38Q)9W1#?q$v&#b7 z4$j`d&^uTWeDkDNb&B4hAHwccts7zJA3neKQA_sRRQ;T)*kA5m*uXT%ty%Zxc~5n! z!I6ubMu*>zFb(tS6A7-ex!5qjF`1__XCup~;8L1`j(3gNsPOVm)w%0OS;j}N?w#s* z&^*n!s3lW(Y1Kxy$+5O<(~!G0X(q?pbIdmkkFrgRf6TYup*(N9>50xFr$g(bIW)R! zs|0;c@xmgPO}1vaH7A!0E^ppWqkB~jkDSX4xs$kpyLsLp=t(l%F?8*?(=>X<2Pt~1 z1l^&0485E}x87k=(w?(s!|*b$1ignsuPFKHT;#)+9LIB|_p1|zp5Jdh_kCa8R-CKm zbpFiGO=5YS6>aCwKD@F=(=pfSLfPZiER&GlwhQG0H}b4@s5w_vJnPaaUUX{3^{UFj zJ7-q4<~mpZ@ZzWQ(SyC$tIxgctKY^|cd0r5>fzPzbXK<4To@YovC#2|OKsKfgWcsJ zeeJc?Z(q{z+>536fBOPn_u?3Qg8BQ`f55+Q9gxBYzR9yX?ooHqbfim)UOV#kPv}j3 ze??cTyAb_;sL=7aD}_GX{i08bE^ISVJUW4F{pjB%;I`)1f)YUsoxmFFDA*S^vaAWIr>K(17s37%9i1Sja~t+&WXZ-;81* zfA#+lsDGEYAl(d<7}2YO0>XUx1jWR(us<<6Emfz}@)i`c1T#$O%i{AHQspm660P^2 zW8q_bhLbsMO(;o@h$ej)g)nwhTFR7v4cfm1_*ZVLZ9l<)uLNf!RM z9wo&m#)>>4OR>UO0YIRt#1px@yL!x#K3Ph6gaIvPB1@4-w2=DJ;wTcui!1|0HX>1= z$SP2>W=SbH(<3X9$e!-0`B+cQB*SRh1MNuUx?ab?;45mdz{QT9TgB7u0=l7Gi$qRz zW9datCHo>ndLH6u>|*5Su_V^tW_E~|r+W~+ALd5Iw{8v9Nc>qzoKQhfBK0R3(tBUn zM|*pih5OqEihOB%2|Se!C0=85_iw(T=MHouO&Q<6G` zw@{j&!{RUFp^8PxpW>lQX&LEnyCjC*5ZOD?`Vyu1G5m%0l6q6crev)2B9S#cv<;Lt5&&-0e_;OxMuIRh literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..151008dc482639c3df72873480ca796f9ef31d42 GIT binary patch literal 3553 zcmc&%c~leE8o&3>OlA^-1kJ<*QK*)P>`GBVOPQd68WnJX@Ser62*@e~sE9eBfN>W$ zK-7l_O5LgyZ3Ul2#HB7M+M-xXU1(LbuUhM~ty+6$Qmj1nk9W>Huiqhe`+nbk&wQ6) zE_5hMvVQCsEz7W=Hvsmzt*PGE9rjKoHH$vr_uF{cbxzIf$k#s3By=9 zt8~G587pE%w!?uV(~VU!-6#Nn ziYBNy1=aCt=1r%(ID%JCG4qcIkPtx|`62>G%b3*Y%;eY^pL{%#((g->Rqos^pO6?1 zw`W~N1JncQ%?Xb1X^V>BJafq%sWVXa<3M=vNr!di;2@%?b*A_u_`T)Ye#hvRr=Q9; zJii959n0WQd9g?mc9%)+V8pp?4a|-yZ^1*SJ&@1-6|!`~qz(yxac;rGjy0&y?FaO>v z5L%OrB#+L5*k4-U&o>fDR6S4C;d_=`6+DQtD6u7?p2i?p6w1Wib^>>A9g(Bu04#ii0-h0jI*9BCX-?Oxi=EIJnE(qO-nI zLM@bnUIa;ezO#*f6b7i78Xs`4E|qRw^(AN+@j3cuaVxRc*WSuuc?mfzeo)<+p zOh6BlC}8o?Bg%9!o-k=6=u`PMK>FTsGGox>fxrXGE%)>THy}#+fzBUz9PCa^M|)R}g7xcVmZMwuQ`Xbmz_AP2C^aM!P8^ph zQGfL{=p2wWIBiNDGA!Cn&suLsjhz*U^jUgxPXQ}EH|Sk@Ozb%FOK~-w_Def#JD7)T zOB&_FzgQ0DT-a+r-oXIc)CHfGrjf~g27q|~?J#lj zJ|eCe5G6O=z}cKlD17K;>4CoK%zF_wP?sbI7py*n^ZP6#{PsX3xu>T3ovcOkRd;%u zT)z{y*2bdfS}klC@(eC*TFfk&T7$BVCW^kHY*3VB5OKE90NOU~fUOBp!0Viryz-qB z`et}Dyr^1$PAyPbEU_6)Nh(aV{puOLnmT7e3wb?UV3`0eZn;6M7ui`B%5C5}!x|#D z>T7r^qeSGqKNR@QyeZlut+S~5whXv@IKrZFv@P(IOow+K{D_i&wE)?^mWtf9ENp*# z7d?N!NIbDaN<27`2K!$thjvpmsC|Z-%#lwNZIzFOaiK=)i8p1f*U;jb%~u$=hEjRQ z^AjTRvrFQnP6hhLu$rnEwTOB$|0wwERv_G5UTU>-{|sVs>rOccJ1X(|NiM2>7A&n{ zWnyO6bL;(~HH_A|33Q$40;ikyk-auQ;J<>k44X&0;NPF0ZIUmolH@NwrZUoAjwj-` z^AP${;Qms)$)ic}9Do?5UqxO%!Sr?vZ>-2)NXrHWmHKU2P-vkX7Js>M%fdxg_8OCt zt1MXD$1$`f{77Zt(!L&1Cri1lMa%n*Ouv2k$ks(G?fe!nDovHCcu>%4M_p6Z;x&VV zD}%~3)l1enhu6lpH&rh!ai7^_Qu%LNR^}Ohp+ zYbbIpKahO2e(n>I+s04lbe<|3wa9H#edfI@2j|i5n-9%}M)?@`;>w(5Zbm@HxLX@# zT>}p1R*8+$!_%HbP(~LQW8LgW42h(GXA&L|QVVf8g&BrrYM}HH{Zz1yVBI7{2kodWu0k8t=)0 zZ)&_&FWPWYY)YKL{G2ei-UDx`Z`kA3`>Ogk7=2x=s%2h4DaweBk4@IIBz|=`pqCym z4z}7|ef)4s23|%IJDQg!l|otg2M)p(y36BWgZ10<`US)P z87xpRDqEHIfBdpi|25B?|I6i7|5Z0hvD!|-s2N39fBc12*(#-6Xz1iHAM?dhR*>#I zd2t$!<3jL{rwJUF&1vGe0FH~}{Nsc(4F*i)SN*buoZa0= z;>FlWLHyo7n;*yW16Iv(T7J;z_^rU_JotTL)UZiIqr#?V#sqjL>cT_gd46&*_HXWN zJP7*y1tr)COlEz;lIQi1&xi=~N{I=K=8qN!Ps(oi>f%kLL$-=z{BJe-iPT z#-I=LO4Wr=N{t8?LdW((%<+w%+S4DLEKD!?!>3Y{785w#z%#gUS{^Tevb*hvj1%&W zlZ~l6^&B^bUj`=eIY^5L@SYtt>;s&WcX$v^ayFlT?4r3S_)H^6V$X4On6KkB+2%S4 z&hkfF>8aWF hS+VIE38^XeLx;N!b@NbZZ2@@k2M&<{1ml11eg_+(_4xn* literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b485d3882928aa5ca25ac11f280ec415a3432930 GIT binary patch literal 4311 zcmc&&d0fp|8-LDkxl8M9`Ca#_AuYEeS}5z7?k(*|p$+ZYyJ?Scg^40jBgztGX`zU+ zm8A%cZDflv%otu`3`UmJ@SfjIhS$s=@8|P=-Z}L<=RD^;=lMS0=ltq^%TwxQ%}`7b z6XwCFF`x}#)p&aDzJ-&YT<8v{NM4#^?y$LRI6rGhcKO+pf#1nW0_PWQbvXU@*XEzE zTr@P?r}08}IRlK4>2JXZ@E3>S=-YuAuP=s(*XIQQs%8Z41&AcGC3ccdq|Z=dTi^)9 zB7R74d{|&)a8e{g2nE8sNcby~WFrah?}CXfaMr*OfI9&m0I;2j7+();vNY@|1bhS} zpGyIy&I1}h9Q>%N^Tw%TceZKu4s4812#9!nJ0c!>?<2&dK3~O~?@1Au)X6uJ68Zxv zGxaceXi8jod~#q|aB{GE6rMzd1%}FFQ{v*4G;0XqGD5P9lB_26{|n5&!Jvq;X7Tv3D0JQL_>F^AB>x@dd39G1`01(UTc zAph9}z)}aEX%LY6I12pt{sEWYRsttt43ztdq3o#u=B<;W1&L}zw0I0O?Y4r`8=S!~ zZYVsP{1#oP83qOVB4plZ4Adh_;K|*=K9hpB8e`y_8i0E|pjzx5`p5PC5hK^PIrwfhAlHDS+E=;~_oS2a2+`z`N9w zu-3T(Ho1+5JL?G4z2H6!$^HTit8L)vvqiA|nkg8^YopH1FH!%ndnnlqK|;I{IIWxn zi8WJ+y2x58xAHB#su_Wf3@uXS8M?#vLEEXI{+qyfrU~V_r4t=;Jp=8d@~Nu8Hr2tF z=gAeW=3ssG9>h^+h{J^kP?zah!bbj(n||>;v@E=dzAJr%Z2Bb-CGs%nI<%GKn~kTd ziZ4RVu{X#NJcuzTtWc$^I-1hl2!|z!=y+xuvG5QHdFcT}oA(5GZ1ch;dm>)QYlK%Q;8jLi{8-u z1ayZYdea*>LTv;=rZ5fQZ1fshZSJZ(=om@sf;x=)`Y61vas_q22@pGADDh<0A;|ge zD0#(uJY+Y`CZ0FY=+eYfFm9X)RX^}K)aj-Y)me@(z|kEVFML7j=5-^J#d|?>LO1*z zSOerZD^kwb11}|;IvBK?^CmwD{?)Pw)}5@S*KY|RO6sgpgXtl1c=;dXjI;O2@ka-7 zjj~dSTFYx(#Y{W;822|i*2<4O4M{MjI20CixX{{}8DMdsk)w4Dq3172fZ%Ol_0^YT z{|q(o?#V&--H)SWZ9WP+I*z_!X@zLBnS|;1J#^?Pf7-241OB`o3Xh*+R z(qhLda$dVL988=>G`KsXHA&fIznRy`bj{l+F6u{O&TmVIN3$1D^%Ff2@BTOo46 z<~4DbXwsaz7hP0ZZ3a4@Jwjb@e=wwt_TtRBml#@K-ivVK@IdA#Iwv_ zsiJzkfr$2hhJJDz$us`(K8jSaP&vEDjyrwTLIJd2FycYHIT(+B8FTH2Gyj={*}l<;6J&?>diO--)4!ZS%yp#-F2B zc^c8GYxT*Ho-hI}@!~~17z~f@H>n)gJ_ScTH*m9qLk3igHHG1KAJaO+^HC|+hS1Yn zNphDw0hbM#FgR-m$0KIuUI8uVf{&>$(Yb;nii3m>3^WMkmhIyv{^ zsw>bivXwZTf05ih-ayl2#ahZ{$_>&_r&uLWJWHj_)P-yv>%!2EGX~yk640B3_H_P< z69PTkY?PEFREbkPKu+ygOGZQj`e}KPhJ$tE6 z!1SnCFqs;Eek@aLSP{kkvczpW&%g{15^vM*hfr7E(CY(G}Ic&Vm=EL$Wk`zA-n%&Wq;sVr-GKdX?_Mba%xR_KjN ze9+#sW$DU+_KSF8+1BiQBbNd*zvitu-zGD5zggd8;9m?}Ij!pml!S&r?WN$~qSR;(&@$ksOZKT(mt#d(#X z>pI8Ef^F_aHc>ZDRDN4Nd9y61|Aea5mEILzmHsVNg*$w9hqSJnuwzYiz=6b$8!bE5 z?g~7%h_C0gv#2)obb)!m$(_Y}Bfc+lUGG%Au0E!HPt*@5tJm+BCmd`2fbMt9bb}II zdt}XL=s6p5FnVP>qcEO!{^mjN2Q2Ow1z2tSK?&xm1fvy^l^E*!Aq-W!{%TDh)VYl; z)H(&|R~GcfQCCoh^=JiDv?&0diWO=t>mAAm&5gXeEmUFZ9*ZNq+!UxXbPjfDSx;x`bua{rbi8SDHcg3rW6kM7khYmZd7TOMWc~ zam>*%pI(=)73#D?Z**$sYppPsd?VL2BQmtZ-3o{J?Mv>`p6yXQJnCv)hEBw!jbk%j zc6RARdX_rm2yN%;MtN;>FSbj0qZ>W7(z|kc{ahwydUe3T)E94I%5eD0potU0RA!e*IXC$1giG`bBy+X4oiz zH&s5B9e(U)kiwt~2i&(Sus-4K3*z?|grsw@K11~CN0(2f7C$a3p4F1!NmOi9QZhF@ zHk_4VSaW4+2y4uR4~!(c2TF3ethoI6N8&Gql_%rq!Kf%zl^L%=4QND#6Es$p?N~_$ zzn6z0@5j%TBtD^RB3_>_t0X^P#ZDh7$*tCsJ;VM^Nos!3m9gRR5y`!hxd#b5k)&j( z zf9zEfHd+b=5}vTlTo*vBFH)0wc}?~i)B9$LHdB^tvz5xEQt1@@u+3d6O_RzZq)t+4 zgw!EIxhBI76WN)ARO-r(*zOy#rJN?Q5!gV|$#aKTT8_e#36oq{+aZnZC$Jq3EtPt( zoy?ClmHSdF){Y6WnPeT}Gd(5DX>^RAuU90yALohl%ZSFEGMyZ6cY zB!3^<_%P=PX)t?U36JUE=GSL?O`FErZfw^}U731sJ~4PrmK^M3EBEuAB=`4KQpb2x zdil9c{WKq3EZjcIhb^TnAK|1uk=!>BRh4DWowY& z?=(6(#3mirWCZJki=4*RAE(%RDR@n$Jc%>M?C`#yRF>9zPs(K3Q`kqUtYq&YCCT1( zC3U|zrxc8%I7|U};(uxU8z)yG4gdfE literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0da33db3c308003647b082db9d88177f6ae33a22 GIT binary patch literal 3765 zcmc&%c~nzZ8vpKn?bU}~p)$}v-dD@r~;I>zdxmpQ$?rsM0{*lD+e z(|h;ap34EJy2kk9?_JNV33TAdP}_0AxOc3kfGl2+3<{8YBor5-|-{ zau#HAc9fiG3DT{z6HH%8Tg7(ms95?J9VLPC`YPG{1qvomPGYBa`Xv;NJi!>BZAeVZ znx3G~(#uoGASGdXyfHP~kY)k50xO>(r`wd%BFg^1i21vauiJ1x*o1S3NHGZNN9U-llfcAHuQEaV>f;RE{E#Ho_-P8R*!;3%H=3g0J1QXmzX?Jn(A6s@=iV z=4Kg4*7bn8?I+;dl6CN6MF!Q+Egz2>>CL2+*@=EQxgWWw9HBQAibRZNDjqa?KNEi{ zmZ>`W5W*T&u(>@9%Z6hrn`?mJ0ngyH$4%+odm|WIki*CYN8njaD9CkzklM!;Kb%$% zOV1pkzMME3N}6Zlwgv`W9DfSR_D@C!`nExx?OeR4FaY`ljDyC{dsAgD3^*?S5Uc}# zg72p91!|O*GID;vW|z<(&sZyJFU^G8ClM*b`6!P)lr{9Cis-cL;k%Gru)KNa5>Z{viS<3wjU|TyAllA zF>g@4a%HeSvxMq7^(r;b`deg3`3BGVM-g5%ZwkIYGoL;%J_50KM&Wepk5Ny95rcjQ zZ2f2^ZRT2-k3JmF==Y^6s%!Vyj+-5W-ksZu`#AL%Z%WZ1bBmhdhb^FMj8k#W(_nP* zr)xG37oCz6ou46jA3cz-wL2jWGg-?jUR*~unsVfBTPl)9-)9ZiSjmFR4Q$QCeyH)R z7Eh_Nl4J}WEUL5Kga3BT5&tE?)+XKR1H69PJvMrcKa;Ae21l29;{;= ztO3u8{U^E}Hi8}a%^j31b&;Oi?Aw(U##GL)U z9-Y@eg_y%@sU~MRYQ7MIFIdO-u$ENgt+za|^vP$C_`C%@yD^)_JMz@mMt?@Hi5S3G zZLp`}%n2A3MX^bDo#FnSX6do+r{G9LmAFJ7*JqdSAQ*P@0n=kxDcUObz;<@4DRI$5 z2&q~K&V{w2@T|Al^fr?u;sS;A5!=nSf$vk^ka?%C5Y@iOVH)ONsO^FME%yi z4OCJxpzF(LbQ{}!6~^xvuxl%ZpsWy?@33IW&H8Gou1C3ax_X-Qv~LK<1ian%RZ-tPWB4R`_$yhmR^0!vfgv&=;au$8oqVl>w{wpD-mClc|#MSn-*HT_78C zl1>QdiLV7~P+dikwQFPn&9+^p*sHxH;u{}QU+rnZ%6&$*_pTja|1UYjR-dED&uEj3 z`;m3wnxwxz0JM;XiWU+jvPI<3hji4>Um!G-fk(^9q)AFk@&U6dzby+3D7L#MrfEyT zVzHus$VR`diwdn&9?4glwk|HVcJMDz@zqO~_RvJ_ifOJcT;5X~_vuD{ThU6pkr{Wd zG;b?j-PdmstM=bsQaT`{Ortxxed#-cBC12m{dX)|Hza0%^3|g|mTz#IakNA|cIS%n zVaXSE>5lDOSvdl5liF_!GQkbT1XYZQ*1k)cFRu1B!79$gTKQ^iOl;1hyJ04v%ii@e z0orCGCuUm{ieEYXx!2d?PlJ)a(2wmo#Mv#x&0;CFV8uky)l zr=8dCirgMn?o;Tzu4dBi*pKHkL)P!s)y=wEUpQpL-l>iApSLqE8$Zw=U#9jgaw*@R zc&@@bPvlxrmvX5_*HGkIdC<^WpZk+&Xw{)PH_w#&77zVveYTGYR5lGH858@xc<4|9 zsl&z$KQm^9O+!A|ab}unL}pn|6+K^Wh8vujl`mkBY2Xkc;Xo}(Se@_H0g`;KL!Pgd zR`?dTTyM@^plAP zH8RnWW{(b_mz{IW%i27AIYD}{_(AnSgQ_9-Ec+7agl+XJo%Z*)=Wr)Y2zm8wl^9ae z1j>+@NvEb{W{DG16NPt|JhDwv<+8rYGj>joRZe#;gygsP@poU+2rl3y^I@-GU=XcG z451J4ggV3_;T8i-|D}JOIuHwssMszYfjYdtN|b)34t|6MJ#zdb>R|nnNTepFC1rIo zBQsJyMoz5*oy-FWd3fFbA4L5<$m$iIpb}-qCnp;898Cz6BmNB!bF>fN-NTQN3h>0L zSL@D+I+b7LHiF)-C&FP)6h2C@rd5{H5x5#A5xP#vu@~|0Ao#Zg{!g%=l&S62;s0Zj z_$Lvy|H~cz({7UT_CTe~iB*>MvjtGwt7Lps)SJ;hofk{Bg?1CfllSL&K9YO|8OQTE zynhlO#Pdn~*d)uEKT#MjjK=bOsL+#nS5Ib^VW!|gaKyhke~6pgNHPeV5F*US<_K+| z&=A)=A1*ZhI$>(r=e5FocAUoq_qgcE*$F}3vvo01$-=%Ng5;Mwi!>I0V=Ycd5=1)v zS%$*iEB2YO(Vl4u!AZPcu(yOKaER!-=24R-3G*4a`cy)&QLWX(TIAMI(>#Y`~9 z##o?}c#)m>!rpo{A5ttLzsW}^rGI)t@Z>CEgAWZC^esv`UGXDBE%heFCel@Uo(~X) z!Kp$G(qn_XXT^EUBQ+T=D3K!P2=yl^b{+~@^S4-%%n3TOujBo5I?u_HtYAfScuOTa zjVvHL*DcU>hM;T`$AVL^C;o}4a5SBopmIppXUxe?%yNxSPj}U4WEgWC<|bxjrWn&4 Z+`Wdn57nx}?EzkXpppDNiy(hk{|!(~IQIYm literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1e1b4765c9bc16d822b1bac7fb9e6d103c2763c9 GIT binary patch literal 4158 zcmc&&cT|+u5}*5(WfvA{`{6F4AZ4jaQ4kT8rB^{vDMsnNtyB{g6oZ1INE47CARwIx zLR8d<2-q7o&?F}Il88o6j2h*A-|{f}V*YsNyz??=_uf1A&dhIS?mg`9;3@Jlp$N*F zvU8)@6et3yuQq6?w1{^Z=^I`!;X>d@*WeNT5o*FZrT)|lKlw%4$~03z@hH9_#iisa z7R6FE2VM5KEuTGZ1pv}o7(H?kh8IZ*2azg82+V*bFW_^6gW^L2B7>45DNLEG+>4Y? zAz`GD{db`RhIEcVVL%V^^aMcbfs~UQodc7zF@b=C0Ov>%K+3W~PL^#YPg^mfFg7u+ z%9P5VS@YTBCww;gw~aDg$zONrLm6F3B^Zup4`9Lv#QRcoWkVA}QsTnmlLJD7l7nQU z=tWd$KuAJtN?g38xLi#6cO<-w3!h@D{{`kJOWr?`>YeCDJs~Ake<$^*!!)gP#eT54zwraz}emw}$=b_#839xxN5421AA>k)&03#)MHUWd&U!uT& z?_0S1t`S%fmQe32fL%}JVbu;1%1Gp5(E>|2y2lt!eB}rlans9d%IRS;NI>dC2oOhnP*a;64-$pRH5_PO%bPThIzM6VhOc zQ3pEPs){rZHvw;x6KEbj3*xSs5dTCO4o&zBio#W4^B>=UeQ6j3wrIe)Wh|&1YDc%E z5pPTU?zMKPF zJ$NAa+7YrI-A2(V3b2RJg19UW6d01wzjrSfu5kd**DUDD)dOKJ3tCEp;N(p?IDUzQ zOFNB`OJg-?C7p&hs~kY{fe~B|E`Xcw;$da750qxrz-a0KlsGm(rOQ0HwF5(MGVX&x z_GHj#HHD|oGGX^M9ngwbL@%m-Ln@(nQL-)qal9rt72n#6CO#eym$`!*L{ahRX;|i6UErhgiv_ZU5j&c=aEfi7on}^HPQe#%(BlIH9E_p zg}sO1urLuFTRVuY>&9X0%B9$#w>|t~{v6}CIO3H@xnRE62X)r=!_Dks7|l(@ry6Bo zRu;3#sDeqXM*|(mBpV&Wex0a}Y-gtlI;hi)&!~Tke|F|RKJUm>Db1`@tlj9ERQfV=vPbF(8Efo^ zpMWH=EDM2*hfbv8+Rwpo{~?ybHH4o3h64oEf${vgxXR~T@cttQ-FH2Pk`*~9^oSLC zAaOB9>Mp}{=5>%E$NkBIjwML%vI^OFauhXBJc%1NZ^I9|I-*rWj&N$&h?Rt&A+24s ziI(#OZv4DRWw?4SCr7*i_2b=A`W{cvn#b3nqR0WCto9CA@gq{}GzOqGCK20{HwSfZ zmzTA=*hMNVo(}zkX;{L-0OanRA}ze}J=~d+sw4L9#F96*N`-G%BkjW-v7po-qU}qH zdtP&{lwFuJafMSSC4M!Gt{zy6sh0Hc%r9?&6@z=!U9@K)e%S#2?V>u|Z_@*`XfnZ` z_tuZxcIXlMqf|`9DNTV$+jb*sqdvn;E7Byh0$6{DFNfj(u zS`$B;l?+!an$X3lU@$xHMqIzAkKHuyLze`Z*r`l`%qA5pQo1TzO1)s&gx)*hNQT#q zGG(LSQuQr-J4;PAi>HE$lMC_Hwcn#tNhK_UE-$bSyv?fScFNS=Ed#^GSu#CVsxU)( z33~YKS1j(e45ZDIRbE|0q2ZU0vC-ulIPO1i@n^@yX!4KcNX^>@8(tk@#Tf zXwAkvHS0{az@|1kU(=~T*RQuWr%=apms6Qd-KJs#-;T)P-nz{tMuENA0^8lWWiuiN z8~l##-cmkGeCtG+ZT;4YIV*pCFnp{&uX6rcjH6=Lu&vrQTT#!yuOYw2ahrzo4!g#J zI@eOusOx==h4mg)HaRNxO+}5~4PK4@{Y}NqK6`>s?yzs(-nw*u;=}9x%_Uz4^ki}- zIkc3vhny(TTRPBEwm0I!F6VNG)*W3jLmg2!23pJaC5U@YzDM`#(fLY(t~jgh6X-cK zdgG96x?%pL#pk{|FotZI4#w3tBygS*I5PcdZ5@NwO{2Az546!}zc6TLEA}Of{McVO zD{uMSRWG03y|gbcV1C9Zuy_{R0!8bk6oiZRZws>BC^yxi$Ra=3e)B|=@XP!2Lmcze zL>qXs3qqX>br$Ykbf6&2wZtfBplEhsxO@4m756S5D2(u^oWG7mSr$butZ~d2F77Uh z@~-!&b||(ij$YjC(;j}MyEw+@YX<#g@$$KQB{%yu|&QSl6k3riwmqmq)P!eYZ1CPdSo9%JG$Q3(l&ooT3uYMq-|H~Wx$6k_hi@q{f$X2e^;Q(wrR|0&kS^Ek;si1Z2TBiaxOAv^q64N1t!5hOHx7|X28bWdh7Ms$!NtW@VZA6kzjyXw+9NU+a?ic5fLgPs4Wc=y)v@3HpW0t^I z7Y8M-NC``x79ti;3rb8(NK;P@OH7JNh*vi;pKdbUSm34#@csjb2mnv|KL`H;bMo-^ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e9af93b02ee386e9ac98f1c9aea4130450e7a99 GIT binary patch literal 3781 zcmc&%d03Ry8h_9EzHjE6VMd1K8)t+RR0c$lO%|1uZ`c(LL1l4)VG#%rVnAGK02Km3 z7R}W3qTrgAhT#&JY@(S)38myU^L8^tE4`0(UBc{qLu6j-kNZ6LxgD5u&U?=L{@&%B z;rs~aBiuNebLA$6aUu@P0aOdToonMS#q#;GaF27~Dhl0~HoxCs)t=7*XUVCYI0g9; zaUyFkaA1e_RP2y807%DT(kl?AV4S8MS#z}76GRHNN|K;kk`$M!)2DJ+sZbt9$|j_V z*06sDrFJ5*0f&Kf3Nio?xe#&DVMG*y^`j|}AfP2;8W=eXBJwdIADNu>l@oQLnGFsC zhLRicBzzS+w4-9FUv-ourDs>krip_wB}`GJrypu=(!b^8AWzCl%*#w(k{g$#%hk!# z$RsT(E-@=3FLQ}8%Y#_C7Hck2nrqnlzwr5ox%ZlJUzvq*M@dyk_TREUq|-rFvmoc- z9Nd`NKv%r^Gu%5k5uJ42Y{HJ70efus(D5U-!`Ov(bokC6P*dnR==G_g_s4acIJ$IE z>q5tY+r{gUNuR^-Z#<43*q_JlSvSSYE_6X>(G~Pz-7VxkJR8?$CBcKHU6jOODzksf z1vuFH5RHZ~Jo%I>dNWjxX1AY!_cb}Fqp%wnHBnHxY(DOe4218z?qbzj!PNFP8F;-L zg&KEt!k48R;c-O{W$RLaeSIb|Y3r>-H%}i%ZfPxaZLvrcX_zM6oNXn|7I2ZjzB>7cK{<<0nLBmk_%9Z*tjmyTNtZ^VEpt zGKd@~Lw1`wp>IY9$~Bjuq!wT1vWqLKiuD4?MhjpZ7GnFUhnd7PvCQ-nDp-Gb9$N3j zz_?FGFmIlHj1CMxOF12=rdIX_!+SY%ar2B|^t!&38ouyzYMJSmC^PK~{L=R&_}1bA z`pC?1#P<2(Y}5D9@XRb+cWf~|z;!Xl-kr(l8Zs312i~@vksgiqF1>F;;SIus(IwL8$5HC4|zLURZ)hV8CFqOUW_!=^E*n-?FYedqh zTWsc4R`T-2W_Ew1Ejn?|70=&hBFULBPSj}nHvaonJN!z3rCGMgJNWfQ{cKdVmdQ}n zgPqed@ls|fG(3JC&tKUlE-__9jXyr1^Bb0dwuHA^@)@+5! z&qu+YN+)_$xGn2aQzbR&XvQzb{tI23?#+(6*td_wril)vUxG4TH z3Yd89Wcao*OMIdFGidfci{G!gK)o~7#?)@zCfa@WWvbpjgz6eNu9$qJQn6j>$8-cG ziL5+J(Z#^IOxD&A^_>1qR7xtKYvu97{LL#d-jIo%uV4tu4UtXp2$p=^R4=#b5Rb*qX^ zRPL$0t#zwkHMP-}s`&afWfl$*`=Zt)N%$V+LenFSpiB zTEQ6YqLUvhICl8zu|@?Q@w(wkz%wRoMitpI8=QH)E4)a(81{r2Pxq z{ff;!*UlLixi@@w#F4dg$44EC?_BR$KF@jnvFz*JN6MpI;#vwB3$Kc3x5UIV)pn6b zD`GrSKHd?y!E0TtXZodgl6#J>o9~_VWoxm8cjbaf`o2$gM;@zOI3@q~wayLRRg0cq zdGF5ko?}&U(+Y_D2?e0)nMT|qOS1x%^f>WYbO;q#r^geAtnyQ{0@}j|6~GzJ;4~~~ zLrHb)ku}oci+i3kfQ~UZq#y6n8If1|2-Q*8${*2YZs?JsXIhyB&po1N&q0r5MTZM6EW^4awwCSQ4OQKyFaEe$#WzWoOXOdg`H`(sYlu=I|ho z2zmO=m1L$Z5jaJ%p3X?q=Zcdvl7;t}Jidj)l$B$Y53Mv;RGMBRVfp=C{lgbFoC^r! zq{ipfflbE1An28|v<@+*evpWS8xD~0>-a3C=rw)^2#aEMzv>8-;@MRK@`O@^)SGFl zHGf7aOn)O78OcjhatEo?08;*dGy`Kb+r|=7@htwIc>SZ4)fJwg66q6DlQVT3O&FFP z{vA=XvkrU9!jF&*@WiU8>&}S=m7nH3g5Hx8;V>r(^A)UVmGN{0GKYzWp;NLOLGWz^ ze52$4Ocs<1wY56zf6OBPA_Dh+xx;_iO;X-;P%1R6^02KXfZAFm<0B$oh?+cju~a)~ z7omCbTAt@;laJ75@O(b6P2q!hK85#BF|KKe!c1Z2&-01c|J_&v@wEc+~-{dIX%IBmRmy9yu75KN$D}s5vjs{W;ls& z`HQ48;`tjvND!nA;u)vH-c$Da*eK5>Nx><+POvw+C+UzDGbBgMoh!)Gg>Eo)W9WnN zq?0vmt}eeUKV1Co*JRJ`y7HPkWM<}H>J1KZxuCT#}h6(ycrTiiP zvqO#bCdDRgsPsG^AWVZZgc@YW22FY~!F?I2$wWbk6ggk0KM8U0P{^9rXh|X`*^&Ji zUYkF7PR3vbE26_2D>-OnOmcADn0icRP#*DP%u}!@_{3B=nn6ua*<|Z-Udl_(bxzFA ocGl(OWaZl|P0rD$Wi7FB^P1o`!Brh*4e;A9oS*^1$v?2a0DmGVAOHXW literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d112e179eb9bbb8372b524bcc40eea2e5cae3f76 GIT binary patch literal 3997 zcmc&%cUV*B8h^iYPLh+bA|z=7Q79t}QBcIK%?UC^K^zRV$dXMEP`tIa2%@NniUSk} zLnQ2NTSv^_2fT!~I8vS#hZwu`H>4z^DdPw!R9u|U7AQamQNg z+o!bh&k8UMzXP&%;&la9I-$yt<*{LLNr9n3NkKZ1^dd4eFhm)%JT^{UQ$G@Yk7NzF z>=q{dFCgDpX_H8JTI4Cbz)}~Rul3{r%VqsCh;mGW${q@OjxPn>f4I|CTd#US&?NC>EZ0bjo?1$OvkDDjg(;X^*G+C!mF z6ST1?$z*7*v4>N8UBNVV6byL1MCYqUL$*|mtdE%k@xTs9$%1dNC1jZQGDCkhs=GzRWY1GqA~5{h(_VT4^B>a3I^v*TqT*ysjk$2%duaU8_` zB!XkQi(to6DQtRv7@Tv%U`d53oLkI>13k5lCm+7k6FP!qXt<0stUe?Y0&K|#QN4wfh;cpNUB{Sz5h0fT0RVF2y=){=R&qE z3)<@I!FII^c>m6Z#!PFFWwN0nHwZdz>cPoNEV#7a9(j}&fO+B>c(%#~%q4vseKZGqJ+*O2N!%KUA{fIyORP%Gn=jg`S_2;FQD1 zM456imi((Ly7cVEutG~Wd|lTPZZ-N*=Wz@R&OQH}zvtC0q*eV3Ym8tUo9Fw0%(xVR zXLts1%I3b0j-9c`CO2mB6GmIJ8}ur%Yd0*g10p%=^cofH`tUyKyH&x85f_5?2kk_w zODH>Z#(UVpRn43mu2#s%F#~lw&u5*>7^lO(YXixXeAvtST#(=XohWc^9%S_z!=Wr& zqG6AVp53-BJgwHFSX96Oy5%u}H2>xHKx{kqD-g`NmzIt3W#S$Fu{KztJ*oIt^ z+_a61w%}JP&tYzRQ@|#@oGq$VlJSo};d=K^(i*NR23DFj=;8$4QQpn@#TQsh6qDhH z1|_F!>t#4Pp#wXx`Z?6{vfYfRgrGCBdP*)jzk*~QnH+SSM!Hh(maf+4`}!H9zUOw;g<&Cep6ICc3$&?a1G#aY3R}?{&A!Pp#DaNdcxQSNbnh!eU6H}y zc;1t^`n?TybNmT(Ns@}4PL*hFG@Q!fP+P9$6^=)-rhjdjum zhG=KfcKqXm-=NcpIqZ>*J|JImn_ZyYpmp%O-C$cfL91!16dZU9(EWi&SnTgwkUU*S z^zjZM>izWr_UglR+*voY@qv@^X!xx>WIR`a^)8kX9|>l$iv-Rn+~*V4vq`LBK?)9c z$%(7v=%cxU8?R2Xxi7A8WA2Esn@QQM{3+{M&sMj>S9e{}{=8hh^2WuuS68_Jd|P>r zPXz4Z7v9<`A)iZDyc*W%qar;=wm|>MemK)oOX%(&V7_(gNZ~=!R+z&1>uZu=u2xJi zA854fCQJnMgulFi&~z3&Q8A|}9D=(HFj6efPg#SL`c{5T`D@ZRd{ehvc|mHrmUwJL zPg6nKdOZ_GhL|c`yTQQ9N9EUCn7-*f``}Z#RMEQ4MiUe6^)wf)-(oCJB_)c38QaX< zvaS4E4sO_P>0RiyTT#4m=SaW0h~AdsO*wW;S~4U~hcb6NL|jz)w;tM@HzB_7)NZGe ztbJ2fKDyW2TC(N92PqiWaE5AYfm4RQb-)SLwj$T9rtW)Ylx7#t%pDtf^+f6R60dy4 z21Dnv9i?+sKBWO|Wjo7#Yl1uWIG69LT=;py{i|)|In{wpsazwMirm_eQ`y!F+bee0 zhhHdk&vU8V(-_@T7kRC{GVe3xt(FZ&u2p-#NW6Ekbm7UWeMgfY_I2dBR_{N)>gl8V z*G^U+`07)@c~ZBU{N^;$z-G~@nu6B#d^3-|ZncGNo5T)L*H6_Jo!nyLv{C9_cd%o- z)!ed09d*TLa_mD+?{%*~)R{LivHyBUeaZO)@-*IXj|Np&k=u5g#itueFO@LpoW{Lh zsD_|hHV&dIZ7!G5=;)J&K9|#cHRw(ie6yvOk2IQI`4heTO7$D)sf*u6mmDtfstrlw zS)bl`YtUtxGN};LkR-RE6oVTZ8OIQ&utXvzGBJr078AzM51MPFa75RQMLou{ z{Nb_&H5+^LnZ1)x$c0m1Qz!DMHn0#8h*&gv`k8r`wA|B7?;uaKwOYjFpbbNw-ae&` zhLEQw>%SvUdatQeOjulal7RDkRr#U1zV&b7K7qT_RL)jf;_t#|zF(KP?D&tLv)fdB%RE$O#ydVoh zI=%@LUk&`9#e#^GNF|>CV|M%xQPuy;^OgNmFG!Xo&b`$A^^K3U}@084p&dflErYKKlQ1~-jwVtwP_R+y(XO9W?U9ddVWm2@ipHBp% zkM*YW`&Se_sN*@QgV0IPqKT(oGP)uDi2=TjaiOl^R1o8@hNsJ+#eZ<`Gk-p__h1H1 z>FU%q`9#xailiW4N2R~tY-NC-x^x zHcB4EpX09XH{EPn23t>2Gnl1o4AX=70GCNo!DCm_JsHOs(M?Wf`cJ2*xfJ?Lq4uOR zr>mp&{*)qFb5H7I886y~Qg>40q%Kl(t}fj_)@3=3qpp+jr{mME%++WtL2MEql(1}h tSkkDF`1nyl2?@$%lNDhJiIK`UlQH8*j~Q(*@st9*{)1x#fH(ca{0|mk!UF&R literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f3f7d2a7d0cc4ecb1608dee4e76b94cde19d2b83 GIT binary patch literal 4018 zcmc&%d011&7N41WZ*r4>M9obTKtL%0abXkOP)XPk6~PLq6vHAQTS{28?obpJ72E(p zR*_9m5LyM5T?My-&jq#0vseLN>-wI1efK8C>a+dhec$_D&zH>1nK|eD&N(yRmtQ=j zo)#2N*-;Me6iI;*02r)Z78|!}x1Rp2qp7})1!Ygd(-hlXZ{B>jHe|0&-nH;)HWZ)) zlxRG~rT7$!V(Hod6LQE#L=IU20JxtcW`qdq1Cnm2OW|T`z~YNVoB(-LkbkIL8A>4` zSJ(oDolp`gA^$9tcs!jWpb(%3K0N_I>mHPyJDmfQvk8s?4g{JUHV(8jNn1W`!N_p+ z#JCA5?cXyNkwYgU68p_YNLTUcE`7)|QB{F3EX4^(dhz#Vm}mw?2gXGPN5%RF$z$c3 zVe}#_$UiVTA}%sYRoELSd;}%uG07vO`(I%GVYy90s#EAreNU(=&gVVLBb;{31u<0% zP;F>6o?7-RcwRLLo|=%!Bgf1J2S*>omulyM&;5*WkD_O=-t`=~J|!J5_wVMJnq9=U zxsC-ESMPyH{2V&D=NNoxd>&aw53*NuT?FkLZo$ri2hj2}1uW~M0r*h zsA_rz#{hRUt<4;kxoX1sEhoT9NepaV-;FlZV_^G=#i-lM5&UfP6p0QwV|mRQz~-no ztUb^U?xgGi?^0v1(PrzA?UboR*iIeR;OQD@5!Q(3CbL*xYiDsY6*}Aqz1u`i%R?ZT z0ExU;Zb)Mi!s4g~;B5E;bXfLr`v)cy0-y<|tZoD^%3Xn`uOo=in}8mESr0aMHey%2 zW`dOFW$0-G0WZ%v18i-L@H+jcpjHr%Dw7?6p2KW#;?gKgu>CbON;(R(9AAUK`&R+X z)*Oqb>;M^;f*)VHoAoMP34Ur%1p7``6WK+JQEsgTY%s3JtV-TuOU~cNW;Tvy8z#r2 zYO`zXwSG246Z;n;!rT`-3zT45Rv=h6;6jX8zY>f;c7iqh8iY^(ML#xqF)*Js1Jhoq z0enYhao`E&{KAS#!R&A!SQ38==}E@0{l#B$-&_=9QmfT?d9)u&c$J1o)@o_&d)Etxn`A)?K{|`;{eX$OWZ6M3NM#Fo} zd}479j}tR-EUQ+l65YIQgz|)Tgo1YjrMI>5z0+hwgs2c0jbFizC*nc%yS-@fs%G{^ zErO+EodOf8SHg=alQjAFO+ms`Pu5p=ouE(d9_ar~HrU>y50pwFSKnhaX_mf&%WG{w z%NIR{y>64q&wscNL%HL*=WA@(3)436BYDTL+I!>Ri&K%f_|h%lF(Vl)7|6iR{a6pX z}cc7E$ zU6|>q7Czs~fwY`|1MBQDV=1xogq@oae&`a8nLpjF-BU1wvstkbwqx~d(|NzZH4kop z{kxs8QTo3ER@6iGhB2o>Wq1ravST`|-^JyI3oRK_Uj#qs{XYPFGDZ z3aoL54P)QJtu33$EelFfd}}!C4xx<#xQ5t;v<53jo%a-0>SfnW8&3ZSb-< z5p^btd0Vt?32uH0+hC{Pu$I1MkSC~ziJD>Ha=~3}FH2uDS)dK~#O}gYANT=wDl=H) z>O6tn(jQp`8nwIw_p-qFvdO$ATU}tqT?hvrKS7bNcpzb>rf~Ic3idpIfZlz%ku&GE z2KM-j0*>mD>X*R+Dh`_3>oJ5#Ho&JUZ{RUrW0tz7G80#@-`4~yv?WDse=s%6Dzv-QxAow*?8%C|XS3`}w(p<5;z@r`YsrqhS?du; z+o3eAz&>Tf*hOun=|#?IV_f$+lH;)*R^AW#U!7q@$+zT!wjIXO8R9 zgBP-=C~aZ%&aTTjUiuNbaorGlwCR;{8Xc~E zLrwXE77HpnlZV2Of@)>VNncw3RdC~Vaix`a6~XeLuk&?OA7l&3;XpY^`j&gRn+Xm z=k|xh!j7_i&;7_?1PF)+K!R-t9y0HfmVWibJm-&ipNO$miHIDsVOZ3syR^}7To9?a z;NaLlBT-uKDOE&pR7k9v9}Ys{Ae3A;k*qhNY1F6T{{-}psuibN0})FZ7#bWYr*N7^ z8lex^sFAMwp^?EXKd%6}1M{l7e4$zS!7gsH|tu7nh(8VLXp>xwj_o}Tl(r>P&7 zC?7Xt#9AtoN~QDZFC(+1(gdk2MCv4!hDhy0RC_YoU=Fjgmr7lknbwD9T2n2ROavy7 zbl%tF%*>|H3&*)GjBcO6jE>Aehn7m+nL*~usH$_RIirUMSkAQw@Lm`f*gEt#{A{FRHmt)qgRL!@%XUjLyPZ_?)k+RjPAw^ z>e5xItMdt`_hhkhZ|i7ZpSjVCd{m{=c=Ofuxh?oOAG%qze2@>*N|_?ad0{MbAa!+T z>{UhyL-_MuRsE)$P0LX0rBVlG=^VlIK(WYa>hb`~6?9J~F-CNg6PW(fDXK4p-jk_3 z>CEZs=yPAGEJ1xws$>~2+D5ACq}oYUqFEz=f88vc-tXauN0lK?s72p#_5j#fpN80*V`F zuvS0?>#B+y(dVdEtJYezxZ&x$H;C3}d(M03yz}~<+2Q3U1ysi7;p<8Z=bKfX zyK(Fd4YYvnZbR$ShBSxfn7e|6>e30Rt|$PYE5VpiAK|!A<1?15=yy)6`)9QasVLLY1d44yAf8RZ8&7>=utbr&xuKjM+-MJuB+ z6fr5PsnK$kTrYuHBt%b*QYL38QnU!yV1issoJ)w8$ z1fJ<*u;_6Dgzx<)TzaX~4o*KfeRZ^6b#>WyxL!oh(1Dsgn4Sf{0@NCRG zbasawEHf9Pe)U#BJhBH$=@0I?1YFo*1-!l%U}EYH=MxQKNw_N{E%1cVt%)!xs~7M} zOyEjj4XoGCgr4?$(3u)@WZAG4Oc(foWy2Xrt#yW!-vm&vKM9t{n#015eK4vt1|qii zf#%5^Sl7B6-O)jyIjsxVj7*@Zu{RLqBf&w_2Km005V)ffL|Yx90+T?wnFi*(-vRev zIJ9`v*qyJ3LD3jf5bgAa{9k@Ti5W()i?9MkJ`a}JkZ^SGUa*wx^WRDxCdDQKVN1(px&;Zo!>xb-dtvQ!~Zn!g_2Pd^4Ly*I-; zzY%a}6^7b#?!y3e59m|l1W%vMg^gGGf>nw!dS3Aob&tM>R1yTKDVE^1BoM}R6e3Oh zZer1vckpV5GdgHns!R3phmE~A5)s|kfz@PdLbm=nY8_lm-1k3%`r9|-i+%fnW6M2I z5T~$)lKn_!%tO(K+{mo+&2TjDI{KmV5pwF5img>f!^S@u^??9v z==Tn2i?1FU+f)w?;xu$5_X3u80EcV#4(PFK8z$WDjjwCugJ#1~xTP+E_lwf--u5|| z+u$K&!ZI_?FUR&E$Am+~+I$X&luX9@j@Uy+9SodLgWP>xc23F+78=18>XM zaMJ0n&Vz@8NK?>*!Ltv+>#e?^7d8r#d)i`8rW}C69}eM{$Bh7WV=UHIN1}_PkHfM( z!Kk)Z8)!_YV>S6Bq31|{s6X2Sx1P5b3`VuX?WsEecXPm%v^!84YT~G1Q> zxzO)YcXIXmiP&0=BdY6r0N--*J(_g-K0e}5Z?0wjbZlFJ1KG4ad9B;6qW?MC`k?37DxZBJ2FU(Ta36 z-fi+#Jj>t~QY75Kru|lcJ&MgCYDdcub>9t3HTVv7Qz$WzS3~8uu|x;mOn$p}G%4Sm zY*@9e#?(J46m6P*9qTFX!(E#oK^@H^Tsm+zu~j)4%Y5aHF1Ft^dNTJoui$(HZzp=J zx7_R~*Du?Ezv}&u$VgI#98HTkIw6lJ#Z8JgyQPlWI<7aWKjnZ;T%*fNv+Kvv7}Q|@ zxM_{e8EI;is=EtYG36l@vebi27FL0^O%``LIURPtUx7`W)yOR{AUT>BZ-~s@Gf=bI zSa`uMa8+(b?-D8=enA`!12Eema28%U?n0!R7ej{(7kK zLMV2@Aj-smSA|vHbHa39oPn4(&FJ-=Bm%3RCAvA{46#&ZN$RdN$0IwUF;oyl#XaZ` zkM1|>95Fr)hh%HGYI$VO&BOY_z`Ku0lYzyklIw(-nJvM&1y8_dO)m7$-^K}0O`=lU zvU##!hUk9ZCr6*0#|M3!z)x;#;l2m~GQwjh{HjrM&o8|Ub%Rb~4aMj2T_Y?EtQW5& zoW@?mtNQxj%^mv;Tp^(cX`K05JEl*s(zT&0 z_@BO&3APBFIa3gIPF1e?Q|#g;zHBwMZU)UUW zWaHx1gHrFDDD&J@R6Z>0_XlCS=8x(1RSuBG&T3h2&-sSdK^pZ~OYbEXqn0@rE-Um~ zVYlV*a^H*Mz;d@a80~jS5>y>f?or}*xqooYq=vb)e~V3cZSsw#68|f9Q)DyWyd`Z) zcgeqBAR1O+Q?@7O^r|7VIJT=a3758p)fL#T-lzD_v#;*Vu9t0-fon70WHUEq19fX_ zm{B%AceUpx)FEqn$beWfxD!N?7DVy59v!fn?w~fNl^b;sKQ{+v13y3Tik)2tNUA?o zChB-vFuHu7;!$gD`YZh@ZZp~*T;7{$Fm=S7_SY}kUKvEV=ixkobC#jpBVSkKs(Ni0 zIkM2CU$7?2D9UTG*^ueaUmHdH6nk6xu5g}V9OG9qAZ(xNjd5&1*}#Mr%?y*c!17@; zUOs8#q~IDheUBXBtzahYv=>G>!e3`H z8F%WU9P_71Ov1*3TB97ov>oM$vL(m&seA>or}xC3ZL!M+P>|V@(X+-Yv~ef54`sTu zFR5MhPn2zSJ?+ISL4^DMX*|7bt2Czk!LJ5PF3jkW4H{9l`R^%AxySEhFhPjsdR~W0 zA!}f296d{=462@8=K8q(V~v-UEr-NQ2bj}XC!6&7oPF^W2`MbIj7cYw6Vg@OnB*9i znK3l1f-l(BS5Ran-qA}um&;PkKYl{~lxxXouK-#{OSB|Z9Un+Uv|=Qe0y*MQZc!g7 zp(Tb4jvSHcfm#2);CwA#mWh6dl_gC!F;3L;ej!B7Abu#DMJ^!Zq({ZaDC9K3@N8@B4}9I)JYc(tJHxoa1rvVWc$(8`{5kz) z?LJ)zjnSL{H`bdFYOjZ-^ca(&>zb^)Gx06h_}aw(4i*IZB6Cr||N5VazlqHMU!LN> z?NJhJw-xA%DZ$fzrT`*yAzvC4G$v$d=aVI@AnaLjk$Om_(y`2&6@RHTQ|b{X^^!{C zq@HowGY`gKG`sSYN`2Xx*>}y%PP5QjcvbxXw z)5AktQlh=%q;l3@o1THgNLbfCXuefx)Wf#YBQ@GPSj8%&z5%Sg)+n4`II#56gR?LSju=cO=b9$HVPat0lv50iRicHWb=Sk{ZNk!m~H>7)hO zd9H;Xrtr#O@@R3g{!DzvmA#ryOAuP5%G0K0#HeheQd4c^X=%z#i|H|G=?Tgd3rANw VM>_{mfH}a&7c>w6GUgYKe*tj26RQ9K literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..f7f0fe9df --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-14 10:43:37", + "end_time": "2024-10-14 10:43:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 31.7, + "gpus": 0, + "memory": 15.83, + "object_store": 0, + "execution time, min": 0.003 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.2, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 deleted file mode 100644 index c92d73bfb11dd463e1c278b8eee0d36cfefa5610..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 263 zcmZo*o%)9X0&1sdba7@(>=kV`bvef1&Dh29FUL4&+AEz55IfyAV%q7pHHtts=WI`J z%P;F+?F1>})EAxM+v^^@9;_oYv!e0!op|2ZDH`5P-mIxzoKG7a*UsA^w1)vM#8Gr~ z#<6)hD@))aoH_IV-HAJN=mCbuia5^XM{0ZQ6-xqXWA$C00kwNmHX5JDVPeDN}2o}7L#eWiK;B|Tnp diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 deleted file mode 100644 index c3966bec265a8da7ff350586855e53ad69f800ed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 31 gcmZo*ohrfr0ku;!yqUdOyqUaNy&1j!13{@C0Ar^J*8l(j diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 deleted file mode 100644 index e419c951696721f426be5649d731716ef4543547..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2840 zcmeH|?N5_e7{;HzO=!iJszYRGDLN`>M?$8|>_x$lj7L!7jIZf684iQt?-1xvMY+amz4s^hfY8<)=xSp$N#`a-<0sYWlult;H zo%=bx&;1X-{@G;OLDZ|V!D$|ryiWS9Lr8w|Ysok%j&AfdD+G0(9SwVqPOIvAC zLl`r-W+({Cz0$@YKh+{9Xc-=gB6=b^J)E$@k+Otvm%k^BusJEXX;Z3ZKGE!Lzafyw zu)=xxwPP$2&pMYI)URKiNAydN_wfneANRmd@6m_j zP+}i_j zrx52*Q9BRww(7f9kvNZW5c^GZwIkksmJ6-B`xzOg4nI1Ab69%l_Hv>JRnf@PRHZ_k zHgixn0lrsSS)vRSiJ7@Sg+5YiqB8?^#WFKqr86=z-h3AL;7dxC()rRYHME(LMwbRA zNtr6vKZUgu$3{{0r(PlZQRmIbi<&szjR(?SUW2$t6b`MDbwLXs-UQp;iN^w?>NfP) zMl)Fbb}Q&w5ev2)I0xG9tOJdc+2986ark~2xd+;kZ~BNd;}Km`UUd}1>3=} zKh9&{ky{-Y-@#Xcd80nCZV~E-YD^MiK6^$5>cTxBwqEWg?qMH#+>0HJ z@EboVLoY}@y9arFu{RRWF;gYRI!nbF>~pO-4)fJB+FO##ls$o~`Zg*H3(pYGd? z^}MC`kmg!hFM3y~_guAa!gzxx73=<6A`RUw*mr#*3-fNYHbEPzzs3I$DiK554E%?u z6Oc!>I}_)4qWmc04LM5?&sg~eSa_oxe%94dm{)!=559KFh5Q~5urc0o;%oG%f0qb* fu0$wmu0;N?L^!icWbQ?ihFduIqM7xgDHHz Date: Fri, 18 Oct 2024 15:29:50 -0400 Subject: [PATCH 042/105] Updated ray tests Signed-off-by: Constantin M Adam --- .../test_cluster_analysis_transform_ray.py | 52 ++++++++++++++++ .../test/test_data_cleaning_transform_ray.py | 61 +++++++++++++++++++ .../universal/fdedup/ray/test/test_fdedup.py | 18 ------ .../fdedup/ray/test/test_fdedup_ray.py | 60 ------------------ .../test_get_duplicate_list_transform_ray.py | 45 ++++++++++++++ .../test/test_signature_calc_transform_ray.py | 46 ++++++++++++++ 6 files changed, 204 insertions(+), 78 deletions(-) create mode 100644 transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py create mode 100644 transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py delete mode 100644 transforms/universal/fdedup/ray/test/test_fdedup.py delete mode 100644 transforms/universal/fdedup/ray/test/test_fdedup_ray.py create mode 100644 transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py create mode 100644 transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py diff --git a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py new file mode 100644 index 000000000..a3771fbd8 --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py @@ -0,0 +1,52 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import ( + jaccard_similarity_threshold_cli_param, + num_bands_cli_param, + num_segments_cli_param, + sort_output_cli_param, +) +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher + + +class TestRayClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + num_bands_cli_param: 14, + num_segments_cli_param: 2, + jaccard_similarity_threshold_cli_param: 0.7, + sort_output_cli_param: True, + } + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "signature_calc", "bands"), + os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py new file mode 100644 index 000000000..a62105b2c --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py @@ -0,0 +1,61 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, + operation_mode_cli_param, +) +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher + + +class TestRayDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "get_list_transform", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + "run_locally": True, + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + operation_mode_cli_param: "annotate", + } + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "data_cleaning", "annotated"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_fdedup.py b/transforms/universal/fdedup/ray/test/test_fdedup.py deleted file mode 100644 index fa46fb071..000000000 --- a/transforms/universal/fdedup/ray/test/test_fdedup.py +++ /dev/null @@ -1,18 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -# There is no local test for fdedup -# This is just a place holder t satisfy overall framework - - -def test_fdedup(): - pass diff --git a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py b/transforms/universal/fdedup/ray/test/test_fdedup_ray.py deleted file mode 100644 index 78ee7cc04..000000000 --- a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py +++ /dev/null @@ -1,60 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os - -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration - - -class TestRayFdedupTransform(AbstractTransformLauncherTest): - """ - Extends the super-class to define the test data for the tests defined there. - The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. - """ - - def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) - config = { - "run_locally": True, - # When running in ray, our Runtime's get_transform_config() method will load the domains using - # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration. - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 1, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 1, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, - } - launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input", basedir + "/expected")] - return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py new file mode 100644 index 000000000..34f3ee403 --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from signature_calc_transform import ( + num_bands_cli_param, + num_permutations_cli_param, + num_segments_cli_param, +) +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +class TestRaySignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + num_permutations_cli_param: 112, + num_bands_cli_param: 14, + num_segments_cli_param: 2, + } + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + fixtures = [ + (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ] + return fixtures From 954dffddc11070366fdf56efe2229a412f8501f4 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 15:31:46 -0400 Subject: [PATCH 043/105] Spark test data and tests Signed-off-by: Constantin M Adam --- .../docs_to_remove_consolidated.parquet | Bin 663 -> 663 bytes .../python/test-data/expected/metadata.json | 16 ++--- .../docs_to_remove/band_0_segment_0.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_0_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_10_segment_0.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_10_segment_1.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_11_segment_0.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_11_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_12_segment_1.parquet | Bin 0 -> 1532 bytes .../docs_to_remove/band_13_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_13_segment_1.parquet | Bin 0 -> 1526 bytes .../docs_to_remove/band_1_segment_0.parquet | Bin 0 -> 1523 bytes .../docs_to_remove/band_1_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_2_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_3_segment_0.parquet | Bin 0 -> 1510 bytes .../docs_to_remove/band_3_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_4_segment_1.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_5_segment_0.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_5_segment_1.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_0.parquet | Bin 0 -> 905 bytes .../docs_to_remove/band_6_segment_1.parquet | Bin 0 -> 1513 bytes .../docs_to_remove/band_7_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_7_segment_1.parquet | Bin 0 -> 1505 bytes .../docs_to_remove/band_8_segment_0.parquet | Bin 0 -> 1530 bytes .../docs_to_remove/band_8_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_9_segment_0.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/band_9_segment_1.parquet | Bin 0 -> 1497 bytes .../docs_to_remove/metadata.json | 58 +++++++++++++++++ .../data_cleaning/annotated/df1.parquet | Bin 0 -> 6923 bytes .../data_cleaning/annotated/metadata.json | 56 +++++++++++++++++ .../data_cleaning/cleaned/data_1/df1.parquet | Bin 0 -> 14933 bytes .../data_cleaning/cleaned/data_2/df2.parquet | Bin 0 -> 3068 bytes .../data_cleaning/cleaned/metadata.json | 59 ++++++++++++++++++ .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../docs_to_remove_consolidated.parquet | Bin 0 -> 663 bytes .../expected/get_list_transform/metadata.json | 48 ++++++++++++++ .../spark/test-data/expected/metadata.json | 49 +++++++++++++++ .../bands/band=0/segment=0/df1.parquet | Bin 0 -> 3984 bytes .../bands/band=0/segment=1/df1.parquet | Bin 0 -> 4763 bytes .../bands/band=1/segment=0/df1.parquet | Bin 0 -> 3695 bytes .../bands/band=1/segment=1/df1.parquet | Bin 0 -> 3684 bytes .../bands/band=10/segment=0/df1.parquet | Bin 0 -> 3305 bytes .../bands/band=10/segment=1/df1.parquet | Bin 0 -> 4466 bytes .../bands/band=11/segment=0/df1.parquet | Bin 0 -> 4906 bytes .../bands/band=11/segment=1/df1.parquet | Bin 0 -> 3317 bytes .../bands/band=12/segment=0/df1.parquet | Bin 0 -> 3138 bytes .../bands/band=12/segment=1/df1.parquet | Bin 0 -> 5020 bytes .../bands/band=13/segment=0/df1.parquet | Bin 0 -> 3138 bytes .../bands/band=13/segment=1/df1.parquet | Bin 0 -> 5244 bytes .../bands/band=2/segment=0/df1.parquet | Bin 0 -> 4782 bytes .../bands/band=2/segment=1/df1.parquet | Bin 0 -> 3988 bytes .../bands/band=3/segment=0/df1.parquet | Bin 0 -> 4323 bytes .../bands/band=3/segment=1/df1.parquet | Bin 0 -> 4341 bytes .../bands/band=4/segment=0/df1.parquet | Bin 0 -> 4035 bytes .../bands/band=4/segment=1/df1.parquet | Bin 0 -> 4860 bytes .../bands/band=5/segment=0/df1.parquet | Bin 0 -> 3554 bytes .../bands/band=5/segment=1/df1.parquet | Bin 0 -> 4872 bytes .../bands/band=6/segment=0/df1.parquet | Bin 0 -> 3553 bytes .../bands/band=6/segment=1/df1.parquet | Bin 0 -> 4311 bytes .../bands/band=7/segment=0/df1.parquet | Bin 0 -> 3765 bytes .../bands/band=7/segment=1/df1.parquet | Bin 0 -> 4158 bytes .../bands/band=8/segment=0/df1.parquet | Bin 0 -> 3781 bytes .../bands/band=8/segment=1/df1.parquet | Bin 0 -> 3997 bytes .../bands/band=9/segment=0/df1.parquet | Bin 0 -> 4018 bytes .../bands/band=9/segment=1/df1.parquet | Bin 0 -> 4326 bytes .../expected/signature_calc/metadata.json | 48 ++++++++++++++ .../test_cluster_analysis_transform_spark.py | 46 ++++++++++++++ .../test_data_cleaning_transform_spark.py | 58 +++++++++++++++++ ...test_get_duplicate_list_transform_spark.py | 45 +++++++++++++ .../test_signature_calc_transform_spark.py | 42 +++++++++++++ 73 files changed, 517 insertions(+), 8 deletions(-) create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/metadata.json create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet create mode 100644 transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json create mode 100644 transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py create mode 100644 transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py diff --git a/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet index 557f866a7c3a83d68e8842afec48e1c9af5e5cf1..edbd80b43e1a3e1ede5676006a991cffc1396238 100644 GIT binary patch delta 26 hcmbQvI-PZbEI%8A00T3FAOkA{Hv=yN-$tEmCIB{E1C#&& delta 26 hcmbQvI-PZbEI%KE00S=r8v{23D}x{d^G2O)CIB|<1C#&& diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/python/test-data/expected/metadata.json index bf26b5228..ba1f5b0a6 100644 --- a/transforms/universal/fdedup/python/test-data/expected/metadata.json +++ b/transforms/universal/fdedup/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "fdlist", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-18 11:20:38", - "end_time": "2024-10-18 11:20:38", + "start_time": "2024-10-18 13:22:42", + "end_time": "2024-10-18 13:22:42", "status": "success" }, "code": null, @@ -21,16 +21,16 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 136.2, + "cpus": 32.5, "gpus": 0, - "memory": 23.89, + "memory": 13.31, "object_store": 0, - "execution time, min": 0.0 + "execution time, min": 0.001 }, "job_output_stats": { "result_files": 1, "result_size": 663, - "processing_time": 0.021, + "processing_time": 0.047, "input_files": 28, "input_bytes": 38040, "input_rows": 44, @@ -39,11 +39,11 @@ "consolidated_rows": 8 }, "source": { - "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", "type": "path" }, "target": { - "name": "/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", "type": "path" } } diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6 GIT binary patch literal 1513 zcmcgsL2nXK5T0EY+$N2|#<$r`I1r^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p zHuSaLi9eaPx}L+a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuPAEYJvDI#?y${L zc%Jv>w`yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;` zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m S&e+cY1V1>$PjeMN^1lIVM)pPk literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9df2f3bd54e13d5078be076585302c2d0f4e93af GIT binary patch literal 1497 zcmcgs&5qJg6h18sb|Nuk;%(Z51u@#pVt%M(jF{ORn3)D0N5IHvHv~$7F$2ZIK;{K} z0vA4laj)@Nj0+bodtj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(A|*aNsX9Z9<<${SOFtdE(DeZY6^=Bh4kp3 z@E3S6-c9-oj6cGI2T#Ty;Kjt5x2#m42QN%u=FRusoB2NGg%YpWOkoLjr^sXmn*dv% zc7LZd4xm724A>^SdtaPVnVK}TXlJ(4VQUGB_eHh!0)UK(B1sBZ-=!o|%zaQ3rm@~W z3x=b5%Xis^uIN{i{#CM$E$x4rm_!Yj4CoRV05DIny&}Pppu}5%c&G=|X&!&y!K9Ws%zO&4iPF=5B-UWib&PVCM{MVlq86>j`Kpc8ToVkIRUE3s(@ z;&_BI1rU7Dco^$sb8>9Kjw%N$)+<<#qXDs%Y%n%K0{xpj=0*PIRk^UctwEd9doL(o z1^x4~{(2edO&6cjauz);w-2h+x7v26MkE~S9dY8mc>X^p>;-4T14Yte+TjAOpT4j= zu<)rU-@}Te%gU#v4FD~sMrg74E>di-#A~ tc(3NZy0iy7R_BWU5^g~{&Bn|ZS{zu%je&GVwc!Ysk|b4+5e2C(^V z_h(#v3s4O~7;bOnswf(OdF)mp0@r=W zbJ?x>3920NSH$jW6gOwF`w*G1yNSF8m7!DV+Z|_QPf>gm)<1@j(7(wOUhHpPl{35B8n#_}?*-+n zpnqP{UoRoOtHtNEoJCK|?SU!{thU{$F$sryTb#Ntp8wAYd%@Z8K#{bVcCdi!r!VXd zEPN`;_pl=Avhpdi4xq)<6<#bpKH2w%q+W`Ke?DYsN6OQq3C^R^pwNK+$UN@UAs>ZN6*5_KbWQ9dz7jZUdq zH%sT`WYNuU=Usf5iEvOZ0>6-T@wy=XKC!XEGK4Xfu=zD}>Q9WNgyr9N= swf?Jfd$?`&dfT;r-xT)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5601f5cb07d71179df35855265cde6b0634c130f GIT binary patch literal 1532 zcmcgsy>HV{5I@_A>!N_ts?V~e3>m=c(2t~15velpl9nhCN)jN24CKT4Dr!DR(nKBE z85kH@85!9a8TcC*PzNT&j2HlM_iVR?qytM&$vfZ2yZim#Jv+w>QRZV2wv}ZngGGSl zck92x#v6c&0zL?UBQjKurHaIWEir3_X)K88YRJ&M1VdBYR2`OQcrhp{U&;f>XlRnG zyaMaD6k$_-kV|YIe_GXrh_2CQ(0tJ&(6<_7@C($J*U06 zSrDAhc=MAK)%vNO@?(GVD<3qvtx;Rh{3z+)B^>PkrP8UC?F$`m&Am-^pxT}LmF*QqeY?AL-OCJMU+Zqs5BZ$ z-WJXtt<=bsBI!~({?v}fBeiJ!sI(Uy*E_{#EnPe;#R?+7ofG&l6XBp7d3HV{@Cpzo z&^g~Gyy?RDacs9)Psf{fr#Py`N(YGVhKq-DffP3-^Ta33xy`q!ID*?DP2$$S?HFRE zB}HA+rM}(WjXc`GJ`?h`Xg@-+sh%qiUNFM_>fqU7W3+1b`>WN#pf?GR8-w9~uN#hD VUyH6q%tQ#l`N0l;sdM<<{{tHM{bm3F literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..02bedff1c8ca8f026eb6a3d7664517b0f74cbb2d GIT binary patch literal 1526 zcmcgsy>HV{5I-lW>q3Oms?V~e3>k{mp&xOhB2smENlO$6B?%%eS@PjHNm28GlP2nj z#2>&v!Op_MU%=|B zyB@0mR4=8fgnL$*F-dOOOVE^{_eJEUqqr!@-6{Cw0>L#fJ}g|byNA36m0_>q*qz?U z78Jip>Tk(-7cxEtk>qsQTbj)zIcs{vpG0yE9mEG8y`Rc8KlU%b z%9-754cilw{JLnoT|}DG#cydji>9921x+4UZM#!rG7k5CVB+4m{*9A%gR|j*D(g{w ze-77hes{r`_f*vHAyqaM^~=I4fF9MtyjXm8yyp%{y9{iTjCqQ4Uci^+3FjDS@kgAu zc)BHGDNJ}uUun*>WYPC6dbE^&LGkCLM5%O^N~4qFZ4vB|rbeEW$d)t7SC*NK*UjWb z`PdvaI;CbkQ#vmviXy+A7x*v};h7)1#llY#9vt+|M-6aMPB*y literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bf131f43cbf10180944b4906799b7d6288c54724 GIT binary patch literal 1523 zcmcgs%Wl(95FI;->!J$Ms@JlmEFx5_F8YXxib&PXO5R0x6Yjz-J{2+lO3pO3uGk4~WXU@5EHHxnkQ{^B*l|z220-JN9xan5DQ2-FpkpwZc z0s1c?#=_bjNKs8)Zk{=VVWrWt*oq=4Uj*fcpuf=N|LLM_Asb*Kpo*daSj27>A`tZ< zFJ!msC#Z77UlF_6C~nSU_aQQ2cN2LHD#KpIu{*tyJw@?NSpOJ8lCxzOG@DB@%5LYF zI2obL0R$g39>yBmm>yfOqsqaGdIjrAG$6K|4aO!&pnsDmyx8BoD(7~$HEg@|-V4fC zLI1p@zg|Lmv&9#*oJUW~?SU!{thU{$F$sryN1VDZp8wAYd%^keK#{bVc5n^XPrtT1 zaP3o3zK0b_mz7VEbpS1<)aaC& zb+dF)P8Qw#cHYH@nFt5vGVlvo7q1KA4|L8C39nfkJx?At8)m9$bxOl}vV4yC?)!K+ zcah>p=>qXFIk)&B6-RJKWJ&z`4?V}#q%zdiEcdPMapLqY_8C*OMc*S7i|YB};RQ9` suQ{(S?BTZ6?{C)}r#Fs|Y-e!R>&A_HJH}2zJBR^z4;m>l00$Gw|Fn&mB5!d%2k!`pB#MZqFOLbF6;GjE^zmb+hY|S^vySTFNV`XiDVqtjM2b7xVh8%y@~vdDSo6 zerMF3qxb%n@s;n!&E`u+E~2NO?1PpVI$gKd5D7ZRd-B{B3FB*DI7}{v2db>6^@A1M zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0mccu3lJd5>j+1w^dS%kl(~5*<87>|n7I zqt%AK}bR=(}@YNKY(z94)L zr+5$pq_n9}VLle3gNNLlz#Xw=nf8Ai27zW(6)1_&QGiUE}Ar~Vt0tMqdK literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2838dd9727770220dd6b3f3ae9f0d4bdaefd8ec2 GIT binary patch literal 1510 zcmcgsKX21e5I;MKaZ!cRs?V~e3=t|;hyIC)ib&PvC2g%hC`k}0WXT`LNs5{{IBBA8 zd=wTI1_mS~J_8doV(7p}Kx|0dd$yYdhk+$0`|jPndw0K|?uhZFiRp*LY0G^=}LjE1yMZDtF7b#M06xUkU;+-BuG@-2Pv+p zEA4Z4II6WAi>xY=@>x*63;KyJ|4$Qb8wmgr0aY9g0QxC%P(ZXdgw%Hc^)-k<%p-+( z>eoVDz%v^}pCz~EC#Z77UompwC~k6cw@U#zF|rBz+t`%d9pp8vjhvcm_nfiKDSi`_ zUxNN7s=tpQ$#B_!G+Rh=(bNLa;i&~X^Tfi@fY|)hzRFWxWS&>;!tQrQ-I)n~yrutS z7bRSr(F;dm5k0N252`e@x^}NYBpm8Japt~w{$F3%3oeESilim9!zEn5<=p{&*;7%z zL={Pwl@GBE04<@e(sKFf*?~7C^;+CxGND;YX$fB@k154Khn`T{p}7u^WijCiTjeOt zGmCy;(PLY==Pdq=DPAg4=OtVJ5RG6)rnOByPNi~hkRrSmmw|bR!)2v)p z(`BCDEpmLAiEvP^0>6~!csYn0=#(BYyk>d)GJV=^nVGiLtBjiI>ILF^=;NW3BgI46 z660f1YSANB9Kjtk%fzq$$aA=6D#yB-)q&MNO})5}eI_j0VdoKw#p?Ou;RQ80Xt=L0 o?a_`k80<7$*O?^8wmUp``bp!#uCbfa4if<01IPGNE8;)=AFJv2-2eap literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7cb2cbac4ca976304da8c64e8db39c678260db2c GIT binary patch literal 1513 zcmcgsPj3=Y5T9L^b(=H>8{cL(;XsUTdax~9Qd3MX4_Y@UR=`M=3xQ>UHigB7LVEP( zC-D<#()bDb2|Ri519t?-)3B$i>jWel(Zu=#1{ zS0?oiAOV3T!^nVbu=@|OEyZLer!^(o(UhoHh0T~M0{EzFB>)i5Q*S(Jc3hjStCD&xsNV&nZYcjVL?>#%L_n9q0I)>ZoluY*#zkRKho(e5By6#Y z05QToBfTGn=LG6oL|rl~R$q$UD>HF^a0&+ymgjjIym|GeR#7*7vWNl)tsE4X>a%I?6*Per|!RLM}(kIB0LdRklOwc3l5$H5_K z-^3lV882|otN15-#yJ*x{0ZkhUhMg`0yaFSRgv=&Ma-j!8Eq9`QT-)3AF5a)XiN&c z?-yP2G$^z>#fFtXv(0?AZRRfsCU~|{i2bp`ThNh&ksA14%%guS4%!$pTr;P zoF5Wjt2TX;JMMO@eAgb-C+%G09Qo~sd^q=+;zNZh>9IJs`5}QLxg&~XA^y`K@jbaB zO|=>$dw86Ey^C{ZRPE98NW~_+&;q`oWkxOU?S(VhvPYw>mgl*%%+&G5r|vLgKH4_7 Sv-&|AAo#%%ewr)zk^c>+^Y&B# literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79fe53b6203893a919b73fc9a93777fa66f579a6 GIT binary patch literal 1513 zcmcgsL2nXK5T0EY+$N2|#<$r`I1r^1mfJaXzCeFOwl`6}@OD8b%X5Q>K-?#5&tGsS8g=N`p1p{mVY<}AL zl}*0`NI+mp7#Xk)cK1HErCExlGP)Y==xWrf!DdVq0R-w=89>HFk)(ud{*W?EG4>#p zHuSaLi9eaPx}L+a~1#=*`?K>fS5n=LG6oMqM#0R$rRksW5qQa4H9oR_cm&bTwMh ztZ2Qc#^_SO{Gu*&vA=b-PTf&|IuPAEYJvDI#?y${L zc%Jv>w`yj#W5s9$T~Su+i(-jk9LHF8T*m!4Er;4%&H?*UEygAn}Jf z=Lf{suFqcQk9u9Z&~t{3X(!)2MSc%MKAa1t_)xJ%dTh=een8+z?vNr`i2o!=geO;` zsdjVhjE-`zc5%*}s(pGMsW^leTEG|d?6~c}J#(j9&Un1l_I+=jow@$x#2aO;2iw+m S&e+cY1V1>$PjeMN^1lIVM)pPk literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..57642d19925240e1a86b323222b420617a75e7d4 GIT binary patch literal 905 zcmbW0(N5w}5Qc}wkToG*m~GQ0TyR697b2BtWW&u+wLw{gu8V7L2$WLX0L4-vzJeF~ z3Y+*2-uV#w2>T)?&YU7avnINec1~x`Isbffno~#HkXK}o*Q?}U9bn^r`!NFv0DUhl zuV`6FYU*n5GMq%MZs5u(1(dXs(?S2VN%OQx0)V$99ufR zhteNup(MXlY7M-RAN7|6@1z%kT6GuH&j}-Bg`JMCM*a<1K!qOC!u`iMVdoDD$CXbF){OKCSe8(u>IOoQ+J8bqk zR`c2^*kXOZF8F9A;gH=t$Bl~M6%Y$_!~^EF?CC|J*6Uiuo;z$toq{tUzWq55q9CPB zKj0nQ_+V{eWaOx;}Nfgiki<(2)QD literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9de62574605a07b28b991ff2c736fbf6e3a7f45b GIT binary patch literal 1513 zcmcgs&2G~`5FR_Oait1{inVMhhX{ey18w6*MFjP-NsAQ-B?%%0U!ufulAltmS!@OGnx|bXiD6x!d5~R0Rq(ZGJuGQBnTBA`oS09YdIRwPJ{;-av)LsQ}&61G@H zfCOP5*5<-)qNH(i;xⅅ7#3Y>fTGn=M?H&L|rl~QD2(fsW5SVa0&+ymga%I?6*Per|wRLM}(k88I9^o+K_>-Fa+kHSOJ zzKOeJGhXDJ*YHpFlyfZf_+!p{ywnS7MQnIRs}kpBikQa{GukS>r1}eT0aU3>(3li? zKPYXms0F(f#1fE=p zrdsyM86M|e?ctmmReSV2QgH|`vXC!m*-^`Xd+tuQozZB!<@??&J9Yi>sW;4;4|dF* SoPLl22!C*dpXMrltj)g2>17P#h?yro3 z07(c%fY=asx2T*F36V%^N>rJ7Dr_#u(*B^^dI>6_5Bsx zIAdjZz*zOEs2}30Y$)oxwc7xCT1#TJ`t0;kcu3lJd6#8^1w^dS%kl(~5*<83>|n7I zqt%AK@ZR=(}@YNKY(J|}z+ zr+5$pq_n9}VLle3g9qH4z#Xz>nf8Ai27zW(M%SvVG!3&e!nK$$1&3xa^%a(c7VhT&LJ0&JF*Z|o4 zwDUWya)3Al!hmhCyZ1#_f+bSA8t%+j8f-3!;(1YRtq35aqezkh=65N@6k`t(Nkd=j zocg12qwP6tT~oA6N&709N2dBeO$>qtOa@d*Gyv$Q*=~u#A)(}3fLj3{h|O%2SeT)! zVGa?CMzO#mv6yVWi_eMOL`tK^*lYN1-%D_qZ`czBT)4(zi!=f(cz)i`qp-Er^A@IKryf6yI?7Qd$DGI~a75A?*y z>AC$TOW-{3$ycsQ;4ggPAh;YJXo`_C_E&K8R(A(hJ{9d!(G*kF-p6kP7%6?7SF6uX z9tA_vzsWl!Q=aFXSMViy!Z`-I{D|`|FLXsLj|op{Rp7ix7X8qoM_Yv#6n{=ilqwXd zG&*_S6G4wOHJV9{Y~9YicC1{cW#umFC)T9huXS2>?Yy3?iu_(#;KNLWgK`o2m7>5a zK>UTy`2pdztCLsR<4)Vob)0@}+{)I^5Z}WY9?k_)Je02xADeTBA5d`wcSx3GR{wF} z2u-R$UG4hN860O`?qZ)QMZ0t!p*U1;CIMd1)5E6!=G+}`Im6*r)Azk;dgA({Q*V&A W9&B6N8Dl>M5Ik^*f3PzC-2VX7ZuN)& literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..37aea5168fab44a7bd45091bfdfe5871bee8d360 GIT binary patch literal 1530 zcmcgsPmj`25T8=$b`cG`@ilG2frB<2_77dMM$8_3u)7V4E1%K)i& z>%Wrm*8pMgB<5QJWSQJrAyE=15|I+B9IU8vkgGsyMidM1s*44H5IRvo5aarHAwk00 zCP;BjU2Gq^!%@BESR|%Il~025P0*{l{69^!6~=&s0970f0Qx0zzley>h^0OPyjlVg zA|SzapIP{62t)z5DI|4~-;%$s%A)_p_)SM~k@LHD5bzVlH@l#}kDN2Sf!v1mkyCf= zo-?*N&7XquThQNz^$#KvovwRDx0yu$H9^5gKRv-Ry=Umj(TF0clPbzoO+Vnp$xaRB32+?OuaOIN;l%3unjk|N7Hja5g+pqFO@R zn!{0?-&xS-Jr(6NqC|CB`5d_kpe588tyCWG-}8o~ehsZNna~`iw2Uv4hm>NVLw70d z(0qr-a+vUledQ@FFpGX*(PK;bXDt4NDPAgHV5QN?(Jl`50~csYnO=#*|Vyk=#5klAau%xv50RY%QC z?FjMR_3=>3k>aLYnej0xwdgi0j^K8fW#ZR=;5l3~m1kYe+Q91XrJrwLp9zb0*nWg! zv3kCEctK4L8t%(ud$ei|2CEI%btcKN?G6u}e$u$JW~`;PtptGgzz*JJCA{_j0OaiY Ae*gdg literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3d1f158e9e79bac193f88f94d2b548b79827778b GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJS(ac$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1h@mC2oUSy{w9@^A|VnfO^GT~Plb&+S=t|TTP*^}6v&b!gYiR3iMYNCiKMQrw$A*) zu-5b(v8KxE7fJml8ApclKST6QA^<`LQb{rZ3kW4cKD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t uf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEVGN)|KlR_NHuSUr literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ca5323db5f2275b7c5a497f7b7dd1b0a67cf5939 GIT binary patch literal 1497 zcmcgsUvJV-6hAEtyJRuAc$+riff#M}us>9?Ma*6fY-`Xt0VC7C5GVz<42p$8_67U| zKKK!g?;1ag@xccl{1PUf(^A0=5`E#MJ%8^#=lAQqX$dQ)h>N7ScPwN9>i`>{c7CN5 z1c*T>0>rwwze(k!NQgvAQ=-b$Q(>#*v}?&k%i+2!N1*RFVw90z!8}KD_Q4p{SxMk;jB0f0~_7=YBJHzT{%=j21UN^fgnDx)Rq=meqil#&!&x-tMc0RAq%9NM*n^*0^ z?X`!U8G7$;8DIHs+-$yJZ2aIK(iuxh0%7&u8Te%CMr?fS!RGyzc4i8EDF7L36F^`C4dRd+#QlgDVh;1yi zgIt~#JY%nbSY%6aWGUiO;U(u^unJlgirkulJa&Sl$C?LCq{`N|a_5ej%QVc~m3?ZC zn%!!vVO1~fY$aIVDFygwCBY%NisEuHz#D-010C^z@miJ9>+DIZY2{i@w>oTO?F+*9 zXo3eZKuVkPW#(fcI(Wd%3EUxDmP!97VGw9m1)gfzeW!PldG(Oyj5*up{RG9~eiI4t vf|l;r{Wq8HaLeiUx9Yy{jngC7ADnr;w7Iu!ZfErU6hQdEA^n_7^i%&05z+J| literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06b4b7467bd2f97d25d69e1cd7f3690aca9f91e9 GIT binary patch literal 1497 zcmcgs&5qJg6h19Kc9Mx96K~TdEQrx&7V|@gj1e=N12fa0;|LfT?S?=pFlL}w7|6VU zPvF8wFzz)zi*ez?g)d>^IV}~OfkZc)wCDHUbH1A>=)n{jq!b8%&%eyQSEFfZqUX~|_l<43wVh4+z zAXlITPuVLXme^7pS&Fz+e8KtWtb$g>61S$HfZZVJvF1Tjsj;=K{A2STKw42<95IbXmGq9LDE<4uke-BpVv@1u5v%nlfGtK{}wpq0xsL^P|%? z)It!iTE~h-=XyZ8#$1jerle+L(%BMRu2v{nOg4x8R)@Vqhqr{rWBh*%avcFxh@}I8 z${0K#2(%w(vj#>BS?oW!CwW+_r$Tc!^>TgvR;;Ehjb^W>rC%DTW!j(jqK~RLg;%D@XTAQ>+`N|aoE_9OjR;7<+T*=-d1vynh=sAc-ahFVQ5f)upxq6#H=1VDIdoGJb%!`WYZzwmjfD_T zOz_%x%S@-?hGJfr?)zht%aCjGZheNAJ2PoqML)5Z|Z&)_}IN>8nA0bGHQ|Gj%c#d!l8VIWqgpA56-?pJ(8pCJnZX$71T-V^r zYi5lnendLSYnNsYD|hui&$Uj7i{|$}4DAi)6Kk7(gYO=Zxp!r(Uvsf~aNX^=`j}TG z2Us3I6)W01Uk{tk2&{ACXQ#f8d>9rI^kjb@{l(@=VT8|^x;8XvcdP%4xKo3H4r>n* z_sbMYl$h`kKI-Vxaz}l~SeYVPkePm*?->xXJy)5t((TF8bsK9^Z^fM+kT*r6myekGFC+lnWE?|Zl-k99L zKrVzlUEE04M(U19TC#{?Ze7GMYO3s;S(vtW>BKclyCxcSGw69wh>?>xIr?WhwvG8# zpDr+lqvzF@>Q}Xw2wz&)|9YA599s}s`n>NAY_rlrwDaCwlR2uc@AG=(QOC-+T4vnl z28i0=;EUVe|Fo`=^TwlZokUEPRS`*V$MP27-j!~(d-iqM3lm<+{d3$3VjTEKub(Uu z_c|w3Dsp2IU!A{6*E5UVb?7OXcG_$&>(_6=*Gwz%tgUl<=7`W!0fBA}T*U|Fko!WU9Bg3IOcP!l(X|{UZ$nTP*@(tkeXf*yAOZY$@dBhKGZhX zA;#UBUZiJfYim0p!uHfYbzf0afKKkc7xN+_Mt%Eko=-T^v)&{w?~p-kb9LxV=jt)G zZqM^V^Rf?*J~6`I+BGOmA*SALuF)AaL3ic;?8MvF)v3w*haJd&DYJ24{bWq z$F{YFjs@9)+Hj2;OLUVD9tzkq%{1x4gca!X(>uJfJM5OGD$Wi@@x9rF+zpB64F~mA zEy+Tovqx9nj;ot{rbkbbu43OiJ0s{>{n%pejnD-P*0i^Oa%O-p!fnCFEc9>MCehHlnoVed&ole&a(MmrVnn z-EIDD7eWi{FOubC7sE6)bA;!zw5p55Q1$b31{3}3<`*QqT6cK{-tRYrYu7|yJbKq% z&UAgQ_`-#P=7Le7-U7PbkjSXai$z6fB49z zMv*)vX1~rw3sb+qVUK=s8Ft(C=)Qq4_RD7-i`o}8FjxOR;l%FyXP#^mA+7<&Q*uvV zh%_9(nPr)oU;e|d<_RX>o5q{2NOBs-$UWCr$852`bg+9#n()V2KbM|p^83){ymS6k zYJ@tajyYqGe&bJ7-CYmpMwi6A0esDo4Y8AjaeeteEc}k|jC<|O94@u73cr;(i|}*e z4Ov1zE#7M+e%Dl7@2+!TV*4ns%e1a58qXi$M_p+0sAQ^(T`h1=eb z+)W!Ut9-l2>UcpU#W61Hc(_`3(VqRLtNE>OeiIF__kF0d+u}rwt_tWH7Z+WnH2LAa zkz|*n`;?Pjdy&flZl(KsUv6(0pGBzgQ|p#b2=jRn;VZ5=6reU=JAa(n1sBF7+N%!{ zk}z&9&Cp}?mJcnv&+3JY@r@1i<*|K_+%?EDpz}C1^^n6}vxSBwn;b@>wj$~%Q zaygfl`@UeWSj(C2MrQroz#f0#yN}DoTaQz%~pbq^96>;A9x1UES&V~b#G)xLdprl-(27v_3M1lHW z62h1Y7!%_NDRq(nc33h7!mdQsBreBgTDJuO9{~UWzlNMZ0ah_spl|^2IV#gGq43Wx z`Up%j>Gy(-Kk50izDJ;Y#h&H~!b0SQV zg2@o2e!N-&K#5UOyLg3|m_%SHi4!1(l*${n52frijw3NuD){P>_#9M1+3WwWM@tKE zwCtT^#bSOsS~5UFpa2||_EgzgNMk?n;uVhL{WiqvYYO_Bg1)ApuPNx?O+jD0Y%*ny z5^C+|2bxci#LziuSIw7L1$z0KE+#6x zKQIEo1AK7;|MZWzwVfX)`wLREFSw@u3mz?W1n8T|m#5C;ho^}{Vkohezf2IQ)hGD` zMKcpv00rWJ+X8Z~8l{{h1mT2`Bsjr<6Rk>fcy#r2c6D~)`hXv@j~_^H7z9Otzrp?q Du#EMC literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json new file mode 100644 index 000000000..047921334 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "spark", + "job id": "job_id", + "start_time": "2024-10-14 10:43:38", + "end_time": "2024-10-14 10:43:55", + "status": "success" + }, + "code": null, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "annotate", + "RDD parallelization": -1, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"] + }, + "execution_stats": { + "num partitions": 20, + "execution time, min": 0.284, + "cpus": 20, + "gpus": 0, + "memory": 0.36, + "object_store": 0 + }, + "job_output_stats": { + "source_size": 4111, + "output_bytes": 8856, + "processing_time": 0.46729254722595215, + "input_bytes": 8753, + "result_size": 6923, + "input_files": 1, + "source_files": 1, + "input_docs": 12, + "output_docs": 12, + "filtered_docs": 0, + "output_files": 1, + "result_files": 1, + "source_doc_count": 12, + "filtered_bytes": -103, + "result_doc_count": 12 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d67b5bcf87dc622ba80e30d210bc1b9b4429fbbd GIT binary patch literal 14933 zcmeHOYgiLkw+@Ji5meA1DB=bYY@39OqT-cs6OfyLPz9`$$&idpX5!3*K(&@CUfOCE zue2&v@Jef~TCG}ZRn&T`7p#g}QB*#w^@dv89<|@M_9P&BJkN8!?>W!;mHZ%?Jv)1^ zz1I7_YwgJ z@p5UtC-t8B=<&s=EL1E3;f91H(Rh zUiYNzNoFarKGHSEA=x3v!Rx~H`w2#fw)o?VFtZTUGy? zN!4WPt}&-gmlmCm7mi68c;xD!4{QDKq4b>d0f*eTmHEaFn(t+#Fs zzj^6w;i&!JqncS?9ADS>Noi`2sg>;A;ya_?if;6sn)ozg$^78z<%5SWEU*1+@~xds zUmWXF%dBildvE6KTHldxOiliL^O$ZwZrtcn=kD%a78CPlxYzpu35wghoh#~|99}S> z*RI{e^6IN@4)L+fcbGnX#JHVB+WG1SPQ7c|&g`J*5twE$eYXEW=8bRGMIT(8w`1Dz zZN`E@R}$}J%70b&cfDFVk19&MA*VevP#bi&mMDxdZ8)D z=yreTy7LuP5AyPxes*)~?leKOBl(}_ruz7H?=(+1ryiHbX6J+*F0buQog@M$abq8U z_jp%L-c#CbTwcYh>Zv||pL#s~M8JTacVq9pu{k2%DW_+jPJdK%v;>B(*y6%|weIqp zsYPQlYLu7Pe!j|e@coEg3g5rq_IP;rQ2^i0yC~}Vh=A`NCtdrm+m#cxnlJK0g1-Fl z+~gHY%PR~uiHk?96L;i1_1(Jmn_l5>mQFZOnB05TtV7XSf9t0mQy))IvgiU{~EcM)QT65yO^Q|y_LH*k$+a6Mt z2Q#YqDZL~8wf@kkvSU1Zu5Z?3 z=8NR%YcfB+a zYt#?URZXY*Pbu$rVB_{_yO;Mb9aKn_udUcrJ#K2*Wd6vTJN7Qo_uVo7p2^kyjlk2a z@z%YILQLX?Lf_cy4er0!S3O;xAZ9K7{>LW6mGT2F?7It!m_6TIiN5(HqV2R!-ICTX z>h(tBPq%-W>UOB6msf{O-QEYW?~m3r#dKM?G5uNBcXy3E8>=3cernByE;;Ulx;rlm z9)HF=^~C7do?>F#xwFDz$38N@d(Q8tp!RpD@`II@+%w(w{5mgU&7y0x{9Z(tsS2+# zyMB9k%e1dP!-r1x`?lX6&Aw|##{a?p>^n1Pj@OJ)4`Onh_KfPeW9!yy0KPj%0{xx2~e_ISK}^XJo_=e=>7UcXJOK|y=Q_PgWzcewuP z@u+unCCsPDYt z)~KF_U&bFOry6=5ubk9j+@Rp&qgHC#$U;gCjp)>SzEsG)yRN4@MCR_VTE=$JbVwbx zwcUkdY3JIwHHweCR?6Bt`o_$jU3Ast^I^wE!>qw|Wj@~>pR_v8=fUO`yUZctt-Yfg z%bpHJF8tNV{bw%6|GGOxXZrnA!_1jgN2mX><$T5XlXo2wt7x(R&|P=(5>FQmsvcVS z>B8>Cmp%vm)5`t(ib_D}g}OZg^M$&K|5<>`s;+WBSPW!}lPa~wjVqGl7WoX3Qf z50@`;IeRK;K@|1Rh=&7Ig~b=Al|LJi*1z8%s>Uf}p7`a0gsxkDUbLqlda5#g@?`G8 zGOvYonv#-FS9QGnC)pIbxP#Y0yV~mqy(TpZ0up)!AFgKTNhvT!M~n{+38Sv}ZxM?}QuM{6otg&HMP|mXPhw zG`0IGKl9yuefZHG`&Qp?)4AKHe(QfatGF^acv`wg&Y8PApLvYvVeD`(FOIL7@ZO^n z3nN17+mf==*GIcJ8tNARnB99vRI%UIDE$pTw?L-u57f=PpocA@ys0G@^1SNPINf_(N{;m%ssGX|F@$Y?~FSdlKxi-f3vM;kkTU{ zzWr6du3y#pxE;|qg?2dG$*ZixmD6kX2J7#R@|0CNIs_P(kNC68CrbH4&)$c&$&xST zG9Pngzh9{we(nI77A%@}&p2fY4CE#T3D-I7WxkDUqNVxUFGH1IG&}o8xp0 z!RlaYhCzvDK}4x$QJ{4c8p-4KbizdIgduX6;Kk(894peSnG(Q4o)9TLO`;UZE`mX^ zMJ9q3u#IUjvLKJqWW}bWrKV>pQ37Wnl-NXdi+w0KExp^5b(|@aG(WNr1d5?ZDLh<0(n|hg2_uEDBLx#h z(gY)dwI&EK#VCS7Z6if}%funRh zfig%=6rsh)!9tAS1;l{`yr6y&5%3bh2pqBiJeXn?Pw8P?2uQOl1ha|aY48gPIO2=| zGlAj@X%Z%)S&}hJ))E-IA_dD4Rr%N!(m?Qfc!RMdUvNev1Qqb5M?3}VW|4_wXtG!d ztA_5_JIKO=6LAl`0I69p(AE~)l!98GiLn8(#IV7A$rjX#9MYO;;Dgx&FvX5Gg1M|{ zHX;ZdzRo9%G%OJU4eM(K3v8+gO>8sT^*m=04G<`a2gR+tWkV~*NIuO2v9f6aOb`H@ zt#Jh;9Vjh>0B^y4pd!seq&Sgywcj9Xz-SQ}AkKJZNe8TK02&HNOBLf0@SMrC&H_HB zz<(I%CLX-rIu4l7V>!M+i8KtMC5(h*L}~#;7&s%LMjD-t#!v=b0M}!m*xVpZZoW3`qlA}N#ANQf@eihMvV2@XmGplb#SM3~}109i>~Iq93gh?g}GgB-%oii9A7 zthIy+gWN1Fxfw@N1n}4#sJ4~utyEd*j|76&p+wq<3SsG3+m*mz3X3afI{>X&>eR?T z4iowX5V16vfJ_3i5KIBw;5oD2AWfv>K)02M1RqEo0X~^oo-QO9fMxTt5mkh#NUZ;)_A(UIj0(uzhHUjT*E(fK$7 zyo5$jFJKx3(PkUK$4CPmKo}v9ON@OP)Hg`nYpWgah-wd!Uw z^&ksFn$yw{=$A5+(JTPrrR7N0;{pQ$pqKKv3A%HTQ^2H`B4o4K0?5RKf*gW9g(#zm z;ff)@V7!Nhhd=-Y4)Q;U^bjl0iHVT$LFcRq#|XfJ5wu{nnPF5S4ak4t7m0SDOIogw zQW*)pfPxfA=y5v7NCr}XplG2u%EzI_Igx;*DS&<#!U`dj`A}d;T8=Cr`G{f*X`Yj0 zP>te2z9c^5fLjFuvn?4C36xe~egjHp7{lcwb5%4IX_yRi~3}tX}W5HR6fC^Nq29dQIl#(QhRl%rO zEN}%t+7gig=#Hra17r6>orUW)h!f&i28t@EB_OFo6V?TgdLt;k8q`TGj|SI@P=28d zBg`o$ZI_WeWio?F5D{GJv$)EF{_-puax$Po4-96cas}2k*dU6g!NI{6i$!hXfjd&2 zkNXGX3#h`N4iqRM7-j}+n0b}1Wdos->PHpkg`kp3S}TQNVigU&Z5s$xzmSlKegi`X z4Nx0IBO@PYCJQJYb`g-;Sd>IVZUYB!CaKmPjDiwGpap?=S|}*S=$Z&cZ;)4ATZtVb zq1b{vnMCJP@?@e2C7JaQJ)V+}<;)OKHV9WQz)k=bpk2#BeL^7Hm!vjW83pqK{^KB} za~5QvplpVS0m=l@n$sqNC&5Rw0EkGj6*4FjNP|ltL4-OS0s}g27Gam8M44F#unh%> zH5=fOShEQ914md>r3m@?g{@d_v3{8-qCm<-SPvx0ra@Zt1`&cU2eH&yc?0Pm6BhFm zKyJZ!ff~(7*-9h^ixEQ?KnL4G)$)GfgMoM!lnsl6HAEA<7UNSQ5>(aDz@m79H$sU9 zqet;jw4f=N2T)O@Q9NTbi=ct9w~3Lzt`R4F0-Os;T!pg?PA#xz%FJ^Nr3BY8dvxYkaMG+1dDuR&$dH0FksLNDg4W)T%^~ra=~= zz?YIK#1)UrV19}(f`Szq2dUX$5*lo=SHBRCe}AX^A0b=*`ritUYkd^1w%-a|a7_qg z6{MdD>fZDG=$*j*y#x2R@v-mit%7CW+e?VH@9ph-d;8uV^1M~`?R$H0gne&s-`hhC zWZ&E4=SB9ty?t-r^3(^O9@_Wz_Psqkv9j;&TQ>amz5R%5gxM!WG(Dw*7r%Nj~tDz|+aw$6NXTbc!(0{s6*$xWIn6 z;H95%?1u~NhYRe73+#srS{+@pA1<&TE|AVuzH*ihKY)yGIqzpbTwp(3U_V@7KU{zh z%-IhYv^;CDA1=Trt1|v~$MFBp9zb}meBaY4(8cq~gsw7~w~v>rCOthhd$9F+gjb1U zFn$fxYv45+|AE&ijmDyh(re;08oef3x}}MV(rA+L&1j7#0l&j_^E+JOk5MnKQ(|Dy zLC_XECKmqUFTqRtD1Cx)q5ni}Sa`^|?C=oMBDKQ((Rf^GaF`s%8>@-3Sm%{)O5LD~ zLHfYix>{Q`niTwjfbrfAV+!65ODWXGXNg+Le!V6}8jrM@f_uUY@RE5UI5#!POo=Me nWKt1u%Fu$~jFI$D2cZMhq3RHCB|MM8k7Rywm&wxLA4>lPWZ^x$ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..267e78385aa2df9afa5ad324b6090abc65a6912d GIT binary patch literal 3068 zcmeHJU2GIp6u!GGYZrVOL`@AE!v;$(w4Rmamqs6uq~w)C{zP?XK!bQ&dyBd zr)%p2Ubx#!+mwyv0QF_?+Op+i{+J=%~>8 zMYnY96?fnF)mfi?iRbJb{AJViM(yS2e|zuF+(*!2RE4(PI4!hy_Pm;mM_%hGs@}N! zT+gohy}f4!viskNbvSSEu)nM*aAEfQKb9CzleH($duI6Eo2O2_uy4+h z+s*|WD)t;|Us&|Rm1M;`JN6vkb-Lu1aOcgp=+`T}^Xfi2TpjC~7r*uJ#{+M-1W&#vp;ydxAStid{*#OOZlQJKfgGmzqNAk zldhaoEl!8**-E20aZqSnfu}2{dgvpvZrV%KwwY3QVSHtP7N?I)siTO6-l{6k_1CXb4y)hN%$=H<=n1 zbzD+bP*RQ!F-3KpCR&PW!}|jV65<0uE7jm9P+}TllPTeZX6+W{HA;b9QJTjh3URHe zL99B2I`Z#yK~Yu9V~)y@1vqLZG^Mqf;Nl7#4u`R$Zwx35Deu1M^|QO;^<5i^#8S> z|MTehwlh9gD8u*Nj9CZ;g8odlu5QJe%Hfmb-|Q^1WC72Dth5@CN06_2;}M3Z&(*wsh4}XswSx?4dmZUchfmL`uW!)|7_1 zw7m-Yl@_l}4#t6REnAfywreYG7(fJjpo~C=S6Q~k8W7-Xh)B&1k($=TW2=pX?LW^( zZ9Z*Ab4(47jbd%8_xTj?;P%JFsDFc^j4ttIwch50n#t|Pjl@Z;5h{{%m9Hn zNRWg%h%mb^f+SF&DotmwhftZyhNh&Zc}UR;)ehtIz*79ySESPK{RMg zuYgQ+EqUVihQ}lSIB@&^*rf@Ts86DLFIs`6{Dn);;B`PnKtWIeAV;PGnmp{Cy8RB% z#1_S$grE}L(2f7viQh&n1@Bt`&?UFNY zm$ojtt(D>tZiIZui}NNCUsxY9MV+9TgHCXD?T)g&!64fS0)JwT-C%g;Uz&yGTp_ol O8wNhkqX5vrFZl)Zqj*Ab4(K7jbd%8_svn^E-#6&YV1r(OZGmYLrk|07$*$KF#%afM*G2K!!NX zK@teeLxkA_5hQ^ERcR`XJ%q|sHZ*0L*A(fyQiaqnQA~ys@go2c9U@55-P?!tAR08L zmq8}FmOSx$!}iE;2X5aVyELH^^+{CkMJuqBzi8=cybh=cXb2JjYNX4c$-~~M+wbs9 zY)Skn2rAJH-T1Ga_-({e@V-R=-Ov)uarT>Q(TduukmD$M4AI~QvBTJe*&f?q%wv@> z#cZ3gU7oBkw#6C!DWg~Ap+j!)h3xVEO+E+F;KdQDm9R1z8a|_V3CTY17WQshMQ7Y9 zZeMhptA%6S2=!1G=S?ELus&3ZIzclBo#5))9c6lhL8cP~{=^)+!SKw#H1jLje0Etk N4E#8c0ze&q$uINqc(?!n literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/metadata.json new file mode 100644 index 000000000..a0b26f931 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 11:36:37", + "end_time": "2024-10-18 11:36:37", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 4.5, + "gpus": 0, + "memory": 15.91, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.024, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c7d3d807242ca605dd7ae80e4807895ad4d48c50 GIT binary patch literal 3984 zcmc&%XH-r&>{ns3qCWSl>YEfiY;u0EYF)g*Cg|rUM zp*e<5V8I)dNqK|X03aBJnN^4|Q9#Jsks(dW9D$=Flk%gZl4It?N2SNpSR#}(B1tQf z$IE$tK9>Ya!~>Ct=d8nMv7|e&v8EDQ z|F^8Ayg`YSNBpWIVJi9KEhE&j&{Tp6BJV7g_mJ;Ou@J?i%*#xQP0pAT6O|DqN?;BN zF>~gnBxWWhYl_=|B`=Zu8X-gszsT=vGQob%EB8+bGY?0g!fVI+c`f1aNk1{`(j&s<%y90=+$>yWbCau% zbfW6H-%*LS;lz1Jhe;*#;N!>M)X?RN!RoV<9Nn7;J^z^*wy6DJ`@s}Kf3XmP`_>|} zq851M*Mu^L@=?qgXX>_%E!rI61pGohpe!P>xl1)Q?_31sdr}Jd)gdU~ih|MC^r?dv z-lD_9E)Z6Sw-8HidBNvtGjN@s7Yf%a5%kelsDB>!r=qG7bt(=Y75gm=L!V@IVq^IT z?i|@{LEjY_p>SAA9!iPC*)P1%)z{s+73SW=g~{GDzqmFkdcsP2{phL~vrNvJhJDO9YeH4G$ZHD(qT&Yy8Px02~W}VP2 zN-9xW0cKWhOYy@6lb~@w`!8k5Ih8N^q&#i5s$Ig-(*zXGG0| zG3e##BvST8Cj?H(h3Surh>Ks>qPD0P5El=q-cp1b+QabO^>c@~4c&yxAJ}8TPnRL~ z)fM#e-a-=ZUn1*vxlC>e97$;v84}TbF&M25;>Gn?!jnf00_#o;X99O|^P-}SD<_+S z!~Jh5J%_ERoNJE_3^o(o)lb2D*K)ATJ$B+qPVXy73qAsjY2<<>^Kp9S3)w{j}ReYljUdow!7~`kIkPz95O$ z6Di$}@+thasjE?xlFPT8`W;&KZ=ZPw3_1Ba-LRA5LvK z*^By$Q^_PfVQL1-r*CGM0g?}+O-d&WqKP3 z*JB6^zL8S~=d1D5w!>lwZ4x;CDCW$oR-wAl{b+sTTHd=k>UxF9w2>rB@R z=~X-{x?b?x-$;^ey_94JiJ>x%sWGz=Z~leQddj#kvhX^wc!f$&7}rQ(t4C?fM<-_ zGRCJ!ZTph(cX1En5A^jab=5kaTNqarjLs^1HITv`lscdfRPFEqkIdkA{z^QuF3;;J1@hu?=_2{{i@x{Lf`9_AxCGO%Ax&kSVf#j>}o9Z zyE%6Dsipl`qPk@p-Lk>RZM&+&e*WcMQ&vj+Z;xAW{YYffcK1ryPddEfG-`7t_1Mj-#P$FaZ;=r$zt3^gGtEAKJ zxv!(GXEzs5``UAp(G)c)3%c)}Z{}a>UJ~@ccl#LY$kVH2!4FkC9Fw}6OM)K-lulXp zhSZd)w%erU$g*)2O}JPA1#c}h(ggnScL4R5mX57}7$ zvzZ}{2N03>`!Ir^l#tA#qu6vZF(EyJ8=Dx*LJS5^)C(l-BPC@f@c!%wHu^1=>w-|gBjo2{+=o#2peDofwlk#q91{FZc8`NR((I0Oa2frdctU-DQ zM*kH&(tZb)5@VC&G6v9JFOob&@&XHay*UGr{^t7T2KZ^y4P-ZAo{f~k0-vmbgFT*SUyCzApZnQ)w=Sj|w5(CH#r9wqe(A3aL1A`?k zBW+l6R45e+#Wd#2lAl76tx(1(JQa#Kg?G)5#Y`8ZrSYW!7hwzg+j$H%5av|$Q8CMzcAW9z%Dv8Bs0cy;==H-pm&qC%-nGqH0m=YcqkP;E5DV>QoZJ<8i>F?*mG>f6%<-@j8nHu92lEDfTJ}OpU zqm(@uf0~b`-%PU^8f?8n;m#ht64@T4MtDwK5N*GR>B%@&iD_~++kYm-fk$EPlp0GW zbEY~*9;9gUDc#>x(6Gu}gYtQ&+OxdRy&;+SzYE;^X unXwtx^HNi-qten+vQ4vM)6x@Cl1=TL#@dawm8lE?-hJRC2@uHqFaB>f=8;VR literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c355b299a2b6bafc20dea943a5a9833b966e68b4 GIT binary patch literal 4763 zcmc&&d036<`@f&(ec!XEsFQP&igZN#YK!TV_K<89(P>Z1NsDX^MRiKH2q6?j$Rssi zDpYnuri3Cf_GQL0Wl-VwzNcx1S^oH4*YEm$pX;3GeU|&aKlkT;-gEVQ@{)R62uY!r zaJr|ECxih2hC3F2a%Ycj(%o2op`t81$mz2car&$PfNzNDUVsQfVT~mZk+zT&+W<>VEaC>rVuI&| z%i_a@SW}?cj5OPjWRZmPZ?VM2bh04CKzC~N1Av|f2wgqtB$(vQNd#O3EH0Y_z6u8f zDjd6hv=!4-dmoJJS80nleLEr!`rSs8rtTGn5J?LOS;HEs2HjQ%Yzc5Lt+x< z1j`Vtc21QDYTpj1>*uEcyL_+oofx?+1U{gG*SZdig&A<<9$>$Mgr>v8fID0d)HUp(alRU?^RtD>EJv8sG9TtF(gkj=I^3LG z3VZq`fsR=@x>Bl*^iLcBjVxEtKXC}olJ?(+~-T;{{A{_0R)Hw4K1iC|vQ0cmdfF!^u+h!0r6ZfpqogDN2T^I>=dg39a>AjxLIp{+8w zcvlt9U!&mKPIKg5QUJr^e}L|#&Y=I)44MOT;O?gwSd=gewx;cY&kN7N7MK07%Y6de z--c25@+V-V7z{&8E#dW>R4BeV9EQaVK%KjPLt4R)P=XiHuY~^!?dO-1zVlqH$7Yy`oCA{`@qL$Gc$R{4B(IVBmXpP$lu(aQ^fUY6F^ZHiaV+)bZydk$SI$RDs?*` zR`ob4O`8Zh6Fs2j>R`fPV}`@9R|YAoRACcfs?2pj|XgGb!~`;$7fSvV6+@> z-!mKUtgt}U!>fpri=WY)%TI_2r-rcg(-z``11}Qa`rA^*2iFr(=Dx%Qh=;LxL9qOp zD>Y!nVlX~d!|Hz%p^o1O09g^3e>0xYS}cHq|Adk~!UpO;M<-FTBT;IF2TL_PB7M-l zg>6_zGL*eD!VvW|hzY6HGV*}jA1A$YLD#z5`@c#(&rNR(;C_dGQQ0);EZhBCRl&B; z_fUVsJY=D0y!}8X9a)ST6xJ$gPewx07;n~s z`_9N`XD*tzYCEiL)rI1<#-y&-5RTb~bv*v*YCPZX4Z7z(iZkra6BN!f=3OqgWzSrn zqQHjk0&(b8-_=J5mMJHTmEs)gX0MZXqss zWQloiuRzF$2K4@ZB#DcbirXh#A=i89Q~WL3L|{)aM(N(1u%{#8#gjVT=>g~Al-CZn zLKdjAf821ediV>aZnXgwur2YRLF)*1`YUkVu>wY>9b|bX%;Cg#e9QHEK8Am0=<U-s*sUjswc;JpG3i#i>r4-k~35BO0`%=n&5PWlv44ya;x4RgkifzuSCR0;L zPR9*`)2hW~-#<#+EN#G=N93Hr`-?#PR~7Ipyi8nbHDkpSmnnO91M<*Sk}&Vsq}5t5 zo|_q)j?NNQ?2%JmqU0C1Ab*oHF{(#SYmwr(7%+N`Ex*H}sQpJ57T zo0716>O3^bEs-a=eFq-uEF2y?tpX=xl(IuN>rzgkE?BluMjl=z6kOFG&z>0KMqcL@ zv1329q8rCoVC^ln8n(@AU_n!vp1Xk+66KvEPEIc(e6yaS>4Qnmgpa<|`kLpcXKO4O zt*!&%j{8vLlp{pg86dViFoi2i^0ALai{Sf#ahz}bw2*HU7n)Q{P_lY9VSgB5-p>+h z(1miGEPggXVe**USRR6d$~{r_h>vJ>eI_Ss`hL9dbR_F8rG*1|`oxvA1h|oZ05wJg zg3VP=^48;#_^$04bWNO!FQtn4Sz2}!Z?}T2m*d~B{$VKMYgD0Bl?b?2aG%)B(p5>* z&_cNh8;NCm@1RTZTUbVw-e4c_fK?!<;O~8u2gW6%__cQ0V8xq>p1t`MM|bfdX@ZL8 zvQ0wN`tAk(JU5*?sa-(4IUkD#-`kFKr#awOe+junV-jnRh7$_&{+6Pz7PSK5dV#U9 zmiOnSh-6AHBAGv0blc6+)1e>4rymgtrJ!>*bBn3vk=*5gIU@VrD^du~07IYJ-6<>C zYC~PO+83mz@kN&5Ewu$JGgS2)6e4Nis!Vl5@BKb?g=wn?ng?FkD&3R5X3*%kr!94R zGS=zZr*gy&dleh>U2_b5>-T1E9PU-E#=qFCuaBHrI_E^H(Bp=&UuD#t`dp8jrgOcPeCVQ# zx0cDyWQoV68|RgWT;4WjDa&MAMMU!f-|BRe?cYbYRW0sjneI5Y;NhjbaT%uhRf#{} zI=+-`wzFn&=kwO;46|J)lY6>8bhFKO*QStsEjxvIL4BHF(AZ@hi^4OR8b*FK3X47G z)@YAg(!;Ubdtt*ck8O6DmPMCxjb~Ra<60G8&a;lZRg-D8uOZ)VX;%-|djF3F6IW}E z&$2FQD*6lh;A=FxO>!T4d=`zqPl>K}fJWcrKE4lq2k&e2k|jMpHhWzU&N}iH`avan zb5{I8CHh0&U(lCy@c!h^pr5o&X4~$pDGPkj+KV34{h`}MbAD?|=ojdAdm>fSIwP7( zW_%ZV=hoZEmLnloi~Vo^7TtC{@zvuyx8KR{)upKppWF6+!NZHWW?}7ZAL5@j7EReV z_fA*h&#jg6C+&B-lU_?YEwqKWR|mL$eR<|aN5n9iAS9lQiil5OheU-i7i;$JXI==FMnMBcvTJ%PX0q3aH<7BA+iNiY!1i}mjgkxK$!RYt^XDC^Q7C~ z1p@7+Y$8sd4I>c#zDgT?Ef8vK{!JjL{vj8lLSn)adi4)25k4Wwerw4|D_SD_J^sI- z{u6I;whf3_@j>As(J~=P%OnH*h1xRE_B^C+PYWin!J@B=FJ$!^f308`yFZUbh=nXq zJI0$7DUXOzWSBpO89a<{&WI6|5W~$tm znltmrK+DM%fwN{N20M?5^!4!$XZEAL==>JXr#odlM`aK?3A**hQx2KEul(cvX4%99 zyM#$)jK30|rb9R1zPa~|8O+?B>3XRvQ}4|ul3sI2kj=7@`}$0l`}ruT(|A*R`MFR3 zIv=`Nbo(M7rj!n`!7eism<_3$Cu6TPO6tR(>ZYtWU2MAbm0l{H$P8Vgm>R_TIggni zXt{{4i8W(H7deTkKb>Olq0nm%N>4g-njO9GD|JZfJtt+dj2CSqRaUasNlCJIT}jG-rOb2Pn{AkvGK#Vtq-NiYeDjWv-RKi7BbssI20 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ad59ee31cdea4b0c0bd77befe4a698181186db33 GIT binary patch literal 3695 zcmc&%d011&7C$rh-YgLI+!zoQMFJ|wq99sC5(EUJBA^f`#jpsJEwYHE8W4-J3gU)! z0|mvUR4LdBwjhYM*lIgxYM-s5zPU-U^|k%uec$_D&qrqF%$)6aW|DIV zmWQ~qH0#4oSF!>Y%mHkcR#63GKPaA$mVNuf$Mstt<&7R5b!y9$`%7=R4ZGP}^kW{IcLt z6zyL*8#0^1a7)rbx_r-bc-%A*ed1JOC>S*pc8%OcFRwJo;2pj{h=Wjq7eG-4X{v-6W{pZ*%<1T;6nN!fS{4(mTyNBF{Xz;d-c=)-k zo)S8EGkZ6kfu_zUXcQ>%lw&SvPoNQ++jRs!mSv*uf(G?KFcL%}|5X z;a8X~UH3ukgqZD5g0N^JrgGSJ2(bMLK6ATnaQn_AMhr&axwHd*+8YQ)DnCda?u75Z z(FP^G9n@E$-cWpWA%57-ptG}1fR~pY-8$kSw1^ku_ll;&@aZ$*$mwB}c=c0cx8egZ z@p}s2#54irEwUMq`Ii6PazAYY>I>YJ2M#bg15l&qa!@C3kW^@7kO;z z25^}+m9lzE1fc^Z$Zlf~+?v^qvdx7kzQc>T$qm|R6^$zS6| z->t;=7Ut8fvx1S}mKWBT97aRZGH~xjXI>WdIpY&#NAEvPQ!WqJSzWA~DlE~gL_Jg+ zf9#y^(6W1%VCy=6YS@V9z)Qc+Uq0$MyqA)R4^+N}+BTRPd7Wux%;t@Up1xe1F*gRy z3d}K({q1YGK76sgCbR`-uX>NaaKi}3Z(#sdFILk{@35lNwp01j69Vb4gbjSn&ll18 z!v)xSW2e~TTm>xZJ75!JHvvgE9j87CYoJu6w^7(ITHyUm#Z(=+iw0^mbehF*Nb=o{ zQsx|_5|073@!Rq6Mc!5%;o2`cG$d0nEz%09QiafGvIi}*D5rdy5H|RSj2ZfAGhSwS z$Gq6-rtnO20*-4|qV_S*P+3=rpfs!zFYZp^U1h9rtbr}{MNu}K-?|r_Nsa~g(@Of% z&9V5Z$1!x)vJm$cS{jyGc`*jH#eAFUXydNy3z4C?4HX(C!`ZsOQyX|AjEcloXnpot zYH9t~s5fgPZ(M5#_$>IAS0`#QtiQ1d9QRBz?DVpR2?q1go$r6ZX-^Cx*V{<4bRCN> zKE8)vys=U^>xzi_{)7e%yS5pP2vy*V(K0$uJd3we?1vIVw9M1VjG0=&7cMw> zMttSP37+uT1!3xS3BHD93fd-4l1=WY z8(T6_&%Z<{kAVkegsBzL!hFC2sZVXea!O$C5Ybt?d8P|wWU{YQ7MsvO}{D6mx27q79MQSG4W zsxMhNf)~s$~HPL=qk4K-BrG6LQ-F&s(V+(=1Fjk^Dfkiio1PQ z3$?WGZhtLES8pYrE~{9r^>X{lZA0JAysgjzJRoArh)pA{%TL7K$?e1*SXnfhX-)mB zh^ZY!<{dXJ(4u!4?d$sdRg@{Or)KYnoI?1z;x z)ABj@k19j#3f%I6W~}#)`8PeAJ&D=U$73VUG1})p;(z^!Wi2DES*$PkGhVHb!nEXcuJ|Wp z(W%K<+5Cjm1g?6LQdujK+_sUNw~}>9Wmj~a#cyBQAJ#)3*7IfSB1pi%Am}Hh|33FH z3EkFD+&g}yX^{&0xpGcODj1aEO8@V7iO4UF48~NLvF!TjKdbyEFZ+Phg!II0eRFdc zNp2!ppRMfqC{p`>Z~jl5{t;Hoa(9sOvf`2w($p+XdLTRen^wxsTDjlChx7#Cj-{_A zoaO0-Uu{h|xnHk>#Vk+h#c9)0-L-J-7A77BugA`c*tg;A>m2_lSdbW7T3agr$9>*k z)Mx!)9`9fFk|YOY5@VS_veHHjU}-HC$wNZsgip~wENLz6%#piXA(zYNk`G5Sv) zB2n%ymnX`76LotEA~1_v`O4*i+)VhvnQ*#g7H5GoB%hNv&e_?MEc|BA;CSC$ZuH{@ zVp=X&a)Uy}vAT1)3&*F#y3KZt4WFMA?>{+36%mreou>tp_}+Sx3_5$hIwK?qLVA0; zC3p6UepY0-dwP68qFl}C>)aDKgj9ok$h>(RAH)rM=(^DL@uZMFMYcNJJwp{SJ0mhe z2c7tvtH&1<_G&yNS%iKQ50^@XCO%+(Hg_NoRC4k4@tlJWjlZl)VNpdcie-fhpQplb{r%56w=*YQBuE^EjlP*|Ji^#}znba%k zKdefHT${xSnOVsh={Bw& W<6XzQSSqanUjD%m8X%bbtNCwBcqY97 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fb2a0b13d44e39718519f4b481204d15e4dbf2ba GIT binary patch literal 3684 zcmc&%c~}$I7C(0~NhX0nSSBWrR@N95WfNPa3lks-ii*_A;(}q3ogjd4Z)2? zsen}$#SN@Jwbr;C)A^E+HjNDgrF3er64Q#h!5Muw)7vq=YTnZw}Tgh%L;ox@>`QjP{6 z_5kvZH&Dj05Hz2JOFKP4G{^-U9E0FuniN*Yj)e5(!7#2f4W`Z+2%_~4@cH;A*kV%v zFMIDr=bMzs{YWD?F4u$mk@JwfZvH z#SFQzgm^YxOl0d)C zy8%OsUIMo!Ke&5uA#D465O`$SqrU2YBd5gcC{K+bJIftv_Rl2EbO(Qb^Ekd?PlmL1 zdz0hX^k`H!`%B_wm7DN`R5dbpD0qzjdt{?=8d30*4qg8Bn%$2JPm7jZoFUqUevqtl zJ}DeyvbC;!+>Pwi70B1IR3MJJLuFp0MDJZWL^ZzUiVmOkA*O7y66FjZENHQ9BEGoh zNxUE8XqRoZo7gb@78O;Z;b+Ke!P9Gwa5jH7?0LL_m@>CjxWtw(XnF86S+M6F)KN4- zBE2yL3Pwc=X7y-M^atxv+~SQ;e03mfEA}D>Mz~VmrK`nO#}5%{vG-8-m{%x|uWq0e zv6uMV-jTw|B@3jYLvIl+*N38qM>9#qCtpIu>xB^c(Wm^g-yTF44L?Ej(RI9ILnNs6 zQZ#YNHo?JGR7=!c_anp)&O_4A9q3_CI!SDutGM>Yd9ozJoo`j9aGPhtUt|q7Jd)SG{fh8G6!2$gM#J|lM&ZSh&*0E2?ZlDNi@e=$xY&BG zDkJ?ScJXQl>3JRI1Jcp^ilv)nLHy&|M1gZa5xNrkHs82Wub6bJjF+4OsC&f>n_&B5 zf>@hL4EvG*ZJypb&|fF|?qIDr#-TzSrd|xqis^De+cyrx*&-49828K;Dgn!2mGSTDZHZaSmDPD>cRTWQ)FVu0OFcXjan+T zw!;%3cY?)}G_x*Wy7j-9dTL-P9lNa*_>0 zPO?8ZOjUdr1)g}1E+7=ihdvv2x0@nG^MIh_LDlmY@F;tA^s(v%i-b})eR)vL!a^&# zUrOh(nng=(T{J~X^0tP*5WF584% ze_Zi}oymH%u5sOP6TEhzy>k7|4a4&9bXvSgOjD~kPr0YRui`u{(mnAcGQmANGuhiK zGD+X*!{8()^0O((tejQyJ%?aHD*6-zon}Scy|OFs=3g)Z&!)yeB>Z8s|ok8$`K{X*0KDv3Nz`a$6?!Y$+=yP z=8>+O{C)$E#g>#*`4;kjo0L)0oNT=%EM5OJA?IuMFIAPeDrv$}-u2JdMNwvI7F+6* za>1ly=QDf-kQYv3c&fcUuV zC*ZOl|D=}3J(*uD`&fe`In^)2mi-rZvC#8nzr*9N%D(N>CLkjzD>={7(cr@0hg1jM zRb6hl@V{vPFTDO4WW~w=kPC7XQj#(aG>ID@PvV*W$x|7=!yyPa1Q0;TpU*ojun0fj zm#}hw+z3t3g7DW^ZBowN4%=f9*hBv_s^K)mbt72)(|t{-J#yzui{1McsJtjIbg+0%V!12A4h6m1GFlPh@fyrWzTe4@*>F35q z1!N`ak{JW5&$-8NFvaxqk#E1v@?+S+0-X!p5>Gnb)8rYV0*o=yES#0+gbMooPYmPojgyV3KlKa~!Y*|gY!0$xwWHGF z{pR4Dj9`Uuk_*`U;}9*Eg7-9>CXO7V!{;%KroeJfT(GPbmSMO|T9i1DmVFL-Or|y; z`{8i1`q(~JWv|8}338WgL(Z)Hq`YAX+1bMkIXT7xm)S`b$sao9kHMMp`-1XS%(bMT@N9VqmqEhvbIcLuFJIP(Y<@bH}p8HD@ zm_$FCqFreZ74ED}h@9LZqE+8+iZ^?B za0w6uc@t!f)T<1N`NJGY-T800@S+y@S;4R=Sppj#Il!#d3@XmIA#x?b zuz#BZj;v9Fo7NW|k9vVlHw=afsTg^;cmVZ44g!4~gq2WmuE7JW2e^WhFbvM-I>3_D zK$yBP0!Es1VM0+quwL#2m*eZ9+O81#%6FiX^-|>Cx&?#_qrtuPBp7xL0o~6cXtA3B z%ce+S(eJwM!ZSYXD)$CiIUnj)s^RE0dpLZa2j?plD7JPzco>hv^I1{gepe0`(ktNF3mp`h zl3-tTgc>v zz@T%7s3q|*=Jy4tF?%~zUi$+6*)Rm{^Ih3f;1&lp{cEVnE>+->=1C<~KSk!CrPQ6c zgJ_`q6#03KH~3w;1zPGj(YkyOGC5i!OO{QHM?Aitb8sN=Xs?O144K3G7hKm)*Sc+Vq3&e z^(>aK8A6Z+bTcU3e}kj`H+$T@_W@4`wh;7j8~nB<25eIzA+N75@yo;}Sa7V3yf`Kl z%Jxqoo;35&`H_dAV#heNtKSo76iz4VOGiN85pmFRx)13&ZztGCK8Nd*8h{K@ka{`{ z1O;W(-pR}O&z2hDcKclLzTm?9x_UfO+31Iw2Q-njM_-}|C+?7;ZT+p>OQ#cCosW{A zqy_SPwthk8DN@KIV1(dR8BlyLn&((D1AO+h@OxcG=*h1n%rYxM@!@dNWrhtbE*ekl z%^yn$eG#uYPKmxUmXXeBKafTC*N`^nCt}*qpArwI6jQrKCLqC`5W--;7ddP71gO7Z zU!Vh4^ckKHU1Ih6#iZEd7E|oy#&Vr>72=M3aR#AK9z1%8CnZ*tU>YETqfT+YcQ+a`f2o z6DLodK6Ccm`3n~>T{i!4j2_v#SdA7%S`OKHMFL($zI zacgakj-{kbBbAq9G+AZlWwH-Ee$7j)MDv|RYiL;wE!)7+-J6H^&L_JYe>U5&M+{(Q z$pj0&5Z(AwEa=h@3%YbzN_u-08~?$n1k0)BzVBls`&aywm#ND#S$Osc5}iS^BYk9L zeK04z?fpG5-<6=GJOIReV@7tSR!viwjy;JtSlv^qs&fj%#0&w1xOrWFjUlqbyQ)2EcSmq~igswC3biw{zS^r$%KPL+!fkY}%{f~cg{6+l#f4Q-L*-a8% z>Mat;1fpjW{QOCzVjCtgaa2;Uy>9l#)T98NMw!K^ zS$i%#P6ta$*F16TSau%EHkQ;m=oUOv@mjb^ofM!?Nsia2CUdFdc%v=(#eUQs4=xs# zH}J5f3^!<$<4o)Z6Qg4FIiHbByCPP>y zT;xKw{urX=Q1DtfXNi&H?C^dH6JBUJCk`xYg>@LNk`^N_Ny|EydWtrx0LS6-WbLtk zY|0*uMH9rX26g_lf=rWdhQZ*g&d=8ux=zo`H|FSdu6}`o{RS%}Dk;FL542DK3HVRW Fe*i)ig~9*; literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a46cb40fef1eae7a5d0207565c796ee63a4a4fe GIT binary patch literal 4466 zcmc&&c~lffx38-1o_%0IW-C<+Lu zfXF5&TO?qV9RW8mF1Vm^2b1^|OMU3!0g&15RLbs@Auckw^Pp}^Ozy}+TM$xJ?S>=&VL5Y zKa-(DF(Ke>!%>WBFrO&P?gf=i@8A^RhUT`J!cu20T-OL;X!PK-K;|_|d--VAiHs4BrMY;#2YO0y1dt(&gZBdonoKQAOkw`k_O$X0UN; z17=Y?iY+_;1ela4XQG^PFF0qBAV#G5K*Q+*f}5vbF+iN#z7}SJ#if{sy|Ir zyAI*-ujFM5ECQwrzQ8osaX|j575E7MB5|r_ArV*=#VaVU&~T0PhQ(`cqedhe zpl`I6zJ^!>sy-Y*e*bEwZ&4*^wXfge303Rh#nhQx-V;NRFvo+o`o077KC~bDZ_WX` z`*pyv-A1^M`(%bm`W_a$wGplKeF5*e%wl|g=LrmF8L`e+ThW)MCGnVzo~ZWGbok~> zG%mVy8@PXw0zAI`f#`YA0DA*}1KwvcuqHz;Z0_?$eX79|RG9@R|B*Rjy}kfK-(G}o z?nmON=wG6N`4{jscRhlgsf`7Vgdn)ZgAw-B5IlR*%xYEd1ZUlI>8XK1lS<}I1r`r~ zCMH;yDH$2nA<5}1-euS54zWT0cIEn!56R(NX zE8$qGn=idPsRVEqbl@R&pP>Oq16X^|LDhJ163!UDhB5jzne_WL*!7BwNKh5Sm{?K- zv|n<8Z^3!2yWfN+$IcTrF8cWKOE_jaoT1sD{{?fKYzu718t8_Let{dF-2{0V4%kGU z-vMpab9&O04p0#pi%#yD2OAFXxYoUOgxXg#Kzm;TidpOr7dpqY#JBE%hm+P!m3h{p zxUCiR6$f+(`xTBTa7`dyxtY(or1u5gF4P&n$}FPG-uA<5-z6gL%q9)1A9jJ&eJ6EY z^evz;y8}D3qzLob{uC~mh%@Go`VeWShu}z-43C~L3544g!^lNdSXdjtG9SzU7rx0u z-exyA-+mU$Sm3J(eWIA4Pqh?on6MMGsf392lbFytSB*A^2GvuIA2WNaLs4+G8*H35 z3U{<@V{Bhig4VP~((V$PD2SzpT}X)o*Ye6>Z$uEVyyS-8d~AsBTD8F|qGZ&aEMjlh zv?f?bQt7&Tm#efqTmjh{4KSG-0j}iV#}3eRxG5T%aDUuBZ2i$Yuv?x=Q?LC?`v`yly$RsJC1Lc(t-DUun>aM%_6jd>)f={DWGZM+w7idJ7op z90ENpRk+Hb7u3(@c^>~4wh=#3<=?;jICZo_oVv)Q9AoGjkRA5>We9f@;FSs_wL}%1 zxe*|S(B?>D62?$B@NPPiw297};*@2RpPa%Lnup(L%HOnARacrSloV{#cFba~;bvW+;I5<*tMKD-RgQKf5s& zjjLFer;H@;{3%ZvX_Mn0k(dhbLTvTq+q-ec5kv2II zPT-ChAU)@UGt;!2U5Osi2yX5~rX8#-&d?FK5ucoEJ_BJxXucQn4r?&F+TymYN3fgxFEmqz>S5?@#N7r_{wsZB-u6+icWy`v%i+VCmgS&H` zYmQyWnJpi>-BnY3>9EZv)Z4I&?ZlI}jIzKEne z9qayv?vgu|B4mu)P;~dNv!v0-bXQY6Wcp8ZSM^Ww8x^`LEN+a}=OV#oB)(=MemsLD zynpro4^jUPx@e~*5Yptq;i1uie4J$e`skyV2z_n0;}dMi*8pINgrC-*Pa8A-l#f$( z|2`Q$;?vx$sol6xc}moa1|lPjpO(HR>0g)fuMGS@!2$tKq%Cs$KYqjc54~Lczue{j z*i8~-m0o@z+f5{Vnt zNPQ?(xh^rK^vEFdFU^9ymd1xT%!%~z_6Voequt5;)~zHPr9WGx6EX=>#{4OV)Y>Qe z<-T5)t3n*ZB!Sd^Wq1+}DL&)6$5&rbx(n5eL06_emQN%(CyfjAvW)TZ{xZhbTM3G5@$EcUTKst9)Gd3vfgB|Nf|G_L}Et`9iylk z$b22%?e=l}o! literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..56934cab8de9fd4346c856340c7d0894b2552537 GIT binary patch literal 4906 zcmc&&cT`l@*5BurDbxYxjx#_6=_pDOupwdSASwdV)W}c-q)Nv!DvE*~qI6LZr3ngx zg`$9>Vtrr(`!gCWF>2H()_3plOyZa1k8iEF-rI|N>fZae_dfToIX^tbUgi`*1yP|M zlmZ2+0A_7I6-xar*H`oX^po_q{-mV8?|Jv*%MX*5mL00-cz_?ZFQv}=`%F3SbZFunPu6C01VzCO^Og%0gERR za)TwYq4Ogoi4hbgP!&8zf?tq97Bb-coj;KY9XL=J&<&670OS>HVIEkn?3n$f2W^iGa@S&#Uwn&qkIT zMvy!UpA-(gVsMcBEjuH{(72G~n6TKS`Js{|iDD!@hzy+{5*M8u6D!O2G$ts)3x_&YAw=V#bvN7<(6$N;;Ngp0{ z8$-xwKH5_m2kXD%gMNND#Qjqrz*G%hC}FVSX(R+5_z0IjRRb%0GF1ADp!_)xmK2H6 z@&pBJp=dI6?l*(eB`%;7GZtRV{DdynkAqxIAu?{$1LBD(aN?$c-3kKw>h*wYs10g- zJLp}=gUx}~5Vg($ruQ#|xoKL!-KGZDX4JthrBoPc+KA5AX(HYB8sM*U1>N@Z5Z`PG zu|Er-O=&J{U7!i;KO6+7f-nf$s{M5{kCLcumTp;uD4`^Ys3hXEJASROw zxh5oZA2CC;Gx&=f8Q z=fa&&v5=PJ2L+kC;B(4J$akrN9qzVpuLwi$mk+|2tZzW4&H|pl$bdc93_&ke6}{T| zD^d@AfRc<5#K-D_^X3_lQ162^MKll_sz1S-dP{V4Y=JySXBzAowTB2&-vN5_^a;;h zuTYEIS?HgbL)6UglOJt*0pI9m4CYrKKn!sfYu|Pl4H=%pEaD!s(|RvJ_sW~-OxY7; zF(Mu-jSGdLmI|C}WJ}f*_CkHfJEQ|1*yIytsM<{t&FX4{c7p_Td_^C&vIU1tX>+hX zZzp(a{Sp)Ib-{OZDuDF?Kh#vw4R^A(!RHML_$brmn9amVWMr-e>+#7(WFC2pD9vQC zyrtjD6~t@EMUT8qmUKM>{#Zorc;}8OSYmiG)e0`UZ=uKHzTCq{6G%QN!o;P=;BAc? zC$ENPpDz7XIFc|MYn|qT@)EP~5%UJ{H03)eCh``x=;t-qlLgC( z=INe@GiZawD<454V&br}Lkoxx)CKa;f$5~AA(~fSTgRUk<&P>;Zek-1bl9blM(D!@ z5iTCTl&FcDho!!8L6_d&R(Y0jio2#ah`SFxRotr4&32b6D-?aci&TsXkvTtyCFl2q z6LXuxU3#^ZQ{z1fwVgG?=9I{D6UG^{nw0CX8@KhbWsZE6c=`QU-gl2Uep~FxXkj_% zo20Q*$P{S!oQKU>(#c+F7e1r6O-|G^9@QVlBGojQmTtq4}$eVbg#X z?Ac^OXnBs}nC5JjlRw^yEew2t?z&Il=-nDb5ppJS=NhfqzFSuExUJq;(}OYS?a>%Q zbnzy5PR#_bgO|y(KeV7;$s6!Lx)tvjt%y4N{INde5H)3PIac<-0+V}v9>Rt$pttv; z2yFKf(QVuF#1>CoQa)c35B?B}p*3Ed@Q0(}$zZ44an)0B%(H}@B?%r`HN_Cd-+xM~ zjn6@4Yzs_7V>8ZP^9)=|R>0`YT9!xBTu%H;DcAGyB>59M%azDg3ZR4&cu(zB_G>>N zgX|~6Pfc;`-Yr+4bwUr;p3{r(x7AkG-~lOmU@1$GKV$ zI(g&{fgPE1;sj5_Dhnkh3glws592MR4nMkbN~F=z>a% z3zQFzNZ`~q&%*;kIPpRwv0wm9GK^cmXaH;Gbp2yNs%;c-1ZAstbODk@nGl}`EG0k33Me(?0 z3Qh7A4+?>^_p?f~R~2WxoG*Ix;up+gOc$iuDhifvrO?2eC)ndtk74?41^jGR1EEy< z0<^sCv4MF8#A5z*)-Jvi3ipzd^u44{m3pUOLfuvV{fb2}r56&+j~y-5TjvdiFz>?^WzbX{|e0=g=)zc=(O zcP+Hvy>8nW|Hg=cuHEbNO@q3!L=Jm46poMRs|q;2XJhe%_0^_PXR&6PB$Wk>9JW-Xi%Vmp>TajaR?(S&?7LnIaRDWMNbEkc_x>L>8YVRtq z>cH-rZMA;;gL{gcYV+#m98P$2y}LGl-~5gYu7>m8f`*XOxyEx&?kzkJ{*Q9EV&}S| z=BWP0$Qvi?iVwxbcl7**9?)S}B12c5Q2!_N?2>F6y}F;0>dn4z^N9CD26wU)%y!?9 zfqBZnNNGeh4K>h0Lp2m%t^WdbemetIBn8dt++jb8QledhltNXX6yUi?s?c5RP&sOG z#QjHf7&qIPBB;9TlZA0*&TtWo7gJK9;SwhG+2{jj`J3l6J_50ApY=z8NA&Rd8YZLn zZ(qNBqa0+n66Xpm(|8j5OnH%Y(pz4zW44;HZ&R8|i1S8`Nhz=1s)V}c=(^=urmKdz zZyOVEFlk72fk)x^$g551YT+}sPf7pv)sR|*XPHB`z-lo+(rfp$LhIyr{DnT%-qpU% zi>WBzx;aNuUcaNF{To7h@~oB!Vgj3^hG=vog=nsnGv`p8NUGP>OhcYLtnna4Q3`{U zl+%|yq#~UdFoMB&`s=Io5fPqk=@wGp_{8}z-L3Z&l0IKEM0-5AZ+`P~#1_~T#7mB^ zRWbB!D_9hAYV#g=5U&+XElP^KR5`2Cx4kGi>e@cZsl2Jh zDKR$=EgHPqUYr_t=h#a22-}jhML(R%G4wlHlAbttvCJjkcKhPwC)XOnuN~dKB=y;y z&b1@HEnS-aa2nIS6R?xA^6e6;Ix531)Cu$fBYi<@?IW9&G?OFaat9CL_`=y((I&PkwZuj zmuD0FZ=Ugx`KD>1KmD($m5?!$iIDR}hoK~YUZs`(O}8+bZdEkUjt%-J{2d`t{*9hQ zhsA~`4O6gOOt2a=IBIQh*@~tle}?!Q;J->;w8095ti+IruowwN&_qoi`vc+A*Ywz{ zW=AtNu)>62*PmhyD}T+H7`@+5hQcV8hYe#*2xX_la6*hm_;OnM>a>4t#=i{s|0W9p zRgtF1<3G(t|401qzw$Ew&t4_L+lc~I1CHRW1s_19DO3=9dCl~jJp5(}cM_(ISc&b$ zV(~2cVPu+EoGP{t7dwl^;bMny*_u7AFr67Wh{bM9PtU*f^h`EPWIQmA#4{Id!4@;jgZ+GyL!Bo@1^9bKF#9o{bbRRx z=|<+yLFR-Gf|g-_vLUngm3?BMpH*zAOSo9V*vsJQbZ80qGWVK2o0+>a&2Z|n(1+uR zqSx$`Bz{(L0sb@M0{vyF(|EIn^K z5Zk8?pOY+D#){Sv%PKi+BujF5U6y)4jB_%LBg>Pqr~T8W%+U;Mf>1kNlCUT_ENN^= peEe8RLPA`sc1l=6Vq{#bwz>5<^KoV(4^4pI{-B)z@TC7s_AlGe7Q()!s=un9a5mDN zAsIKOznY;L&llyRS>Ml zbh12q95TENf|vf)(zVP!m?)^9AleNdNmp8(5Y)c@)@;q2R@l{HHFURFMvG$}2s2xR zqUnt_!t&9dz=P-9(BS<`%yL}%&{d`t=vs>%`by?Y=}h*Z;_>amm?A$aLtX~#oTdnl z1t!yb-xQ-yryUU#m{PQrXCZ71h^A@_`%BI4Is(I>NP2QxAdKCx3MR~53zjt5fUlQ0 zk=bj56|T#cicOByph@vhVQc6BVejjAVXD|ke7@34Fs5j>nW$zos=D15{(2~rRMa(t z@S*cSRC@_=?q)Tt*SrKNsQ?~!l)@tov8Z+7gf9NwmLRku6Nz730NPiz@Yl9<5^b5L zXz{r~7KQhsOiGkQ;_GAt3nGOn_Z-24yGO*%%?SJuzD`i6N$jz0h&}MQ^@uX}SOx?2 z!4fO0rNj`AAW`tTS-^4L4qB}rE7U!oDGI+o$mCXanV9-q0=f_+ab`!O;6)6eCio2o z538~T^+lIK&44rL`(^dS9v@p%yTv7Bzlf{E&+sB0Z#M!x&!B+G?iW17x;>*qx zXPaE7MM76+i)Gu(xgR5Zu%0R-}=6676Jr>OF zc@k8l=b$}HhrsG$GpTp|eyVG<3pmkWK-m!qFf1fbth#m`-0G2UuNz&3^j}m6CKlUJ z0TY9eCSODDn#)M)dJPo}*M^XnMOy^AS52_-yIDwCa@fM_$HicB!yemEI}a#deUdm7 zwS|aVa1TaVlR}^0;;5p7_u=a=b!4V_50L7=6{e5cOQakJM9EDTaAE3t6zhIP@=f;~ z;jnm17?&Xe4W`@S4D&^V?=FbM4^@=a=}I(1(cZ1l`HrZ*Qj3x*)v%_|Z*bw!`N9QJ z+fe?obovHmi4w)Vhzs-dpmBXUtWQe>o^@*S>K#XP!|OP_q?m)w&QX{wu=J+HrG*09 z<>R^>y)_Yo+CcRNoWwQD9 zYT&eefXQKRCGZfBf$dM9q0E1p0E3TIHhm=nn_fOZZ^jph!dfK6(^EQV-MSXqjP^rK z<5c8Si!geVMF32RoJrv-b%kOUP)5 z9{oMN1`HfDc!>8kBNzmi~n@&xbX=SG>J*c z+KDNtX_L}3GPAOQ4oxO0Yn?Kun|UrH)8|Qbw)xnG!ETEEX9}HUDpoGiDW`For~40A z=*Su2BHh55yuQDUPne}ioSm$d>Yn7x$@+Zmyn-(Z=Py{eXz`MzMa!12Sh=dW;j7Y(Wt%o{`FiWN?d3amR_xlnXK&@cs{P*_`1ZT%nuFgT`r+`AqsNY)IC<*y znX~84U#P9SSl@8z$IFdBU1_>{t@(P(jn<#rZr-|m=kC4s`wt#I`sMMHr_Y|NPFpD% zWNbmfovXMgGcAj2N7`I6BP~}i&}L}43XjWqDJ5%TWE&XOKNwXx-|ODZ%m-_>8{_u2 zF$t3(prCgG$BcLUGMCf9DPl)W?d$3~5HA8o26KCV12+b616Iwl zYHsk0sHwiTy^pC&rA)OAZ{A9v2&#%JDP9F}}|x;eprZ&nv+oFd6mnOODs+ zer|k>XI64h3ajDn^X9QTnBqG2kC7N&RbaG~_mB?pbsa||{_&B^mZh7S7?A$+`XurYOno@Iw~%b*M{2DYbBxI%iEzPPV}|Uz?MgmYrqm?&ad{;-*k5 Q0eJfZ2T1_J@gJUl1J8q)CjbBd literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..842ce2caacab4c58384e71071de8b40ef03e2e4b GIT binary patch literal 3138 zcmc&%c~nzZ8o&3xmlXntyqI7VsuB@dL~inR#F46h%sF$W-%0NF?Z12TeJ8PO zye~sDuFMcMo`53&4nZ3{?e-qKe>b7w(9xn{j~bI1z&J8ePt12<_zYhW0NjOEo=j+! z1^|d$5gus>DWu#76$~x+2Yd&)Oq8L?*QRA_3bPp`rKMY-v=%BOl)_h8k`rHmK>!a9 zcmm+hA==?HLiU!42=)q5CbV*7LaVG4hyCIyHt_GT5ea)U{VZ&EiPn%=q}Sz}(zF_r zMv{XUIoh;LLtc?SpG)KnlwN?!a!T2T6#s?Cm!?-@$CTNrnP!T+gc89&-=?Y$&7l7d zFYpf?h~}lbO+s6D)j3VdO@`ZxucCp9{FMf8gsDR zp}>0>zrnr^UFmT{A4r3FEemQv<@7tkq*W?jo@^_yop72y5vJv9C-y^c%sD3L33rG6 z{Yzn2=u}=?=_rZAZ4Y1$jOWj~83vO#uZ3xgHh}VO7w}QJC*80<%-*MBrP$^~6PlU& z5Z;IyEgW*~Hp~`#iqF*t2ohHkhkZ`Ap5PmX;d>u-ZQ?H7Ir&}Y^oGg##1eV zOX7uD-+F+1w~vXvt|IVR>_&m$<^ig9>`>r$>w8{bzX}*;im>h9e)&Sl+w!n^}sk6Jf1do#dFI_bTeAi$Qw6D4dnns^RpH{R} ziXSDwZABHGy7e5@((S`9q|Whzqul7-=V@xhqczUm)#F43@*Cji-xn+xbQ;v<7NC7A$HK-+2T4%-L0+G!-r!V+85!czV02`WSb60dxHYhN zsBuaIGQC?Tm{IA%3!M>;G{qWv&mzY5{NQndiMmMoqNqk-{J9%;eNuuHRn1NTU#TP>|2;HfEJGbHHEk8$X)pKeIu>!I}l`tY=^lq`>CvxfU5f1 z8??^egpz$P*?!!wKsX`Q8K&fkK!^PfIIr(=DtHe>;_sBa{%7jZJoz2RQm=1B?e#j8 zS+9mo!+wR!jxQCKCDfwg6S@5BJZF?49!#|^Hi529J7IfH2Jk0k4!Z5wR1{bL>Q+%ejwIY}Gzhj|S5^z!x@;XBfAlz+hJ*8|6l4H`FoLNKcenHU-t9uXNeNgX{oCN?g9O2X8{ zq~w&;H~uzldRn?BBU7uJk(He@GdEA4Z!pd(C^Quno8NqE_MEx%=D%ICVBtH97B6|X zbZObLh$Ml&bEAUuC?v_h4zllFE4g|b*cOEm8;i!t}7qN6b!P| z_lVwwQl!txC$(NzNay7gngqH$9Vw!?Vm4FKOODc8j>^MM%H>=ic^=G{mc?F;DGd_H z0l06OUQ+d~5;CDxhE)9*PqBgLtA3f{RaM{qSr5q5<^Dj%FU-u==`{?E`GvHfHv4AD`FpB6M;D*@u|P7)l0gnq}2wP^FM9$7e^7eQw6c$-Ws$ ziACD5z}%GN_-w-0$Kv?jnu!O_UI=Fd2Z70Ak6RL6Fa5&QB>#MEcowT6`kZ^54yF|A zK7Q&{vX3HzC3P-zOFX&wOl8s}`5RJ_CmT|exzw@0I7@y}3BBXNU}1VL9s;Gxs0~jv z5e6HnCi0w++3G(olEWK=jmZi<%T6TA@H~QnF*Pi3X2!@jF(#vk5C+*y@W&xqE(M>d zI87Wm&JOcaSe4mwPh7A>3(K$^k`^T{Ny|BxdWt@*2>apkB>LDsRwY+skpwbVqo!b1 tkfDMrWsiGnvO^pskK&o^_N7105SRhIj1S}K< z1;Ij5KoPqE6~P*f2Fr^*v4HyS9byu{B!7Hsz4hK++*9}7zrFW4Gu+>}N!(2-f?7lc zx>AZ1r~#NX9ZhSGjO^{q;M=S}{<%h2IQ;TO8_#zy4!sFJ=h;?d`Yh{Pt<-rbHK#tB zKDawUao`{YlqRKZL`+zhlLTmvn zzF5TdmqrFI43Wl$P?%64e1e3pkX{3wT%;+j*!mpudDPX z-%6ezMv#0n&xBlhMejKIdxi!|fl&eR5kZl03j?KbQl(IO5E{5JASygQB2u1n4<_7= z=@k=t`MBo)1Nd(Xaa4rSj$`+hP4uSYojjs(K{Q)4UjrO2)vC*A(H+ zW?gvPZU_OR1!!+s6m0xP0J=Ht5Oq%%z*rq#Dr1oGb13*7`UqD)RRAk^GL-pRv;tvkuIvE&*|+DHLM{P^eJ{D?eAmLzoKPjudwP+bOVlwgAKj93l0|k7!A}DjXz6 zK}0GSvW!S*KXeF;Ryu&&2Nu+47=m5~3-;$q;p|-%=(s|{l_C@5Qc(hzsy-N#HVj5qnZb*fYhdqneHay~ zhTataiiQS0L~#ZPq9b*{VaqItsrJO`LTZSNicj#i+8iApo2$SXIRo~N*h?%Lx)Vk% z&?Vej}CQ-StOJTI}C47^!A((bQgb3n1*1Y{Fdar*0GmCn{PU^e_?J2j= zxss>IY)CX#6cq^X8+PMd18cG}uM?_UKA@4{icLOcf-0Pq(44j=Xx58CCs%i2DGfMm zPMU{xdDz3xmaj30;lsyb&%bGa^m8ZhYaZ5+*18yb-AJMk+Q#~j84EwPuYvs2HDtlAd00`M zDQeVjz{i(;#OGh=!>vz@VC$qNU^T|q*|G(eWDENl8E)c>pMhAIoEHGA9y^h0tCxY% z(I%Gab%b93iUUaZfXTF}_|RpF;4zSn`evL&acW!?c*2UjVQhj(g9VtrbuAgt;YYeO zi6E=i3mIM=N>-fxjP?yViyQ6RiYNCt!m*gSSmO*wlpULf4_VNQC#l>;5uvxSrO(!3 zPZzHu>SwwkPM;MPt#S+viHO2Vjw~hys7vILLo-QfO*p@FUzK1+m=7vTxP=YZ8_6yT zH9!NG#JFVq3ZgP<0hajI5nbuOqxyVJ2X|fPBJKh7v(h$=cD9R5MKS;L4@lJ@519%w zSv>EjoQOLd?uzb4PNl~P)O6kio3}%O8#B(3Ri{#g-MpiVt(-1UjaE2_WqTGxu2B*@flPp!&)L|#KK?DSZn*$G71vqm@u=mk-vZ5}>ZaEyYS6nqjQDl7&&xMd1z z$mT0qb=H%rbH_n@S0c9P)D6x|=Xjpp&D-!`c!GYkM;#WozKXp#TZ^<`?1)Jdq{O(! zT)~~s9W3t0>)dPHJ?!ZBy{P-xYD_byMPPY#6D;jIs2%=5h>7w}<9Fk-NZ$>QkmoRh zWBt*W+}iX64dg}>5$eOCc*=S*Y<3MEd&R8e2PSfpXh}#YjQW zfYvC5!j+Qycs5H*DOE5OZIAm7U$Of(Iv1P68dL8M~zZQHXk z$9Pd&n%Hh{M&9_4u5#a#dp8wKh`xU&&#r89;gqCTk9tp*ZP__(HO3t}y?kqlU7DJq z->LGeQ1JJq&IJxt`SoEvwV^jpR}~zIif%dk zJGyU+{th|1+Jx%Apr`Lhr_n2VDA}mFmu?;Nc*NjNmVwEhn{qHWIT$Gmsi2{H8)&GS zg6`@sP!|gssC*e{R%8wOQIZkO8l()$yJP?_#4^S90=u#i%R?SKro*_^#1uiz@qj#x zYx4$+V7Q!;iS(11)aRlB&Z@UBW_<)=`vJ?30FUY6ixeiK{&&B;eyg&`CI#mT&6D_2 z+f)UyW!yWy|MYZqL$A6d)c}W08j}*V3qsxZ%*eBh{~%c6S>aLPRll4H^QxM6Ea8_ARJcz~ zz}akzWMPC~JzWveJwF< zL&9^R#V{EW<%wj97nA2kkuuKherlWyh!yXZ24E#Qpm@*k#47OsfgMHzibpi?M0)ui zHNfE|yjPX+CyD4-#WF~DkpVTJBzv)un`4oD_@K0Yy{f)fQ|{7$jx8e`v#s)Cg3jld z26s2*#RgyADOoR^njaT>rEE@_S95-R*!2U_j_j!g2@yAsEbZ%VE=Y{Jdm@EB#Ck{4 z(jPlA^}UbpSQguNxx_KYx^Q{?)9W?C*N+z_CqBR1x_-#Cq7}-}V*NnwMm$LVWMz>NrvWEfCTED;_W8^;a` z4`TQNO%7UAg#+fohH-jZ_4W3%8A|l$&-ULkfN09gl~R_!{!ipIDgud!AdIBBP>&La zkRmQ00%!u{{~N1$#C${0LVuBbMdn0|sa!;yFFFhX`s*sK^c%s$7=l-$Ow*sHN&iHD zRDLHz;X#qXaf76<6cc7(dZiQf4o;v6&|e|`0r>CI7iU<2h!qh&2$~w|Vt=BT zx|**0)oo~w2o{*=>ocHOgUVm?Ge+-^)1femnE7REeGE@1!#DtlL`J2r$_4oFQ4|JFm=Ii4g!t6)5 z(eW)?LN{`Mc5)|l5VQ>XlMk7_uk2&}ye%RF9fKuO#$FCjr$dYHm%01gxy;;!X$Di5 zhdvlj7`w<$oyMCpn4gR1*YVKBqUBHVFr~DO4s`U2V>Tqtu8h81 zDe(*b9A|mG>0;CJrSuZXbY|!n&eR~<&tcLMf3qaICgzM1UF1Zj{&a|gheEH}$}Q>0 z>FnrzUx{tv;5o^IWvpl&iM*17M)D*F*X61EMmWUNIPyFhd)hy3${fw0CWy47r7=t6 vgW|>pL`RR6#>7M=Y9|E6#D+#iYMWY)GaY9lcGU#<{STT6`nN0kce#H9l9NZ9 literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..84c399e67f3a1f307e88c5325c849ac627b14610 GIT binary patch literal 3138 zcmc&%d010d7C-mC_d)_h5qKeqQBWjqAfSRv5qTgW7-Vt5&>{wifNYTjQ0s!USVXK# zaU1Kxh*n$}rPxtK!37uUhFihbIy1Fe>rAy{zp-^%oqJyU*Ws5-wgSf(Jt~`^NJFA;GKDeH{m<+5?-PSgaJ|; zPbsxY1Axo}laWP;!P4d)Z2>w_av3jfWZ;IcA6k(V2!9KFgq3?DnJvd0 zLAJ&P8fseMR{0uuR$0LG@m++2MvmgLR`-;Ad#WD!XEn2%%OnzRKr-$hT+gMQPU1d1 z{s5wzl(6MVG!|M8}=z#kpXmBmUd&|Z_?{O2L>0&QtwKoS|?|cm1!~Y3? zPN@SX$dAeALqJ+s&K{akC3&*S40l>fVc=CYw{hE>cyoh4I^4gJsXhA)y>;#`6WrX# z&a4Qa6G-gwTrWG;9gY?Ac&453H=VBqbJNbnmw zhEdOTgtks!iJ3Xag+zO?doHq!-=ivZXZ0A_QqvOD$~4*yiv12PxPJpSRYfqp+@FFZ z?}6RozNcVsb^+eE;tkZe)=3`Jet_#SWeBuhF2?!sDJUkY&|drbO}O2=sK04a1Gc=k z*KX!ocP@NpBsLTo*t&Ol$BUk0?8X_R*lV)wcBY>?(e*4 zq}*_dIXZbeqp$b|P42}?gMZO;E1SMWKd(2jxh}mSGwdUj9lM{&I0?+UuZF<+d7JPw z{|?7b+zO;)lhjC`BZJG`YtaIiN+zTZVf(*oxt?e0@dDL7=W?Grvi5o-POI0U!vlXo z%Z@LVR!rWBi%w)qZgFayYVXOMFSEe)O}kKgRw@Ks)Uh}248pepPogWTQruRma;Q)T zarRrw?L1Z|cRPN2CUS6XM5Xd9xKjNWX060sUgoMsYb+ly^J{LRHuE~k-~$N|GUF>r zwPS-r&DR^iyLN;_OOOVJ*-u6HzW+PUec}Mc!E(j?Djsz{zK@?xUm}b7(vkW8vf!9-$}8NIDs@jyuiow+efoO#^V0SoFmTXd zZ=WH)e*Qy;4G#z$F>=)CH-g3lj|~w*!^VY2M2?S&o}i1F7#kO#Flq9Xsfp9{NpHTD zJUwNGAvG=CI5Q(NYgTqnZeG4=c7fSaSX4Y`?!33>FIc#!WO3;`@0KljuY766vdZP} zuUNTi^#@gJ)~;K>;ddK9+_ZVi*6QDHtJ%Keqn)+8cJJ9+_lJG^>pyNd@W~$!9%?+? zbmVAr%dz7pPPU#pedcW2r{~UJxOl1k@|COCu7B3idE@ghZhm=7`%tOjv9-RB?^-Bj zxmkI%)*H=iPL|nXXUs9uB1$S|3!~_8R@`>h9(2`Kihbn8WWKU2c4`2H;jV3o%Kaz*Jmw|3AgawO#k-&_ z6%!A(JMmHz`ySN3==j&cf`V0PRJ#9hFaM2t?EmG7)&5pblA_#A!D^+7J#MZ5Dvi=n zNJxlH9BqBDlv`O}N&!NsAPDi~OKE~26bqplLWCe>2w@rGStt>Rp{p=Kh@vyu+h(#8 zmu6~#8WLjX4fgdNNfzM~$J2dSF&)F{KuimQjt-%Ex)u3?AKhoC4xQ+qnmDyEJz`Y0 zep*5%<>$r`d~;`!L9`bp8X+JES?!5S%Il(UPD%{OOOMPD3{+oqPr@Ojx9td-kPC&Q@_NpdmGKY?hy6mk|SY7*om zI>Ofrp~cpF62Vd}A|r^Iv?_@~TF=GM^|=v+#E%#!)hG6eD!m%3Bq%*hhJx9JMvG6H p$>d`wD9A7NC^8n9v-0yi`~!#h5Ajp!Gyu;(&_sS$$B};z{{p%vL_q)m literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..79a6f24b35cf797574a2db8d32609d38de32a0f9 GIT binary patch literal 5244 zcmc&&c~n!^*5Bt2$xR@TKyFMR%!8mPgMgwq1epazL=hAv3^|k%uTkEa&_Trv#@BQ0*pPNH|2~mVP z(*&JPtAc1f8VmqN+?wBe_9GwOz$1;jeli?I8EGnaxGl>J?%laV^I5}L&hX1)>xTCv zXuRrGH#R6z$X6SzH>Vdh%XJ>pK+9=!2U<)EX%5YiyMrC|eosoh*9HJRSBz=J2q*K7 zmg@o7uEpduA#($cP$uQYC{yERCn_@&X-p!P{E8$iG1~%UOZ}5SnFEsm&=@d{km&%J zc@d%$gP2Izj6w(mJOnt8O8_aLfWtgG%%jUZ0_M>ZP@Y1@j1^IT$NDm-t^deHO1>x4P`(+w@I!g>T+ZrJ4o(+JbAw1W^K>aKUB9Fa+t8e!M zCt)OfIb8-NKMG;d=L)neLl>JX8woW>oZ!NCKd?+00?#MCMVAi`h0Ss)vOjGN#8XG0 z(k8&TViDj@`F9%E=kD^Ou za%6Sp0EpN6gVmW!kY3>msozWBwBAg}pCgA)Umpjbf_R8NWC^WN9N5)Sj=J~=G%Y%C zQ)CFWH5NeZ@CGN%3&;(yf{BNVL3Y3yiZDCaVXT2A?+(L57!B9`Xsm11C|Ey946?8M zAot0AG&f5GM+j?3$>qUj2ND{N9Rr6Yz7X;&2P)L|V5{c9p#mi|-O-1}D4XCJ z7lUdjx~?Ou(|t z1%7WEA%hsq7G>r&@vUhKZ1II1JMymJ{mzZ{g)(S9EqrfevLk z0rpz#C8CXXfpwG(5wg1noeF4%juD%P1GC$8206Ck>jLb-`T9dhA)2u>TTY_Bfi0Lz z+7s@AwpM6ZaSL56eu`WMq+>hN;-K$TDbBMSLmnt>gTvLoB1;IuM%Fu_{Q&|rx%M=i zvCTkr%iFOPr*K%mU%F?m?f^=hkvgvJdd+iS(9)ieSzXoEut{9$0SAn0^FVN)j zfd9CAI4K4Jj97dQemM{Tf(Rc-HXVZfFzXboxp)r0Hf0Rt)y%#TN!PI*KoW5*=i`TzGHXk1YeVJ&KIhj>nec7H zO8ESIIr+uz8Q4yZGpZVR3Lp068+>NVBYey`3$9geHdgL;1?hh zMi$1xvTlFUVEICDIC+{Qx`EJ(pK$=?9&j2v8aG;~3sYXNL60WXA(a6S#hvpcD>J5I zq+Jv?aLiFMwlR|2d~_PJziLG8Z+eFg4rsz14sOJ&CitO69e%JiGY=mS)rl|Azk^be zzQN{yzZ!cwXBklu9D=Avo>;nmB^r>Dh83TfL%gP2$rHzdNo9Gmu;gHwc*4Bt=*#R| zn5nHLcW06vdfh6+6~h)22hyUjoR@y+%B$O=A67Q~joxI)OQ znu2K_4o1J6O(A5LZ$ZfDTnIgWm2AF$3biR;!t}HGc=aFws%f8&wd=hnsl;)gC6 zzxNWv_qC#5y5`s1yyPYOp? ztQYQ*j3euO<2c4{dFZ;&R5I-ge_2@fHasB_(A`g?^~M{l$FR>*upzfF@KyQijvD62 zdvL0RA7NO?pDmlkZyM!KlioAU9@YrSP4)G{%uESCMdu{mt@fe^ovjrLhj~-?(AS(u zn+e#9WD<0~>Lv58B;t8Nk=%HnwOj4dkne5vG@}C3{S<-@E>~!R0;w|L3KpW!FWdeWYL5ezyNAuRCuSQBQE^pAJ z@6nQ{H(^JXS0=oULZ0&m2EYnXM#-RuzbEziZ_1-rV3_g`!cPkn<*$ z;|cYEzgkiU=N|c?onI8_A0LoG9ju7LBa(U0uD>5GF;wH@4kL{Joh@m6;V3q#?VuRK z>-cUxVovPQAXH`l27Ow)hFTl856iBb$GJlqVKICw{8Fw8uJ1U2+LB^m&zlh4o8Jz? z?zq>ZE3%c?#g#IhwML#KzbKDuzBx*-_Q4#aBR+*z3XqCdXQ5p3zD z1p7~wkxCtN7pBACevQy^5_d1+j}^1LjuwD`PH zvgY1-1uYYCahYV>cz1w&KInAx45*p-ftQ9wYSwJ*Q;T%m)ttyc3*P8rMiDQ5yjtxZ7G$>ehv20K*OjgPs3BTZo;v8WRt9o+D+76~ z)%80T@>v{G+nv-RzG!8#w~$&_?`~uO4KJH5M{4Vz`te*YTe?V*Oil4d7gHEp0uw&uRvc%#w?nX&_3bT#o?%#jhXkJ<6&qm4RP431i6`k3} zsg>QwjnmHb{cN0G!y7K2-yqA8XEfQLkY`@o;bm7Co9Mg}w_n$Cb^sMrw>Eh>2!_AiTcIKEZ?dv{aw(L#cOXjuVk1$_B8Z9ti zX`=m1B?Zdgf1s3sZ)1q|7{YeDhn&XxiJbq3G>4axl**F2_)H=>DO1IbPmX6P5W{z> zMUv7HlDp2fN9=4*b6J-6=WFI~NzHOZ&kAC$C6?WdqpEa)L_|DBGW@PXKoLsB6GALQ z>|z*VH~TlX_L#k~7-2594@jz%HP?!idauJ0yT7h7O1}|Tj3KVB*Yz1%_uBKHXr2D= zBriEWH9^%+W;K{(D`wj@%J$ol46*ww#6JN4T@7SvH;{5NV-w?3lr+ImUK{LB)Y(QJ zbjWZVQyp-_q#qsx&FNSEkax0re;g5w(VQSp)|!xNkBep37=!Ts*ldg#|K_ZJE%3i5 z3lgzRE(`ik^ZNf0i~g^?)&H|sNpfJUL~Kh*wvP}4$mCL8MQG@x@R9v*mb8d)WW`P4 zrBEm)GY=~h6p9>$SAxPqs%_qrr}j zBbb5DM1OWZK8J06*oJYfPz14!R|GrN?kk+w`MemHiOwd z7S3fFt-tYFCrl8G^!w8e*}V_!Gb6*@Qsev*6iU`!3(us(NW}Yj=+vp~JdkbrQ`d&x zAJ05y%}b>WcT0nTPy__A`dX!& z_xO_owDo3+&B*)GD-_=B&@Y*-L3*UG$J`i~1x!s`StX{(Ic)uz5c>~>S@Y6bGLbXc zG5Zk;ublpK(gw>~F**uuCHsxEN%pU6Q;$gT&0=u0d9wD5f5wzOntn}?nx`u>=4ZvL thQy|)4^d`hq~(}r$7f_FrKOrXyAO39>Ld%21N{C6X9(suIp%lA{{vSbUhDt> literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e67164596565a48f5cd69702114b2db7228ee7eb GIT binary patch literal 4782 zcmc&&2UJwoy8ibmGiP9E!r%-JMVg2pMX(nJgrTX3QHr2cDbl2xD4-Y+u_Fjb2T_V3 zAZQd&0Rch93o4>fG(=;Iv7oW*J7_uT{nyN{fBXJ@&YJy=lgQbGNASFO zevUjY55@o(H#@%FRrYIjmd%K#?Xi>L&tew-bhaqo+Du9a#WgNb6yAD~oX4)=XLV1rqWIfjO|` z1$>T=cbMPGK<}tP9;Pg-dLjA*{SNeHJd&>mTgCM_^zTqJ;pOm0(F`z(^6E$zrhWaS*s4`2d$cRsbttCLDAXK*>{iShr7v5+k`-uwW*%)*8dv zJTYj78p5+hAJN5{$&fDOqlwMhKs+)6D%=4qHW1KXqYa!18lb3T0e!*pu+x1itWL9n z1%tuhu~r>8nTl|AVKtP>#(|nqJ-SdWL|QGCpp<3{S}hkKqR|Y(eo=;ISr5oqC4{Z- zkAii!KX@I|gkDb;6b{y*VJQR+=cVDgf+DoFjt3%tJ{UKQK(d_{EUYO8L8S>4VS121 zt^wjd)xcet0|R0nHvH`@*s(|n1c$|t{P0H<9HRiWgf@gGb0FP-gpMOez#!fRoZhma zaoa@D-Nu4L+1}87OAfj&k#MQN7}-}8gLc$8c(=|5wC)?hWuJ7o^)U?AM!P_Eaw&X@ z?Sw3GITYH@h2eb|dY5<)CZ(!^X0<6ieU=0Vu1)~$urcUG(XU9w?=FhgLl6p&jWP_2S#@ zCW6VpT?i%4VJ(^8q3AIjDUOYAlwh$oY;?Z^yZS4Iek!b z`YqA~M{MR9V^m=$gBG_nLyK-CYTwY0Z9IX)bt7Z+cC(sK&GvSco#wywfPnn-kM zK^LUgFGG#$Bha7}i&ZDj2etVQ(0ox9*V%Lg$l3?Ox6*9MUIabhMf|@3rz$>~xA&>L-@VTePv$c|Av8u7%c!K9t(x7S=9%Af< zpM@xxnd1wI4{XUX8@>jE@0wW(R}mWd6^C_$VlbXJ2Uq!;3p>^>$BsuX#gq&Y+2kNb z#m80=?|HrCu_Fse@468AlB#MYht;m=VC)S{O;?j$5Tu9R_X=>)l=VbqxF;6(N{lYO z8&Y_Z)Wu2Z^Wq#vk7Y8(b+GLdhpdDt|v3_)A}-~wci!%m-AJWvxN$picFGg5 z&D#L_$yF@JXb&o4B!T1faEA05%|uyp6BlH00`IOGV83($(#vut{M-=E?%Q<*nx=JQ zExY^h+PNBXI@_}d)5X{Ek_onW@B8oMXEyGTFI2W5+im<<0v21_n0Kw%l1$#GE@PN~C^3|6;l8IA9|oBuF2ANv2*3#8VyJ z**zP}fjh60@S8st8xrfGhJ7}2hKn~6)W`r%T~pz(hmYV_t9vo!x^POhybOfTWx&1U zJl=E7h!usOCoSxCh(i|%{M`&v;ac$=&X$N2eT}5T(#g=#ABTCJxlS#xi;>d({wCZ}i=7a$qydZGT+Lp! zSDmz8CC0pCy@|<9*-As7x>%eKS20`I1o^bH*sSV=h$|$eT;Gt`;9$^o=9>RhlE`F}9j*u;Q2I@;J zv1^{X#5YO{SfxtVD8M;^q^}yCvb?v_2D~EfA2%g}DLtQHe#B@gWou~B9scoO2z^7s z3o~Z5h)Zx10i*a9MH@Ea)EGV2(?uIMvE?;wvn`5~lBM~kfrF=uH*J>FuuSEPO1|Bq zsOMbn+FFvlb*!<^*=$j1%Jy;7BkvEkmTumuZjnR@Ecd7G*0N34b8FkbWzPhs65AZh zvb4-euJwV}+RCNMVJv%HRTL+WBJ@lFlvbh9n#TkQ8vt~i}xcS(0){_J>+XMb7Gxy&)&GSmKw z{<3P1mL#6TfPs5s$j!D)hpUrUIDPZxEoqQl>wP9oFe}9%r{4ekz8ULShWi?VE?2rW zr5NTO4gJ4I&mD~}lVA{oOCk?3k=qi0x>f1MNq8E6yKT!jaZTRwsQdjD%e!jwo8q1h zcjt-^7qqN<{rtiAU55)#B?2xbw5=^_-9#zsdz`H;Zr?1gWuIqTSJJVSKPC9a*}BrM zof=kYLc99?-Fx(wRC;vRmz~Qp_U*~DJ96Mc?)0dKH@c4;yjWpdX`Qc-z~9qDN2>soAly|+2di@TR7fD2?NI`V1&P{qX~LYw{dU#M zkDDffc9q2Mi=L4xGLa%38m1-!m8R$*_<$0%J8kmCM+;i&SB=OdY6sBCr|u2DzA0EW zk??RpiDZIscQu`G-BH!a3BYwYX6%bTP*oR>(Hn|27Yo;HlFAe$Jlz;wv*bj~Hp4-T(-uyXMP!_eXg5rD&%3KjKKEG8#E_0D2!nT{!v6oP*gPAKg6H8 zUDH?TBChhFq4M5Ix&u18NoW-oFOjnU z$dmkEdz6&#Oi`BArIa_AC;Mt*F0vGfM2qP!BMu@_oX9dj zWFryM0LCdH= z$&lIm!amB~#XQVU93b*$>?Pr8IJCHZo;xpH%FOMVW)!+4_0fD*(`%N|-Y(|hZmtW% z-CZTn>3EAr@!5a%Wj=JVX!%n{-L)Fy_GBf-ho)Bb5w=4eJWfv*wa9l0jPKibeYBErx+GBP|)Bi27M dDkwZm!({4YlgY*cMP%$Vdir5fE zrHC{^nu%hgs$d0IBUaX+f^MSG>{_C6-y4F7o8*uEzWp}$o4NO%dr$kFdp_oehuqVW zAsIWy!JVNQ7z|*za(T^**yrnePfWR&bgVbyQOb~p1?LXQf}d`xo03}~*;L8^qs>Un z7$Kv{@ED$+4VcgaIud%o3IK#7F*hSj^nk1r=`o~qJn%H75z>n14`(;w z!Tv!pkqJ@JDqwGs_!*L2AY{+5-hUDE58*b5nRc-|bA?hN&J+E(pK{8a0dbYHaa~w7 znNj*PysR9DPL0_mqK8j|gToGz3w8Fx$oWR3N8yjC-t|1(n3zSD`FDy;%`Xv~Tt|WB z^#>3|p2sJ597V5^$PrUqGtpCIx%=-lcMzo`UumMD2g$ zhSkPlB7r#$&Ldtyn^ljnw{HTa4eBs){r=5r^QXOEZ!!$T?S)b6}{1Y--TMHVFKf&MqD}k`JAYvFh zpcB%_V+*(Q-ef7^VQUKPK2uHYDV&S<)mftB#`T1C@jGJPg-68H)5G{9QWJ5t`8WLJ z`8HGo|9dLZ!k0J)N|>Az1grX7sKKk3f!Wa$yg}a}^x|J^vjr4^#b;9pon>m^^)?;t zRHhL_=64ZB*{$$sS~FUzaSz207NDr`J9x?W>u{!@4G4Bno%$$Dh8Lfy+`Fk+6c4qErhRDj_@sWiTb*BIu%eIsaa57 zp*=0a2NfsY#zSSp`To)c!nc>Cgxq>1Sr#)NC%tq=SAV)YsKD5TX#3QKp&aHJY}ak& zyCrL=?S9{l2ASj_OYJP4(EAA;b(a>byndW6^BRU8G-^_Fb47xl(8Ws4~T_e0- zY)8e4s&H0Ii>CJ$1r;eN03)-d{6s1ds^9O#b5}I-*J)5Z-SKHCsd^c@lr~OX^N}ee zP4eU|x$lI0_U%CaU+#g;T?U|3iiHLq!)WuYOrfaxI9}}c1L}60K##oh2!#pFgcoXT z_;a?b(ToxuCF&lGMz2mqk4UpBF8suP z@MfIn73OWzJI) z7brV7Bl6H?lCXHOU8gI5iXc699cm@&`KB|!M_)X-346CY5&8x{126h1f6eeSP!SP_ zt1>@D^*c4yZ99%ogFYV%t({3YW|lvi?wTN!-MRxkLlcc-z3OoM`U?J{od%TSB4-?s z7(iBj$*5f(F@^6C;!0i<6!BwUccJUYR%5;F25psB z5fW+FqZIXUxSD^T*vT_cPu12zJL0zyD-Ya3?aFN4=p&wBxA0qDzFM8=z=IqxE1e)} zu+;-=;T+WW!*d+Ge2YQzs`W08LM9%SI9z+LlYh0I&N8PiSw`ddkIrI(OgBe7#IWhiAA;^!*}c~bCfBP(>%q+k_b zT4I;CdJRDjHt}i5TeFt0IoxHJU4BZcNMaRsqalCodJRKGnnYgkWxAG$XNgZ^LF&dK z7J=t>$qUzQ(w!LBd!w;%eWrn33N2L}NXr`Gvem@5=|K86V~+xt97WNF9ix3}!n&G@ zHfEbIY)X^bAI!+H4(lxOZ9ceZ&jjdhk=o^u6Xsl@7CGF`%kX|La66bmDjkI)-s)Ut}4HI zz2*qFT$k#C){PSD#kbE@7oN>DwBMlTT63Unn~7K1ytbO6^Vt?b?YXYC2QTiKsC<08 zt+x2`e!I0oeYd)jjzX7hrt{nDO0O1s6uRwoJ5qM7+^2T&o%SQ;-yB}pyg`53SBJW4 z0Y86*)HK+qQLI2+( z878hWkk#wUE~)6(Z(HgQ$%7p;@dLl3LoiSXZm>Kl5OFOC@U(y7c+yAAPD0;MH7S6E z9?;=<((g}M2fwgAtYUkWqyCI0X?!3}ks;Bc@%;?&F%mySvRo6{Ib)V6{oehb1pOnd z(v0yS;VFZ{LZSi~k|l~p_&0phNYDL{mK{qOVLX<6TyTchulzBOSzuNpeM~+(|AEmD`7^&J?V|bZ%!a zm%DN+D-W!!q}nPu51b?U%;lra%_p)O#~Ci1Y@fuSq-mHFnZnHnmhb8q5I2I9|hRn?m9tT)|AS5P$f(rqWT|uQ_Nl?~+2r5f!l}$j_1_Wz0EEbB2xD*7d zsGuk=c$MO<1*{70A_^*kMas2Ww_00q7xm5wm$rUw|M;Hgd%io#nVEOyeShyVlPA9< zP!Z%Hrp1BcnJTf47={8^ZgM&lWX{bGo_gdJ#2!z}nfWUJ;ZCpEnCX+;UitY5M9Oh* z&TsJPSsf|{u|#ZSE!HD1j+kTO43@mXJt=Qc8vq2BnDlyxqM%Y=eiNC9X_*sn^kh2PFhRvp4%?4vTLPQ`HTgJj$4pKpTmbs4xtwoE!Z*T9(Pe&E1bybMrSMTBgbK> zcymfLyr|z!@hzt?yEnAKzN0VEC{SUSc~fD;R6l6EFq|?h zdWFoFeFeJHUctZS?gPrzo=OqB18-pg{mr~$&darF@Z*VG`10fdX5+RG@a8%P)L>Ch zjoe8BE1S2M?%Im-Q+Nwg26PD2{FY>0xayWY&u>`%b@P$NhG zDnflv$z_YE0{i#fD5Fn=5Zu2UnU|h`+kVX`eJCGAA8}=_+1aDj;m*J>F#yJL4z`$5 z%S4?DXM7u_u&#DCT4&9`*vm%Dt}}1Zo?&Mw>pg3zPdmNf8}%&Q;OB)ZzMfC_i(8qm zzw&1y4kYST?x`{KO9(|fGrI8z`6%w@cuUmZDx(zRm(aUY=HSd{Ug*-R9{q>8r}+78 z^Z5Tj4@AYrC%C>EUE!B+dy&572IOG4mZKALpO@6b<1guG;O!0`i5kz@;}13o_-fm+ zoI2ea{9TVZUOLrKKUJ_Fum9*SFJz69Nt9NCx%DD$29p5?-mb?Ve0q$Vugh@ip1hzl z4}5}J3nqy4Zd*a-B#_6DqPl8l(Im zUhEw!xPSYYPV>-Ha3pXOw;&>NMD=?XF#hHP#$fzfRKaz`#>T5DZvI2?-joeid3!jj z^bdKdeHwn?y~%>(qq2rD%Y-n5qNy``I=D|lfSIRsfuHJ9xNU2$K*Pi{`0&~`YX1~7 zUGr6?wBw9xRHcPC)!KhZ&*fl|-WG{F)9e||F?K3I9n)qpDI2|IGw+sCv2lQUSI!&a zF|-KdFOslLHwMpiZ{d66z4$llD|P;bE^@-n{cv9*1A>1%VG1tAQ3a}S?%AAb5Wats zj-EOU_jp;Nx-UI-ZD!=qyuJ>K*JZ@#-uQ~TTGNUp2U2*$tE<4|kqE*oTd1>LcAPY- zg>m;ar}tiu~z#HQf2@O_^!)y>LWE z1ikMwvGBrZH}2FJANmTvikteP3w3;xjZI3A8aiKI1s}KXH}f?gkE9z;Qs2(3qQX|( zK{JQbyeY54m^F>}P=8q}on$Zq;yiYugy{#U*yBKz{$LB|7nkEuhfd)?hpBn*hZ~`= zL_W0Z?m|lq3Muz}2d>mbp<>IrsGQkQXS4L-R0oQEZoFPp& z=Ock(J<1iu!=;KF)OwDoD9_Lcm85?`E!ln@olPs{j5`s=EC zM_o-|yv}TN_t!@_>7@WNr-&p=ip8kw*?s)>qkO)9kC6KHR4N+YyAhcND{ zpR>(y8j20lFy!m6MNfQM$6A~{3;UFKeqnOGUfJywWGb)S%x)jU;c$fV5Rp4Wz~w3aiO(w zT}@RhOYP=0705hx6mA$F*IpggykphIiO|b>=WB!o&F)2f4ehbhQv=eXa^k5wOsp}< zu~@y?_*3;Ar3T;^qPCP^iZu4m3EsIa1P_c1s`WK`o@)qHCDHlJQ3JH_OGXpW5BaQ& z`jO_BrFp{P%3|-fQJ$+z+;R#z<#n0URZ#P6!@M~9cZ#u8PhedLM4;U4Q%U5@k5Z*p5E@xL}|q!i>vTy{0z%CZ#tmn*N>Z}FI|`-E8KmZ2<0V`6=Sg#6NHs;vfub<3~p zMXz0ZomY@Zjivi;Oj1kEmp8~XzvSrt&{$|RQT@}MUq_!_GsuY7_51 zGH)q!jBVc=-zRU~LMbJ-GkOuJ>pz*T6RM zx&4jfZ_=ozX_P#FoQW6@5K8ZN$rnEi)(y6JwHN(+7BTA(r$b`5_zmMQg$^|BU}P zUjHg>S)mh1IcZUGF-Z|(n$T@?{0G)+l;0)5tlsaZBF18l z%9XXIrP>o>nLQ>R22aY|h~S&C_*%#RGg*-6$xLLb|Kpz&|0Op6|8kH2Yd1+!Y%kH1 z^CUlyH3X2ENQH`^py?ql0~brWm9}G>lR~LbC}xm{ZGH+xra~F3@Kh*b6&|tLH6>B- zXJ;M?g%3Ltd2l3>cACZ_ut19Gi^tj7O(K(N0p6_ak;(Sc*p5Ie6e_k;hOwe{Ut!P6 z36YKg4v`_V7e;$dP6!JPievYa0!e(IEFhg0&qE7Bf*@@GPdjDz-my;$4{=J4_KHC3fD0FP literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d0f1bd9b4393fd8255b1fde58a8b75d36fb67909 GIT binary patch literal 4341 zcmc&&X;f3!7T)I$$xQ%>l3)TT$Pkek1XMsuf-RpYYQp@b*jap*jBCfX$9ro8;Y&3?T@$CTd#X1XWGNr-##a~U*ak8G8Yg6 z2f+*v0VRMj09whpi^mVWOwzBOKJL|&T>m{MUw@Ije0LtNw8Ca?-;#A#&j>)k7YL1L zd!B$LU};!`J~g5fQX|#?z}3a*mO>cb3*ru>As|Fnz~YI7oB&y5;F1toY={6;rBqvx z>SZJj5mSE`N@PUm2m%b~hDUb*@|>rB$eBRELBQg$9YBfJGQu0~7?+Q-6;dM_A%%Xk zQKb?5d6icFh5(Zz=p;TM&{qnKS{5G>6dAW9P!=as3Z)03flHP}g~vxk$}#mL)$ftG z6&F9pH2#Y%{}Sl~bwR(nhu{rMen8KaUOi(uWvql1RvV$JO9EX@%fYC`5Wc&m2(Py2 z!IQR0u*`su4jqnyoW*?5+t~(D-|7LFs=$j;807vK3VtWv!Oi#OzzVj3!#*M?ea?e5 zyCi5`j3Tx|WCP8|%;4f4XV8u?h8J_*qpzw>AYVg>CN=5;@zfNkD0gs3C7`2P7dR8O zK!xuB?JIb&)z2EjvK?Vo*9usaqzRk?6}UaS3J#1)gz=^|=vtKq(rKyyezptfG+l$} zI!lQBNfjDLErRXKH6Um3B+M)ff~7~b;krKy_H`Xc54i|xuPQ*#XccH_o&ZGg3^1#G z32Cl6FuS@0L>1<+AJd0o^;$?7s)k1}9Xg!_*uzb>uw@P(M4vlDTK{)wMf_+uM(9FB z8VB-?NN7880*q3e!1FB&>T)N6IF|)S3uSQWzA{|6LBfr_X2`9)1axDsz`z|jI4C0YE|%1O9op94L6=LOB8#!n z*xsl>Sou>r#?iMYD~j5o`s`bz4IY?Hs~IYHRYG%H8lg!XgU+RPVC(B~*pjpm`_cL( zCOqnl?`u|s+JkLyKeGUaa%1obrt7e$%hwU1`Rc6x^EJpk^bD~#jm0AM{jrJmHRQ4j ze&n8(XHYlG6H$G3n4%?y#|s+3S?3L0vgqYLelmsR)1S+!$!FkAg)1oe&V=ys#@MsP z^^kS>41Q~#J!Cd7$6hv&=*FxIkYBR^)oH$jXysF=Ds2XgpWzOTUyZ}{Hl6_GnFH|c zl4`*1%%P$R~MOXW9`!f^R zI%x@5<=9L3T7PTOsB#+~Zsv<$gjldCS_bQ$xR7H~SAo%&jjYkP5qkMA96(kEW}i&Q z)mABjuSyYjP;j06;>0Xcc08O{T3N++5A#8X6YgN+#oFv8qQ%_7>mpn-Ihm-4^2ZWi zJEI!|_ePgabips#x(LV_i!`^Zx3S&UDl6_9dVogj7a?Yr z3g;TI6@D+!1Gg!Z?%h5V!ZqSvt+8e=*tVV*q3|VM`^XTzIUPZWzPbaR)6-!7lb!gL z@9I&z>@@_306J@+gqk~ium?FyM>&q!f|We7z_`C$gP>opqc;!32&`<4=$`#GVwC>YoCXzuo&#vR$ElRgQsNIUPq4p)CAf(60xPNJ=84Mc&_-H zyU;s6VPf>WS}ZQ3ioJY?COLDtGbT%r5he|V{Ch(eSe$paIkz}v?C4*+QRk^tOk?L+ zzV*$`u(IQrR(P)}CM-IS-;c{DeX}2<`Qr%6{+%zmt+5{s7Df{hD&t|lZ3Y=O=QtkR z3iv?EIXKhjjP~v>R6aR2hN`Ud$9=;&(4kz8QdDwrhiZgze-M-E7i+LN?Uj7+KF76s z#b+(6@jwle-l3e9EGm2c5iH?c80$W%h6QkS@M~#t&{*>( z6cJi{y2IapjgUPFuZEb_R5rm`2^b}G*q^!{r^e{}oZY{EBb%q~Qs_{!Ax%MO5z=+G zWMhW1mNZi+DczK%qVILYr@1sOXRKMk#X`w}^v&v1V;*-kAIR9M>9BzkNe^b`>A2+U z`?eg+`fQ?SsY{WxEW5zarzWJkr7UNs>C%==k>jDJ@wyOszi-Q;QblrE^ zWm%M6E-+eHpUjzj=xUK=SWjctS=q;osD!C`_rob1Lp>PgBm6&+j!Q54?S+ zF!k7j=1mIf_PJA!KWxiYw_U3^t)}+>SRy<$HjW(>9>mZ& zn!Ba4RR?ud-{^|J))u$Q+3g>%*^m@`7;@% zt>q%5Mr;@|`sY>J=p!=v+~n`bsPYGD8XgoG95+m{X*$}E#P^KF>Bcl2{d4%g+4Qf_ zigK+$$ckMS5)>g55Hu6l!~Q_%^)x(=syNWUBftt1eq3?^YuNZ>w#?Z5ei#Bw!1AzT zya}QFU>MSm(L0PBke(VHUyF$^-}pblf+{7_5PAHMe{G2W8S4LkxxV6m>L#h?8>muZ zitgxdZIOmhQR3w_$J=K3VhQ&Xri@rgq!Ni_F8yZ2T_Q=8NP{I#5=pScF<8DPr442= zBS(qEmFemENKen?!&oK)6G$>=wV|o$RC+LTwhJ?NOk~=bOhbp3NIaNE>dQ>!dWjh` z4-2rEZ64sgAU@D(T9~hoR|unz@TBuwwSsQs@f_tr=p<+vjwc^7x{v&0{k*Lr1D%5< zGR9xNJq?Ez-;uf3r=K!&H>MeeE>C?ppD=n&8YlC%it_cD9p&dEhfeP|cNm}B{EzdY zi$%*H@?lCTjSh5P5XUGat{#lN+$eEm|GBR6deg?8*{ye^0C8{rgB z?<2>__|x%eSLSGjEkURiEsI$h9~5W2EIQg)784VdsFe^D6B`;8sby|$Vs2t4^3VYI N@PbAHz?1&NkVV0zh$QTbia~{75!r;u7Fo3_1r&`tAd0wy z8>neokpnB z0fkXQ^6y;nEvOs;7XfO~q&5JQIZytOGmZlm1Z)=52@Ip8G-;?d+T^2bdE}6UN8-Qh z=uj2>^_CL;UI9{8(1U-dz?2gnAC?>&5tkGk9+DKokx>U(cyL&JOmb|TvZev3a}Nre zG2v5W@Lz=dWTn}<+?~2U+&hGFKD9CRCy_0kB``%5^+^Q4X(cbuQ@TTS<#xivzN(;Kc(fYU082F&6&DOBQlLKcr)`4$? z39u=>8)ejB;Hs51+!AOH9@`HhUb#EAuR$HKwv7PQ2V208if!Qi#sqA%)tAU&@)Sar ztIrr{t%5ePv-sXD27@pSLMDz?L|9uOvAgjJ@T%j1eQ&*x`b2~!b8CV7nAf1)_Kw&6(US!=U|q*q>^nb4plDcx25Sj;c}5$^t(pf<8V-VL z?G#jzYsmy= zSM-Qo2j-)_)i$u!qy{T#e-A%De;;!^JDNErD+QJ5wqt2Q_Jl=QHWp(o#m<35Fts2I zeA(|oXr(U$7N_eN!@q}c@D~g$?{x>()2Cs2%hW;PsYUoZZYOc-#0(;&GDfqgtU}v6 zIshI?xsFB&jhTC8rtn=S9}`VjftSVyq10FI@ba%WhCj({W3B00$T|iebMo|Cm|kfb z>O0@xgu_h>ppAA8L(Tso8GD0dt>~#GOZ`T}x^Jz~{M~F;!gw=AwMGTHc0+(xx@ZrV zvyY?g3;Id_Y%vkTD*^(GrOXr}1ysJ@j^;0KV6M?17}YP|;Hj0%V5edtNAtcpNSz{K zEV<fpfwSlv~ zyP1lR&{4&onScp*9up%b4fq7OL!zbx;Qa+f*@*SAMZZ`gM z7_mwn48w4&y{w1%(jO2D#Z$qf>Ud^X_IIFmQak!4rwcpoXrv+7l#koazKZ<>FEIiI z-l)jL0~?GXKt!{?3%E#0Xoes(aN;hC%^ zy!HdQGb+VI?pKYH)>beVZ8s!b7rCR5lo0&rYOealG1HhX5uW&WtV2xsn?Bg{bviQ0 zuh+J}vI#8dK5pbCm;iYNt=O44hcM~-2XM|voOJw`l*q0dfbVw5@z@cgK$Ps^(qGS@g-oXG&P@4z)pFKyhZ`mN#k)yLBj|=-=Jw)#p ztYOW#p^iOklf#iWcR@oxG3pBv;$LXbU>wkPg^`jp>Jyz?MRK!A3vN2|kFQglZaojD zhfh%fZlq0Z*l(90bR@uwVUXg|nK)|+KqSv;e|iQ+YMBPq@6TAp)HL?k;Z&HJ#pc;Y z_0|`zTB~6sR`5hctJjS%l@tdw6lHDDwGKVELv&!xCjH3?4|*F8tleVhlu7c%2NgME zJaSE?jR)6lHSsO-C=ef7zinJVRa9T&p$+*~3mX-D=ffKdCPZ}?OPdaF+BHdj>s*2J zkLE%Pg?DIOl+Swz20*S74uIg^j~c$%Rd%epUKj9E8kHWb}rX!ertKb ziO5Sup1a&CcAkvxt&&}9t=M%cUS8kMa#uws($Y^)PjAi9pK~;ObmDz2GjSUgJ86K5 zUA;M|*8Arh4cCUJsjqi0&Eup!J>|8Rig~YV*W6>xY2H&Fcq-xXz{_#H$3xq4Eu(+2xLUpV`>NQ7zgS*BE$kHLQ6`mj6`M!{Y77t!T0niA5sOZ0esTZ;>cf zB#sohiA0ei=Sbz9m{OQQpPWS^PkN{Lp`GHC$3)r#?MO81i*Z&~lc|I2Ob?oOPNi2@ zdZAp4L_YK)meQL%d|}T$=Zy7gg!X)K&RJQ+MK|5P!RPX~4|* zzyM|GRJ_@$`n=|RoDbD3ivE-j-Ab`M+vZJ(c~YnoZGA>qR0L z`sf}*_dp)#Hf3?B?NX{I6KN%?$*FYzsT5U1q3*;=ODc1!I!Z1TiBnZ`QYK4VQ92@J zCsjtuB31Xw(xtI($y6L=owPmWpE9LKqtXOCBY8-|lH`aa%P_gzG9)1(KGi5CA|X*0 cA7^A^Ki+1%HQ&bofDeD5j{5cLOZ|NRH{sIBfdBvi literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f892d384dcee4c386d866900d99cf672068e5bc4 GIT binary patch literal 4860 zcmc&&c|4VA`@imIIU9!~=ja@SlC6}bsE{1HNTRZ}SVPwAGZk8tN{VSAZA95aOl7Gk zN{faz+7~9YP1{Tp>3yDK>YaWw^T+S=c|Y%aInTXb_w~KM*ZrK2`+Ho(t_A`^;3Eid z5=aYR6o7t1MnIyG^~8rC6&vGskz)kU);Sbfn--)Ve;z&LH#i0zHP_R@#J^rV`m*)S z^ZY9d*9btMAW+m5@TfnAfT3&*+N>`zgsd-K03fA}Q6&%Ic-mBB&38yyK!}Wh!4nDD ze!dX_OM`u5f(4iYPoW1XJV%;oNR#zX!9=>$u0enSRq&_^fa<3pfrk^-rD<3Y2-pa4 zHj@C7%K`@VNK+4&%QE9p?zG8`RCJ7Q2nbnUW`r#ChmQh9>aU~J_+AvXq5Lr(rx0&-8#5=?E4n1awx62ey_P$n!0rV>u7Dc^O0KW-FM} zwHy{DssekbJlt|FheI+6prTieu9Pby^~Mt5Z?*^Z#w!q2I~5{cDnNtGBG|D^8MX}l z2sYV);8QjZ+7~n6Kvxxd%t26dSqg5;$wPC~SRnRUgMQ6hNOe#L=ZZWKl^DQ5OdIwo z)xhe{70?Sa;ijDcd%WHhGUoC@bkq(~2cDqiadJ>ejEC@4He~6N&~oe;=&rT}myZmn zP1gaL#_sfC@!x4}e>kiqehv0L3 zE9|x_f&;V7;qe{}4JJQ@iD_eCT)83q@_H>CzNH1@BSxWj2Y*G10lg?z8$ncrI@oS= zhUkj_`R+GXKS02 zB{>~Xapof$2TqvDIek>@z(w<#8=z4$8l6q)#Mac|kdf$(b-LNW3*)z#u*?oW&?F7U z$J|j(K?^)c+XK~xZu!qyvKaJ!A7pAW|4IfGzfa_@kVJefbS4-+()5X*H z4L5U0Yg&fAttZj7Ip@L5OoOOZeG4`Gc&t3t8dR(uq2cNnoS!j>G}avhS(`z4w6p?n zGkrW#U;(VSG~%Ss4#vmK82GtmE$nHnB6AOUWBY3iP`y?iKIO!(4RFESNu*I z+=e!xr;cY)>?k%0IBiCrj9!3|+KVwQ^J>!nycd~Oy%6bKS0szuKBLmnZMbgfcD&xv z4z296gIzIc_~^ymc%tkB6drO9Tk&!O_H0=)Q9H*4v7VY?QL-n|=ZD43(mxm<0`T)2G=k6rJE)$~q8A5Mi6qN{hoWo9bO@86AIdQyive1C(WU_fUk za8XmI2ljA_kBrr*3=I1r9OJyd0)fA`qYsZm39N9X=)U+J>7>v18lsO@Rrir3|Oo zMXac|No<#a8B#rU`5bbcG|1ot-d1{(`Q9B!A4?N>UK7de*nR`*r?p{?nH_kgxtgrT z*4>2RyxX{^d=6)+XbGpy)LuYZFH)VIBrDz9#3Q};O0hG~o#Vxm@sLpU9g2xj;HbKc zWra(f!1H2qj~2Z^r*yZMY+{ru^Y>o%XX=fHp}cV-*{hQa(y=c{^#=y_NQCy8|*@6 z)bHKs=E)RHdG{H<@%62+qO(#htVaP8=CtAuVzWrk&3$P87=mT~$&=jPFo1@#qlj>M z6*y?Rkqn(%g$JDjd~o$yIQ`TP?a$4Y{c&_OtF(48?it30PT69#T0R}Os6ZI!xhAP} zp&FapQOXDRvmB#$e1?Cu6ROwwgtjzqVr`yZgvFl?Wjr7iF+YwvekC;)Ztg2V9U*>D z_{l~3_RkZr2gc{nHPKq^;#!f^W<@iSb1;pmmbF->xn~)Y;@6?I+z_~y_ZZ*BQ01ob z717SvAMjNL_t3?d-HeH~u23Z4vGb&BqzZa-fK~QcuGY1PHHEVf^}l|DIZbSa1aq#! zsvQE<{o6Ba;QRp0xi5`hYOW$=_P+*IH%qL0u_p07e-7gi-v$M_CXv)NPu35+A zjkC|T$XlB#B{U4~I+M3Tr6_l;F-H z&$EZO=1z-xd?Cl`NX9z3DD#lr_Hhn-tc$Y> z9kUHXZl5du;mF*BmYWo9N_G^x6}c9BwUq2Eb+7bm+hbF@tK9oUbpP#^(%nawo>|LQ zvMtN5^1qO!s-amIv0wV6HaT35I3KP0e(k~lm z0>8g|@#NOk2eVQTDKp;Vr-$P8Y{BG!uAd%RrO9bH7kG3$wn-NmMf7xaJh9t8)+S@T zXJ?PY4xI%R0o|Rwj@kYOf$asJ*M6R}cShXHp6+W;=N=TN%4m39@0(ZPuyb?H6ia`JRy@|IXju(R2wGvF_4=qUANl1 z110kxQJ@AdQ8IB+gPY81FT8`2$t1XCND+A@YRx)J0^OcN}qA;=PP?nR}M>?)bw=}IOdvfHK`rA zQRZEr#5Mi#d3R0dol{#)kH5UvuySyiYj)z7N9VRE&B`!4`KI@Bu4%H&%({2|*Gs)m zXUwes^~K%#m7itI8$P`5xs*F=n|b5M_ff;|u{ZBA&6+970&GNI*nfK!XNQMG&{`uf zh6oFZiDd?c1=81a>Q0^_qp(X~p-5LVTT?TgNsEs^fBpXU;_V}F{wkeV_0m8hA`l}f z)zQUe5&nqHGxPgHL-f&KV9Nd~H>hP26SBVe&~oFiqm}_W%3r3 z3h8btCjZY~C50Me1)e5LVXq+{K%^{`7Q4F6bvGG#vxEl;J=%=KmSVAZ z9`(@XC>AG(ErZ0iVsVhzDo8SBNjc1+TUKJR16@=7FE!PZG-K!pbRhBE?)l8_u3mr(Z%w~eE4Bs}T$RD5bFeKaGUAXJO;jb0HK7(2;7 pDr%B%baZ5bT6|!1Oh{yent}0TgUR|LCuM-IA7~^1T&Ul@{sjY`8d(4U literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1a786300b7ea789c918d25ecc282aa7737b43bf2 GIT binary patch literal 3554 zcmc&%d010d7C-mC_mYcLo_o*sJNJA! zC*gdA8%J|~+!!s#a?l6BwjgJ6)wJs~Pv+Az>f*J1r#!^yP;%e0z;n$_KNtRC5M zz*%w%7f!~Rb0SV;;|We|uZ)82l>z|iU`%Ei!qgf{)sAd9TIm5IbEQHOr%O+mk*v#3 z=CE8QZ$|PnNR_N&-=s?ELVN@c1DVXp0zl|kE=Ws!5PVkC6i5)z5-|;oDGMg1tdERv z3B$s;2%>VX-|t>4*xnljOZ~^eU(K&uBMs zc5S7W2fqz&9oHd^K8qVFj-khn=dgRmZSkD8RyeV+3!T}12f15i;w>2o@VIs-C2{g) z_OEY)Lyb?6J!rA_=W4Vs*c44@u7?Iy7HV10ju+NaP&Q{OZl5?7?s`7JijM-RtxYoU zJQ|5=cAkLmN-E%aX%=PYT8MqTMlmU?tVOp^RwK8R*JyDRuI!?!CY`TRxh&n2B8N(lVsEF0`9398#j-;Ab9XMzi zIy(yO$?L)cRQBR6DNd-TRY~!~3+VkB({cWzK=k#~D;D<_osukRiD=FxxWA{(AL-l9XcVhCP12S= zr{H+_CUJ=_ZouAm9AWsiyG-BVTsE zK~!tcF>~*a%gndR{g{@31d+8z3F;U-naS7~q?~lCl1fSjbah$mTmF5@FkYR8hjw8I z$O)3YGdxgot+q-^e(oA&+%~Zb*C_q{cbXR5SI`188QVm*1IR)>`#VB28MwcU5RFn= zQV5t;_-$LTkYf8dMKx|)SR^*L4_fE9eNnNA!acdOaeL8+X11CV1z)vzX6DuzT= zCtqydwXD)Lwz)*_>Wb#{ zHVYkBY_Xn~byK4UxKG&T6P>1dbvMy>b}P{b{rc{;(3=OWB&wXW&Jg4g4*l@mfMV8WEtE(IHba5dh?p#uT^22edc2>%DEaE~tTw*>>z1&<8`pvNS8VY{Fa&4zN?nG&S|FznS?lZsKG`dJW?$U@^7e1Qa zvUc3Jp6QqC=TlZ;U0zw&&u#Xv47==|ceksd$ZGr*pAWnVq=?bPXy`t8NSS#3t z^fu1l5n~VU7Fp=&?<8Ry0Bq| z@GKwyoR3u-2HJ4gSYv};E=-cNlysrqCT7#ADcL#V#MDHgXpsulD3xF8FaKPjsfBHljOB!$dB`S(Jiov7#IYdq~hw3kW>KizpW$%U+NPSY^NdZ02FL5uTXJcJtazh z>m9J6cQ|AJrh;qsqWq>NrYGeX`-$5~{sU4Svr}EOB_;P&^=~lxI;_f44^W7*+z|+Tzdr?sIg!>!kfs%eXAwFaOf2+1 zk3&DA-&W9XF#MljK`v9;D7F9N*6L4cZ~iaItNODtNxo{3T&7~>KlZl-P}(SDd_=_f zNN;0eDYntB!tmfVJkL)cUtxsud_J#9;sba-iT6)3oM{NdIAQ0{^TEPO=-!o3hHbW> zK~Thx|6qu#s~6df4GR)z|9oK{D@;T+&ufKA6D^R2dtNQjv*X;u+~Ojq1P&R|y>!In$pRfB zOh)Ji*Ny(nCTE%)U8F}wbW~VIOq2mSu{XhpFJ#io{*Yji@dtl|P--$00;l8%7ksc* z;5RVk_u8KjY=}1rHW|I4=lLB^I|iJBJkd`?g{Lucg2Faam-T*b sV$RU`%*>&>tgMWD+q}fA?39djTQ|>PZo||{tqs7757g5D;pE@luYn)u8UO$Q literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bc20a7699b64f53ab5cc24074d8f61c1e997b00d GIT binary patch literal 4872 zcmc&&cT`l@*5Bur=?Dsg3?QJ$h|-%zMZ*Bnn`ori06{>ibh}`HL9r20P(g|)7Kl=! zQlyCm5Cd4iC>SiL#2Cdq6;R*3Gr2Ozgg&OlOp5Jrz)9m1N)uYmgYUFOYN`5U7^|RV|ul$X>sWNr$?SF zyRp0UY-uka_=pIt zvHg6*{Z|J0MhEdRMV{g#r1%RGijk1@PtF9oRMf!7fNJEZ4nUHH-RCR_1Z)HhHj@Au z85WF_VVTQPTC~VXJKFS%xRqFAJ4!6{hmInZ%U@Ti%|CL%q`9P%pL5Zc@sC&)8x{~A zv(n!;##bho8U*{VTon--8x}6fE*(>RhlKBN;XYjTe}Vb8Dc7s>ThyKSzcD3e_dxbd zAJZmvEvzxy1|{7h=&oH2y7@Zr@D>l=?9zf~&C_Ak3~ zcr^}#^cTV4UH$=Xemnt;z&UW-Qve0O$-#z$BD5unhpiFJfrc}NaP_b)ObOG2SFRt? zjZ%HcP*pm{m4v-Tt?Y_|i=+UpQmX$s-LDniY;Ww39xD(o2j0TyHhfY0eE z(7v1jN4n3Vehz|)HZI&9FAt3klYz*w2E&RWNVeAmm(qL?oHT+wOdE1kDqz#6Qg{sW z;I=Iv>;HBx>~d8A!S}Y1{QMzW6FVNx5K|#6nGG4bBs7iElZ7i?By{B(B8L>6Zb0AHyLb#sjjF3*l4TWjJ743`ZR1 zL;pbxjcgeJ9q}ZXQeq6hy-I>(cc#JA@CoQm-oKEt|6>%RjUX~y6Kr<7z@pJKv_D#m zr=R!;Z%a+lMZGL8YlO*zS9^&8$0n2@+lO}APY0vhk0Fd` z#cDIpqnHV76c&6ROXz5a=EQrbC4UeZPmIKJBm80QuPGQ?dp>zGy8}w=htU*p!sc8u zL?`TJ(89(Vs1-(`rmdY=VigX%5|&~wW)5LWr)}{g4Lqng+6;Z-O!$-@g-!JqN@q938ZgIrrWhW}uLfJqcW}-4 z5$EZ%S)>BUz*igV;N3}kknvgop=x^A%N13S)>4PxS~MTT4Xd%CY7%uhUxAFWC8$z; z2qI-Kpps;3P_uS~nj4dFt!?EXyI=%`>uSEVsf4RZpwJPh9(imbDGw^Xn!vHIw78QxmV@o0rcdbx-ZVLk+#~s}K!yvRA>D zXLjU-t?NMdd<|p#9fXGdg#-8&f#KKlaOHJ8@RHBwjPl#bbLGyY@7Ykff>R|5jv=1t zc-%cqO*n5K~566~j5E$oP*1sTab zgYCU^NzP-BB^jzz09v{U%s4U*&VJgDE#1(-OpzrSDrRC7e|8;e7n{n+4a|V}+3t+B z{Wi!mHxsS=_7LpqQHSVgMUJ}LWR}6+-5hRHHMYk46?)(>i#7HB01D#ha@xvfGMDU0 zlndjY$15J|pm!I;2*Hhe;5IKA7C$?Hw?3>w9lmcNFbGin3>nnW>4`nq;WN%^!Y(ZT zu`$N^{W=7^Z%6O?LkO&BgP?c*bz+a3CdoaZiu;ZFV<^R)75H=p3=TAK^zLC$=XRJW z_VrUMo;wXp9=#ytP4=RErZJ|XvKwcnyac<$TVY1>DTY(bGFIf!X13e&+1y7}1srl4 z560mHe(lt4=IWOP!ifl&&zSY|D6s^RZ)Q!aAvULSmt0F?G3;OQjK%x< zGSOt?&+xaNi1pfPquUF3SB`OZu$g-;!VR{P_3D(eU9Q!T|WA!Mrnf>LsO^hAy zPDwkacs7DHskjJ!sfb{zzLo*4od*c-f;POR$AA%yw~-bOTEyua1a3IAPq`<59y={E z1vTSU%o(n~qD_N$A!nZrK1ux}Fv4Fl6Q^8;l8`9u% z_n_MswqmLW>J?_*+zD$t&uBPknINU?%lO5`MYz}Yr)cpcf;Inx7rCeAIU3E1B*Ns? zAjs-C3UNJ)2VMgFz(akwzCH)@H0tL4FfoetwYM_z3S~p5>&Y>2JLe?o2=)WB z8&1UCCo{0VnU_$PAPH+p5^%RGo0FV8F;gRB`MAbMs}WbB3MI(|Lsx!3zMr8kldPbO zGGo5OHy*u@TA~jybSmA!!sj6)pI5;>`ZybOPt4-ho2!BeX9;@t>NOTN%!T;*GKw4b z@lns)LG06t6t;6O4}Wzf5>0w=2&pf!#CnzsiS-K3j6#J4DA0W~IWn6Z?`z3q`&_!q z(yz`^=>2qs!TxZE9r{QS>x;=C^X8_KBb%C_Zon4h9?FtERk<8@?Kq_X9!(sxHwuiR z4{p3-z8sr%`qTJI_hOclP9r?O@*G+l&l2NuU(wf=dLBQQr^~PB{O86+&_}C8&_9Tj zAl5tRX zeg3voSq)3ElBnR@GSyyP+_3x4K0VOJI3ayjRmM zL))wIXxewv+zRZnEsM5i>Ufp~^)wdkIAGw@C>B^9OV2h5>MZtZI=1uBtjPYW*;dDQ z<;+cZ{nYDBs;m0fTY#KkDt4P}l`5y@ULjtnX}eovL568sMw-KZ{S)>3?7M_6Ip&)% zzQavz_adhp%S?w`GnSMrt4-oN-q!W5487l&>3Bzfh1>e~!=!H38Q)9W1#?q$v&#b7 z4$j`d&^uTWeDkDNb&B4hAHwccts7zJA3neKQA_sRRQ;T)*kA5m*uXT%ty%Zxc~5n! z!I6ubMu*>zFb(tS6A7-ex!5qjF`1__XCup~;8L1`j(3gNsPOVm)w%0OS;j}N?w#s* z&^*n!s3lW(Y1Kxy$+5O<(~!G0X(q?pbIdmkkFrgRf6TYup*(N9>50xFr$g(bIW)R! zs|0;c@xmgPO}1vaH7A!0E^ppWqkB~jkDSX4xs$kpyLsLp=t(l%F?8*?(=>X<2Pt~1 z1l^&0485E}x87k=(w?(s!|*b$1ignsuPFKHT;#)+9LIB|_p1|zp5Jdh_kCa8R-CKm zbpFiGO=5YS6>aCwKD@F=(=pfSLfPZiER&GlwhQG0H}b4@s5w_vJnPaaUUX{3^{UFj zJ7-q4<~mpZ@ZzWQ(SyC$tIxgctKY^|cd0r5>fzPzbXK<4To@YovC#2|OKsKfgWcsJ zeeJc?Z(q{z+>536fBOPn_u?3Qg8BQ`f55+Q9gxBYzR9yX?ooHqbfim)UOV#kPv}j3 ze??cTyAb_;sL=7aD}_GX{i08bE^ISVJUW4F{pjB%;I`)1f)YUsoxmFFDA*S^vaAWIr>K(17s37%9i1Sja~t+&WXZ-;81* zfA#+lsDGEYAl(d<7}2YO0>XUx1jWR(us<<6Emfz}@)i`c1T#$O%i{AHQspm660P^2 zW8q_bhLbsMO(;o@h$ej)g)nwhTFR7v4cfm1_*ZVLZ9l<)uLNf!RM z9wo&m#)>>4OR>UO0YIRt#1px@yL!x#K3Ph6gaIvPB1@4-w2=DJ;wTcui!1|0HX>1= z$SP2>W=SbH(<3X9$e!-0`B+cQB*SRh1MNuUx?ab?;45mdz{QT9TgB7u0=l7Gi$qRz zW9datCHo>ndLH6u>|*5Su_V^tW_E~|r+W~+ALd5Iw{8v9Nc>qzoKQhfBK0R3(tBUn zM|*pih5OqEihOB%2|Se!C0=85_iw(T=MHouO&Q<6G` zw@{j&!{RUFp^8PxpW>lQX&LEnyCjC*5ZOD?`Vyu1G5m%0l6q6crev)2B9S#cv<;Lt5&&-0e_;OxMuIRh literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..151008dc482639c3df72873480ca796f9ef31d42 GIT binary patch literal 3553 zcmc&%c~leE8o&3>OlA^-1kJ<*QK*)P>`GBVOPQd68WnJX@Ser62*@e~sE9eBfN>W$ zK-7l_O5LgyZ3Ul2#HB7M+M-xXU1(LbuUhM~ty+6$Qmj1nk9W>Huiqhe`+nbk&wQ6) zE_5hMvVQCsEz7W=Hvsmzt*PGE9rjKoHH$vr_uF{cbxzIf$k#s3By=9 zt8~G587pE%w!?uV(~VU!-6#Nn ziYBNy1=aCt=1r%(ID%JCG4qcIkPtx|`62>G%b3*Y%;eY^pL{%#((g->Rqos^pO6?1 zw`W~N1JncQ%?Xb1X^V>BJafq%sWVXa<3M=vNr!di;2@%?b*A_u_`T)Ye#hvRr=Q9; zJii959n0WQd9g?mc9%)+V8pp?4a|-yZ^1*SJ&@1-6|!`~qz(yxac;rGjy0&y?FaO>v z5L%OrB#+L5*k4-U&o>fDR6S4C;d_=`6+DQtD6u7?p2i?p6w1Wib^>>A9g(Bu04#ii0-h0jI*9BCX-?Oxi=EIJnE(qO-nI zLM@bnUIa;ezO#*f6b7i78Xs`4E|qRw^(AN+@j3cuaVxRc*WSuuc?mfzeo)<+p zOh6BlC}8o?Bg%9!o-k=6=u`PMK>FTsGGox>fxrXGE%)>THy}#+fzBUz9PCa^M|)R}g7xcVmZMwuQ`Xbmz_AP2C^aM!P8^ph zQGfL{=p2wWIBiNDGA!Cn&suLsjhz*U^jUgxPXQ}EH|Sk@Ozb%FOK~-w_Def#JD7)T zOB&_FzgQ0DT-a+r-oXIc)CHfGrjf~g27q|~?J#lj zJ|eCe5G6O=z}cKlD17K;>4CoK%zF_wP?sbI7py*n^ZP6#{PsX3xu>T3ovcOkRd;%u zT)z{y*2bdfS}klC@(eC*TFfk&T7$BVCW^kHY*3VB5OKE90NOU~fUOBp!0Viryz-qB z`et}Dyr^1$PAyPbEU_6)Nh(aV{puOLnmT7e3wb?UV3`0eZn;6M7ui`B%5C5}!x|#D z>T7r^qeSGqKNR@QyeZlut+S~5whXv@IKrZFv@P(IOow+K{D_i&wE)?^mWtf9ENp*# z7d?N!NIbDaN<27`2K!$thjvpmsC|Z-%#lwNZIzFOaiK=)i8p1f*U;jb%~u$=hEjRQ z^AjTRvrFQnP6hhLu$rnEwTOB$|0wwERv_G5UTU>-{|sVs>rOccJ1X(|NiM2>7A&n{ zWnyO6bL;(~HH_A|33Q$40;ikyk-auQ;J<>k44X&0;NPF0ZIUmolH@NwrZUoAjwj-` z^AP${;Qms)$)ic}9Do?5UqxO%!Sr?vZ>-2)NXrHWmHKU2P-vkX7Js>M%fdxg_8OCt zt1MXD$1$`f{77Zt(!L&1Cri1lMa%n*Ouv2k$ks(G?fe!nDovHCcu>%4M_p6Z;x&VV zD}%~3)l1enhu6lpH&rh!ai7^_Qu%LNR^}Ohp+ zYbbIpKahO2e(n>I+s04lbe<|3wa9H#edfI@2j|i5n-9%}M)?@`;>w(5Zbm@HxLX@# zT>}p1R*8+$!_%HbP(~LQW8LgW42h(GXA&L|QVVf8g&BrrYM}HH{Zz1yVBI7{2kodWu0k8t=)0 zZ)&_&FWPWYY)YKL{G2ei-UDx`Z`kA3`>Ogk7=2x=s%2h4DaweBk4@IIBz|=`pqCym z4z}7|ef)4s23|%IJDQg!l|otg2M)p(y36BWgZ10<`US)P z87xpRDqEHIfBdpi|25B?|I6i7|5Z0hvD!|-s2N39fBc12*(#-6Xz1iHAM?dhR*>#I zd2t$!<3jL{rwJUF&1vGe0FH~}{Nsc(4F*i)SN*buoZa0= z;>FlWLHyo7n;*yW16Iv(T7J;z_^rU_JotTL)UZiIqr#?V#sqjL>cT_gd46&*_HXWN zJP7*y1tr)COlEz;lIQi1&xi=~N{I=K=8qN!Ps(oi>f%kLL$-=z{BJe-iPT z#-I=LO4Wr=N{t8?LdW((%<+w%+S4DLEKD!?!>3Y{785w#z%#gUS{^Tevb*hvj1%&W zlZ~l6^&B^bUj`=eIY^5L@SYtt>;s&WcX$v^ayFlT?4r3S_)H^6V$X4On6KkB+2%S4 z&hkfF>8aWF hS+VIE38^XeLx;N!b@NbZZ2@@k2M&<{1ml11eg_+(_4xn* literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b485d3882928aa5ca25ac11f280ec415a3432930 GIT binary patch literal 4311 zcmc&&d0fp|8-LDkxl8M9`Ca#_AuYEeS}5z7?k(*|p$+ZYyJ?Scg^40jBgztGX`zU+ zm8A%cZDflv%otu`3`UmJ@SfjIhS$s=@8|P=-Z}L<=RD^;=lMS0=ltq^%TwxQ%}`7b z6XwCFF`x}#)p&aDzJ-&YT<8v{NM4#^?y$LRI6rGhcKO+pf#1nW0_PWQbvXU@*XEzE zTr@P?r}08}IRlK4>2JXZ@E3>S=-YuAuP=s(*XIQQs%8Z41&AcGC3ccdq|Z=dTi^)9 zB7R74d{|&)a8e{g2nE8sNcby~WFrah?}CXfaMr*OfI9&m0I;2j7+();vNY@|1bhS} zpGyIy&I1}h9Q>%N^Tw%TceZKu4s4812#9!nJ0c!>?<2&dK3~O~?@1Au)X6uJ68Zxv zGxaceXi8jod~#q|aB{GE6rMzd1%}FFQ{v*4G;0XqGD5P9lB_26{|n5&!Jvq;X7Tv3D0JQL_>F^AB>x@dd39G1`01(UTc zAph9}z)}aEX%LY6I12pt{sEWYRsttt43ztdq3o#u=B<;W1&L}zw0I0O?Y4r`8=S!~ zZYVsP{1#oP83qOVB4plZ4Adh_;K|*=K9hpB8e`y_8i0E|pjzx5`p5PC5hK^PIrwfhAlHDS+E=;~_oS2a2+`z`N9w zu-3T(Ho1+5JL?G4z2H6!$^HTit8L)vvqiA|nkg8^YopH1FH!%ndnnlqK|;I{IIWxn zi8WJ+y2x58xAHB#su_Wf3@uXS8M?#vLEEXI{+qyfrU~V_r4t=;Jp=8d@~Nu8Hr2tF z=gAeW=3ssG9>h^+h{J^kP?zah!bbj(n||>;v@E=dzAJr%Z2Bb-CGs%nI<%GKn~kTd ziZ4RVu{X#NJcuzTtWc$^I-1hl2!|z!=y+xuvG5QHdFcT}oA(5GZ1ch;dm>)QYlK%Q;8jLi{8-u z1ayZYdea*>LTv;=rZ5fQZ1fshZSJZ(=om@sf;x=)`Y61vas_q22@pGADDh<0A;|ge zD0#(uJY+Y`CZ0FY=+eYfFm9X)RX^}K)aj-Y)me@(z|kEVFML7j=5-^J#d|?>LO1*z zSOerZD^kwb11}|;IvBK?^CmwD{?)Pw)}5@S*KY|RO6sgpgXtl1c=;dXjI;O2@ka-7 zjj~dSTFYx(#Y{W;822|i*2<4O4M{MjI20CixX{{}8DMdsk)w4Dq3172fZ%Ol_0^YT z{|q(o?#V&--H)SWZ9WP+I*z_!X@zLBnS|;1J#^?Pf7-241OB`o3Xh*+R z(qhLda$dVL988=>G`KsXHA&fIznRy`bj{l+F6u{O&TmVIN3$1D^%Ff2@BTOo46 z<~4DbXwsaz7hP0ZZ3a4@Jwjb@e=wwt_TtRBml#@K-ivVK@IdA#Iwv_ zsiJzkfr$2hhJJDz$us`(K8jSaP&vEDjyrwTLIJd2FycYHIT(+B8FTH2Gyj={*}l<;6J&?>diO--)4!ZS%yp#-F2B zc^c8GYxT*Ho-hI}@!~~17z~f@H>n)gJ_ScTH*m9qLk3igHHG1KAJaO+^HC|+hS1Yn zNphDw0hbM#FgR-m$0KIuUI8uVf{&>$(Yb;nii3m>3^WMkmhIyv{^ zsw>bivXwZTf05ih-ayl2#ahZ{$_>&_r&uLWJWHj_)P-yv>%!2EGX~yk640B3_H_P< z69PTkY?PEFREbkPKu+ygOGZQj`e}KPhJ$tE6 z!1SnCFqs;Eek@aLSP{kkvczpW&%g{15^vM*hfr7E(CY(G}Ic&Vm=EL$Wk`zA-n%&Wq;sVr-GKdX?_Mba%xR_KjN ze9+#sW$DU+_KSF8+1BiQBbNd*zvitu-zGD5zggd8;9m?}Ij!pml!S&r?WN$~qSR;(&@$ksOZKT(mt#d(#X z>pI8Ef^F_aHc>ZDRDN4Nd9y61|Aea5mEILzmHsVNg*$w9hqSJnuwzYiz=6b$8!bE5 z?g~7%h_C0gv#2)obb)!m$(_Y}Bfc+lUGG%Au0E!HPt*@5tJm+BCmd`2fbMt9bb}II zdt}XL=s6p5FnVP>qcEO!{^mjN2Q2Ow1z2tSK?&xm1fvy^l^E*!Aq-W!{%TDh)VYl; z)H(&|R~GcfQCCoh^=JiDv?&0diWO=t>mAAm&5gXeEmUFZ9*ZNq+!UxXbPjfDSx;x`bua{rbi8SDHcg3rW6kM7khYmZd7TOMWc~ zam>*%pI(=)73#D?Z**$sYppPsd?VL2BQmtZ-3o{J?Mv>`p6yXQJnCv)hEBw!jbk%j zc6RARdX_rm2yN%;MtN;>FSbj0qZ>W7(z|kc{ahwydUe3T)E94I%5eD0potU0RA!e*IXC$1giG`bBy+X4oiz zH&s5B9e(U)kiwt~2i&(Sus-4K3*z?|grsw@K11~CN0(2f7C$a3p4F1!NmOi9QZhF@ zHk_4VSaW4+2y4uR4~!(c2TF3ethoI6N8&Gql_%rq!Kf%zl^L%=4QND#6Es$p?N~_$ zzn6z0@5j%TBtD^RB3_>_t0X^P#ZDh7$*tCsJ;VM^Nos!3m9gRR5y`!hxd#b5k)&j( z zf9zEfHd+b=5}vTlTo*vBFH)0wc}?~i)B9$LHdB^tvz5xEQt1@@u+3d6O_RzZq)t+4 zgw!EIxhBI76WN)ARO-r(*zOy#rJN?Q5!gV|$#aKTT8_e#36oq{+aZnZC$Jq3EtPt( zoy?ClmHSdF){Y6WnPeT}Gd(5DX>^RAuU90yALohl%ZSFEGMyZ6cY zB!3^<_%P=PX)t?U36JUE=GSL?O`FErZfw^}U731sJ~4PrmK^M3EBEuAB=`4KQpb2x zdil9c{WKq3EZjcIhb^TnAK|1uk=!>BRh4DWowY& z?=(6(#3mirWCZJki=4*RAE(%RDR@n$Jc%>M?C`#yRF>9zPs(K3Q`kqUtYq&YCCT1( zC3U|zrxc8%I7|U};(uxU8z)yG4gdfE literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0da33db3c308003647b082db9d88177f6ae33a22 GIT binary patch literal 3765 zcmc&%c~nzZ8vpKn?bU}~p)$}v-dD@r~;I>zdxmpQ$?rsM0{*lD+e z(|h;ap34EJy2kk9?_JNV33TAdP}_0AxOc3kfGl2+3<{8YBor5-|-{ zau#HAc9fiG3DT{z6HH%8Tg7(ms95?J9VLPC`YPG{1qvomPGYBa`Xv;NJi!>BZAeVZ znx3G~(#uoGASGdXyfHP~kY)k50xO>(r`wd%BFg^1i21vauiJ1x*o1S3NHGZNN9U-llfcAHuQEaV>f;RE{E#Ho_-P8R*!;3%H=3g0J1QXmzX?Jn(A6s@=iV z=4Kg4*7bn8?I+;dl6CN6MF!Q+Egz2>>CL2+*@=EQxgWWw9HBQAibRZNDjqa?KNEi{ zmZ>`W5W*T&u(>@9%Z6hrn`?mJ0ngyH$4%+odm|WIki*CYN8njaD9CkzklM!;Kb%$% zOV1pkzMME3N}6Zlwgv`W9DfSR_D@C!`nExx?OeR4FaY`ljDyC{dsAgD3^*?S5Uc}# zg72p91!|O*GID;vW|z<(&sZyJFU^G8ClM*b`6!P)lr{9Cis-cL;k%Gru)KNa5>Z{viS<3wjU|TyAllA zF>g@4a%HeSvxMq7^(r;b`deg3`3BGVM-g5%ZwkIYGoL;%J_50KM&Wepk5Ny95rcjQ zZ2f2^ZRT2-k3JmF==Y^6s%!Vyj+-5W-ksZu`#AL%Z%WZ1bBmhdhb^FMj8k#W(_nP* zr)xG37oCz6ou46jA3cz-wL2jWGg-?jUR*~unsVfBTPl)9-)9ZiSjmFR4Q$QCeyH)R z7Eh_Nl4J}WEUL5Kga3BT5&tE?)+XKR1H69PJvMrcKa;Ae21l29;{;= ztO3u8{U^E}Hi8}a%^j31b&;Oi?Aw(U##GL)U z9-Y@eg_y%@sU~MRYQ7MIFIdO-u$ENgt+za|^vP$C_`C%@yD^)_JMz@mMt?@Hi5S3G zZLp`}%n2A3MX^bDo#FnSX6do+r{G9LmAFJ7*JqdSAQ*P@0n=kxDcUObz;<@4DRI$5 z2&q~K&V{w2@T|Al^fr?u;sS;A5!=nSf$vk^ka?%C5Y@iOVH)ONsO^FME%yi z4OCJxpzF(LbQ{}!6~^xvuxl%ZpsWy?@33IW&H8Gou1C3ax_X-Qv~LK<1ian%RZ-tPWB4R`_$yhmR^0!vfgv&=;au$8oqVl>w{wpD-mClc|#MSn-*HT_78C zl1>QdiLV7~P+dikwQFPn&9+^p*sHxH;u{}QU+rnZ%6&$*_pTja|1UYjR-dED&uEj3 z`;m3wnxwxz0JM;XiWU+jvPI<3hji4>Um!G-fk(^9q)AFk@&U6dzby+3D7L#MrfEyT zVzHus$VR`diwdn&9?4glwk|HVcJMDz@zqO~_RvJ_ifOJcT;5X~_vuD{ThU6pkr{Wd zG;b?j-PdmstM=bsQaT`{Ortxxed#-cBC12m{dX)|Hza0%^3|g|mTz#IakNA|cIS%n zVaXSE>5lDOSvdl5liF_!GQkbT1XYZQ*1k)cFRu1B!79$gTKQ^iOl;1hyJ04v%ii@e z0orCGCuUm{ieEYXx!2d?PlJ)a(2wmo#Mv#x&0;CFV8uky)l zr=8dCirgMn?o;Tzu4dBi*pKHkL)P!s)y=wEUpQpL-l>iApSLqE8$Zw=U#9jgaw*@R zc&@@bPvlxrmvX5_*HGkIdC<^WpZk+&Xw{)PH_w#&77zVveYTGYR5lGH858@xc<4|9 zsl&z$KQm^9O+!A|ab}unL}pn|6+K^Wh8vujl`mkBY2Xkc;Xo}(Se@_H0g`;KL!Pgd zR`?dTTyM@^plAP zH8RnWW{(b_mz{IW%i27AIYD}{_(AnSgQ_9-Ec+7agl+XJo%Z*)=Wr)Y2zm8wl^9ae z1j>+@NvEb{W{DG16NPt|JhDwv<+8rYGj>joRZe#;gygsP@poU+2rl3y^I@-GU=XcG z451J4ggV3_;T8i-|D}JOIuHwssMszYfjYdtN|b)34t|6MJ#zdb>R|nnNTepFC1rIo zBQsJyMoz5*oy-FWd3fFbA4L5<$m$iIpb}-qCnp;898Cz6BmNB!bF>fN-NTQN3h>0L zSL@D+I+b7LHiF)-C&FP)6h2C@rd5{H5x5#A5xP#vu@~|0Ao#Zg{!g%=l&S62;s0Zj z_$Lvy|H~cz({7UT_CTe~iB*>MvjtGwt7Lps)SJ;hofk{Bg?1CfllSL&K9YO|8OQTE zynhlO#Pdn~*d)uEKT#MjjK=bOsL+#nS5Ib^VW!|gaKyhke~6pgNHPeV5F*US<_K+| z&=A)=A1*ZhI$>(r=e5FocAUoq_qgcE*$F}3vvo01$-=%Ng5;Mwi!>I0V=Ycd5=1)v zS%$*iEB2YO(Vl4u!AZPcu(yOKaER!-=24R-3G*4a`cy)&QLWX(TIAMI(>#Y`~9 z##o?}c#)m>!rpo{A5ttLzsW}^rGI)t@Z>CEgAWZC^esv`UGXDBE%heFCel@Uo(~X) z!Kp$G(qn_XXT^EUBQ+T=D3K!P2=yl^b{+~@^S4-%%n3TOujBo5I?u_HtYAfScuOTa zjVvHL*DcU>hM;T`$AVL^C;o}4a5SBopmIppXUxe?%yNxSPj}U4WEgWC<|bxjrWn&4 Z+`Wdn57nx}?EzkXpppDNiy(hk{|!(~IQIYm literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1e1b4765c9bc16d822b1bac7fb9e6d103c2763c9 GIT binary patch literal 4158 zcmc&&cT|+u5}*5(WfvA{`{6F4AZ4jaQ4kT8rB^{vDMsnNtyB{g6oZ1INE47CARwIx zLR8d<2-q7o&?F}Il88o6j2h*A-|{f}V*YsNyz??=_uf1A&dhIS?mg`9;3@Jlp$N*F zvU8)@6et3yuQq6?w1{^Z=^I`!;X>d@*WeNT5o*FZrT)|lKlw%4$~03z@hH9_#iisa z7R6FE2VM5KEuTGZ1pv}o7(H?kh8IZ*2azg82+V*bFW_^6gW^L2B7>45DNLEG+>4Y? zAz`GD{db`RhIEcVVL%V^^aMcbfs~UQodc7zF@b=C0Ov>%K+3W~PL^#YPg^mfFg7u+ z%9P5VS@YTBCww;gw~aDg$zONrLm6F3B^Zup4`9Lv#QRcoWkVA}QsTnmlLJD7l7nQU z=tWd$KuAJtN?g38xLi#6cO<-w3!h@D{{`kJOWr?`>YeCDJs~Ake<$^*!!)gP#eT54zwraz}emw}$=b_#839xxN5421AA>k)&03#)MHUWd&U!uT& z?_0S1t`S%fmQe32fL%}JVbu;1%1Gp5(E>|2y2lt!eB}rlans9d%IRS;NI>dC2oOhnP*a;64-$pRH5_PO%bPThIzM6VhOc zQ3pEPs){rZHvw;x6KEbj3*xSs5dTCO4o&zBio#W4^B>=UeQ6j3wrIe)Wh|&1YDc%E z5pPTU?zMKPF zJ$NAa+7YrI-A2(V3b2RJg19UW6d01wzjrSfu5kd**DUDD)dOKJ3tCEp;N(p?IDUzQ zOFNB`OJg-?C7p&hs~kY{fe~B|E`Xcw;$da750qxrz-a0KlsGm(rOQ0HwF5(MGVX&x z_GHj#HHD|oGGX^M9ngwbL@%m-Ln@(nQL-)qal9rt72n#6CO#eym$`!*L{ahRX;|i6UErhgiv_ZU5j&c=aEfi7on}^HPQe#%(BlIH9E_p zg}sO1urLuFTRVuY>&9X0%B9$#w>|t~{v6}CIO3H@xnRE62X)r=!_Dks7|l(@ry6Bo zRu;3#sDeqXM*|(mBpV&Wex0a}Y-gtlI;hi)&!~Tke|F|RKJUm>Db1`@tlj9ERQfV=vPbF(8Efo^ zpMWH=EDM2*hfbv8+Rwpo{~?ybHH4o3h64oEf${vgxXR~T@cttQ-FH2Pk`*~9^oSLC zAaOB9>Mp}{=5>%E$NkBIjwML%vI^OFauhXBJc%1NZ^I9|I-*rWj&N$&h?Rt&A+24s ziI(#OZv4DRWw?4SCr7*i_2b=A`W{cvn#b3nqR0WCto9CA@gq{}GzOqGCK20{HwSfZ zmzTA=*hMNVo(}zkX;{L-0OanRA}ze}J=~d+sw4L9#F96*N`-G%BkjW-v7po-qU}qH zdtP&{lwFuJafMSSC4M!Gt{zy6sh0Hc%r9?&6@z=!U9@K)e%S#2?V>u|Z_@*`XfnZ` z_tuZxcIXlMqf|`9DNTV$+jb*sqdvn;E7Byh0$6{DFNfj(u zS`$B;l?+!an$X3lU@$xHMqIzAkKHuyLze`Z*r`l`%qA5pQo1TzO1)s&gx)*hNQT#q zGG(LSQuQr-J4;PAi>HE$lMC_Hwcn#tNhK_UE-$bSyv?fScFNS=Ed#^GSu#CVsxU)( z33~YKS1j(e45ZDIRbE|0q2ZU0vC-ulIPO1i@n^@yX!4KcNX^>@8(tk@#Tf zXwAkvHS0{az@|1kU(=~T*RQuWr%=apms6Qd-KJs#-;T)P-nz{tMuENA0^8lWWiuiN z8~l##-cmkGeCtG+ZT;4YIV*pCFnp{&uX6rcjH6=Lu&vrQTT#!yuOYw2ahrzo4!g#J zI@eOusOx==h4mg)HaRNxO+}5~4PK4@{Y}NqK6`>s?yzs(-nw*u;=}9x%_Uz4^ki}- zIkc3vhny(TTRPBEwm0I!F6VNG)*W3jLmg2!23pJaC5U@YzDM`#(fLY(t~jgh6X-cK zdgG96x?%pL#pk{|FotZI4#w3tBygS*I5PcdZ5@NwO{2Az546!}zc6TLEA}Of{McVO zD{uMSRWG03y|gbcV1C9Zuy_{R0!8bk6oiZRZws>BC^yxi$Ra=3e)B|=@XP!2Lmcze zL>qXs3qqX>br$Ykbf6&2wZtfBplEhsxO@4m756S5D2(u^oWG7mSr$butZ~d2F77Uh z@~-!&b||(ij$YjC(;j}MyEw+@YX<#g@$$KQB{%yu|&QSl6k3riwmqmq)P!eYZ1CPdSo9%JG$Q3(l&ooT3uYMq-|H~Wx$6k_hi@q{f$X2e^;Q(wrR|0&kS^Ek;si1Z2TBiaxOAv^q64N1t!5hOHx7|X28bWdh7Ms$!NtW@VZA6kzjyXw+9NU+a?ic5fLgPs4Wc=y)v@3HpW0t^I z7Y8M-NC``x79ti;3rb8(NK;P@OH7JNh*vi;pKdbUSm34#@csjb2mnv|KL`H;bMo-^ literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7e9af93b02ee386e9ac98f1c9aea4130450e7a99 GIT binary patch literal 3781 zcmc&%d03Ry8h_9EzHjE6VMd1K8)t+RR0c$lO%|1uZ`c(LL1l4)VG#%rVnAGK02Km3 z7R}W3qTrgAhT#&JY@(S)38myU^L8^tE4`0(UBc{qLu6j-kNZ6LxgD5u&U?=L{@&%B z;rs~aBiuNebLA$6aUu@P0aOdToonMS#q#;GaF27~Dhl0~HoxCs)t=7*XUVCYI0g9; zaUyFkaA1e_RP2y807%DT(kl?AV4S8MS#z}76GRHNN|K;kk`$M!)2DJ+sZbt9$|j_V z*06sDrFJ5*0f&Kf3Nio?xe#&DVMG*y^`j|}AfP2;8W=eXBJwdIADNu>l@oQLnGFsC zhLRicBzzS+w4-9FUv-ourDs>krip_wB}`GJrypu=(!b^8AWzCl%*#w(k{g$#%hk!# z$RsT(E-@=3FLQ}8%Y#_C7Hck2nrqnlzwr5ox%ZlJUzvq*M@dyk_TREUq|-rFvmoc- z9Nd`NKv%r^Gu%5k5uJ42Y{HJ70efus(D5U-!`Ov(bokC6P*dnR==G_g_s4acIJ$IE z>q5tY+r{gUNuR^-Z#<43*q_JlSvSSYE_6X>(G~Pz-7VxkJR8?$CBcKHU6jOODzksf z1vuFH5RHZ~Jo%I>dNWjxX1AY!_cb}Fqp%wnHBnHxY(DOe4218z?qbzj!PNFP8F;-L zg&KEt!k48R;c-O{W$RLaeSIb|Y3r>-H%}i%ZfPxaZLvrcX_zM6oNXn|7I2ZjzB>7cK{<<0nLBmk_%9Z*tjmyTNtZ^VEpt zGKd@~Lw1`wp>IY9$~Bjuq!wT1vWqLKiuD4?MhjpZ7GnFUhnd7PvCQ-nDp-Gb9$N3j zz_?FGFmIlHj1CMxOF12=rdIX_!+SY%ar2B|^t!&38ouyzYMJSmC^PK~{L=R&_}1bA z`pC?1#P<2(Y}5D9@XRb+cWf~|z;!Xl-kr(l8Zs312i~@vksgiqF1>F;;SIus(IwL8$5HC4|zLURZ)hV8CFqOUW_!=^E*n-?FYedqh zTWsc4R`T-2W_Ew1Ejn?|70=&hBFULBPSj}nHvaonJN!z3rCGMgJNWfQ{cKdVmdQ}n zgPqed@ls|fG(3JC&tKUlE-__9jXyr1^Bb0dwuHA^@)@+5! z&qu+YN+)_$xGn2aQzbR&XvQzb{tI23?#+(6*td_wril)vUxG4TH z3Yd89Wcao*OMIdFGidfci{G!gK)o~7#?)@zCfa@WWvbpjgz6eNu9$qJQn6j>$8-cG ziL5+J(Z#^IOxD&A^_>1qR7xtKYvu97{LL#d-jIo%uV4tu4UtXp2$p=^R4=#b5Rb*qX^ zRPL$0t#zwkHMP-}s`&afWfl$*`=Zt)N%$V+LenFSpiB zTEQ6YqLUvhICl8zu|@?Q@w(wkz%wRoMitpI8=QH)E4)a(81{r2Pxq z{ff;!*UlLixi@@w#F4dg$44EC?_BR$KF@jnvFz*JN6MpI;#vwB3$Kc3x5UIV)pn6b zD`GrSKHd?y!E0TtXZodgl6#J>o9~_VWoxm8cjbaf`o2$gM;@zOI3@q~wayLRRg0cq zdGF5ko?}&U(+Y_D2?e0)nMT|qOS1x%^f>WYbO;q#r^geAtnyQ{0@}j|6~GzJ;4~~~ zLrHb)ku}oci+i3kfQ~UZq#y6n8If1|2-Q*8${*2YZs?JsXIhyB&po1N&q0r5MTZM6EW^4awwCSQ4OQKyFaEe$#WzWoOXOdg`H`(sYlu=I|ho z2zmO=m1L$Z5jaJ%p3X?q=Zcdvl7;t}Jidj)l$B$Y53Mv;RGMBRVfp=C{lgbFoC^r! zq{ipfflbE1An28|v<@+*evpWS8xD~0>-a3C=rw)^2#aEMzv>8-;@MRK@`O@^)SGFl zHGf7aOn)O78OcjhatEo?08;*dGy`Kb+r|=7@htwIc>SZ4)fJwg66q6DlQVT3O&FFP z{vA=XvkrU9!jF&*@WiU8>&}S=m7nH3g5Hx8;V>r(^A)UVmGN{0GKYzWp;NLOLGWz^ ze52$4Ocs<1wY56zf6OBPA_Dh+xx;_iO;X-;P%1R6^02KXfZAFm<0B$oh?+cju~a)~ z7omCbTAt@;laJ75@O(b6P2q!hK85#BF|KKe!c1Z2&-01c|J_&v@wEc+~-{dIX%IBmRmy9yu75KN$D}s5vjs{W;ls& z`HQ48;`tjvND!nA;u)vH-c$Da*eK5>Nx><+POvw+C+UzDGbBgMoh!)Gg>Eo)W9WnN zq?0vmt}eeUKV1Co*JRJ`y7HPkWM<}H>J1KZxuCT#}h6(ycrTiiP zvqO#bCdDRgsPsG^AWVZZgc@YW22FY~!F?I2$wWbk6ggk0KM8U0P{^9rXh|X`*^&Ji zUYkF7PR3vbE26_2D>-OnOmcADn0icRP#*DP%u}!@_{3B=nn6ua*<|Z-Udl_(bxzFA ocGl(OWaZl|P0rD$Wi7FB^P1o`!Brh*4e;A9oS*^1$v?2a0DmGVAOHXW literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d112e179eb9bbb8372b524bcc40eea2e5cae3f76 GIT binary patch literal 3997 zcmc&%cUV*B8h^iYPLh+bA|z=7Q79t}QBcIK%?UC^K^zRV$dXMEP`tIa2%@NniUSk} zLnQ2NTSv^_2fT!~I8vS#hZwu`H>4z^DdPw!R9u|U7AQamQNg z+o!bh&k8UMzXP&%;&la9I-$yt<*{LLNr9n3NkKZ1^dd4eFhm)%JT^{UQ$G@Yk7NzF z>=q{dFCgDpX_H8JTI4Cbz)}~Rul3{r%VqsCh;mGW${q@OjxPn>f4I|CTd#US&?NC>EZ0bjo?1$OvkDDjg(;X^*G+C!mF z6ST1?$z*7*v4>N8UBNVV6byL1MCYqUL$*|mtdE%k@xTs9$%1dNC1jZQGDCkhs=GzRWY1GqA~5{h(_VT4^B>a3I^v*TqT*ysjk$2%duaU8_` zB!XkQi(to6DQtRv7@Tv%U`d53oLkI>13k5lCm+7k6FP!qXt<0stUe?Y0&K|#QN4wfh;cpNUB{Sz5h0fT0RVF2y=){=R&qE z3)<@I!FII^c>m6Z#!PFFWwN0nHwZdz>cPoNEV#7a9(j}&fO+B>c(%#~%q4vseKZGqJ+*O2N!%KUA{fIyORP%Gn=jg`S_2;FQD1 zM456imi((Ly7cVEutG~Wd|lTPZZ-N*=Wz@R&OQH}zvtC0q*eV3Ym8tUo9Fw0%(xVR zXLts1%I3b0j-9c`CO2mB6GmIJ8}ur%Yd0*g10p%=^cofH`tUyKyH&x85f_5?2kk_w zODH>Z#(UVpRn43mu2#s%F#~lw&u5*>7^lO(YXixXeAvtST#(=XohWc^9%S_z!=Wr& zqG6AVp53-BJgwHFSX96Oy5%u}H2>xHKx{kqD-g`NmzIt3W#S$Fu{KztJ*oIt^ z+_a61w%}JP&tYzRQ@|#@oGq$VlJSo};d=K^(i*NR23DFj=;8$4QQpn@#TQsh6qDhH z1|_F!>t#4Pp#wXx`Z?6{vfYfRgrGCBdP*)jzk*~QnH+SSM!Hh(maf+4`}!H9zUOw;g<&Cep6ICc3$&?a1G#aY3R}?{&A!Pp#DaNdcxQSNbnh!eU6H}y zc;1t^`n?TybNmT(Ns@}4PL*hFG@Q!fP+P9$6^=)-rhjdjum zhG=KfcKqXm-=NcpIqZ>*J|JImn_ZyYpmp%O-C$cfL91!16dZU9(EWi&SnTgwkUU*S z^zjZM>izWr_UglR+*voY@qv@^X!xx>WIR`a^)8kX9|>l$iv-Rn+~*V4vq`LBK?)9c z$%(7v=%cxU8?R2Xxi7A8WA2Esn@QQM{3+{M&sMj>S9e{}{=8hh^2WuuS68_Jd|P>r zPXz4Z7v9<`A)iZDyc*W%qar;=wm|>MemK)oOX%(&V7_(gNZ~=!R+z&1>uZu=u2xJi zA854fCQJnMgulFi&~z3&Q8A|}9D=(HFj6efPg#SL`c{5T`D@ZRd{ehvc|mHrmUwJL zPg6nKdOZ_GhL|c`yTQQ9N9EUCn7-*f``}Z#RMEQ4MiUe6^)wf)-(oCJB_)c38QaX< zvaS4E4sO_P>0RiyTT#4m=SaW0h~AdsO*wW;S~4U~hcb6NL|jz)w;tM@HzB_7)NZGe ztbJ2fKDyW2TC(N92PqiWaE5AYfm4RQb-)SLwj$T9rtW)Ylx7#t%pDtf^+f6R60dy4 z21Dnv9i?+sKBWO|Wjo7#Yl1uWIG69LT=;py{i|)|In{wpsazwMirm_eQ`y!F+bee0 zhhHdk&vU8V(-_@T7kRC{GVe3xt(FZ&u2p-#NW6Ekbm7UWeMgfY_I2dBR_{N)>gl8V z*G^U+`07)@c~ZBU{N^;$z-G~@nu6B#d^3-|ZncGNo5T)L*H6_Jo!nyLv{C9_cd%o- z)!ed09d*TLa_mD+?{%*~)R{LivHyBUeaZO)@-*IXj|Np&k=u5g#itueFO@LpoW{Lh zsD_|hHV&dIZ7!G5=;)J&K9|#cHRw(ie6yvOk2IQI`4heTO7$D)sf*u6mmDtfstrlw zS)bl`YtUtxGN};LkR-RE6oVTZ8OIQ&utXvzGBJr078AzM51MPFa75RQMLou{ z{Nb_&H5+^LnZ1)x$c0m1Qz!DMHn0#8h*&gv`k8r`wA|B7?;uaKwOYjFpbbNw-ae&` zhLEQw>%SvUdatQeOjulal7RDkRr#U1zV&b7K7qT_RL)jf;_t#|zF(KP?D&tLv)fdB%RE$O#ydVoh zI=%@LUk&`9#e#^GNF|>CV|M%xQPuy;^OgNmFG!Xo&b`$A^^K3U}@084p&dflErYKKlQ1~-jwVtwP_R+y(XO9W?U9ddVWm2@ipHBp% zkM*YW`&Se_sN*@QgV0IPqKT(oGP)uDi2=TjaiOl^R1o8@hNsJ+#eZ<`Gk-p__h1H1 z>FU%q`9#xailiW4N2R~tY-NC-x^x zHcB4EpX09XH{EPn23t>2Gnl1o4AX=70GCNo!DCm_JsHOs(M?Wf`cJ2*xfJ?Lq4uOR zr>mp&{*)qFb5H7I886y~Qg>40q%Kl(t}fj_)@3=3qpp+jr{mME%++WtL2MEql(1}h tSkkDF`1nyl2?@$%lNDhJiIK`UlQH8*j~Q(*@st9*{)1x#fH(ca{0|mk!UF&R literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f3f7d2a7d0cc4ecb1608dee4e76b94cde19d2b83 GIT binary patch literal 4018 zcmc&%d011&7N41WZ*r4>M9obTKtL%0abXkOP)XPk6~PLq6vHAQTS{28?obpJ72E(p zR*_9m5LyM5T?My-&jq#0vseLN>-wI1efK8C>a+dhec$_D&zH>1nK|eD&N(yRmtQ=j zo)#2N*-;Me6iI;*02r)Z78|!}x1Rp2qp7})1!Ygd(-hlXZ{B>jHe|0&-nH;)HWZ)) zlxRG~rT7$!V(Hod6LQE#L=IU20JxtcW`qdq1Cnm2OW|T`z~YNVoB(-LkbkIL8A>4` zSJ(oDolp`gA^$9tcs!jWpb(%3K0N_I>mHPyJDmfQvk8s?4g{JUHV(8jNn1W`!N_p+ z#JCA5?cXyNkwYgU68p_YNLTUcE`7)|QB{F3EX4^(dhz#Vm}mw?2gXGPN5%RF$z$c3 zVe}#_$UiVTA}%sYRoELSd;}%uG07vO`(I%GVYy90s#EAreNU(=&gVVLBb;{31u<0% zP;F>6o?7-RcwRLLo|=%!Bgf1J2S*>omulyM&;5*WkD_O=-t`=~J|!J5_wVMJnq9=U zxsC-ESMPyH{2V&D=NNoxd>&aw53*NuT?FkLZo$ri2hj2}1uW~M0r*h zsA_rz#{hRUt<4;kxoX1sEhoT9NepaV-;FlZV_^G=#i-lM5&UfP6p0QwV|mRQz~-no ztUb^U?xgGi?^0v1(PrzA?UboR*iIeR;OQD@5!Q(3CbL*xYiDsY6*}Aqz1u`i%R?ZT z0ExU;Zb)Mi!s4g~;B5E;bXfLr`v)cy0-y<|tZoD^%3Xn`uOo=in}8mESr0aMHey%2 zW`dOFW$0-G0WZ%v18i-L@H+jcpjHr%Dw7?6p2KW#;?gKgu>CbON;(R(9AAUK`&R+X z)*Oqb>;M^;f*)VHoAoMP34Ur%1p7``6WK+JQEsgTY%s3JtV-TuOU~cNW;Tvy8z#r2 zYO`zXwSG246Z;n;!rT`-3zT45Rv=h6;6jX8zY>f;c7iqh8iY^(ML#xqF)*Js1Jhoq z0enYhao`E&{KAS#!R&A!SQ38==}E@0{l#B$-&_=9QmfT?d9)u&c$J1o)@o_&d)Etxn`A)?K{|`;{eX$OWZ6M3NM#Fo} zd}479j}tR-EUQ+l65YIQgz|)Tgo1YjrMI>5z0+hwgs2c0jbFizC*nc%yS-@fs%G{^ zErO+EodOf8SHg=alQjAFO+ms`Pu5p=ouE(d9_ar~HrU>y50pwFSKnhaX_mf&%WG{w z%NIR{y>64q&wscNL%HL*=WA@(3)436BYDTL+I!>Ri&K%f_|h%lF(Vl)7|6iR{a6pX z}cc7E$ zU6|>q7Czs~fwY`|1MBQDV=1xogq@oae&`a8nLpjF-BU1wvstkbwqx~d(|NzZH4kop z{kxs8QTo3ER@6iGhB2o>Wq1ravST`|-^JyI3oRK_Uj#qs{XYPFGDZ z3aoL54P)QJtu33$EelFfd}}!C4xx<#xQ5t;v<53jo%a-0>SfnW8&3ZSb-< z5p^btd0Vt?32uH0+hC{Pu$I1MkSC~ziJD>Ha=~3}FH2uDS)dK~#O}gYANT=wDl=H) z>O6tn(jQp`8nwIw_p-qFvdO$ATU}tqT?hvrKS7bNcpzb>rf~Ic3idpIfZlz%ku&GE z2KM-j0*>mD>X*R+Dh`_3>oJ5#Ho&JUZ{RUrW0tz7G80#@-`4~yv?WDse=s%6Dzv-QxAow*?8%C|XS3`}w(p<5;z@r`YsrqhS?du; z+o3eAz&>Tf*hOun=|#?IV_f$+lH;)*R^AW#U!7q@$+zT!wjIXO8R9 zgBP-=C~aZ%&aTTjUiuNbaorGlwCR;{8Xc~E zLrwXE77HpnlZV2Of@)>VNncw3RdC~Vaix`a6~XeLuk&?OA7l&3;XpY^`j&gRn+Xm z=k|xh!j7_i&;7_?1PF)+K!R-t9y0HfmVWibJm-&ipNO$miHIDsVOZ3syR^}7To9?a z;NaLlBT-uKDOE&pR7k9v9}Ys{Ae3A;k*qhNY1F6T{{-}psuibN0})FZ7#bWYr*N7^ z8lex^sFAMwp^?EXKd%6}1M{l7e4$zS!7gsH|tu7nh(8VLXp>xwj_o}Tl(r>P&7 zC?7Xt#9AtoN~QDZFC(+1(gdk2MCv4!hDhy0RC_YoU=Fjgmr7lknbwD9T2n2ROavy7 zbl%tF%*>|H3&*)GjBcO6jE>Aehn7m+nL*~usH$_RIirUMSkAQw@Lm`f*gEt#{A{FRHmt)qgRL!@%XUjLyPZ_?)k+RjPAw^ z>e5xItMdt`_hhkhZ|i7ZpSjVCd{m{=c=Ofuxh?oOAG%qze2@>*N|_?ad0{MbAa!+T z>{UhyL-_MuRsE)$P0LX0rBVlG=^VlIK(WYa>hb`~6?9J~F-CNg6PW(fDXK4p-jk_3 z>CEZs=yPAGEJ1xws$>~2+D5ACq}oYUqFEz=f88vc-tXauN0lK?s72p#_5j#fpN80*V`F zuvS0?>#B+y(dVdEtJYezxZ&x$H;C3}d(M03yz}~<+2Q3U1ysi7;p<8Z=bKfX zyK(Fd4YYvnZbR$ShBSxfn7e|6>e30Rt|$PYE5VpiAK|!A<1?15=yy)6`)9QasVLLY1d44yAf8RZ8&7>=utbr&xuKjM+-MJuB+ z6fr5PsnK$kTrYuHBt%b*QYL38QnU!yV1issoJ)w8$ z1fJ<*u;_6Dgzx<)TzaX~4o*KfeRZ^6b#>WyxL!oh(1Dsgn4Sf{0@NCRG zbasawEHf9Pe)U#BJhBH$=@0I?1YFo*1-!l%U}EYH=MxQKNw_N{E%1cVt%)!xs~7M} zOyEjj4XoGCgr4?$(3u)@WZAG4Oc(foWy2Xrt#yW!-vm&vKM9t{n#015eK4vt1|qii zf#%5^Sl7B6-O)jyIjsxVj7*@Zu{RLqBf&w_2Km005V)ffL|Yx90+T?wnFi*(-vRev zIJ9`v*qyJ3LD3jf5bgAa{9k@Ti5W()i?9MkJ`a}JkZ^SGUa*wx^WRDxCdDQKVN1(px&;Zo!>xb-dtvQ!~Zn!g_2Pd^4Ly*I-; zzY%a}6^7b#?!y3e59m|l1W%vMg^gGGf>nw!dS3Aob&tM>R1yTKDVE^1BoM}R6e3Oh zZer1vckpV5GdgHns!R3phmE~A5)s|kfz@PdLbm=nY8_lm-1k3%`r9|-i+%fnW6M2I z5T~$)lKn_!%tO(K+{mo+&2TjDI{KmV5pwF5img>f!^S@u^??9v z==Tn2i?1FU+f)w?;xu$5_X3u80EcV#4(PFK8z$WDjjwCugJ#1~xTP+E_lwf--u5|| z+u$K&!ZI_?FUR&E$Am+~+I$X&luX9@j@Uy+9SodLgWP>xc23F+78=18>XM zaMJ0n&Vz@8NK?>*!Ltv+>#e?^7d8r#d)i`8rW}C69}eM{$Bh7WV=UHIN1}_PkHfM( z!Kk)Z8)!_YV>S6Bq31|{s6X2Sx1P5b3`VuX?WsEecXPm%v^!84YT~G1Q> zxzO)YcXIXmiP&0=BdY6r0N--*J(_g-K0e}5Z?0wjbZlFJ1KG4ad9B;6qW?MC`k?37DxZBJ2FU(Ta36 z-fi+#Jj>t~QY75Kru|lcJ&MgCYDdcub>9t3HTVv7Qz$WzS3~8uu|x;mOn$p}G%4Sm zY*@9e#?(J46m6P*9qTFX!(E#oK^@H^Tsm+zu~j)4%Y5aHF1Ft^dNTJoui$(HZzp=J zx7_R~*Du?Ezv}&u$VgI#98HTkIw6lJ#Z8JgyQPlWI<7aWKjnZ;T%*fNv+Kvv7}Q|@ zxM_{e8EI;is=EtYG36l@vebi27FL0^O%``LIURPtUx7`W)yOR{AUT>BZ-~s@Gf=bI zSa`uMa8+(b?-D8=enA`!12Eema28%U?n0!R7ej{(7kK zLMV2@Aj-smSA|vHbHa39oPn4(&FJ-=Bm%3RCAvA{46#&ZN$RdN$0IwUF;oyl#XaZ` zkM1|>95Fr)hh%HGYI$VO&BOY_z`Ku0lYzyklIw(-nJvM&1y8_dO)m7$-^K}0O`=lU zvU##!hUk9ZCr6*0#|M3!z)x;#;l2m~GQwjh{HjrM&o8|Ub%Rb~4aMj2T_Y?EtQW5& zoW@?mtNQxj%^mv;Tp^(cX`K05JEl*s(zT&0 z_@BO&3APBFIa3gIPF1e?Q|#g;zHBwMZU)UUW zWaHx1gHrFDDD&J@R6Z>0_XlCS=8x(1RSuBG&T3h2&-sSdK^pZ~OYbEXqn0@rE-Um~ zVYlV*a^H*Mz;d@a80~jS5>y>f?or}*xqooYq=vb)e~V3cZSsw#68|f9Q)DyWyd`Z) zcgeqBAR1O+Q?@7O^r|7VIJT=a3758p)fL#T-lzD_v#;*Vu9t0-fon70WHUEq19fX_ zm{B%AceUpx)FEqn$beWfxD!N?7DVy59v!fn?w~fNl^b;sKQ{+v13y3Tik)2tNUA?o zChB-vFuHu7;!$gD`YZh@ZZp~*T;7{$Fm=S7_SY}kUKvEV=ixkobC#jpBVSkKs(Ni0 zIkM2CU$7?2D9UTG*^ueaUmHdH6nk6xu5g}V9OG9qAZ(xNjd5&1*}#Mr%?y*c!17@; zUOs8#q~IDheUBXBtzahYv=>G>!e3`H z8F%WU9P_71Ov1*3TB97ov>oM$vL(m&seA>or}xC3ZL!M+P>|V@(X+-Yv~ef54`sTu zFR5MhPn2zSJ?+ISL4^DMX*|7bt2Czk!LJ5PF3jkW4H{9l`R^%AxySEhFhPjsdR~W0 zA!}f296d{=462@8=K8q(V~v-UEr-NQ2bj}XC!6&7oPF^W2`MbIj7cYw6Vg@OnB*9i znK3l1f-l(BS5Ran-qA}um&;PkKYl{~lxxXouK-#{OSB|Z9Un+Uv|=Qe0y*MQZc!g7 zp(Tb4jvSHcfm#2);CwA#mWh6dl_gC!F;3L;ej!B7Abu#DMJ^!Zq({ZaDC9K3@N8@B4}9I)JYc(tJHxoa1rvVWc$(8`{5kz) z?LJ)zjnSL{H`bdFYOjZ-^ca(&>zb^)Gx06h_}aw(4i*IZB6Cr||N5VazlqHMU!LN> z?NJhJw-xA%DZ$fzrT`*yAzvC4G$v$d=aVI@AnaLjk$Om_(y`2&6@RHTQ|b{X^^!{C zq@HowGY`gKG`sSYN`2Xx*>}y%PP5QjcvbxXw z)5AktQlh=%q;l3@o1THgNLbfCXuefx)Wf#YBQ@GPSj8%&z5%Sg)+n4`II#56gR?LSju=cO=b9$HVPat0lv50iRicHWb=Sk{ZNk!m~H>7)hO zd9H;Xrtr#O@@R3g{!DzvmA#ryOAuP5%G0K0#HeheQd4c^X=%z#i|H|G=?Tgd3rANw VM>_{mfH}a&7c>w6GUgYKe*tj26RQ9K literal 0 HcmV?d00001 diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..f7f0fe9df --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-14 10:43:37", + "end_time": "2024-10-14 10:43:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 31.7, + "gpus": 0, + "memory": 15.83, + "object_store": 0, + "execution time, min": 0.003 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.2, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py new file mode 100644 index 000000000..294c86f25 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import sort_output_cli_param +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +class TestSparkClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, + sort_output_cli_param: True, + } + launcher = SparkTransformLauncher(ClusterAnalysisSparkTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "signature_calc", "bands"), + os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py new file mode 100644 index 000000000..919857e23 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py @@ -0,0 +1,58 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, + operation_mode_cli_param, +) +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +class TestSparkDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected/get_list_transform/docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + operation_mode_cli_param: "annotate", + } + launcher = SparkTransformLauncher(DataCleaningSparkTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "data_cleaning", "annotated"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py new file mode 100644 index 000000000..6d93dc7a9 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +class TestSparkSignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, + } + launcher = SparkTransformLauncher(SignatureCalculationSparkTransformConfiguration()) + fixtures = [ + (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ] + return fixtures From 77d85fde33e2905a19a5195adf64fecc5d88be9b Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 16:13:42 -0400 Subject: [PATCH 044/105] Adjust to file naming changes Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/ray/Dockerfile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index 27d101bb8..1265e8ee3 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -22,10 +22,7 @@ COPY --chown=ray:users images/ images/ RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/fdedup_transform_ray.py . - -# copy some of the samples in -COPY src/fdedup_local_ray.py local/ +COPY ./src/fuzzy_dedup_ray.py . # copy test COPY test/ test/ From 310d8139ca2bd52afd2e987fc52c6b530d4c2888 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 18 Oct 2024 18:03:34 -0400 Subject: [PATCH 045/105] Create python Dockerfile Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/Dockerfile | 43 +++++++++++++++++++ .../universal/fdedup/python/requirements.txt | 10 +++++ 2 files changed, 53 insertions(+) create mode 100644 transforms/universal/fdedup/python/Dockerfile create mode 100644 transforms/universal/fdedup/python/requirements.txt diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile new file mode 100644 index 000000000..a0a557060 --- /dev/null +++ b/transforms/universal/fdedup/python/Dockerfile @@ -0,0 +1,43 @@ +FROM docker.io/python:3.10.14-slim-bullseye + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest + +# Create a user and use it to run the transform +RUN useradd -ms /bin/bash dpk +USER dpk +WORKDIR /home/dpk + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/ +RUN cd data-processing-lib-python && pip install --no-cache-dir -e . + +COPY --chown=dpk:root src/ src/ +COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt + +RUN pip install --no-cache-dir -e . + +# copy source data +COPY src/ src/ + +# copy source data +COPY ./src/signature_calc_transform_python.py fdedup_transform_python.py +COPY ./src/signature_calc_local_python.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +# Set environment +ENV PYTHONPATH /home/dpk + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt new file mode 100644 index 000000000..85806f809 --- /dev/null +++ b/transforms/universal/fdedup/python/requirements.txt @@ -0,0 +1,10 @@ +pyarrow==16.1.0 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars>=1.6.0 +disjoint-set>=0.8.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 +scipy>=1.12.0, <2.0.0 From 7d97cef7c741a703b27b2d5f17467b998bc2794b Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sat, 19 Oct 2024 14:51:24 -0400 Subject: [PATCH 046/105] Ray bug fixes Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/ray/Dockerfile | 10 +++++++--- .../fdedup/ray/src/cluster_analysis_local_ray.py | 2 +- .../fdedup/ray/src/signature_calc_transform_ray.py | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index 1265e8ee3..ec2c56f28 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -2,6 +2,8 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 FROM ${BASE_IMAGE} +USER ray + RUN pip install --upgrade --no-cache-dir pip # install pytest @@ -13,16 +15,18 @@ COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . +COPY --chown=ray:users python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . # Install ray project source COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml COPY --chown=ray:users README.md README.md -COPY --chown=ray:users images/ images/ RUN pip install --no-cache-dir -e . -# copy the main() entry point to the image -COPY ./src/fuzzy_dedup_ray.py . +# copy source files needed by test-image +COPY ./src/signature_calc_transform_ray.py fdedup_transform_ray.py +COPY ./src/signature_calc_local_ray.py local/fdedup_local_ray.py # copy test COPY test/ test/ diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py index 25b96788d..c078746ce 100644 --- a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py @@ -19,7 +19,7 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands_consolidated")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { "input_folder": input_folder, diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py index bc3c0d991..678d953f2 100644 --- a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py @@ -14,6 +14,7 @@ from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) +from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher from signature_calc_transform import SignatureCalculationTransformConfiguration From 87902ac1f8aa4eff51e127df76ea44fd86a632e0 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sat, 19 Oct 2024 17:03:09 -0400 Subject: [PATCH 047/105] Fix spark image to support testing Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/spark/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile index 523b94c06..a36a7cef7 100644 --- a/transforms/universal/fdedup/spark/Dockerfile +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -36,7 +36,8 @@ RUN pip install --no-cache-dir -e . COPY ./src/signature_calc_spark.py . # copy some of the samples in -# COPY src/filter_local_spark.py local/ +COPY src/signature_calc_transform_spark.py fdedup_transform_spark.py +COPY src/signature_calc_spark.py local/fdedup_local_spark.py # copy test COPY test/ test/ @@ -46,6 +47,7 @@ USER spark # Set environment ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH} +ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH} # Put these at the end since they seem to upset the docker cache. ARG BUILD_DATE From c84792452619fc57ce2ebeee8f872ef2b67deb82 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 07:57:22 -0400 Subject: [PATCH 048/105] Removed file copy utils Signed-off-by: Constantin M Adam --- .../fdedup/python/src/file_copy_util.py | 158 ----------- .../fdedup/spark/src/file_copy_util_spark.py | 261 ------------------ 2 files changed, 419 deletions(-) delete mode 100644 transforms/universal/fdedup/python/src/file_copy_util.py delete mode 100644 transforms/universal/fdedup/spark/src/file_copy_util_spark.py diff --git a/transforms/universal/fdedup/python/src/file_copy_util.py b/transforms/universal/fdedup/python/src/file_copy_util.py deleted file mode 100644 index 87867e532..000000000 --- a/transforms/universal/fdedup/python/src/file_copy_util.py +++ /dev/null @@ -1,158 +0,0 @@ -import argparse -import io -import os -import re - -import polars as pl -from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase -from data_processing.utils import ParamsUtils, get_logger - - -""" -This class reads all the parquet files inside an `input_folder` of the type -`.../bands/band=b/segment=s`, concatenates those files, and writes them into a -file called `.../consolidated_bands/band_b_segment_s.parquet` -""" - - -class FileCopyUtil: - def __init__( - self, - data_access_factory: DataAccessFactoryBase, - config: dict, - stats: dict, - ): - self.data_access_factory = data_access_factory - self.root_folder = config.get("root_folder") - self.logger = get_logger(__name__, level="INFO") - - def copy_data(self, subfolder_name: str, data_type: str): - self.logger.info(f"copy_data(): subfolder_name = {subfolder_name}, data_type = {data_type}") - if self.data_access_factory.s3_config is not None: - _, root_folder = self.root_folder.split("://") - else: - root_folder = self.root_folder - self.logger.debug(f"copy_data(): root_folder = {root_folder}") - if data_type == "bands": - match = re.match(r"^band=(\d+)/segment=(\d+)$", subfolder_name) - if match: - band = int(match.group(1)) - segment = int(match.group(2)) - else: - raise ValueError(f"Wrong subfolder_name {subfolder_name}, should be band=b/segment=s") - input_folder = os.path.join( - root_folder, - "bands", - f"band={band}", - f"segment={segment}/", - ) - output_path = os.path.join( - root_folder, - "bands_consolidated", - f"band_{band}_segment_{segment}.parquet", - ) - elif data_type == "docs_to_remove": - input_folder = os.path.join( - root_folder, - f"{subfolder_name}/", - ) - output_path = os.path.join( - root_folder, - "docs_to_remove_consolidated", - f"docs_to_remove_consolidated.parquet", - ) - self.logger.debug(f"copy_data(): input_folder = {input_folder}, output_path = {output_path}") - - data_access = self.data_access_factory.create_data_access() - self.logger.debug(f"copy_data(): getting the data from the input_folder {input_folder}") - file_dict, status = data_access.get_folder_files( - input_folder, - extensions=[".parquet"], - return_data=True, - ) - self.logger.info(f"Found {len(file_dict)} files in input folder {input_folder}") - consolidated_df = pl.DataFrame() - for fname, contents in file_dict.items(): - df = pl.read_parquet(io.BytesIO(contents)) - # self.logger.info(f"{fname} has {len(df)} rows") - consolidated_df = consolidated_df.vstack(df) - if "docs_to_remove" in consolidated_df.columns: - consolidated_df = consolidated_df.select("docs_to_remove").unique() - output_table = consolidated_df.to_arrow() - self.logger.info( - f"Writing to {output_path} table with {output_table.num_rows} rows and {output_table.nbytes:,d} bytes" - ) - stats = { - "input_files": len(file_dict), - "input_bytes": sum(len(v) for v in file_dict.values()), - "input_rows": output_table.num_rows, - "output_files": 1, - "output_bytes": output_table.nbytes, - "output_rows": output_table.num_rows, - } - data_access.save_table(output_path, output_table) - return stats - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--root_folder", - type=str, - default=os.getenv("HOME", os.path.join(os.sep)), - help="root folder", - ) - parser.add_argument( - "--subfolder_name", - type=str, - default=os.path.join("band=0", "segment=0"), - help="subfolder name", - ) - parser.add_argument( - "--data_type", - type=str, - default="docs_to_remove", - help="Processing either bands or docs_to_remove", - ) - parser.add_argument( - "--use_s3", - type=bool, - default=False, - help="use s3", - ) - args = parser.parse_args() - root_folder = args.root_folder - config = {"root_folder": args.root_folder} - input_folder = args.root_folder - output_folder = args.root_folder - data_type = args.data_type - data_access_factory: DataAccessFactoryBase = DataAccessFactory() - daf_args = [] - if args.use_s3: - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - s3_config = { - "input_folder": root_folder, - "output_folder": root_folder, - } - daf_args.append("--data_s3_cred") - daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) - daf_args.append("--data_s3_config") - daf_args.append(ParamsUtils.convert_to_ast(s3_config)), - else: - local_config = { - "input_folder": root_folder, - "output_folder": root_folder, - } - daf_args.append("--data_local_config") - daf_args.append(ParamsUtils.convert_to_ast(local_config)) - daf_parser = argparse.ArgumentParser() - data_access_factory.add_input_params(parser=daf_parser) - data_access_factory_args = daf_parser.parse_args(args=daf_args) - data_access_factory.apply_input_params(args=data_access_factory_args) - stats = {} - fcu = FileCopyUtil(data_access_factory=data_access_factory, config=config, stats=stats) - fcu.copy_data(args.subfolder_name, args.data_type) diff --git a/transforms/universal/fdedup/spark/src/file_copy_util_spark.py b/transforms/universal/fdedup/spark/src/file_copy_util_spark.py deleted file mode 100644 index 58a43a736..000000000 --- a/transforms/universal/fdedup/spark/src/file_copy_util_spark.py +++ /dev/null @@ -1,261 +0,0 @@ -import argparse -import os -import socket -import time -import traceback -from datetime import datetime - -import polars as pl -import yaml -from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase -from data_processing.utils import ParamsUtils, get_logger -from file_copy_util import FileCopyUtil -from pyspark.sql import SparkSession - - -logger = get_logger(__name__) - - -class FileCopySpark: - def __init__(self, root_folder: str, num_bands: int, num_segments: int, use_s3: bool): - self.root_folder = root_folder - self.num_bands = num_bands - self.num_segments = num_segments - self.use_s3 = use_s3 - self.subdirs = [f"band={b}/segment={s}" for b in range(num_bands) for s in range(num_segments)] - - def _init_spark(self, app_name: str = "copy-app") -> SparkSession: - server_port_https = int(os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "-1")) - if server_port_https == -1: - # we are running locally - spark_config = {"spark.driver.host": "127.0.0.1"} - return SparkSession.builder.appName(app_name).config(map=spark_config).getOrCreate() - else: - # we are running in Kubernetes, use spark_profile.yml and - # environment variables for configuration - - server_port = os.environ["KUBERNETES_SERVICE_PORT"] - master_url = f"k8s://https://kubernetes.default:{server_port}" - - # Read Spark configuration profile - config_filepath = os.path.abspath( - os.path.join(os.getenv("SPARK_HOME"), "work-dir", "config", "spark_profile.yml") - ) - with open(config_filepath, "r") as config_fp: - spark_config = yaml.safe_load(os.path.expandvars(config_fp.read())) - spark_config["spark.submit.deployMode"] = "client" - - # configure the executor pods from template - executor_pod_template_file = os.path.join( - os.getenv("SPARK_HOME"), - "work-dir", - "src", - "templates", - "spark-executor-pod-template.yml", - ) - spark_config["spark.kubernetes.executor.podTemplateFile"] = executor_pod_template_file - spark_config["spark.kubernetes.container.image.pullPolicy"] = "Always" - - # Pass the driver IP address to the workers for callback - myservice_url = socket.gethostbyname(socket.gethostname()) - spark_config["spark.driver.host"] = myservice_url - spark_config["spark.driver.bindAddress"] = "0.0.0.0" - - spark_config["spark.decommission.enabled"] = True - logger.info(f"Launching Spark Session with configuration\n" f"{yaml.dump(spark_config, indent=2)}") - app_name = spark_config.get("spark.app.name", "my-spark-app") - return SparkSession.builder.master(master_url).appName(app_name).config(map=spark_config).getOrCreate() - - def create_data_access_factory(self, root_folder: str, use_s3: bool) -> DataAccessFactoryBase: - input_folder = root_folder - output_folder = root_folder - data_access_factory: DataAccessFactoryBase = DataAccessFactory() - daf_args = [] - if use_s3: - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - s3_config = { - "input_folder": root_folder, - "output_folder": root_folder, - } - daf_args.append("--data_s3_cred") - daf_args.append(ParamsUtils.convert_to_ast(s3_creds)) - daf_args.append("--data_s3_config") - daf_args.append(ParamsUtils.convert_to_ast(s3_config)), - else: - local_config = { - "input_folder": root_folder, - "output_folder": os.path.join(root_folder, "bands_consolidated"), - } - daf_args.append("--data_local_config") - daf_args.append(ParamsUtils.convert_to_ast(local_config)) - daf_parser = argparse.ArgumentParser() - data_access_factory.add_input_params(parser=daf_parser) - data_access_factory_args = daf_parser.parse_args(args=daf_args) - data_access_factory.apply_input_params(args=data_access_factory_args) - - return data_access_factory - - def orchestrate( - self, runtime_config: dict, execution_config: dict, data_access_factory: DataAccessFactoryBase, data_type: str - ) -> int: - """ - orchestrator for transformer execution - :param execution_config: orchestrator configuration - :param data_access_factory: data access factory - :param runtime_config: transformer runtime configuration - :return: 0 - success or 1 - failure - """ - start_time = time.time() - start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - logger.info(f"orchestrator started at {start_ts}") - data_access = data_access_factory.create_data_access() - # initialize Spark - spark_session = self._init_spark() - sc = spark_session.sparkContext - transform_config = sc.broadcast(runtime_config) - daf = sc.broadcast(data_access_factory) - data_type = data_type - print("data_type") - print(data_type) - - def process_partition(iterator): - """ - process partitions - :param iterator: iterator of records - :return: - """ - # local statistics dictionary - stats = {} - # create file processor - file_processor = FileCopyUtil( - data_access_factory=daf.value, - config=transform_config.value, - stats=stats, - ) - for f in iterator: - stats = file_processor.copy_data(subfolder_name=f[0], data_type=data_type) - # return partition's statistics - return list(stats.items()) - - num_partitions = 0 - try: - if data_type == "bands": - # Get files to process - files = [ - f"band={band}/segment={segment}" - for band in range(self.num_bands) - for segment in range(self.num_segments) - ] - elif data_type == "docs_to_remove": - files = ["docs_to_remove"] - print(data_type) - - if len(files) == 0: - logger.error("No input files to process - exiting") - return 0 - logger.info(f"Number of files is {len(files)}") - # process data - logger.debug("Begin processing files") - source_rdd = sc.parallelize(files, execution_config.get("parallelization")) - num_partitions = source_rdd.getNumPartitions() - logger.info(f"Parallelizing execution. Using {num_partitions} partitions") - stats_rdd = source_rdd.zipWithIndex().mapPartitions(process_partition) - # build overall statistics - stats = dict(stats_rdd.reduceByKey(lambda a, b: a + b).collect()) - return_code = 0 - status = "success" - except Exception as e: - # process execution exception - logger.error(f"Exception during execution {e}: {traceback.print_exc()}") - return_code = 1 - status = "failure" - stats = {} - try: - # build and save metadata - logger.debug("Building job metadata") - input_params = runtime_config - # input_params = runtime_config.get_transform_metadata() | execution_config.get_input_params() - metadata = { - "job details": { - "start_time": start_ts, - "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "status": status, - }, - "job_input_params": input_params | data_access_factory.get_input_params(), - "execution_stats": { - "num partitions": num_partitions, - "execution time, min": (time.time() - start_time) / 60, - }, - "job_output_stats": stats, - } - logger.debug(f"Saving job metadata: {metadata}.") - - if data_access_factory.s3_config is not None: - _, root_folder = self.root_folder.split("://") - in_path = os.path.join(root_folder, "bands") - out_path = os.path.join(root_folder, "bands_consolidated") - data_access.input_folder = f"{in_path}{os.sep}" - data_access.output_folder = f"{out_path}{os.sep}" - else: - data_access.input_folder = os.path.join(self.root_folder, "bands") - data_access.output_folder = os.path.join(self.root_folder, "bands_consolidated") - data_access.save_job_metadata(metadata) - logger.debug("Saved job metadata.") - return return_code - except Exception as e: - logger.error(f"Exception during execution {e}: {traceback.print_exc()}") - return 1 - finally: - # stop spark context at the end. Required for running multiple tests - spark_session.stop() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--root_folder", - type=str, - default="/Users/nelson/workspace/Research/DataPreprocessing/ibm/active/data-prep-kit/transforms/universal/fdedup/python/output_second/", - help="root folder", - ) - parser.add_argument( - "--num_bands", - type=int, - default=14, - help="number of bands", - ) - parser.add_argument( - "--num_segments", - type=int, - default=2, - help="number of segments", - ) - parser.add_argument( - "--data_type", - type=str, - default="docs_to_remove", - help="bands or doc2remove", - ) - parser.add_argument( - "--parallelization", - type=int, - default=-1, - help="spark parallelization", - ) - parser.add_argument( - "--use_s3", - type=bool, - default=False, - help="use s3", - ) - args = parser.parse_args() - fcs = FileCopySpark(args.root_folder, args.num_bands, args.num_segments, args.use_s3) - data_access_factory = fcs.create_data_access_factory(args.root_folder, args.use_s3) - app_config = {"root_folder": args.root_folder} - execution_config = {"parallelization": args.parallelization} if args.parallelization > 0 else {} - status = fcs.orchestrate(app_config, execution_config, data_access_factory, args.data_type) - print(f"Orchestrate concluded with status {status}") From ba9b07ca0a9a4821df0f38cf488db4fc8db7408e Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:00:13 -0400 Subject: [PATCH 049/105] Add fdedup to kfp black list until we get kfp integration Signed-off-by: Constantin M Adam --- scripts/check-workflows.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh index d1f934368..d431f6fbd 100755 --- a/scripts/check-workflows.sh +++ b/scripts/check-workflows.sh @@ -17,7 +17,7 @@ if [ ! -d transforms ]; then echo Please run this script from the top of the repository exit 1 fi -KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering" +KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering fdedup" while [ $# -ne 0 ]; do case $1 in -show-kfp-black-list) echo $KFP_BLACK_LIST; exit 0; From f1879487bc4106f1b776ed6529e8e706096c4bc9 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:05:15 -0400 Subject: [PATCH 050/105] Freeze polars version to 1.9.0 for now Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/pyproject.toml | 4 ++-- .../universal/fdedup/python/requirements.txt | 2 +- .../universal/fdedup/spark/requirements.txt | 20 +++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index fa815441c..f46c8e8c4 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -15,9 +15,9 @@ dependencies = [ "pyyaml>=6.0.2", "boto3>=1.34.69", "kubernetes>=30.1.0", - "polars>=1.6.0", + "polars==1.9.0", "disjoint-set>=0.8.0", - "scipy>=1.14.1", + "scipy>=1.14.1, <2.0.0", "numpy<1.29.0", "sentencepiece>=0.2.0", "mmh3>=4.1.0", diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt index 85806f809..576c028a8 100644 --- a/transforms/universal/fdedup/python/requirements.txt +++ b/transforms/universal/fdedup/python/requirements.txt @@ -2,7 +2,7 @@ pyarrow==16.1.0 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 -polars>=1.6.0 +polars==1.9.0 disjoint-set>=0.8.0 numpy<1.29.0 sentencepiece>=0.2.0 diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt index 10f3e129b..576c028a8 100644 --- a/transforms/universal/fdedup/spark/requirements.txt +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -1,10 +1,10 @@ -pyarrow -pyyaml -boto3 -kubernetes -polars -disjoint-set -scipy -numpy -sentencepiece -mmh3 +pyarrow==16.1.0 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars==1.9.0 +disjoint-set>=0.8.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 +scipy>=1.12.0, <2.0.0 From 84b9104a7791661d368345d3c5b8e8cd02a67a19 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:08:47 -0400 Subject: [PATCH 051/105] Fixed duplicate_list_location bug Signed-off-by: Constantin M Adam --- .../python/src/data_cleaning_transform_python.py | 15 +++++++++++---- .../fdedup/ray/src/data_cleaning_transform_ray.py | 8 +++++--- .../spark/src/data_cleaning_transform_spark.py | 15 +++++++++++---- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py index e5c1e5025..9c60ecbba 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -13,7 +13,11 @@ import os from typing import Any -from data_cleaning_transform import DataCleaningTransformConfiguration +from data_cleaning_transform import ( + DataCleaningTransformConfiguration, + duplicate_list_location_default, + duplicate_list_location_key, +) from data_processing.data_access import DataAccessFactoryBase from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.runtime.pure_python.runtime_configuration import ( @@ -53,9 +57,12 @@ def get_transform_config( :return: dictionary of transform init params """ data_access = data_access_factory.create_data_access() - duplicate_list_location = os.path.abspath( - os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"]) - ) + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py index e83960c24..5ed2cecbe 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -91,9 +91,11 @@ def get_transform_config( """ data_access = data_access_factory.create_data_access() duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) - duplicate_list_location = os.path.abspath( - os.path.join(data_access.output_folder, "..", duplicate_list_location) - ) + if not duplicate_list_location.startswith("/"): + out_paths = data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") duplicate_list, retries = data_access.get_file(duplicate_list_location) diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py index 29890d05f..56c10d801 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -13,7 +13,11 @@ import os from typing import Any -from data_cleaning_transform import DataCleaningTransformConfiguration +from data_cleaning_transform import ( + DataCleaningTransformConfiguration, + duplicate_list_location_default, + duplicate_list_location_key, +) from data_processing.data_access import DataAccessFactoryBase from data_processing.transform import TransformStatistics from data_processing.utils import get_logger @@ -53,9 +57,12 @@ def get_transform_config( :return: dictionary of transform init params """ data_access = data_access_factory.create_data_access() - duplicate_list_location = os.path.abspath( - os.path.join(data_access.output_folder, "..", self.params["duplicate_list_location"]) - ) + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") self.duplicate_list, retries = data_access.get_file(duplicate_list_location) From 08ff0069f00d0a84c8ef6cd3e2f55eefc098b2fb Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:10:01 -0400 Subject: [PATCH 052/105] Allow input of s3 credentials on command line Signed-off-by: Constantin M Adam --- .../fdedup/python/src/fuzzy_dedup_python.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index acb1be3bb..054447e70 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -1,4 +1,5 @@ import argparse +import ast import os import sys @@ -119,8 +120,17 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: "output_folder": output_folder, } if in_args.use_s3: - sys_argv.append("--data_s3_cred") - sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) + if in_args.s3_cred is not None: + s3_cred_ast = ParamsUtils.convert_to_ast(in_args.s3_cred) + sys_argv.append("--data_s3_cred") + sys_argv.append(s3_cred_ast) + elif ( + s3_creds.get("access_key") is not None + and s3_creds.get("secret_key") is not None + and s3_creds.get("url") is not None + ): + sys_argv.append("--data_s3_cred") + sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) sys_argv.append("--data_s3_config") else: sys_argv.append("--data_local_config") @@ -207,6 +217,13 @@ def parse_args() -> argparse.Namespace: help="use s3", ) + parser.add_argument( + "--s3_cred", + type=ast.literal_eval, + default=None, + help="ast string of options for s3 credentials", + ) + return parser.parse_args() From d0c6f8a72efe75ccbfce0d89fd56b9b06dac4cb1 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 10:53:10 -0400 Subject: [PATCH 053/105] Added license Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/src/Murmur_MH.py | 13 +++++++++++++ .../fdedup/python/src/fuzzy_dedup_python.py | 12 ++++++++++++ .../universal/fdedup/ray/src/fuzzy_dedup_ray.py | 12 ++++++++++++ .../universal/fdedup/spark/src/fuzzy_dedup_spark.py | 12 ++++++++++++ 4 files changed, 49 insertions(+) diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py index e3442ba02..03d5047ea 100644 --- a/transforms/universal/fdedup/python/src/Murmur_MH.py +++ b/transforms/universal/fdedup/python/src/Murmur_MH.py @@ -1,3 +1,16 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + + import logging import os from typing import List, Set diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index 054447e70..bdd78c7da 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import argparse import ast import os diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py index 0b9be33ca..0d4c2954f 100644 --- a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py +++ b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import argparse import os import sys diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py index 5217f2f7b..58688de42 100644 --- a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py +++ b/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import argparse import os import sys From 63e11eb729a85f3a1cf349b21e19a680f300ec10 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 25 Oct 2024 11:49:22 -0400 Subject: [PATCH 054/105] Use str2bool for use_s3 argument Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/fuzzy_dedup_python.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index bdd78c7da..7135054d2 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -24,7 +24,7 @@ ) from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher -from data_processing.utils import ParamsUtils, get_logger +from data_processing.utils import ParamsUtils, get_logger, str2bool from get_duplicate_list_transform_python import ( GetDuplicateListPythonTransformConfiguration, ) @@ -159,6 +159,10 @@ def execute_service(self, service_short_name: str, params: list) -> int: launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) elif service_short_name == "fdclean": launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + else: + err_msg = f"Unknown service {service_short_name} specified. Must be one of {SERVICE_DICT.values()}" + self.logger.error(err_msg) + raise ValueError(err_msg) status = launcher.launch() return status @@ -225,7 +229,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--use_s3", - action="store_true", + type=lambda x: bool(str2bool(x)), + default=False, help="use s3", ) From bf550fde9ad3d1d9e8f7bd0f7f75b25df12d24a2 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Tue, 29 Oct 2024 19:22:55 -0400 Subject: [PATCH 055/105] Add overwrite output path argument Signed-off-by: Constantin M Adam --- .../python/src/signature_calc_transform.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 7c4dd391c..03f9bc9b4 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -48,6 +48,8 @@ """ This key holds the size of the word shingles calculated for each document""" num_segments_key = "num_segments" """ This key holds the number of segments across which we divide the hashing space for each band""" +overwrite_output_path_key = "overwrite_output_path" +""" This key holds the overwrite output path""" # command line arguments document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" @@ -68,6 +70,8 @@ """ The size of the word shingles calculated for each document""" num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments across which we divide the hashing space for each band""" +overwrite_output_path_cli_param = f"{cli_prefix}{overwrite_output_path_key}" +""" The overwrite output path""" captured_arg_keys = [ document_id_column_key, @@ -79,6 +83,7 @@ jaccard_similarity_threshold_key, word_shingle_size_key, num_segments_key, + overwrite_output_path_key, ] # defaults @@ -100,6 +105,8 @@ """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" num_segments_default = 1 """ Default number of segments across which we divide the hashing space for each band""" +overwrite_output_path_default = None +""" Default overwrite output path (no overwrite)""" NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?") @@ -136,7 +143,8 @@ class SignatureCalculationTransform(AbstractTableTransform): num_minhashes_per_band: number of minhashes to use in each band jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates word_shingle_size: the size of the word shingles calculated for each document - num_segments the number of segments across which we divide the hashing space for each band + num_segments: the number of segments across which we divide the hashing space for each band + overwrite_output_path: specify an output path other than the one used by the data_access """ def __init__(self, config: dict[str, Any]): @@ -158,6 +166,7 @@ def __init__(self, config: dict[str, Any]): self.num_segments = config.get(num_segments_key, num_segments_default) self.num_bands = config.get(num_bands_key, num_bands_default) self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) + self.overwrite_output_path = config.get(overwrite_output_path_key, overwrite_output_path_default) # use this dataframe to store the minhashes and size for each document self.all_minhashes: pl.DataFrame = None # use this dataframe to store the band hashes for each document @@ -311,7 +320,7 @@ def write_band_signatures(self): last_file_name_path = Path(self.last_file_name) suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) save_path = os.path.join( - self.data_access.output_folder, + self.overwrite_output_path if self.overwrite_output_path else self.data_access.output_folder, "bands", f"band={band_ix}", f"segment={segment_index}", @@ -470,6 +479,12 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=num_segments_default, help="the number of segments across which we divide the hashing space for each band", ) + parser.add_argument( + f"--{overwrite_output_path_cli_param}", + type=str, + default=overwrite_output_path_default, + help="overwrite of the output path", + ) def apply_input_params(self, args: Namespace) -> bool: """ From 272be3697239019ad604badcaf4ae2d8fd3c654b Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Wed, 30 Oct 2024 16:40:40 -0400 Subject: [PATCH 056/105] Add separate data access objects for reading and writing files Signed-off-by: Constantin M Adam --- .../python/src/signature_calc_local_python.py | 21 +++++++--- .../python/src/signature_calc_transform.py | 41 ++++++++++--------- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py index 062580f22..2800c70cd 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -12,6 +12,7 @@ import os import sys +from ast import Param from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils @@ -22,12 +23,23 @@ # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "test_scdata")) local_conf = {"input_folder": input_folder, "output_folder": output_folder} code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} +s3_config = { + "input_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_data/", + "output_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_output_data/s3_test_3/", +} + params = { # Data access. Only required parameters are specified "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "scdata_local_config": ParamsUtils.convert_to_ast(local_conf), # execution info "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", @@ -35,6 +47,8 @@ "minhash_num_permutations": 112, "minhash_num_bands": 14, "minhash_num_segments": 2, + # "scdata_s3_cred": ParamsUtils.convert_to_ast(s3_creds), + # "scdata_s3_config": ParamsUtils.convert_to_ast(s3_config), } @@ -44,11 +58,6 @@ print(sys.argv) sys.argv.append("--data_s3_cred") - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) # create launcher diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 03f9bc9b4..159697d19 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -20,10 +20,10 @@ import numpy as np import polars as pl import pyarrow as pa +from data_processing.data_access import DataAccessFactory from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider from Murmur_MH import Murmur_MH -from scipy.integrate import quad as integrate short_name = "minhash" @@ -48,8 +48,6 @@ """ This key holds the size of the word shingles calculated for each document""" num_segments_key = "num_segments" """ This key holds the number of segments across which we divide the hashing space for each band""" -overwrite_output_path_key = "overwrite_output_path" -""" This key holds the overwrite output path""" # command line arguments document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" @@ -70,8 +68,6 @@ """ The size of the word shingles calculated for each document""" num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments across which we divide the hashing space for each band""" -overwrite_output_path_cli_param = f"{cli_prefix}{overwrite_output_path_key}" -""" The overwrite output path""" captured_arg_keys = [ document_id_column_key, @@ -83,7 +79,6 @@ jaccard_similarity_threshold_key, word_shingle_size_key, num_segments_key, - overwrite_output_path_key, ] # defaults @@ -105,8 +100,10 @@ """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" num_segments_default = 1 """ Default number of segments across which we divide the hashing space for each band""" -overwrite_output_path_default = None -""" Default overwrite output path (no overwrite)""" + + +sigcalc_data_factory_key = "sc_data_factory" +sigcalc_data_access_key = "sc_data_access" NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?") @@ -144,7 +141,6 @@ class SignatureCalculationTransform(AbstractTableTransform): jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates word_shingle_size: the size of the word shingles calculated for each document num_segments: the number of segments across which we divide the hashing space for each band - overwrite_output_path: specify an output path other than the one used by the data_access """ def __init__(self, config: dict[str, Any]): @@ -166,7 +162,6 @@ def __init__(self, config: dict[str, Any]): self.num_segments = config.get(num_segments_key, num_segments_default) self.num_bands = config.get(num_bands_key, num_bands_default) self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) - self.overwrite_output_path = config.get(overwrite_output_path_key, overwrite_output_path_default) # use this dataframe to store the minhashes and size for each document self.all_minhashes: pl.DataFrame = None # use this dataframe to store the band hashes for each document @@ -177,6 +172,12 @@ def __init__(self, config: dict[str, Any]): self.bytes_processed = 0 self.data_access = config.get("data_access") self.last_file_name = None + self.sc_data_access = config.get(sigcalc_data_access_key, None) + if self.sc_data_access is None: + self.sc_daf = config.get(sigcalc_data_factory_key, None) + if self.sc_daf is None: + raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}") + self.sc_data_access = self.sc_daf.create_data_access() def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """ @@ -319,15 +320,17 @@ def write_band_signatures(self): common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name]) last_file_name_path = Path(self.last_file_name) suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) + if self.sc_data_access.output_folder is None: + self.sc_data_access.output_folder = self.data_access.output_folder save_path = os.path.join( - self.overwrite_output_path if self.overwrite_output_path else self.data_access.output_folder, + self.sc_data_access.output_folder, "bands", f"band={band_ix}", f"segment={segment_index}", suffix_path, ) segment_band_minhash_table = segment_band_minhash_df.to_arrow() - bytes_written, _, _ = self.data_access.save_table(save_path, segment_band_minhash_table) + bytes_written, _, _ = self.sc_data_access.save_table(save_path, segment_band_minhash_table) if bytes_written > 0: num_tables_written += 1 num_docs_written += segment_band_minhash_table.num_rows @@ -412,8 +415,10 @@ def __init__(self): super().__init__( name=short_name, transform_class=SignatureCalculationTransform, - remove_from_metadata=[], + remove_from_metadata=[sigcalc_data_factory_key], ) + self.daf = DataAccessFactory(cli_arg_prefix="scdata_") + from data_processing.utils import get_logger self.logger = get_logger(__name__, level="INFO") @@ -479,12 +484,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=num_segments_default, help="the number of segments across which we divide the hashing space for each band", ) - parser.add_argument( - f"--{overwrite_output_path_cli_param}", - type=str, - default=overwrite_output_path_default, - help="overwrite of the output path", - ) + self.daf.add_input_params(parser=parser) def apply_input_params(self, args: Namespace) -> bool: """ @@ -495,4 +495,5 @@ def apply_input_params(self, args: Namespace) -> bool: captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) self.params = self.params | captured self.logger.info(f"{short_name} parameters are : {self.params}") - return True + self.params[sigcalc_data_factory_key] = self.daf + return self.daf.apply_input_params(args=args) From ee411e1bd7957a802857b2b1ac6703f0d50c2968 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 31 Oct 2024 16:46:09 -0400 Subject: [PATCH 057/105] Define 2 data access objects for data and duplicate list Signed-off-by: Constantin M Adam --- .../fdedup/python/src/data_cleaning_transform.py | 10 +++++++++- .../python/src/data_cleaning_transform_python.py | 14 ++++++++++++-- .../fdedup/ray/src/data_cleaning_transform_ray.py | 14 ++++++++++++-- .../spark/src/data_cleaning_transform_spark.py | 14 ++++++++++++-- 4 files changed, 45 insertions(+), 7 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 8e17b757f..1a349ae85 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -17,6 +17,7 @@ import numpy as np import polars as pl import pyarrow as pa +from data_processing.data_access import DataAccessFactory from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger @@ -53,6 +54,9 @@ operation_mode_default = "filter_duplicates" """ Default value for operation mode, will filter out all the duplicate documents""" +dataclean_data_factory_key = "dc_data_factory" +dataclean_data_access_key = "dc_data_access" + class DataCleaningTransform(AbstractTableTransform): """ @@ -129,7 +133,9 @@ def __init__(self, transform_class: type[AbstractTableTransform] = DataCleaningT super().__init__( name=short_name, transform_class=transform_class, + remove_from_metadata=[dataclean_data_factory_key], ) + self.daf = DataAccessFactory(cli_arg_prefix="dcdata_") self.logger = get_logger(__name__, level="INFO") def add_input_params(self, parser: ArgumentParser) -> None: @@ -157,6 +163,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=operation_mode_default, help="operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents", ) + self.daf.add_input_params(parser=parser) def apply_input_params(self, args: Namespace) -> bool: """ @@ -167,4 +174,5 @@ def apply_input_params(self, args: Namespace) -> bool: captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) self.params = self.params | captured self.logger.info(f"{short_name} parameters are : {self.params}") - return True + self.params[dataclean_data_factory_key] = self.daf + return self.daf.apply_input_params(args=args) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py index 9c60ecbba..edef8b9c5 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -15,6 +15,8 @@ from data_cleaning_transform import ( DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, duplicate_list_location_default, duplicate_list_location_key, ) @@ -57,15 +59,23 @@ def get_transform_config( :return: dictionary of transform init params """ data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) if not duplicate_list_location.startswith("/"): - out_paths = data_access.output_folder.rstrip("/").split("/") + out_paths = dc_data_access.output_folder.rstrip("/").split("/") dupl_list_paths = duplicate_list_location.split("/") paths = out_paths[:-1] + dupl_list_paths duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") - self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) return self.params | {"df": self.duplicate_list} diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py index 5ed2cecbe..88171e260 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -17,6 +17,8 @@ from data_cleaning_transform import ( DataCleaningTransform, DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, duplicate_list_location_default, duplicate_list_location_key, ) @@ -90,15 +92,23 @@ def get_transform_config( :return: dictionary of filter init params """ data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) if not duplicate_list_location.startswith("/"): - out_paths = data_access.output_folder.rstrip("/").split("/") + out_paths = dc_data_access.output_folder.rstrip("/").split("/") dupl_list_paths = duplicate_list_location.split("/") paths = out_paths[:-1] + dupl_list_paths duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") - duplicate_list, retries = data_access.get_file(duplicate_list_location) + duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) docs_to_remove_list = ray.put(duplicate_list) return {"df": docs_to_remove_list} | self.params diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py index 56c10d801..2ff0df8bf 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -15,6 +15,8 @@ from data_cleaning_transform import ( DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, duplicate_list_location_default, duplicate_list_location_key, ) @@ -57,15 +59,23 @@ def get_transform_config( :return: dictionary of transform init params """ data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) if not duplicate_list_location.startswith("/"): - out_paths = data_access.output_folder.rstrip("/").split("/") + out_paths = dc_data_access.output_folder.rstrip("/").split("/") dupl_list_paths = duplicate_list_location.split("/") paths = out_paths[:-1] + dupl_list_paths duplicate_list_location = "/".join([p.strip("/") for p in paths]) if duplicate_list_location.startswith("s3://"): _, duplicate_list_location = duplicate_list_location.split("://") - self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) return self.params | {"df": self.duplicate_list} From 3a3050125ef8987f85ba85b59ca13f928f812584 Mon Sep 17 00:00:00 2001 From: David Wood Date: Fri, 1 Nov 2024 10:18:34 -0400 Subject: [PATCH 058/105] get fdedeup/python test-image to pass, and clean up req in ray version Signed-off-by: David Wood --- transforms/universal/fdedup/python/Dockerfile | 2 +- transforms/universal/fdedup/python/requirements.txt | 10 ---------- transforms/universal/fdedup/ray/Dockerfile | 3 +-- transforms/universal/fdedup/ray/pyproject.toml | 1 + 4 files changed, 3 insertions(+), 13 deletions(-) delete mode 100644 transforms/universal/fdedup/python/requirements.txt diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index a0a557060..f8c41791e 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -18,7 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md -COPY --chown=dpk:root requirements.txt requirements.txt +#COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt deleted file mode 100644 index 576c028a8..000000000 --- a/transforms/universal/fdedup/python/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -pyarrow==16.1.0 -pyyaml>=6.0.2 -boto3>=1.34.69 -kubernetes>=30.1.0 -polars==1.9.0 -disjoint-set>=0.8.0 -numpy<1.29.0 -sentencepiece>=0.2.0 -mmh3>=4.1.0 -scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index ec2c56f28..e921c4749 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -1,5 +1,4 @@ -ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 - +ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310 FROM ${BASE_IMAGE} USER ray diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 9c533231a..6a871abea 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -10,6 +10,7 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ + "dpk_fdedup_transform_python==0.2.2.dev1", "data-prep-toolkit-ray==0.2.2.dev1", "mmh3>=4.1.0", "xxhash==3.4.1", From 80ae8df747998feb0f4dba49ec4322ace854d01c Mon Sep 17 00:00:00 2001 From: nelson Date: Fri, 8 Nov 2024 16:51:39 -0500 Subject: [PATCH 059/105] Added an option to run either word or char shingle Signed-off-by: nelson --- .../fdedup/python/src/fuzzy_dedup_python.py | 8 ++++++ .../python/src/signature_calc_transform.py | 26 ++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index 7135054d2..bc5f3fded 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -57,6 +57,7 @@ signature_calc_transform.jaccard_similarity_threshold_key, signature_calc_transform.word_shingle_size_key, signature_calc_transform.num_segments_key, + signature_calc_transform.shingle_option_key, ], "cluster": [ cluster_analysis_transform.jaccard_similarity_threshold_key, @@ -240,6 +241,13 @@ def parse_args() -> argparse.Namespace: default=None, help="ast string of options for s3 credentials", ) + parser.add_argument( + "--shingle_option", + type=str, + required=False, + default="word", + help="Option used for shingling", + ) return parser.parse_args() diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 159697d19..2ed3ed258 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -48,6 +48,8 @@ """ This key holds the size of the word shingles calculated for each document""" num_segments_key = "num_segments" """ This key holds the number of segments across which we divide the hashing space for each band""" +shingle_option_key = "shingle_option" +""" This key holds the option that is used to do shingles calculation for each document""" # command line arguments document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" @@ -68,6 +70,8 @@ """ The size of the word shingles calculated for each document""" num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments across which we divide the hashing space for each band""" +shingle_option_cli_param = f"{cli_prefix}{shingle_option_key}" +""" This key holds the option that is used to do shingles calculation for each document""" captured_arg_keys = [ document_id_column_key, @@ -100,6 +104,8 @@ """ Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" num_segments_default = 1 """ Default number of segments across which we divide the hashing space for each band""" +shingle_option_default = "word" +""" Default option of doing shingling""" sigcalc_data_factory_key = "sc_data_factory" @@ -162,6 +168,7 @@ def __init__(self, config: dict[str, Any]): self.num_segments = config.get(num_segments_key, num_segments_default) self.num_bands = config.get(num_bands_key, num_bands_default) self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) + self.shingle_option = config.get(shingle_option_key, shingle_option_default) # use this dataframe to store the minhashes and size for each document self.all_minhashes: pl.DataFrame = None # use this dataframe to store the band hashes for each document @@ -202,7 +209,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab # generate minhash values minhashes = df.map_rows( lambda row: mm_min_hash.minhash2_nosalt( - *self._generate_word_shingles(row, window_size=self.word_shingle_size) + *self._generate_word_shingles(row, self.shingle_option, window_size=self.word_shingle_size) ) ) # rename columns, cast minhashes to list(uint32) @@ -353,7 +360,9 @@ def write_band_signatures(self): return [], metadata # define shingles generation function - def _generate_word_shingles(self, row: tuple, window_size: int = 5, delimiter: str = " ") -> tuple[list, int, int]: + def _generate_word_shingles( + self, row: tuple, shingling_option: str, window_size: int = 5, delimiter: str = " " + ) -> tuple[list, int, int]: text = row[0] # lower case text = text.lower() @@ -366,7 +375,12 @@ def _generate_word_shingles(self, row: tuple, window_size: int = 5, delimiter: s # diacritics/unicode normalization text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn") text = text.strip() - words = text.split() + print(shingling_option) + print("=============") + if shingling_option == "char": + words = list(text) + else: + words = text.split() document_id = row[1] doc_len = len(row[0]) word_count = len(words) @@ -484,6 +498,12 @@ def add_input_params(self, parser: ArgumentParser) -> None: default=num_segments_default, help="the number of segments across which we divide the hashing space for each band", ) + parser.add_argument( + f"--{shingle_option_cli_param}", + type=str, + default=shingle_option_default, + help="Shingling option", + ) self.daf.add_input_params(parser=parser) def apply_input_params(self, args: Namespace) -> bool: From c531809647c29de300052c1d9a698905bc904733 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 14:26:29 -0500 Subject: [PATCH 060/105] Use captured_arg_keys to list the arguments of each transform Signed-off-by: Constantin M Adam --- .../fdedup/python/src/fuzzy_dedup_python.py | 29 +++---------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py index 7135054d2..f3d0b0fdc 100644 --- a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py +++ b/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py @@ -47,31 +47,10 @@ } ARGS_MAP = { - "minhash": [ - signature_calc_transform.contents_column_key, - signature_calc_transform.document_id_column_key, - signature_calc_transform.seed_key, - signature_calc_transform.num_permutations_key, - signature_calc_transform.num_bands_key, - signature_calc_transform.num_minhashes_per_band_key, - signature_calc_transform.jaccard_similarity_threshold_key, - signature_calc_transform.word_shingle_size_key, - signature_calc_transform.num_segments_key, - ], - "cluster": [ - cluster_analysis_transform.jaccard_similarity_threshold_key, - cluster_analysis_transform.num_bands_key, - cluster_analysis_transform.num_segments_key, - ], - "fdlist": [ - get_duplicate_list_transform.subfolder_key, - get_duplicate_list_transform.consolidated_filename_key, - ], - "fdclean": [ - data_cleaning_transform.document_id_column_key, - data_cleaning_transform.duplicate_list_location_key, - data_cleaning_transform.operation_mode_key, - ], + "minhash": signature_calc_transform.captured_arg_keys, + "cluster": cluster_analysis_transform.captured_arg_keys, + "fdlist": get_duplicate_list_transform.captured_arg_keys, + "fdclean": data_cleaning_transform.captured_arg_keys, } From fe431104ca2d171b451be76a0cd7716f268f9d52 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 14:28:06 -0500 Subject: [PATCH 061/105] Ray implementation for get_duplicate_list_transform Signed-off-by: Constantin M Adam --- .../fdedup/ray/src/fuzzy_dedup_ray.py | 6 +- .../src/get_duplicate_list_transform_ray.py | 69 +++++++++++++++++++ .../test_get_duplicate_list_transform_ray.py | 9 ++- 3 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py index 0d4c2954f..987369714 100644 --- a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py +++ b/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py @@ -23,6 +23,10 @@ from get_duplicate_list_transform_python import ( GetDuplicateListPythonTransformConfiguration, ) +from get_duplicate_list_transform_ray import ( + GetDuplicateListRayRuntime, + GetDuplicateListRayTransformConfiguration, +) from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration @@ -56,7 +60,7 @@ def execute_service(self, service_short_name: str, params: list) -> int: elif service_short_name == "cluster": launcher = RayTransformLauncher(runtime_config=ClusterAnalysisRayTransformConfiguration()) elif service_short_name == "fdlist": - launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + launcher = RayTransformLauncher(runtime_config=GetDuplicateListRayTransformConfiguration()) elif service_short_name == "fdclean": launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) status = launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py new file mode 100644 index 000000000..40081e658 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py @@ -0,0 +1,69 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, + RayTransformRuntimeConfiguration, +) +from get_duplicate_list_transform import ( + GetDuplicateListTransformConfiguration, + subfolder_key, +) + + +logger = get_logger(__name__) + + +class GetDuplicateListRayRuntime(DefaultRayTransformRuntime): + """ + Get duplicate list runtime support for Ray + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + return [self.params[subfolder_key]] + + +class GetDuplicateListRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for Fuzzy Dedup Get Duplicate List + as required by the RayTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=GetDuplicateListTransformConfiguration(), + runtime_class=GetDuplicateListRayRuntime, + ) + + +if __name__ == "__main__": + launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration()) + logger.info("Launching fuzzy dedup get duplicate list ray transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py index 4b59e3a7a..55869598c 100644 --- a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py +++ b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py @@ -12,14 +12,12 @@ import os -from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) +from data_processing_ray.runtime.ray import RayTransformLauncher from get_duplicate_list_transform import sort_output_cli_param -from get_duplicate_list_transform_python import ( - GetDuplicateListPythonTransformConfiguration, -) +from get_duplicate_list_transform_ray import GetDuplicateListRayTransformConfiguration class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): @@ -31,9 +29,10 @@ class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): def get_test_transform_fixtures(self) -> list[tuple]: basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) config = { + "run_locally": True, sort_output_cli_param: True, } - launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration()) fixtures = [ ( launcher, From 82a1860524e8ebd4c59ae0598356095d69021e3c Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 14:30:03 -0500 Subject: [PATCH 062/105] Bug fix: jaccard threshold type must be float Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/signature_calc_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 159697d19..b492eb3ae 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -456,7 +456,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: ) parser.add_argument( f"--{jaccard_similarity_threshold_cli_param}", - type=int, + type=float, default=jaccard_similarity_threshold_default, help="Jaccard similarity threshold above which two documents are duplicates", ) From 61ed40f347612787d32385df779d0d88fc4e3f88 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 14:31:18 -0500 Subject: [PATCH 063/105] Get fuzzy dedup ray image ready for kfp Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/ray/Dockerfile | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index e921c4749..d4b3ae484 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -1,8 +1,6 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310 FROM ${BASE_IMAGE} -USER ray - RUN pip install --upgrade --no-cache-dir pip # install pytest @@ -24,13 +22,20 @@ COPY --chown=ray:users README.md README.md RUN pip install --no-cache-dir -e . # copy source files needed by test-image -COPY ./src/signature_calc_transform_ray.py fdedup_transform_ray.py -COPY ./src/signature_calc_local_ray.py local/fdedup_local_ray.py +COPY --chown=ray:users ./src/signature_calc_transform_ray.py fdedup_transform_ray.py +COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py +COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py +COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py +COPY --chown=ray:users ./src/data_cleaning_transform_ray.py data_cleaning_transform_ray.py +COPY --chown=ray:users ./src/signature_calc_local_ray.py local/fdedup_local_ray.py # copy test COPY test/ test/ COPY test-data/ test-data/ +USER root +RUN chmod a+rwx /home/ray +USER ray # Set environment ENV PYTHONPATH /home/ray From a8ede002fba33a4e01df9421b60f30558b98260e Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 17:37:56 -0500 Subject: [PATCH 064/105] kfp implementation for fuzzy dedup Signed-off-by: Constantin M Adam --- .../universal/fdedup/kfp_ray/fdedup_wf.py | 321 +++++++++---- .../src/fdedup_compute_execution_params.py | 437 ++++++++++-------- 2 files changed, 494 insertions(+), 264 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 3156ab6f1..1c3e8e570 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -14,14 +14,24 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from src.fdedup_compute_execution_params import fdedup_compute_execution_params +from src.fdedup_compute_execution_params import ( + cluster_analysis_compute_execution_params, + compute_common_params, + data_cleaning_compute_execution_params, + get_duplicate_list_compute_execution_params, + signature_calc_compute_execution_params, +) from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils -task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest" +task_image = os.getenv("FDEDUP_IMAGE_LOCATION", "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest") +image_pull_secret = os.getenv("FDEDUP_IMAGE_PULL_SECRET", "my_secret") # the name of the job script -EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py" +SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "signature_calc_transform_ray.py" +CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "cluster_analysis_transform_ray.py" +GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "get_duplicate_list_transform_ray.py" +DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" @@ -40,8 +50,18 @@ # compilation time. import uuid - compute_exec_params_op = dsl.component_decorator.component( - func=fdedup_compute_execution_params, base_image=base_kfp_image + compute_common_params_op = dsl.component_decorator.component(func=compute_common_params, base_image=base_kfp_image) + compute_signature_calc_exec_params_op = dsl.component_decorator.component( + func=signature_calc_compute_execution_params, base_image=base_kfp_image + ) + compute_cluster_analysis_exec_params_op = dsl.component_decorator.component( + func=cluster_analysis_compute_execution_params, base_image=base_kfp_image + ) + compute_get_duplicate_list_exec_params_op = dsl.component_decorator.component( + func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image + ) + compute_data_cleaning_exec_params_op = dsl.component_decorator.component( + func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) print( "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " @@ -49,61 +69,94 @@ ) run_id = uuid.uuid4().hex else: - compute_exec_params_op = comp.create_component_from_func( - func=fdedup_compute_execution_params, base_image=base_kfp_image + compute_common_params_op = comp.create_component_from_func(func=compute_common_params, base_image=base_kfp_image) + compute_signature_calc_exec_params_op = comp.create_component_from_func( + func=signature_calc_compute_execution_params, base_image=base_kfp_image + ) + compute_cluster_analysis_exec_params_op = comp.create_component_from_func( + func=cluster_analysis_compute_execution_params, base_image=base_kfp_image + ) + compute_get_duplicate_list_exec_params_op = comp.create_component_from_func( + func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image + ) + compute_data_cleaning_exec_params_op = comp.create_component_from_func( + func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute signature calculation job +execute_signature_calc_job_op = comp.load_component_from_file( + component_spec_path + "executeRayJobComponent_multi_s3.yaml" +) +# execute cluster analysis job +execute_cluster_analysis_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute get duplicate list job +execute_get_duplicate_list_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute data cleaning job +execute_data_cleaning_job_op = comp.load_component_from_file( + component_spec_path + "executeRayJobComponent_multi_s3.yaml" +) # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "fdedup" +TASK_NAME: str = "fuzzydedup" @dsl.pipeline( name=TASK_NAME + "-ray-pipeline", - description="Pipeline for fdedup", + description="Pipeline for fuzzy dedup", ) -def fdedup( +def fuzzydedup( + # folders used # Ray cluster - ray_name: str = "fdedup-kfp-ray", # name of Ray cluster + ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed - ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_head_options: dict = { + "cpu": 1, + "memory": 4, + "image": task_image, + "image_pull_secret": image_pull_secret, + "imagePullPolicy": "Always", + }, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + "image_pull_secret": image_pull_secret, + "imagePullPolicy": "Always", + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access. checkpointing is not supported by dedup - data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}", - data_s3_access_secret: str = "s3-secret", + data_s3_config: str = "{'input_folder': 's3://cos-llm-pile-south/spark_test/fd_xs_dataset_test/', 'output_folder': 's3://cos-llm-pile-south/spark_test/fuzzy_dedup_test_output_data/kfp_test_1/'}", + data_s3_access_secret: str = "s3-south-secret", + scdata_s3_access_secret: str = "s3-south-secret", + dcdata_s3_access_secret: str = "s3-south-secret", data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {"num_cpus": 0.7}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # columns used - fdedup_doc_column: str = "contents", - fdedup_id_column: str = "int_id_column", - fdedup_cluster_column: str = "cluster", - # infrastructure - fdedup_bucket_cpu: float = 0.5, - fdedup_doc_cpu: float = 0.5, - fdedup_mhash_cpu: float = 0.5, + fdedup_contents_column: str = "contents", + fdedup_document_id_column: str = "int_id_column", # fuzzy parameters - fdedup_num_permutations: int = 64, - fdedup_threshold: float = 0.8, - fdedup_shingles_size: int = 5, - fdedup_delimiters: str = " ", - # Random delay between reads - fdedup_random_delay_limit: int = 5, - # snapshotting - fdedup_snapshot_delay: int = 1, - fdedup_use_doc_snapshot: bool = False, - fdedup_use_bucket_snapshot: bool = False, + fdedup_num_permutations: int = 112, + fdedup_num_bands: int = 14, + fdedup_num_minhashes_per_band: int = 8, + fdedup_word_shingle_size: int = 5, + fdedup_jaccard_similarity_threshold: float = 0.75, + fdedup_seed: int = 42, + fdedup_docs_to_remove_folder: str = "docs_to_remove", + fdedup_duplicate_list_location: str = os.path.join( + "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + ), + fdedup_operation_mode: str = "annotate", # data sampling fdedup_n_samples: int = 10, # additional parameters @@ -136,89 +189,189 @@ def fdedup( wait_print_tmout - time between prints, sec http_retries - http retries for API server calls :param data_s3_access_secret - s3 access secret + :param scdata_s3_access_secret - signature calculation s3 access secret + :param dcdata_s3_access_secret - data cleaning s3 access secret :param data_s3_config - s3 configuration :param data_max_files - max files to process :param data_num_samples - num samples to process - :param runtime_actor_options - actor options :param runtime_pipeline_id - pipeline id :param runtime_code_location - code location - :param fdedup_doc_column - document column name - :param fdedup_id_column - integer document id column name - :param fdedup_cluster_column - cluster column name - :param fdedup_bucket_cpu - number of CPUs per bucket hash - :param fdedup_doc_cpu - number of CPUs per doc hash - :param fdedup_mhash_cpu - number of CPUs per minhash hash + :param fdedup_contents_column - document column name + :param fdedup_document_id_column - integer document id column name :param fdedup_num_permutations - number of permutations - :param fdedup_threshold - threshold - :param fdedup_shingles_size - number of words in shingle - :param fdedup_delimiters - delimiter for splitting document - :param fdedup_random_delay_limit - delay between reads to reduce S3 load. - A random number between 0 and random_delay_limit is used - :param fdedup_snapshot_delay - delay between restoring individual actors - :param fdedup_use_bucket_snapshot - flag to skip buckets building and start from existing snapshots - :param fdedup_use_doc_snapshot - flag to skip documents building and start from existing snapshots + :param fdedup_num_bands - number of bands + :param fdedup_num_minhashes_per_band - length of a band + :param fdedup_word_shingle_size - length of word shingles + :param fdedup_jaccard_similarity_threshold - similarity threshold + :param fdedup_seed - seed for the random number generator + :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids + :param fdedup_duplicate_list_location - name of the file holding the consolidated list of duplicates + :param fdedup_operation_mode - data cleaning mode, one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate' :param fdedup_n_samples - number of samples for parameters computation :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, + compute_common_exec_params = compute_common_params_op( + ray_worker_options, + data_s3_config, + fdedup_num_permutations, + fdedup_n_samples, + ) + ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2) + ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret) + fdedup_num_segments = compute_common_exec_params.outputs["num_segments"] + runtime_actor_cpus = compute_common_exec_params.outputs["cpus_per_actor"] + runtime_num_actors = compute_common_exec_params.outputs["num_actors"] + + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=run_id, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_common_exec_params) + + # Get the parameters for the signature calculation job + compute_signature_calc_exec_params = compute_signature_calc_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, - doc_column=fdedup_doc_column, - id_column=fdedup_id_column, - cluster_column=fdedup_cluster_column, - bucket_cpu=fdedup_bucket_cpu, - doc_cpu=fdedup_doc_cpu, - mhash_cpu=fdedup_mhash_cpu, + doc_column=fdedup_contents_column, + id_column=fdedup_document_id_column, num_permutations=fdedup_num_permutations, - threshold=fdedup_threshold, - shingles_size=fdedup_shingles_size, - delimiters=fdedup_delimiters, - random_delay_limit=fdedup_random_delay_limit, - snapshot_delay=fdedup_snapshot_delay, - use_doc_snapshot=fdedup_use_doc_snapshot, - use_bucket_snapshot=fdedup_use_bucket_snapshot, - n_samples=fdedup_n_samples, + num_bands=fdedup_num_bands, + num_minhashes_per_band=fdedup_num_minhashes_per_band, + word_shingle_size=fdedup_word_shingle_size, + threshold=fdedup_jaccard_similarity_threshold, + num_segments=fdedup_num_segments, + seed=fdedup_seed, ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) + ComponentUtils.add_settings_to_component(compute_signature_calc_exec_params, ONE_HOUR_SEC * 2) + compute_signature_calc_exec_params.after(ray_cluster) - # start Ray cluster - ray_cluster = create_ray_op( + # Execute signature calculation job + execute_signature_calc_job = execute_signature_calc_job_op( ray_name=ray_name, run_id=run_id, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, + additional_params=additional_params, + exec_params=compute_signature_calc_exec_params.output, + exec_script_name=SIGNATURE_CALC_EXEC_SCRIPT_NAME, server_url=server_url, + prefix="scdata", + ) + ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" + ) + execute_signature_calc_job.after(compute_signature_calc_exec_params) + + # Get the parameters for the cluster analysis job + compute_cluster_analysis_exec_params = compute_cluster_analysis_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + num_bands=fdedup_num_bands, + threshold=fdedup_jaccard_similarity_threshold, + num_segments=fdedup_num_segments, + ) + ComponentUtils.add_settings_to_component(compute_cluster_analysis_exec_params, ONE_HOUR_SEC * 2) + compute_cluster_analysis_exec_params.after(execute_signature_calc_job) + # Execute job + execute_cluster_analysis_job = execute_cluster_analysis_job_op( + ray_name=ray_name, + run_id=run_id, additional_params=additional_params, + exec_params=compute_cluster_analysis_exec_params.output, + exec_script_name=CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME, + server_url=server_url, ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) + ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) + execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) + + compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + duplicate_docids_folder=fdedup_docs_to_remove_folder, + duplicate_list_location=fdedup_duplicate_list_location, + ) + ComponentUtils.add_settings_to_component(compute_get_duplicate_list_exec_params, ONE_HOUR_SEC * 2) + compute_get_duplicate_list_exec_params.after(execute_cluster_analysis_job) # Execute job - execute_job = execute_ray_jobs_op( + execute_get_duplicate_list_job = execute_get_duplicate_list_job_op( ray_name=ray_name, run_id=run_id, additional_params=additional_params, - exec_params=compute_exec_params.output, - exec_script_name=EXEC_SCRIPT_NAME, + exec_params=compute_get_duplicate_list_exec_params.output, + exec_script_name=GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME, server_url=server_url, ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) + ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) + execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) + + compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op( + runtime_actor_cpus=runtime_actor_cpus, + runtime_num_actors=runtime_num_actors, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + id_column=fdedup_document_id_column, + duplicate_list_location=fdedup_duplicate_list_location, + operation_mode=fdedup_operation_mode, + ) + ComponentUtils.add_settings_to_component(compute_data_cleaning_exec_params, ONE_HOUR_SEC * 2) + compute_data_cleaning_exec_params.after(execute_get_duplicate_list_job) + + # Execute job + execute_data_cleaning_job = execute_data_cleaning_job_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + exec_params=compute_data_cleaning_exec_params.output, + exec_script_name=DATA_CLEANING_EXEC_SCRIPT_NAME, + server_url=server_url, + prefix="dcdata", + ) + ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" + ) + execute_data_cleaning_job.after(compute_data_cleaning_exec_params) if __name__ == "__main__": # Compiling the pipeline - compiler.Compiler().compile(fdedup, __file__.replace(".py", ".yaml")) + compiler.Compiler().compile(fuzzydedup, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index 726200339..c5ff4d52b 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -10,10 +10,77 @@ # limitations under the License. ################################################################################ +from typing import Any, Dict, NamedTuple -def fdedup_compute_execution_params( + +def compute_common_params( worker_options: dict, # ray worker configuration - actor_options: dict, # actor's resource requirements + data_s3_config: str, # S3 configuration + num_permutations: int, # number of permutations (minhashes) per document + n_samples: int, # files to sample for number of documents estimation +) -> NamedTuple("fdedup_params", [("num_segments", int), ("num_actors", int), ("cpus_per_actor", float)]): + + import sys + + from data_processing.data_access import DataAccessS3 + from data_processing.utils import GB + from runtime_utils import KFPUtils + + # get credentials + s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() + s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} + s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) + # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly + data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + # sample input data + sampling: dict[str, Any] + sampling, _ = data_access.sample_input_data(n_samples=n_samples) + number_of_docs = int(sampling.get("estimated number of docs")) + if number_of_docs == 0: + print(f"Estimated number of documents and documents size is zero. Please verify the input path.") + sys.exit(1) + print(f"Estimated number of docs: {number_of_docs}") + # Assume each document takes doc_bytes = (8 + num_permutations * 4 + 20) bytes, where: + # 8 bytes are taken by the band hash + # (num_permutations * 4) bytes are taken by the min hashes + # 20 bytes to provide some extra space for storage in a table + # The total amount of space needed by a band is number_of_docs * doc_bytes. + # To scale the handling of this data, divide each band into segments, where each segment size is below 3GB + doc_bytes = 8 + num_permutations * 4 + 20 + band_bytes = number_of_docs * doc_bytes + num_segments = 1 + (band_bytes // (3 * GB)) + print(f"Number of segments: {num_segments}") + + # To process data efficiently, each actor needs 16GB of memory. + # The actor config controls CPU allocation, not memory; + # use CPU allocation s.t. the number of actors on a worker provides access to 16GB of memory for each actor. + # Also, to keep S3 utilization in check, limit the number of actors to 2000 + num_nodes = worker_options["replicas"] + cpu_per_node = worker_options["cpu"] - 1 + memory_per_node = 0.85 * worker_options["memory"] + + memory_per_actor = 16 # GB + max_num_actors = 2000 + num_actors_per_node: int = int(memory_per_node / memory_per_actor) + if num_actors_per_node == 0: + num_actors_per_node = 1 + num_actors = num_nodes * num_actors_per_node + while num_actors > max_num_actors: + num_actors -= num_nodes + num_actors_per_node -= 1 + print(f"Number of actors per node = {num_actors_per_node}") + cpus_per_actor = cpu_per_node / num_actors_per_node + print(f"CPUs per actor = {cpus_per_actor}") + + from collections import namedtuple + + fdedup_params = namedtuple("fdedup_params", ["num_segments", "num_actors", "cpus_per_actor"]) + return fdedup_params(num_segments, num_actors, cpus_per_actor) + + +def signature_calc_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -22,27 +89,19 @@ def fdedup_compute_execution_params( runtime_code_location: dict, # code location doc_column: str, # document column name id_column: str, # integer document id column name - cluster_column: str, # cluster column name - bucket_cpu: float, # number of CPUs per bucket hash - doc_cpu: float, # number of CPUs per doc hash - mhash_cpu: float, # number of CPUs per minhash hash num_permutations: int, # number of permutations + num_bands: int, # number of bands + num_minhashes_per_band: int, # band length + word_shingle_size: int, # number of words in shingle threshold: float, # threshold, - shingles_size: int, # number of words in shingle - delimiters: str, # delimiter for splitting document - random_delay_limit: int, # delay between reads to reduce S3 load. - # A random number between 0 and random_delay_limit is used - snapshot_delay: int, # delay between restoring individual actors - use_doc_snapshot: bool, # flag to skip documents building and start from existing snapshots - use_bucket_snapshot: bool, # flag to skip buckets building and start from existing snapshots - n_samples: int, # number of samples to use -) -> dict: # NamedTuple( - # "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] + num_segments: int, # number of segments + seed: int, # seed for the random number generator +) -> dict: """ - Compute fuzzy dedup execution parameters - :param worker_options: cluster parameters - :param actor_options: actor request requirements + Compute fuzzy dedup execution parameters for signature calculation + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -51,182 +110,200 @@ def fdedup_compute_execution_params( :param runtime_code_location: code location :param doc_column: document column name :param id_column: integer document id column name - :param cluster_column: cluster column name - :param bucket_cpu: number of CPUs per bucket hash - :param doc_cpu: number of CPUs per doc hash - :param mhash_cpu: number of CPUs per minhash hash :param num_permutations: number of permutations + :param num_bands: number of bands + :param num_minhashes_per_band: band length + :param word_shingle_size: number of words in shingle :param threshold: threshold, - :param shingles_size: number of words in shingle - :param delimiters: delimiter for splitting document - :param random_delay_limit: # delay between reads to reduce S3 load. A random number between 0 and random_delay_limit is used - :param snapshot_delay: delay between restoring individual actors - :param use_doc_snapshot: flag to skip documents building and start from existing snapshots - :param use_bucket_snapshot: flag to skip buckets building and start from existing snapshots - :param n_samples: number of samples to use + :param num_segments: number of segments + :param seed: seed for the random number generator :return: a dictionary with a Ray Job execution parameters """ - import math - import sys - from data_processing.data_access import DataAccessS3 - from data_processing.utils import GB, KB - from runtime_utils import KFPUtils - from scipy.integrate import quad as integrate - - EXECUTION_OF_KB_DOC = 0.003 - - def fuzzy_optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float, - false_negative_weight: float, - ) -> tuple[int, int]: - """ - Computes parameters for fuzzy dedup - :param threshold: filtering threshold - :param num_perm: number of permutations - :param false_positive_weight: false positive weight - :param false_negative_weight: false negative weight - :return: number of buckets and bucket length - """ - - def _false_positive_probability(ths: float, b: int, r: int) -> float: - """ - Compute false positive probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, ths) - return a - - def _false_negative_probability(ths: float, b: int, r: int) -> float: - """ - Compute false negative probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, ths, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for perm in range(1, num_perm + 1): - max_r = int(num_perm / perm) - for rel in range(1, max_r + 1): - fp = _false_positive_probability(threshold, perm, rel) - fn = _false_negative_probability(threshold, perm, rel) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (perm, rel) - return opt + # fuzzy parameters for signature calculation + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + print(f"runtime_actor_options = {runtime_actor_options}") + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "minhash_contents_column": doc_column, + "minhash_document_id_column": id_column, + "minhash_num_permutations": num_permutations, + "minhash_num_bands": num_bands, + "minhash_num_minhashes_per_band": num_minhashes_per_band, + "minhash_word_shingle_size": word_shingle_size, + "minhash_jaccard_similarity_threshold": threshold, + "minhash_num_segments": num_segments, + "minhash_seed": seed, + "scdata_s3_config": data_s3_config, + } + + +def cluster_analysis_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + num_bands: int, # number of bands + threshold: float, # threshold, + num_segments: int, # number of segments +) -> dict: + + """ + Compute fuzzy dedup execution parameters for cluster analysis + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param num_bands: number of bands + :param threshold: threshold, + :param num_segments: number of segments + :return: a dictionary with a Ray Job execution parameters + """ + import json + import os # fuzzy parameters - num_buckets, length_bucket = fuzzy_optimal_param( - threshold=threshold, - num_perm=num_permutations, - false_positive_weight=0.5, - false_negative_weight=0.5, - ) - print(f"Fuzzy parameters: num buckets {num_buckets}, bucket length {length_bucket}") # Get cluster parameters - cluster_cpu = worker_options["replicas"] * worker_options["cpu"] - cluster_memory = worker_options["replicas"] * worker_options["memory"] - print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - cluster_cpu -= 1 - cluster_memory *= 0.85 - # get actor requirements - actor_cpu = actor_options["num_cpus"] - print(f"actor required cpu {actor_cpu}") - # get credentials - s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() - s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) - if type(s3_config) is list: - # S3 config is list. take the first element - s3_config = s3_config[0] - # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) - # sample input data - sampling, _ = data_access.sample_input_data(n_samples=n_samples) - avg_doc_size = sampling.get("average doc size KB") - number_of_docs = sampling.get("estimated number of docs") - avg_table_size = sampling.get("average table size MB") / KB - if number_of_docs == 0: - print(f"Estimated number of documents and documents size is zero. Please verify the input path.") - sys.exit(1) - # we are creating more buckets actors, so that we get better parallelization for bucket processing - b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB) - d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB) - m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB) - # compute cpu requirements - # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount - # of CPUs - n_preprocessors = int( - (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu - ) - if n_preprocessors <= 0: - print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}") - print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}") - print("Try to increase the size of the cluster") - sys.exit(1) - # compute the amount of workers - n_workers = int((0.85 * cluster_cpu - d_actors * doc_cpu) / actor_cpu) - # Ensure that we do not overwhelm S3 - if n_workers > 2000: - n_workers = 2000 - print( - f"Number of preprocessors: {n_preprocessors}, Number of workers: {n_workers}, bucket actors {b_actors}, " - f"minhash actors {m_actors}, document actors {d_actors}" - ) - - # Make sure that we have enough memory - r_mem = avg_table_size * 4 * n_preprocessors + 2 * (b_actors + m_actors + d_actors) - print(f"Required execution memory {r_mem} GB") - if r_mem > cluster_memory: - print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") - print(f"Try to increase the size of the cluster or increase size of the cpu per worker (current {actor_cpu})") - sys.exit(1) + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + data_s3_config_dict["input_folder"] = os.path.join(base_folder, "bands") + data_s3_config_dict["output_folder"] = os.path.join(base_folder, "docs_to_remove") + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "cluster_num_bands": num_bands, + "cluster_jaccard_similarity_threshold": threshold, + "cluster_num_segments": num_segments, + } - print( - f"Required cpu : " - f"{b_actors * bucket_cpu + m_actors * mhash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" - ) - projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60 - print(f"Projected execution time {projected_execution} min") +def get_duplicate_list_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + duplicate_docids_folder: str, # folder with the docs IDs to remove + duplicate_list_location: str, # location of the list of duplicate doc ids +) -> dict: + """ + Compute fuzzy dedup execution parameters for get duplicate list step + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param duplicate_docids_folder: folder with the docs IDs to remove + :param duplicate_list_location: location of the list of duplicate doc ids + :return: a dictionary with a Ray Job execution parameters + """ + import json + + # fuzzy parameters + # Get cluster parameters + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + data_s3_config_dict["input_folder"] = base_folder + data_s3_config_dict["output_folder"] = base_folder + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "fdlist_docs_to_remove": duplicate_docids_folder, + "fdlist_consolidated_filename": duplicate_list_location, + } + + +def data_cleaning_compute_execution_params( + runtime_actor_cpus: float, # actor's CPU requirements + runtime_num_actors: int, # number of actors needed to run this step + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + id_column: str, # integer document id column name + duplicate_list_location: str, # location of the list of duplicate doc ids + operation_mode: str, # filter (non-)duplicates or annotate +) -> dict: + """ + Compute fuzzy dedup execution parameters + :param runtime_actor_cpus: actor's CPU requirements + :param runtime_num_actors: number of actors to run this step + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param id_column: integer document id column name + :param duplicate_list_location: location of the list of duplicate doc ids + :param operation_mode: filter (non-)duplicates or annotate + :return: a dictionary with a Ray Job execution parameters + """ + import json + import os + + # fuzzy parameters + # Get cluster parameters + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + if operation_mode == "filter_duplicates": + output_subfolder = "cleaned" + elif operation_mode == "filter_non_duplicates": + output_subfolder = "duplicates" + else: # operation_mode == "annotate" + output_subfolder = "annotated" + data_s3_config_dict["output_folder"] = os.path.join(base_folder, output_subfolder) + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, - "runtime_num_workers": n_workers, - "runtime_worker_options": str(actor_options), + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(runtime_actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), - "fdedup_doc_column": doc_column, - "fdedup_id_column": id_column, - "fdedup_cluster_column": cluster_column, - "fdedup_bucket_cpu": bucket_cpu, - "fdedup_doc_cpu": doc_cpu, - "fdedup_mhash_cpu": mhash_cpu, - "fdedup_num_doc_actors": d_actors, - "fdedup_num_bucket_actors": b_actors, - "fdedup_num_minhash_actors": m_actors, - "fdedup_num_preprocessors": n_preprocessors, - "fdedup_num_permutations": num_permutations, - "fdedup_threshold": threshold, - "fdedup_shingles_size": shingles_size, - "fdedup_delimiters": delimiters, - "fdedup_random_delay_limit": random_delay_limit, - "fdedup_snapshot_delay": snapshot_delay, - "fdedup_use_doc_snapshot": use_doc_snapshot, - "fdedup_use_bucket_snapshot": use_bucket_snapshot, + "fdclean_document_id_column": id_column, + "fdclean_duplicate_list_location": duplicate_list_location, + "fdclean_operation_mode": operation_mode, } From 96edea4fe2cb976e0e20a7b0299a022ecd378ef0 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Sun, 10 Nov 2024 22:08:05 -0500 Subject: [PATCH 065/105] Added params to captured_arg_keys Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/data_cleaning_transform.py | 1 + .../universal/fdedup/python/src/signature_calc_transform.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 1a349ae85..74597068c 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -44,6 +44,7 @@ captured_arg_keys = [ document_id_column_key, duplicate_list_location_key, + operation_mode_key, ] # defaults diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index c63fa3576..6b14e1ba0 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -71,7 +71,7 @@ num_segments_cli_param = f"{cli_prefix}{num_segments_key}" """ The number of segments across which we divide the hashing space for each band""" shingle_option_cli_param = f"{cli_prefix}{shingle_option_key}" -""" This key holds the option that is used to do shingles calculation for each document""" +""" The option (word/char) used to do shingles calculation for each document""" captured_arg_keys = [ document_id_column_key, @@ -83,6 +83,7 @@ jaccard_similarity_threshold_key, word_shingle_size_key, num_segments_key, + shingle_option_key, ] # defaults @@ -375,8 +376,7 @@ def _generate_word_shingles( # diacritics/unicode normalization text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn") text = text.strip() - print(shingling_option) - print("=============") + self.logger.debug(shingling_option) if shingling_option == "char": words = list(text) else: From 1a70530af57f530d5ac98acacafbf94512a977b3 Mon Sep 17 00:00:00 2001 From: Daiki Tsuzuku Date: Mon, 11 Nov 2024 12:13:53 +0900 Subject: [PATCH 066/105] update readme following template https://github.com/IBM/data-prep-kit/issues/753#issuecomment-2460867526 Signed-off-by: Daiki Tsuzuku --- .../language/doc_quality/python/README.md | 57 +++++++++++++++---- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md index 38421f34f..f3944cdc0 100644 --- a/transforms/language/doc_quality/python/README.md +++ b/transforms/language/doc_quality/python/README.md @@ -1,13 +1,21 @@ # Document Quality Transform + Please see the set of [transform project conventions](../../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. -## Summary -This transform will calculate and annotate several metrics related to document, which are usuful to see the quality of document. +## Description +This transform will calculate and annotate several metrics related to document, which are usuful to see the quality of document. +Text is the type of data this transform operates on. + +### Input -In this transform, following metrics will be included: +| input column name | data type | descrition | +|-|-|-| +| the one specified in _doc_content_column_ configuration | string | text whose quality will be calculated by this transform | + +### Output columns annotated by this transform | output column name | data type | description | supported language | |-|-|-|-| @@ -27,7 +35,7 @@ In this transform, following metrics will be included: You can see more detailed backgrounds of some columns in [Deepmind's Gopher paper](https://arxiv.org/pdf/2112.11446.pdf) -## Configuration and command line Options +## Configuration The set of dictionary keys holding [DocQualityTransform](src/doc_quality_transform.py) configuration for values are as follows: @@ -36,13 +44,19 @@ configuration for values are as follows: * _doc_content_column_ - specifies column name that contains document text. By default, "contents" is used. * _bad_word_filepath_ - specifies a path to bad word file: local folder (file or directory) that points to bad word file. You don't have to set this parameter if you don't need to set bad words. -## Running +Example +``` +{ + text_lang_key: "en", + doc_content_column_key: "contents", + bad_word_filepath_key: os.path.join(basedir, "ldnoobw", "en"), +} +``` + +## Usage ### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -the options provided by -the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). +The following command line arguments are available ``` --docq_text_lang DOCQ_TEXT_LANG language used in the text content. By default, "en" is used. --docq_doc_content_column DOCQ_DOC_CONTENT_COLUMN column name that contain document text. By default, "contents" is used. @@ -70,6 +84,9 @@ ls output ``` To see results of the transform. +### Code example + +TBD (link to the notebook will be provided) ### Transforming data using the transform image @@ -77,7 +94,27 @@ To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_doc_quality_python.py) +- [Integration test](test/test_doc_quality.py) + + +## Further Resource + +- For those who want to learn C4 heuristic rules + - https://arxiv.org/pdf/1910.10683.pdf +- For those who want to learn Gopher statistics + - https://arxiv.org/pdf/2112.11446.pdf +- For those who want to see the source of badwords used by default + - https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words + + +## Consideration -## Troubleshooting guide +### Troubleshooting guide For M1 Mac user, if you see following error during make command, `error: command '/usr/bin/clang' failed with exit code 1`, you may better follow [this step](https://freeman.vc/notes/installing-fasttext-on-an-m1-mac) \ No newline at end of file From 24163af9d00f7603b9ec17091c785c0fead8eaae Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 11 Nov 2024 09:36:19 -0500 Subject: [PATCH 067/105] Add shingle type option (word or char) to kfp Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 3 +++ .../fdedup/kfp_ray/src/fdedup_compute_execution_params.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 1c3e8e570..139a0f919 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -150,6 +150,7 @@ def fuzzydedup( fdedup_num_bands: int = 14, fdedup_num_minhashes_per_band: int = 8, fdedup_word_shingle_size: int = 5, + fdedup_shingle_option: str = "word", fdedup_jaccard_similarity_threshold: float = 0.75, fdedup_seed: int = 42, fdedup_docs_to_remove_folder: str = "docs_to_remove", @@ -202,6 +203,7 @@ def fuzzydedup( :param fdedup_num_bands - number of bands :param fdedup_num_minhashes_per_band - length of a band :param fdedup_word_shingle_size - length of word shingles + :param fdedup_shingle_option - type of shingle, one of 'word', or 'char' :param fdedup_jaccard_similarity_threshold - similarity threshold :param fdedup_seed - seed for the random number generator :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids @@ -258,6 +260,7 @@ def fuzzydedup( num_bands=fdedup_num_bands, num_minhashes_per_band=fdedup_num_minhashes_per_band, word_shingle_size=fdedup_word_shingle_size, + shingle_option=fdedup_shingle_option, threshold=fdedup_jaccard_similarity_threshold, num_segments=fdedup_num_segments, seed=fdedup_seed, diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index c5ff4d52b..65b7ac2f6 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -93,6 +93,7 @@ def signature_calc_compute_execution_params( num_bands: int, # number of bands num_minhashes_per_band: int, # band length word_shingle_size: int, # number of words in shingle + shingle_option: str, # type of shingle, one of 'word' or 'char' threshold: float, # threshold, num_segments: int, # number of segments seed: int, # seed for the random number generator @@ -114,6 +115,7 @@ def signature_calc_compute_execution_params( :param num_bands: number of bands :param num_minhashes_per_band: band length :param word_shingle_size: number of words in shingle + :param shingle_option: str: type of shingle, one of 'word' or 'char' :param threshold: threshold, :param num_segments: number of segments :param seed: seed for the random number generator @@ -138,6 +140,7 @@ def signature_calc_compute_execution_params( "minhash_num_bands": num_bands, "minhash_num_minhashes_per_band": num_minhashes_per_band, "minhash_word_shingle_size": word_shingle_size, + "minhash_shingle_option": shingle_option, "minhash_jaccard_similarity_threshold": threshold, "minhash_num_segments": num_segments, "minhash_seed": seed, From ecb87b0afd8042d122edc549639880c8b74d6ad5 Mon Sep 17 00:00:00 2001 From: Daiki Tsuzuku Date: Wed, 13 Nov 2024 10:23:10 +0900 Subject: [PATCH 068/105] fix typo and update description Signed-off-by: Daiki Tsuzuku --- transforms/language/doc_quality/python/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md index f3944cdc0..1e060018d 100644 --- a/transforms/language/doc_quality/python/README.md +++ b/transforms/language/doc_quality/python/README.md @@ -6,12 +6,12 @@ for details on general project conventions, transform configuration, testing and IDE set up. ## Description -This transform will calculate and annotate several metrics related to document, which are usuful to see the quality of document. -Text is the type of data this transform operates on. +This transform will calculate and annotate several metrics which are useful to assess the quality of the document. +The document quality transform operates on text documents only ### Input -| input column name | data type | descrition | +| input column name | data type | description | |-|-|-| | the one specified in _doc_content_column_ configuration | string | text whose quality will be calculated by this transform | From e3fae5db338ee16ae4fdcf6eced47da962e20b06 Mon Sep 17 00:00:00 2001 From: Daiki Tsuzuku Date: Wed, 13 Nov 2024 17:52:42 +0900 Subject: [PATCH 069/105] add name/email of contributor Signed-off-by: Daiki Tsuzuku --- transforms/language/doc_quality/python/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md index 1e060018d..6a085ef05 100644 --- a/transforms/language/doc_quality/python/README.md +++ b/transforms/language/doc_quality/python/README.md @@ -5,6 +5,10 @@ Please see the set of for details on general project conventions, transform configuration, testing and IDE set up. +## Contributors + +- Daiki Tsuzuku (dtsuzuku@jp.ibm.com) + ## Description This transform will calculate and annotate several metrics which are useful to assess the quality of the document. The document quality transform operates on text documents only From 3a43c3d4370cdb31949a11190804552716a3adce Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Wed, 13 Nov 2024 10:53:09 -0500 Subject: [PATCH 070/105] Utility to calculate number of bands and length of a band Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/utils/Makefile | 16 ++++ .../universal/fdedup/utils/calc_r_and_b.ipynb | 74 +++++++++++++++++++ .../universal/fdedup/utils/requirements.txt | 3 + 3 files changed, 93 insertions(+) create mode 100644 transforms/universal/fdedup/utils/Makefile create mode 100644 transforms/universal/fdedup/utils/calc_r_and_b.ipynb create mode 100644 transforms/universal/fdedup/utils/requirements.txt diff --git a/transforms/universal/fdedup/utils/Makefile b/transforms/universal/fdedup/utils/Makefile new file mode 100644 index 000000000..dae3f30ea --- /dev/null +++ b/transforms/universal/fdedup/utils/Makefile @@ -0,0 +1,16 @@ +PYTHON=python +PIP=pip + +venv: requirements.txt + $(PYTHON) -m venv venv + if [ -e venv/Scripts/activate ]; then \ + echo "For Windows please try the following AS Administrator - no guarantees"; \ + echo " venv\\Scripts\\activate"; \ + echo " pip install --upgrade pip"; \ + echo " pip install -r requirements.txt"; \ + echo " pip install pytest"; \ + else \ + . venv/bin/activate; \ + $(PIP) install --upgrade pip; \ + $(PIP) install -r requirements.txt; \ + fi diff --git a/transforms/universal/fdedup/utils/calc_r_and_b.ipynb b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb new file mode 100644 index 000000000..8398f9efa --- /dev/null +++ b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb @@ -0,0 +1,74 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cf5dba9a-d530-4a0a-ae71-2d741f7e705f", + "metadata": {}, + "source": [ + "This notebook allows calculating the values for `b` (the number of bands) and `r` (the number of minhashes in a band) used in the fuzzy dedup algorithm. The default values are `b=14` and `r=8`, as defined in the [FineWeb datasets paper](https://arxiv.org/pdf/2406.17557). The x-axis of the graph represents the Jaccard similarity between a pair of documents, while the y-axis represents the probability that they become duplication candidates. Please refer to http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf for more details on this methodology." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "800bc113-8b5e-4cec-8717-98fa05753bd0", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Define the parameterized function\n", + "def f(s, r, b):\n", + " return 1 - (1 - s**r)**b\n", + "\n", + "# Set the parameters r and b\n", + "r = 8\n", + "b = 14\n", + "\n", + "# Generate values for s in a range, e.g., from 0 to 1\n", + "s_values = np.linspace(0, 1, 500) # 500 points between 0 and 1\n", + "f_values = f(s_values, r, b)\n", + "\n", + "# Plot the function\n", + "plt.figure(figsize=(8, 6))\n", + "plt.plot(s_values, f_values, label=fr\"$f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\", color='blue')\n", + "plt.xlabel(\"s\")\n", + "plt.ylabel(\"f(s)\")\n", + "plt.title(f\"Plot of the function $f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\")\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98016b04-b6a0-465d-b65b-6d402978c9f0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/utils/requirements.txt b/transforms/universal/fdedup/utils/requirements.txt new file mode 100644 index 000000000..ce2acfefb --- /dev/null +++ b/transforms/universal/fdedup/utils/requirements.txt @@ -0,0 +1,3 @@ +jupyter +numpy +matplotlib From 2f61be7938d7540a0a1831e85b8a961bef24d35c Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Wed, 13 Nov 2024 15:37:32 -0500 Subject: [PATCH 071/105] Set correct version for pyproject Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/pyproject.toml | 6 +++--- transforms/universal/fdedup/ray/Makefile | 2 +- transforms/universal/fdedup/ray/pyproject.toml | 2 +- transforms/universal/fdedup/spark/Makefile | 2 +- transforms/universal/fdedup/spark/pyproject.toml | 8 ++++---- transforms/universal/fdedup/utils/Makefile | 2 ++ 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index f46c8e8c4..dd58d41d4 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_fdedup_transform_python" -version = "0.2.2.dev1" -requires-python = ">=3.10" +version = "0.2.2.dev2" +requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} @@ -10,7 +10,7 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev1", + "data-prep-toolkit==0.2.2.dev2", "pyarrow==16.1.0", "pyyaml>=6.0.2", "boto3>=1.34.69", diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile index f5f06c3c3..ec193b6c3 100644 --- a/transforms/universal/fdedup/ray/Makefile +++ b/transforms/universal/fdedup/ray/Makefile @@ -43,7 +43,7 @@ setup:: .transforms.setup # TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index b24886ad9..037525126 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -11,7 +11,7 @@ authors = [ ] dependencies = [ "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_fdedup_transform_python==0.2.2.dev1", + "dpk_fdedup_transform_python==0.2.2.dev2", "mmh3>=4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile index 7eb132fbd..ac2735e7d 100644 --- a/transforms/universal/fdedup/spark/Makefile +++ b/transforms/universal/fdedup/spark/Makefile @@ -36,7 +36,7 @@ publish: publish-image publish-image:: .transforms.publish-image-spark set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index 548f350c0..cc66fc044 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_fdedup_transform_spark" -version = "0.2.2.dev1" -requires-python = ">=3.10" +version = "0.2.2.dev2" +requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Spark Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} @@ -10,8 +10,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk_fdedup_transform_python==0.2.2.dev1", - "data-prep-toolkit-spark==0.2.2.dev1", + "dpk_fdedup_transform_python==0.2.2.dev2", + "data-prep-toolkit-spark==0.2.2.dev2", ] [project.optional-dependencies] diff --git a/transforms/universal/fdedup/utils/Makefile b/transforms/universal/fdedup/utils/Makefile index dae3f30ea..d9dae01d7 100644 --- a/transforms/universal/fdedup/utils/Makefile +++ b/transforms/universal/fdedup/utils/Makefile @@ -14,3 +14,5 @@ venv: requirements.txt $(PIP) install --upgrade pip; \ $(PIP) install -r requirements.txt; \ fi +set-versions: + @: \ No newline at end of file From cd5eb05f82d1145a620a03d0094aac96846d5d55 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Wed, 13 Nov 2024 15:45:37 -0500 Subject: [PATCH 072/105] Change the name of the utils Makefile Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/utils/{Makefile => Makefile.local} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename transforms/universal/fdedup/utils/{Makefile => Makefile.local} (100%) diff --git a/transforms/universal/fdedup/utils/Makefile b/transforms/universal/fdedup/utils/Makefile.local similarity index 100% rename from transforms/universal/fdedup/utils/Makefile rename to transforms/universal/fdedup/utils/Makefile.local From 6cc18cd8eaba2fb12a31f49af52aba188a9f6ac4 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 08:36:45 -0500 Subject: [PATCH 073/105] Copy whl file to the context folder Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/Dockerfile | 5 +++-- transforms/universal/fdedup/spark/Dockerfile | 19 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index f8c41791e..a6724e6e7 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -4,6 +4,7 @@ RUN pip install --upgrade --no-cache-dir pip # install pytest RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME # Create a user and use it to run the transform RUN useradd -ms /bin/bash dpk @@ -12,8 +13,8 @@ WORKDIR /home/dpk # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/ -RUN cd data-processing-lib-python && pip install --no-cache-dir -e . +COPY --chown=dpk:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile index a36a7cef7..772dfef79 100644 --- a/transforms/universal/fdedup/spark/Dockerfile +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -1,35 +1,34 @@ ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0 - FROM ${BASE_IMAGE} -# USER root # install pytest RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME WORKDIR ${SPARK_HOME}/work-dir # Copy in the data processing framework source/project and install it # This is expected to be placed in the docker context before this is run (see the make image). -COPY --chown=spark:root data-processing-lib-python/ data-processing-lib-python/ -RUN cd data-processing-lib-python && pip install --no-cache-dir -e . -COPY --chown=spark:root data-processing-lib-spark/ data-processing-lib-spark/ -RUN cd data-processing-lib-spark && pip install --no-cache-dir -e . +COPY --chown=spark:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark] + +## Copy the python version of the tansform COPY --chown=spark:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -# Install project source +# Install spark project source COPY --chown=spark:root src/ src/ COPY --chown=spark:root pyproject.toml pyproject.toml +COPY --chown=spark:root README.md README.md RUN mkdir -p /opt/spark/work-dir/src/templates && \ mkdir -p /opt/spark/work-dir/config +COPY --chown=spark:root deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ +COPY --chown=spark:root deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ # install requirements from requirements.txt COPY requirements.txt . RUN pip3 install -r requirements.txt -COPY deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ -COPY deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ - RUN pip install --no-cache-dir -e . # copy the main() entry point to the image From 9f336203571b07e8486292793599406b87abf830 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 08:38:49 -0500 Subject: [PATCH 074/105] Use keyword args in compute_common_params Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 139a0f919..0a0a4d9bf 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -221,10 +221,10 @@ def fuzzydedup( with dsl.ExitHandler(clean_up_task): # compute execution params compute_common_exec_params = compute_common_params_op( - ray_worker_options, - data_s3_config, - fdedup_num_permutations, - fdedup_n_samples, + worker_options=ray_worker_options, + data_s3_config=data_s3_config, + num_permutations=fdedup_num_permutations, + n_samples=fdedup_n_samples, ) ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2) ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret) From 528457c5cc91dad1439c72258be92e8030f45015 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 10:42:20 -0500 Subject: [PATCH 075/105] Use dynamic dependencies Signed-off-by: Constantin M Adam --- data-processing-lib/spark/pyproject.toml | 55 ------------------- transforms/universal/fdedup/python/Dockerfile | 2 +- .../universal/fdedup/python/pyproject.toml | 16 +----- .../universal/fdedup/python/requirements.txt | 10 ++++ transforms/universal/fdedup/ray/Dockerfile | 1 + .../universal/fdedup/ray/pyproject.toml | 11 +--- .../universal/fdedup/ray/requirements.txt | 6 ++ .../universal/fdedup/spark/pyproject.toml | 11 ++-- .../universal/fdedup/spark/requirements.txt | 3 +- 9 files changed, 33 insertions(+), 82 deletions(-) delete mode 100644 data-processing-lib/spark/pyproject.toml create mode 100644 transforms/universal/fdedup/python/requirements.txt create mode 100644 transforms/universal/fdedup/ray/requirements.txt diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml deleted file mode 100644 index 89b4d9bf8..000000000 --- a/data-processing-lib/spark/pyproject.toml +++ /dev/null @@ -1,55 +0,0 @@ -[project] -name = "data_prep_toolkit_spark" -version = "0.2.2.dev2" -keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -requires-python = ">=3.10,<3.13" -description = "Data Preparation Toolkit Library for Spark" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, -] -dependencies = [ - "data-prep-toolkit==0.2.2.dev2", - "pyspark>=3.5.2", - "psutil>=6.0.0", - "PyYAML>=6.0.2" -] - -[project_urls] -Repository = "https://github.com/IBM/data-prep-kit" -Issues = "https://github.com/IBM/data-prep-kit/issues" -Documentation = "https://ibm.github.io/data-prep-kit/" -"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/data_processing_spark"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index a6724e6e7..280063863 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -19,7 +19,7 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md -#COPY --chown=dpk:root requirements.txt requirements.txt +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index dd58d41d4..97be33d54 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -9,23 +9,13 @@ authors = [ { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev2", - "pyarrow==16.1.0", - "pyyaml>=6.0.2", - "boto3>=1.34.69", - "kubernetes>=30.1.0", - "polars==1.9.0", - "disjoint-set>=0.8.0", - "scipy>=1.14.1, <2.0.0", - "numpy<1.29.0", - "sentencepiece>=0.2.0", - "mmh3>=4.1.0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt new file mode 100644 index 000000000..4e69a72e4 --- /dev/null +++ b/transforms/universal/fdedup/python/requirements.txt @@ -0,0 +1,10 @@ +data-prep-toolkit==0.2.2.dev2 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars==1.9.0 +disjoint-set>=0.8.0 +scipy>=1.14.1, <2.0.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index af32f0fb3..71287ced7 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -20,6 +20,7 @@ RUN cd python-transform && pip install --no-cache-dir -e . COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml COPY --chown=ray:users README.md README.md +COPY --chown=ray:users requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source files needed by test-image diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 037525126..cb8c6306a 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -9,18 +9,13 @@ authors = [ { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_fdedup_transform_python==0.2.2.dev2", - "mmh3>=4.1.0", - "xxhash==3.4.1", - "tqdm==4.66.3", - "scipy>=1.12.0, <2.0.0" -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt new file mode 100644 index 000000000..6ee40ef7f --- /dev/null +++ b/transforms/universal/fdedup/ray/requirements.txt @@ -0,0 +1,6 @@ +data-prep-toolkit[ray]==0.2.2.dev2 +dpk_fdedup_transform_python==0.2.2.dev2 +mmh3>=4.1.0 +xxhash==3.4.1 +tqdm==4.66.3 +scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index cc66fc044..f77df2010 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -9,10 +9,13 @@ authors = [ { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "dpk_fdedup_transform_python==0.2.2.dev2", - "data-prep-toolkit-spark==0.2.2.dev2", -] +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt index 576c028a8..c373ffbb7 100644 --- a/transforms/universal/fdedup/spark/requirements.txt +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -1,4 +1,5 @@ -pyarrow==16.1.0 +dpk_fdedup_transform_python==0.2.2.dev2 +data-prep-toolkit[spark]==0.2.2.dev2 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 From fffb6305e7dbd018c343fde736b396db18a3d3d3 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 12:38:00 -0500 Subject: [PATCH 076/105] Add FIXME for https://github.com/kubeflow/pipelines/issues/10914 Signed-off-by: Constantin M Adam --- .../universal/fdedup/kfp_ray/fdedup_wf.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 0a0a4d9bf..fabc4e084 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -279,10 +279,12 @@ def fuzzydedup( prefix="scdata", ) ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) - ComponentUtils.set_s3_env_vars_to_component( - execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" - ) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") == "1": + ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" + ) execute_signature_calc_job.after(compute_signature_calc_exec_params) # Get the parameters for the cluster analysis job @@ -311,7 +313,9 @@ def fuzzydedup( server_url=server_url, ) ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") == "1": + ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op( @@ -338,7 +342,9 @@ def fuzzydedup( server_url=server_url, ) ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") == "1": + ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op( @@ -368,10 +374,12 @@ def fuzzydedup( prefix="dcdata", ) ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) - ComponentUtils.set_s3_env_vars_to_component( - execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" - ) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") == "1": + ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" + ) execute_data_cleaning_job.after(compute_data_cleaning_exec_params) From 5547d7fb574b8ebe2f8a98d6656f16faf9537808 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 13:03:02 -0500 Subject: [PATCH 077/105] Add FIXME for https://github.com/kubeflow/pipelines/issues/10914 Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index fabc4e084..683f93210 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -280,7 +280,7 @@ def fuzzydedup( ) ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC) # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 - if os.getenv("KFPv2", "0") == "1": + if os.getenv("KFPv2", "0") != "1": ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) ComponentUtils.set_s3_env_vars_to_component( execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" @@ -314,7 +314,7 @@ def fuzzydedup( ) ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC) # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 - if os.getenv("KFPv2", "0") == "1": + if os.getenv("KFPv2", "0") != "1": ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) @@ -343,7 +343,7 @@ def fuzzydedup( ) ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC) # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 - if os.getenv("KFPv2", "0") == "1": + if os.getenv("KFPv2", "0") != "1": ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) @@ -375,7 +375,7 @@ def fuzzydedup( ) ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC) # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 - if os.getenv("KFPv2", "0") == "1": + if os.getenv("KFPv2", "0") != "1": ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) ComponentUtils.set_s3_env_vars_to_component( execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" From 09e56e05dea66de01a023c53978a23497723b698 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 14 Nov 2024 13:06:24 -0500 Subject: [PATCH 078/105] Remove pyproject.toml dependencies Signed-off-by: Constantin M Adam --- data-processing-lib/spark/Makefile | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/data-processing-lib/spark/Makefile b/data-processing-lib/spark/Makefile index d4769187b..5fde2bb07 100644 --- a/data-processing-lib/spark/Makefile +++ b/data-processing-lib/spark/Makefile @@ -11,9 +11,14 @@ setup:: set-versions: .check-env $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml - sed -e 's/"pyspark...*",/"pyspark>=${SPARK_VERSION}",/' \ - pyproject.toml > tt.toml - mv tt.toml pyproject.toml + if [ -e pyproject.toml ]; then \ + cat pyproject.toml | sed -e 's/"spark[default]==.*",/"spark[default]==$(SPARK_VERSION)",/' > tt.toml; \ + mv tt.toml pyproject.toml; \ + fi + if [ -e requirements.txt ]; then \ + cat requirements.txt | sed -e 's/ray[default]==.*/ray[default]==$(SPARK_VERSION)/' > tt.txt; \ + mv tt.txt requirements.txt; \ + fi build:: build-dist @@ -26,7 +31,7 @@ publish-dist :: .check-env .defaults.publish-dist publish-image:: .defaults.publish-image -venv:: pyproject.toml +venv:: $(MAKE) .defaults.spark-lib-src-venv pip install pytest pytest-cov From d3eac50704aa8bf032f212a0604430a3f0764cc2 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:24:30 -0500 Subject: [PATCH 079/105] Fix bug in number of actors calculation Signed-off-by: Constantin M Adam --- .../fdedup/kfp_ray/src/fdedup_compute_execution_params.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index 65b7ac2f6..cd3a58b99 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -57,16 +57,18 @@ def compute_common_params( # Also, to keep S3 utilization in check, limit the number of actors to 2000 num_nodes = worker_options["replicas"] cpu_per_node = worker_options["cpu"] - 1 - memory_per_node = 0.85 * worker_options["memory"] + memory_per_node = worker_options["memory"] memory_per_actor = 16 # GB max_num_actors = 2000 num_actors_per_node: int = int(memory_per_node / memory_per_actor) if num_actors_per_node == 0: num_actors_per_node = 1 - num_actors = num_nodes * num_actors_per_node + # never run actors on the head node, so (n - 1) nodes to run actors + num_actors = (num_nodes - 1) * num_actors_per_node + while num_actors > max_num_actors: - num_actors -= num_nodes + num_actors -= num_nodes - 1 num_actors_per_node -= 1 print(f"Number of actors per node = {num_actors_per_node}") cpus_per_actor = cpu_per_node / num_actors_per_node From fa5959b5f90ce90e97a52288be6aee18c06b9068 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:28:39 -0500 Subject: [PATCH 080/105] Cleanup main entry point and local implementation of python transforms Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/Dockerfile | 4 ++-- .../python/src/cluster_analysis_local_python.py | 5 +++-- .../python/src/cluster_analysis_transform.py | 10 +++++----- .../python/src/data_cleaning_local_python.py | 12 ++++++++---- ...dup_python.py => fdedup_transform_python.py} | 0 ...get_duplicate_list_transform_local_python.py | 6 ++++-- .../python/src/signature_calc_local_python.py | 17 +---------------- 7 files changed, 23 insertions(+), 31 deletions(-) rename transforms/universal/fdedup/python/src/{fuzzy_dedup_python.py => fdedup_transform_python.py} (100%) diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index 280063863..071478870 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -27,8 +27,8 @@ RUN pip install --no-cache-dir -e . COPY src/ src/ # copy source data -COPY ./src/signature_calc_transform_python.py fdedup_transform_python.py -COPY ./src/signature_calc_local_python.py local/ +COPY ./src/fdedup_transform_python.py fdedup_transform_python.py +COPY ./src/fdedup_transform_python.py local/ # copy test COPY test/ test/ diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py index 915cdcd1e..bb785021c 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -21,7 +21,9 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands")) +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { "input_folder": input_folder, @@ -42,7 +44,6 @@ if __name__ == "__main__": # Set the simulated command line args sys.argv = ParamsUtils.dict_to_req(d=params) - print(sys.argv) # create launcher launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) # Launch python to process the input diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 412fc1fa8..a9822babe 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -140,7 +140,7 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str # Generate the docs_to_remove dataframe docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove") output_data = TransformUtils.convert_arrow_to_binary(docs_to_remove_dataframe.to_arrow()) - self.logger.info(f"{len(docs_to_remove_dataframe)} documents marked to remove") + self.logger.debug(f"{len(docs_to_remove_dataframe)} documents marked to remove") metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)} return [(output_data, output_path)], metadata @@ -187,8 +187,8 @@ def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFra max_cdocs = 0 min_cdocs = 0 avg_cdocs = 0 - self.logger.info(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") - self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + self.logger.debug(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") cluster_stats = { "groupby_clusters": num_clusters, "cluster_duplicate_docs": sum_cdocs, @@ -226,8 +226,8 @@ def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, An max_cdocs = 0 min_cdocs = 0 avg_cdocs = 0 - self.logger.info(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") - self.logger.info(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + self.logger.debug(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") jaccard_stats = { "jaccard_clusters": num_clusters, "jaccard_duplicate_docs": sum_cdocs, diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py index 4295e4e82..aa4aabb90 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py @@ -23,15 +23,20 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, } duplicate_location = os.path.abspath( os.path.join( - os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", ) ) code_location = {"github": "github", "commit_hash": "12345", "path": "path"} @@ -49,7 +54,6 @@ if __name__ == "__main__": # Set the simulated command line args sys.argv = ParamsUtils.dict_to_req(d=params) - print(sys.argv) # create launcher launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) # Launch the ray actor(s) to process the input diff --git a/transforms/universal/fdedup/python/src/fuzzy_dedup_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py similarity index 100% rename from transforms/universal/fdedup/python/src/fuzzy_dedup_python.py rename to transforms/universal/fdedup/python/src/fdedup_transform_python.py diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py index be90b3073..34b18ab04 100644 --- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py @@ -21,8 +21,10 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected/cluster_analysis")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "expected")) +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "cluster_analysis") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py index 2800c70cd..be395ed4d 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -23,18 +23,9 @@ # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "test_scdata")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = {"input_folder": input_folder, "output_folder": output_folder} code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), -} -s3_config = { - "input_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_data/", - "output_folder": "s3://cos-optimal-llm-pile/spark_test/fuzzy_dedup_test_output_data/s3_test_3/", -} params = { # Data access. Only required parameters are specified @@ -47,18 +38,12 @@ "minhash_num_permutations": 112, "minhash_num_bands": 14, "minhash_num_segments": 2, - # "scdata_s3_cred": ParamsUtils.convert_to_ast(s3_creds), - # "scdata_s3_config": ParamsUtils.convert_to_ast(s3_config), } if __name__ == "__main__": # Set the simulated command line args sys.argv = ParamsUtils.dict_to_req(d=params) - print(sys.argv) - - sys.argv.append("--data_s3_cred") - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) # create launcher launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) From c4f889b37e165e9c0f6243e7cf47d19b1185c521 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:30:40 -0500 Subject: [PATCH 081/105] Cleanup main entry point and local implementation of ray transforms Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/ray/Dockerfile | 2 +- .../fdedup/ray/src/cluster_analysis_local_ray.py | 4 +++- .../fdedup/ray/src/data_cleaning_local_ray.py | 11 ++++++++--- .../{fuzzy_dedup_ray.py => fdedup_transform_ray.py} | 2 +- .../fdedup/ray/src/signature_calc_local_ray.py | 2 +- 5 files changed, 14 insertions(+), 7 deletions(-) rename transforms/universal/fdedup/ray/src/{fuzzy_dedup_ray.py => fdedup_transform_ray.py} (97%) diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index 71287ced7..4bfe32a9e 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -24,7 +24,7 @@ COPY --chown=ray:users requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source files needed by test-image -COPY --chown=ray:users ./src/signature_calc_transform_ray.py fdedup_transform_ray.py +COPY --chown=ray:users ./src/fdedup_transform_ray.py fdedup_transform_ray.py COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py index c078746ce..c54ba85c2 100644 --- a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py @@ -19,7 +19,9 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "bands")) +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) local_conf = { "input_folder": input_folder, diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py index 54fa2ccac..b951e2fc8 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py @@ -23,15 +23,20 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "cleaned")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, } duplicate_location = os.path.abspath( os.path.join( - os.path.dirname(__file__), "..", "output", "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", ) ) worker_options = {"num_cpus": 0.8} diff --git a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py similarity index 97% rename from transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py rename to transforms/universal/fdedup/ray/src/fdedup_transform_ray.py index 987369714..be1bf5fcb 100644 --- a/transforms/universal/fdedup/ray/src/fuzzy_dedup_ray.py +++ b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py @@ -19,7 +19,7 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from fuzzy_dedup_python import ServiceOrchestrator, parse_args +from fdedup_transform_python import ServiceOrchestrator, parse_args from get_duplicate_list_transform_python import ( GetDuplicateListPythonTransformConfiguration, ) diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py index 64f492584..cb87b56af 100644 --- a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py +++ b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py @@ -19,7 +19,7 @@ # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "data_1")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, From f3c5be0c276c228710d753b377d539aba634f95c Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:32:18 -0500 Subject: [PATCH 082/105] Cleanup main entry point and local implementation of spark transforms Signed-off-by: Constantin M Adam --- ...ark.py => cluster_analysis_local_spark.py} | 30 +++++++++++---- ..._spark.py => data_cleaning_local_spark.py} | 38 +++++++++++++++---- ...dup_spark.py => fdedup_transform_spark.py} | 2 +- ...spark.py => signature_calc_local_spark.py} | 29 ++++++++++---- 4 files changed, 77 insertions(+), 22 deletions(-) rename transforms/universal/fdedup/spark/src/{cluster_analysis_spark.py => cluster_analysis_local_spark.py} (54%) rename transforms/universal/fdedup/spark/src/{data_cleaning_spark.py => data_cleaning_local_spark.py} (50%) rename transforms/universal/fdedup/spark/src/{fuzzy_dedup_spark.py => fdedup_transform_spark.py} (97%) rename transforms/universal/fdedup/spark/src/{signature_calc_spark.py => signature_calc_local_spark.py} (56%) diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py similarity index 54% rename from transforms/universal/fdedup/spark/src/cluster_analysis_spark.py rename to transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py index 83498f59e..c9950657c 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_spark.py +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py @@ -19,14 +19,30 @@ from data_processing_spark.runtime.spark import SparkTransformLauncher +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, +} if __name__ == "__main__": - sys.argv.append("--data_s3_cred") - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) # create launcher launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) # Launch the spark worker(s) to process the input diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py similarity index 50% rename from transforms/universal/fdedup/spark/src/data_cleaning_spark.py rename to transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py index 7b6bd626d..9c14c67d8 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py @@ -19,14 +19,38 @@ from data_processing_spark.runtime.spark import SparkTransformLauncher +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) +) +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + if __name__ == "__main__": - sys.argv.append("--data_s3_cred") - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) # create launcher launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) # Launch the spark worker(s) to process the input diff --git a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py similarity index 97% rename from transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py rename to transforms/universal/fdedup/spark/src/fdedup_transform_spark.py index 58688de42..82767f849 100644 --- a/transforms/universal/fdedup/spark/src/fuzzy_dedup_spark.py +++ b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py @@ -18,7 +18,7 @@ from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing_spark.runtime.spark import SparkTransformLauncher -from fuzzy_dedup_python import ServiceOrchestrator, parse_args +from fdedup_transform_python import ServiceOrchestrator, parse_args from get_duplicate_list_transform_python import ( GetDuplicateListPythonTransformConfiguration, ) diff --git a/transforms/universal/fdedup/spark/src/signature_calc_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py similarity index 56% rename from transforms/universal/fdedup/spark/src/signature_calc_spark.py rename to transforms/universal/fdedup/spark/src/signature_calc_local_spark.py index 0e7046549..2db884346 100644 --- a/transforms/universal/fdedup/spark/src/signature_calc_spark.py +++ b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py @@ -21,14 +21,29 @@ ) +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = {"input_folder": input_folder, "output_folder": output_folder} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "scdata_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + + if __name__ == "__main__": - sys.argv.append("--data_s3_cred") - s3_creds = { - "access_key": os.getenv("AWS_ACCESS_KEY_ID"), - "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), - "url": os.getenv("AWS_ENDPOINT_URL"), - } - sys.argv.append(ParamsUtils.convert_to_ast(s3_creds)) + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) # create launcher launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) # Launch the spark worker(s) to process the input From 4941d5bab37a0bdc1e5873ce8e7288483703751f Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Fri, 15 Nov 2024 10:46:43 -0500 Subject: [PATCH 083/105] Cleanup main entry point and local implementation of spark transforms Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/spark/Dockerfile | 6 +----- .../universal/fdedup/spark/src/data_cleaning_local_spark.py | 4 ++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile index 772dfef79..b04994d46 100644 --- a/transforms/universal/fdedup/spark/Dockerfile +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -32,11 +32,7 @@ RUN pip3 install -r requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/signature_calc_spark.py . - -# copy some of the samples in -COPY src/signature_calc_transform_spark.py fdedup_transform_spark.py -COPY src/signature_calc_spark.py local/fdedup_local_spark.py +COPY ./src/fdedup_transform_spark.py . # copy test COPY test/ test/ diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py index 9c14c67d8..eb1e61845 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py +++ b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py @@ -14,6 +14,10 @@ import sys import polars as pl +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher From 9c82fe0fb9734fb317ad8f18bfd940fe8fe361cb Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 18 Nov 2024 13:58:38 -0500 Subject: [PATCH 084/105] Added documentation for python, ray, spark and kfp Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/README.md | 19 +- transforms/universal/fdedup/kfp_ray/README.md | 14 +- transforms/universal/fdedup/python/README.md | 239 +++++++++++++++++- transforms/universal/fdedup/ray/README.md | 211 ++++------------ transforms/universal/fdedup/spark/README.md | 150 ++++------- 5 files changed, 348 insertions(+), 285 deletions(-) diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md index e128566d2..fed3c1370 100644 --- a/transforms/universal/fdedup/README.md +++ b/transforms/universal/fdedup/README.md @@ -1,10 +1,11 @@ -# Fuzzy Deduplification Transform -The fdedup transforms removes documents that are very similar to each other within a set of parquet files, -per the set of -[transform project conventions](../../README.md#transform-project-conventions) -the following runtimes are available: +# Fuzzy Deduplication Transform +The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a +set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries. +Per the set of [transform project conventions](../../README.md#transform-project-conventions) the following runtimes are available: -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime -* [kfp](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. +* [python](python/README.md) - enables running the base transform in a pure python environment +* [ray](ray/README.md) - enables running the base python transform in a Ray runtime +* [spark](spark/README.md) - enables running the base python transform in a spark runtime +* [kfp](kfp_ray/README.md) - enables running the ray docker image in a kubernetes cluster using a generated `yaml` file. + +Please check [here](python/README.md) for a more detailed description of this transform. diff --git a/transforms/universal/fdedup/kfp_ray/README.md b/transforms/universal/fdedup/kfp_ray/README.md index 97fd45a69..75eb77a08 100644 --- a/transforms/universal/fdedup/kfp_ray/README.md +++ b/transforms/universal/fdedup/kfp_ray/README.md @@ -1,8 +1,8 @@ -# Fuzzy Deduplication Ray-base KubeFlow Pipeline Transformation +# Fuzzy Deduplication Ray-based KubeFlow Pipeline Transformation ## Summary -This project allows execution of the [noop Ray transform](../ray) as a +This project allows execution of the [fuzzy dedup Ray transform](../ray) as a [KubeFlow Pipeline](https://www.kubeflow.org/docs/components/pipelines/overview/) The detail pipeline is presented in the [Simplest Transform pipeline tutorial](../../../../kfp/doc/simple_transform_pipeline.md) @@ -16,13 +16,9 @@ make workflow-build from the directory. It creates a virtual environment (make workflow-venv) and after that compiles the pipeline definitions in the folder. The virtual environment is created once for all transformers. -Note: the pipelines definitions can be compiled and executed on KFPv1 and KFPv2. Meantime, KFPv1 is our default. If you -prefer KFPv2, please do the following: -```shell -make clean -export KFPv2=1 -make workflow-build -``` +## Considerations +Currently, fuzzy dedup KFP pipeline definitions can be compiled and executed on KFPv1. KFPv2 is not +supported currently, because of this issue: https://github.com/kubeflow/pipelines/issues/10914 The next steps are described in [Deploying a pipeline](../../../../kfp/doc/simple_transform_pipeline.md#deploying-a-pipeline-) and [Executing pipeline and watching execution results](../../../../kfp/doc/simple_transform_pipeline.md#executing-pipeline-and-watching-execution-results-) \ No newline at end of file diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md index 34f18c73b..d2d940344 100644 --- a/transforms/universal/fdedup/python/README.md +++ b/transforms/universal/fdedup/python/README.md @@ -5,7 +5,240 @@ Please see the set of for details on general project conventions, transform configuration, testing and IDE set up. -## Summary +## Contributors +- Nelson Bore (kibnelson@gmail.com) +- Constantin Adam (cmadam@us.ibm.com) -The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see -[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details. \ No newline at end of file +## Description +The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a +set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries. + +Fuzzy dedup is a complex process made up of a pipeline that performs four main steps: + +1. **Signature Calculation**: creates a set of minhashes for each document, and uses them to create band signatures for +the document. +2. **Cluster Analysis**: groups documents into clusters based on matching band signatures. Within each cluster, it +retains only the documents that have a Jaccard similarity above a specified threshold, and it identifies which documents +to keep as unique and which ones to mark as duplicates. +3. **Duplicate List Generation**: combines the similarity clusters identified in each band to create a single, unified +list of duplicate documents. +4. **Data Cleaning**: processes the documents by either filtering out duplicates or adding annotations to distinguish +duplicates from non-duplicates. + +Each one of these steps is described in more detail below. + +### Signature Calculation + +This transform computes `num_permutations` minhashes and `num_bands` signatures for each document in the dataset, by +following these processing steps: +1. **Shingle Generation**: create a set of character or word shingles, using a specified window length. Character +shingles are more effective at detecting similar documents, but require more computational resources compared to word +shingles. +2. **Minhash Calculation**: using the shingles as input, compute `num_permutations` minhashes for each document. +3. **Band Signature Calculation**: divide the minhashes into `num_bands`, where each band contains +`num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band. + +The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard +similarity will be marked as duplicates. A Jupyter notebook in the [utils](utils) folder generates a graph of this +probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact +the deduplication process. + +To help distribute the workload and speed up processing of the next steps, the hash space of each band is divided into +`num_segments` segments. The band signatures, the minhashes, the document ids, and lengths are stored in an organized +output folder structure `bands/band=b/segment=s`, where `b` is the band number and `s` is the segment number. + +### Cluster Analysis + +This transform leverages segmented processing to analyze the data generated by the **Signature Calculation** step +efficiently and in parallel. Each worker processes a specific segment `s` of a band `b` by loading and analyzing all +Parquet files from the folder `bands/band=b/segment=s`. Each row in the Parquet files contains, for a document: +* `band_hash`, the document's band signature, and +* `data`, a structure with three fields: the unique `document_id`, document's `minhashes`, and `document_size`. + +The transform runs the following processing steps: +1. **Data Loading**: combine into a single dataframe all Parquet files in `bands/band=b/segment=s`. +2. **Clustering**: run a `group_by` operation on the `band_hash` column that will group documents with the same band +signature into clusters. +3. **Similarity Analysis**: for each cluster, calculate Jaccard similarity between pairs of documents using their +minhashes, and move documents below the specified Jaccard similarity threshold into new clusters. +4. **Duplicate Identification**: in clusters with more than one document remaining, retain the largest document with the +smallest document id, and mark as duplicates all other documents in the cluster. +5. **Persist Results**: save the duplicate clusters in a file. + +### Duplicate List Generation + +The **Cluster Analysis** step identifies duplicates across multiple bands, meaning a document can be marked as a +duplicate in one or more bands (e.g., if two documents are identical, one will be marked as a duplicate in all bands). +This transform consolidates all duplicate information from each band segment into a single file, providing a unified +record of duplicates detected across the dataset. + +### Data Cleaning + +This transform processes the original dataset using the list of duplicate documents generated by the **Duplicate List +Generation** step. It imports each file in the original dataset into a table and produces a new dataset. The directory +structure of the input dataset is preserved, but the contents of the output files depend on the selected operating mode: +1. **Annotate** - add a new `duplicate` column to the dataset, that contains a `d` for documents marked as duplicates, +and is empty for non-duplicates +2. **Filter duplicates** - removes all documents identified as duplicates from the dataset. +3. **Filter non-duplicates** - removes from the dataset all documents that were not marked as duplicates, leaving only +the duplicates. + +The output dataset reflects the selected mode, providing flexibility for downstream processing. + +## Input Columns Used by This Transform + +| Input Column Name | Data Type | Description | +|---------------------------------------------------------------------|-----------|----------------------------------| +| Column specified by the _contents_column_ configuration argument | str | Column that stores document text | +| Column specified by the _document_id_column_ configuration argument | int64 | Column that stores document ID | + +## Output Columns Annotated by This Transform +| Output Column Name | Data Type | Description | +|------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| duplicate | str | Column added if fuzzy dedup runs in 'annotate' mode. Value is 'd' for duplicate documents, empty for non-duplicates | + +## Configuration and Usage +### Fuzzy Deduplication Transform +The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as +follows: +```text +--input_folder INPUT_FOLDER + Input folder path +--output_folder OUTPUT_FOLDER + Output folder path +--operation_mode {filter_duplicates,filter_non_duplicates,annotate} + operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents +--contents_column CONTENTS_COLUMN + name of the column that stores document text +--document_id_column DOCUMENT_ID_COLUMN + name of the column that stores document ID +--seed SEED seed of the random number generator +--num_permutations NUM_PERMUTATIONS + number of permutations to use for minhash calculation +--num_bands NUM_BANDS + number of bands to use for band hash calculation +--num_minhashes_per_band NUM_MINHASHES_PER_BAND + number of minhashes to use in each band +--word_shingle_size WORD_SHINGLE_SIZE + number of words included in one shingle +--jaccard_similarity_threshold JACCARD_SIMILARITY_THRESHOLD + jaccard similarity threshold above which two documents are similar +--num_segments NUM_SEGMENTS + the number of segments dividing the hashing space for each band (for scalability) +--duplicate_list_location DUPLICATE_LIST_LOCATION + path to the file with all the duplicate document ids +--services SERVICES Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning) +--use_s3 USE_S3 use s3 +--s3_cred S3_CRED ast string of options for s3 credentials +--shingle_option SHINGLE_OPTION + Option used for shingling + +``` + +### Signature Calculation Transform +The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values +are as follows: +```text +--minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN + name of the column storing the unique ID assigned to each document +--minhash_contents_column MINHASH_CONTENTS_COLUMN + name of the column storing the contents of each document +--minhash_seed MINHASH_SEED + the seed used to instantiate the random number generator +--minhash_num_permutations MINHASH_NUM_PERMUTATIONS + number of permutations (minhashes) calculated for each document +--minhash_word_shingle_size MINHASH_WORD_SHINGLE_SIZE + the size of the word shingles calculated for each document +--minhash_num_bands MINHASH_NUM_BANDS + the number of bands to use in the banding technique +--minhash_num_minhashes_per_band MINHASH_NUM_MINHASHES_PER_BAND + the number of minhashes to use in each band +--minhash_num_segments MINHASH_NUM_SEGMENTS + the number of segments across which we divide the hashing space for each band +--minhash_shingle_option MINHASH_SHINGLE_OPTION + Shingling option ('word' or 'char') +``` + +### Cluster Analysis Transform +The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values +are as follows: +```text +--cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD + Jaccard similarity threshold above which two documents are duplicates +--cluster_num_bands CLUSTER_NUM_BANDS + The number of bands used in the banding technique +--cluster_num_segments CLUSTER_NUM_SEGMENTS + The number of segments dividing the hashing space for each band +``` + +### Get Duplicates List Transform +This transform currently has no configuration parameters. + +### Data Cleaning Transform +The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values +are as follows: +```text + --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN + name of the column storing the unique ID assigned to each document + --fdclean_operation_mode {filter_duplicates,filter_non_duplicates,annotate} + operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents +``` + +### Running the samples +To run the samples, use the following `make` target to create a virtual environment: + +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_python.py +``` +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_python.py +python cluster_analysis_local_python.py +python get_duplicate_list_local_python.py +python data_cleaning_local_python.py +``` +After running the transforms, execute: +```shell +ls output +``` +To see results of the transform. + +### Code example + +TBD (link to the notebook will be provided) + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +For testing fuzzy deduplication in a pure python runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` + +## Further Resources +The following is a list of references to research articles and github repositories that inspired the module's design: + +1. [Jure Leskovec, Anand Rajaraman, Jeff Ullman, Mining of Massive Datasets, Chapter 3: Finding Similar Items](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) +2. [G Penedo et al., The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/pdf/2406.17557) +3. [Datatrove github repo](https://github.com/huggingface/datatrove) diff --git a/transforms/universal/fdedup/ray/README.md b/transforms/universal/fdedup/ray/README.md index 41be44301..d93be3a4a 100644 --- a/transforms/universal/fdedup/ray/README.md +++ b/transforms/universal/fdedup/ray/README.md @@ -1,185 +1,45 @@ # Fuzzy Dedup -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. +Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +configuration, testing and IDE set up. ## Summary -The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see -[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details. The architecture of the implementation is presented here: +This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime. -![](images/fuzzy.png) +## Configuration and command line Options -The main components of implementation are driver, processors (implemented as actor pools) - table processor, table -filter and bucket hash processor, and hash actors - minhash, buckets and docs. - -The complication of mapping this model to transform model is the fact that in this model assumes a two pass processing, -while a transform model is a single pass. The solution to this mismatch is to use transform runtime to implement the -first path and use the native transform pipeline to implement filtering. - -## Transform runtime -The [transform runtime](src/fdedup_transform_ray.py) is implementing complete first path of the fuzzy deduping: -* creates bucket and minhash collectors -* implements initial file processing to populate bucket and minhash caches -* creates doc collectors -* implement bucket processing -* Clean up everything except for doc collectors in preparation to filter, that is implemented by the framework proper -The main components of runtime are described below - -### TableProcessor Actor - -[Table processing actor](src/fdedup_transform_ray.py) is implemented following framework itself is implemented as a pair - -`FdedupTransform` implementing the actual transformation and and -[transform table processor](../../../../data-processing-lib/src/data_processing/runtime/ray/transform_table_processor.py) -(from the framework itself). - -### DocsMinHash Actor - -This [actor](src/fdedup_support.py) stores MInHashes - -### BucketsHash Actor - -This actor [actor](src/fdedup_support.py) - -### BucketHashProcessor - -BucketHash [actor](src/fdedup_support.py) implement the actual buckets processing, removing duplicates. -Implementation of this actor allows to better manage this "expensive" process, by using Actor pool load balancing -thus minimizing overall time for this operation. Instead of pre partitioning buckets, it is using dynamic load -partitioning. We also are processing "longest" buckets first thus further improving performance. To further improve -the overall performance we can in future implement bucket splitting - its faster to process more smaller buckets -then the long ones - -### BucketHashProcessor - -This [actor](src/fdedup_support.py) is queueing up requests to the `BucketHashProcessor` actor pool, which load -balances their execution - -### DocCollector Actor - -This [actor](src/fdedup_support.py) is a collector for unique documents - -## Transformer - -In the fuzzy dedup implementation, the [transformer](src/fdedup_transform_ray.py) only implements filtering. For every -table, it checks document ids with the `DocumentsCollector` cache and removes all of the rows which do not have ids in -the hash - -## Snapshotting - -Fuzzy dedup often runs on very large data sets and implements three very distinct phases: -* Building buckets -* Processing buckets -* Filtering data -To improve recoverability of fuzzy dedup, current implementation includes snapshotting - at the end of the first two -phases we snapshot the current state of execution - bucket and minhash actors after the first phase and document actors -after the second. This snapshotting provide code with the ability to restart from the existing snapshot. You can use one -of two configuration flags (assuming snapshots exist): -* `use_bucket_snapshot` to start from the second phase -* `use_doc_snapshot` to start from the third phase - -## Building - -A [docker file](Dockerfile) that can be used for building docker image. You can use - -```shell -make build to build it -``` - -### Configuration and command line Options - -The set of dictionary keys holding [BlockListTransform](src/blocklist_transform.py) -configuration for values are as follows: - -* _bucket_cpu_ - specifies number of CPUs for bucket actor -* _doc_cpu_ - specifies number of CPUs for doc actor -* _mhash_cpu_ - specifies number of CPUs for minhash actor -* _num_doc_actors_ - specifies number of doc actors -* _num_bucket_actors_ - specifies number of bucket actors -* _num_minhash_actors_ - specifies number of minhash actors -* _num_preprocessors_ - specifies number of preprocessors -* _num_permutations_ - specifies number of permutations -* _threshold_ - specifies threshold -* _shingles_size_ - specifies shingles size -* _japanese_data_ - specifies whether to use japanese specific document splitting -* _delimiters_ - specifies delimiter for non japanese document splitting -* _snapshot_delay_ - delay between different actors reading/writing snapshot not to overwhelm storage -* -use_bucket_snapshot_ - run from the existing buckets snapshot (bypass building buckets) -* -use_doc_snapshot_ - run from the existing docs snapshot (bypass building and processing buckets) - -Above you see both parameters and their values for small runs (tens of files). We also provide an -[estimate](src/cluster_estimator.py) to roughly determine cluster size for running transformer. +Fuzzy Dedup configuration and command line options are the same as for the base python transform. ## Running - - -### Launched Command Line Options +### Launched Command Line Options When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). - -```shell - --fdedup_doc_column FDEDUP_DOC_COLUMN - document column name - --fdedup_id_column FDEDUP_ID_COLUMN - integer document id column name - --fdedup_cluster_column FDEDUP_CLUSTER_COLUMN - cluster column name - --fdedup_bucket_cpu FDEDUP_BUCKET_CPU - number of CPUs per bucket hash - --fdedup_mhash_cpu FDEDUP_MHASH_CPU - number of CPUs per minhash hash - --fdedup_doc_cpu FDEDUP_DOC_CPU - number of CPUs per doc hash - --fdedup_num_doc_actors FDEDUP_NUM_DOC_ACTORS - number of doc actors to use - --fdedup_num_minhash_actors FDEDUP_NUM_MINHASH_ACTORS - number of minhash actors to use - --fdedup_num_bucket_actors FDEDUP_NUM_BUCKET_ACTORS - number of bucket actors to use - --fdedup_num_preprocessors FDEDUP_NUM_PREPROCESSORS - number of preprocessors to use - --fdedup_num_permutations FDEDUP_NUM_PERMUTATIONS - number of permutations - --fdedup_threshold FDEDUP_THRESHOLD - threshold - --fdedup_shingles_size FDEDUP_SHINGLES_SIZE - number of words in shingle - --fdedup_delimiters FDEDUP_DELIMITERS - delimiter for splitting document - --fdedup_snapshot_delay FDEDUP_SNAPSHOT_DELAY - snapshot delay time - --fdedup_use_bucket_snapshot FDEDUP_USE_BUCKET_SNAPSHOT - flag to continue with bucket snapshot - --fdedup_use_doc_snapshot FDEDUP_USE_DOC_SNAPSHOT - flag to continue with doc snapshot - --fdedup_random_delay_limit FDEDUP_RANDOM_DELAY_LIMIT - maximum delay between read -``` - -These correspond to the configuration keys described above. +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. ### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/fdedup_transform_ray.py using command line args -* `run-local-sample` - runs src/fdedup_local_ray.py -* `run-s3-sample` - runs src/fdedup_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. +To run the samples, use the following `make` target to create a virtual environment: -For example, -```shell -make run-cli-sample -... +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_ray.py ``` -Then +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_ray.py +python cluster_analysis_local_ray.py +python get_duplicate_list_local_ray.py +python data_cleaning_local_ray.py +``` +After running the transforms, execute: ```shell ls output ``` @@ -190,3 +50,18 @@ To see results of the transform. To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. + +## Testing + +For testing fuzzy deduplication in a ray runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` \ No newline at end of file diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md index 3bf9b3245..dd0294aed 100644 --- a/transforms/universal/fdedup/spark/README.md +++ b/transforms/universal/fdedup/spark/README.md @@ -1,109 +1,67 @@ -# Spark-GUF +# Fuzzy Dedup -This is an implementation of Spark data processing modules. At a high level, every Spark application consists of a driver program that runs the user’s main function and executes various parallel operations on a cluster. +Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +configuration, testing and IDE set up. -The modules can run locally or remotely in a Kubernetes cluster. +## Summary -## Running Transforms locally +This project wraps the [Fuzzy Dedup transform](../python) with a Spark runtime. -Start in the `spark-guf` directory. To run the modules locally, follow these steps: -1. Create a virtual environment using this command - ``` - make venv - ``` -2. Activate the virtual environment: - ``` - source venv/bin/activate - ``` +## Configuration and command line Options -3. Set the `PYTHONPATH` environment variable to include the `src` directory: - ``` - export PYTHONPATH=${PYTHONPATH}:${PWD}/src - ``` -4. Invoke one of the transforms: - ``` - python src/transforms/spark_pi/spark_transformer_pi.py - ``` -5. To find out which arguments a transform takes, run that transform with a `--help` flag: - ``` - python src/transforms/spark_filter/spark_filter_transform.py --help - usage: spark_filter_transform.py [-h] --input_folder INPUT_FOLDER --output_folder OUTPUT_FOLDER [--data_type DATA_TYPE] - --filter_criteria_list FILTER_CRITERIA_LIST [--filter_columns_to_drop FILTER_COLUMNS_TO_DROP] - [--filter_logical_operator {AND,OR}] +Fuzzy Dedup configuration and command line options are the same as for the base python transform. - optional arguments: - -h, --help show this help message and exit - --input_folder INPUT_FOLDER - path to read the input files (local fs or s3) - --output_folder OUTPUT_FOLDER - path to write the output files (local fs or s3) - --data_type DATA_TYPE - Type of files to filter (parquet, orc, csv, json, txt) - --filter_criteria_list FILTER_CRITERIA_LIST - list of filter criteria (in SQL WHERE clause format), for example: [ "docq_total_words > 100 AND docq_total_words < 200", "docq_perplex_score < 230", "date_acquired BETWEEN '2023-07-04' - AND '2023-07-08'", "title LIKE 'https://%'", "document_id IN ('doc-id-1', 'doc-id-2', 'doc-id-3')" ] - --filter_columns_to_drop FILTER_COLUMNS_TO_DROP - list of columns to drop after filtering, for example: ["column1", "column2"] - --filter_logical_operator {AND,OR} - logical operator (AND or OR) that joins filter criteria - ``` +## Running +### Launched Command Line Options +When running the transform with the Spark launcher (i.e. TransformLauncher), +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[spark launcher](../../../../data-processing-lib/doc/spark-launcher-options.md) are available. -## Running Transforms in Kubernetes/OpenShift +### Running the samples +To run the samples, use the following `make` target to create a virtual environment: -Start in the `spark-guf` directory. To run the transforms in a Kubernetes or OpenShift cluster, follow these steps: +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_spark.py +``` +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_spark.py +python cluster_analysis_local_spark.py +python get_duplicate_list_local_spark.py +python data_cleaning_local_spark.py +``` +After running the transforms, execute: +```shell +ls output +``` +To see results of the transform. -1. Build and push a pyspark base docker image (this example assumes that images are pushed to the Docker hub, but same approach can be used to push images to icr.io, or quai.io: - ``` - docker build -t my-docker-username/my-pyspark:3.5.1 . - docker push my-docker-username/my-pyspark:3.5.1 - ``` -2. Build and push a specific transform image (this will use the pyspark built in the previous point as the base image): - ``` - docker build -t my-docker-username/my-pyspark-filter:3.5.1 -f src/transforms/spark_filter/Dockerfile --build-arg BASE_IMAGE=my-docker-username/my-pyspark:3.5.1 . - docker push my-docker-username/my-pyspark-filter:3.5.1 - ``` +### Transforming data using the transform image -3. Configure the `spark` service account (note that you can use any other service account name, but you will need then to replace `spark` with `your-service-account-name` in all the yaml files listed below). This is a one-time process to perform for each namespace where you want to run spark apps: - ``` - # create 'spark' service account - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-serviceaccount.yaml --namespace=my-namespace +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. - # create 'spark' role - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role.yaml --namespace=my-namespace +## Testing - # bind the 'spark' service account to the 'spark' role - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-role-binding.yaml --namespace=my-namespace +For testing fuzzy deduplication in a spark runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` - # bind the 'spark' service account to the cluster roles - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-edit-role-binding.yaml --namespace=my-namespace - kubectl apply -f deployment/kubernetes/spark_sa_rb/spark-cluster-role-binding.yaml --namespace=my-namespace - ``` - - 4. Create any secrets that are needed to access S3 folders used for input or output of the transforms. Follow [this link](https://github.com/aws-samples/machine-learning-using-k8s/blob/master/docs/aws-creds-secret.md) for more information on how to build the S3 secrets. - - 5. Edit a pod yaml file from the `deployment/kubernetes/pods` directory. The steps below refer to the [yaml file used to build the filter pod] (deployment/kubernetes/pods/spark-driver-pod-filter.yaml): - 1. Give a name to the pod (`metadata/name`), the container launched inside the pod (`spec/containers/name`), and the Spark application (the `APP_NAME` variable in `spec/containers/env`). - 2. Specify the namespace where the pod will be created (`metadata/namespace`). Use the same namespace for the `EXECUTOR_NAMESPACE` variable in `spec/containers/env`) - 3. Specify the command to launch the Spark application (in `spec/containers/args`) - 4. Specify the image used by the driver (`spec/containers/image` - usually this is the transform image built under point 2). - 5. Specify the image used by the executors (`EXECUTOR_DOCKER_IMAGE` variable in `spec/containers/env`) - 6. Specify the service account to use by the driver (`spec/containers/serviceAccount`) and by the executors(the `SERVICE_ACCOUNT` variable in `spec/containers/env`) - 7. Configure S3: - 1. Specify the input (`AWS_ENDPOINT_URL_IN`) and output (`AWS_ENDPOINT_URL_OUT`) endpoint URLs. - 2. Specify the input and out access key ids and secret access keys. - -6. Launch the Spark application by creating the driver pod: - ``` - kubectl apply -f deployment/kubernetes/pod/spark-driver-pod-filter.yaml - ``` - -7. Monitor the creation of the executor pods: - ``` - kubectl get pods -w - ``` - -8. Monitor the driver logs: - ``` - kubectl logs spark-driver-pod-filter -f - ``` - ``` +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` \ No newline at end of file From ed4e9c1f8cfb77084d095d99200b68355cc059f4 Mon Sep 17 00:00:00 2001 From: Shahrokh Daijavad Date: Mon, 18 Nov 2024 16:40:37 -0800 Subject: [PATCH 085/105] Update README.md utils folder is one level up from the python folder --- transforms/universal/fdedup/python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md index d2d940344..295862221 100644 --- a/transforms/universal/fdedup/python/README.md +++ b/transforms/universal/fdedup/python/README.md @@ -39,7 +39,7 @@ shingles. `num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band. The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard -similarity will be marked as duplicates. A Jupyter notebook in the [utils](utils) folder generates a graph of this +similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact the deduplication process. From fb5601a7eefa66236b9d2b42edbebc476b509606 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Tue, 19 Nov 2024 14:28:42 -0500 Subject: [PATCH 086/105] Code cleanup and bug fixes Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/python/Dockerfile | 3 -- .../python/src/cluster_analysis_transform.py | 43 ++++++++--------- .../python/src/data_cleaning_transform.py | 12 ++--- .../python/src/fdedup_transform_python.py | 29 +++++++++-- .../src/get_duplicate_list_transform.py | 23 +++------ .../python/src/signature_calc_transform.py | 48 +++++++++---------- .../src/signature_calc_transform_python.py | 2 +- 7 files changed, 82 insertions(+), 78 deletions(-) diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile index 071478870..79c85e4ac 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/python/Dockerfile @@ -23,9 +23,6 @@ COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . -# copy source data -COPY src/ src/ - # copy source data COPY ./src/fdedup_transform_python.py fdedup_transform_python.py COPY ./src/fdedup_transform_python.py local/ diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index a9822babe..16febc0dc 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -13,13 +13,17 @@ import os import re from argparse import ArgumentParser, Namespace -from typing import Any, List, Tuple +from typing import Any, List import numpy as np import polars as pl -import pyarrow as pa from data_processing.transform import AbstractFolderTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger +from data_processing.utils import ( + CLIArgumentProvider, + TransformUtils, + UnrecoverableException, + get_logger, +) from Murmur_MH import Murmur_MH @@ -86,7 +90,7 @@ class ClusterAnalysisTransform(AbstractFolderTransform): to keep (the largest size document), and mark the other documents as duplicates. The resulting clusters are saved in a file for further analysis. - Args: + The following internal variables are initialized from the config parameter: num_bands: number of bands used in the banding technique jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates num_segments: the number of segments dividing the hashing space for each band @@ -106,12 +110,14 @@ def __init__(self, config: dict[str, Any]): ) self.sort_output = config.get(sort_output_key, sort_output_default) self.data_access = config.get("data_access") + if self.data_access is None: + raise UnrecoverableException("Could not get a pointer to the data access object inside the transform.") self.logger = get_logger(__name__) def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: self.logger.info(f"Cluster analysis for folder {folder_name}") metadata = {} - input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name)) files, retries = self.data_access.get_folder_files( path=input_folder, extensions=[".parquet"], @@ -125,17 +131,17 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str segment = int(match.group(2)) else: raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s") - output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_folder = TransformUtils.clean_path(self.data_access.output_folder) output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet") # consolidate into a single data frame band hashes computed by workers - band_segment_dataframe, consolidation_stats = self.consolidate_band_segment_files(files) + band_segment_dataframe, consolidation_stats = self._consolidate_band_segment_files(files) metadata |= consolidation_stats # cluster grouping by band hashes - cluster_dataframe, cluster_stats = self.get_clusters(band_segment_dataframe) + cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe) metadata |= cluster_stats # cluster analysis using jaccard similarity - jaccard_cluster_dataframe, jaccard_stats = self.analyze_clusters(cluster_dataframe) + jaccard_cluster_dataframe, jaccard_stats = self._analyze_clusters(cluster_dataframe) metadata |= jaccard_stats # Generate the docs_to_remove dataframe docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove") @@ -144,14 +150,7 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)} return [(output_data, output_path)], metadata - def sanitize_folder_name(self, folder_name: str) -> str: - if "://" in folder_name: - _, folder_name = folder_name.split("://") - if folder_name[-1] != "/": - folder_name = f"{folder_name}/" - return folder_name - - def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + def _consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: band_segment_dataframe = pl.DataFrame() total_input_rows = 0 for fname, contents in files.items(): @@ -170,7 +169,7 @@ def consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.Da } return band_segment_dataframe, consolidation_stats - def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + def _get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: groupby_dataframe = band_segment_dataframe.group_by("band_hash").agg("document_data") cluster_dataframe = groupby_dataframe.with_columns(cluster_length=pl.col("document_data").list.len()).filter( pl.col("cluster_length") > 1 @@ -195,14 +194,14 @@ def get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFra } return cluster_dataframe, cluster_stats - def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + def _analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: # Define the schema with specific data types schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64} doc_ids_lists = [] docs_to_remove_lists = [] len_of_docs2remove_lists = [] for row in df.iter_rows(named=True): - doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self.jaccard_distance_calculation(row) + doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self._jaccard_distance_calculation(row) doc_ids_lists += doc_ids_list docs_to_remove_lists += docs_to_remove_list len_of_docs2remove_lists += len_of_docs2remove_list @@ -236,7 +235,7 @@ def analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, An filtered_jaccard_dataframe = filtered_jaccard_dataframe.sort(by="first_doc") return filtered_jaccard_dataframe, jaccard_stats - def jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: + def _jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: # Process row and return a new list of Series or a new row threshold = self.jaccard_similarity_threshold doc_ids_list = [] @@ -321,7 +320,7 @@ def add_input_params(self, parser: ArgumentParser) -> None: f"--{sort_output_cli_param}", type=bool, default=sort_output_default, - help="Sort", + help="Sort the similarity clusters by the document ID of the kept doc (used primarily for testing)", ) def apply_input_params(self, args: Namespace) -> bool: diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 74597068c..3403bfc42 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -12,14 +12,13 @@ import io import os from argparse import ArgumentParser, Namespace -from typing import Any, List, Tuple +from typing import Any -import numpy as np import polars as pl import pyarrow as pa from data_processing.data_access import DataAccessFactory from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger +from data_processing.utils import CLIArgumentProvider, get_logger short_name = "fdclean" @@ -69,8 +68,9 @@ class DataCleaningTransform(AbstractTableTransform): keeps the directory structure of the input dataset, but has all the fuzzy duplicates removed. - Args: - duplicate_location: location (local or s3) of the duplicate document list + The following internal variables are initialized from the config dictionary: + duplicate_list_location: location (local or s3) of the duplicate document list + operation_mode: one of annotate, filter_duplicates, or filter_non_duplicates """ def __init__(self, config: dict[str, Any]): @@ -90,7 +90,7 @@ def __init__(self, config: dict[str, Any]): self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column}) def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + self.logger.debug(f"Transforming table with {table.num_rows} rows from file {file_name}") input_df = pl.from_arrow(table) # handle the case when the doc_id columns in the input dataframe and the # docs_to_remove_df have different types, i.e. one is int32 and the diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py index b77f44401..166e48e26 100644 --- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py +++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py @@ -115,17 +115,38 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: s3_cred_ast = ParamsUtils.convert_to_ast(in_args.s3_cred) sys_argv.append("--data_s3_cred") sys_argv.append(s3_cred_ast) + if service_name == "minhash": + sys_argv.append("--scdata_s3_cred") + sys_argv.append(s3_cred_ast) + if service_name == "fdclean": + sys_argv.append("--dcdata_s3_cred") + sys_argv.append(s3_cred_ast) elif ( s3_creds.get("access_key") is not None and s3_creds.get("secret_key") is not None and s3_creds.get("url") is not None ): + ast_s3_cred = ParamsUtils.convert_to_ast(s3_creds) sys_argv.append("--data_s3_cred") - sys_argv.append(ParamsUtils.convert_to_ast(s3_creds)) + sys_argv.append(ast_s3_cred) + if service_name == "minhash": + sys_argv.append("--scdata_s3_cred") + sys_argv.append(ast_s3_cred) + if service_name == "fdclean": + sys_argv.append("--dcdata_s3_cred") + sys_argv.append(ast_s3_cred) sys_argv.append("--data_s3_config") else: sys_argv.append("--data_local_config") - sys_argv.append(ParamsUtils.convert_to_ast(data_io)) + ast_data_io = ParamsUtils.convert_to_ast(data_io) + sys_argv.append(ast_data_io) + if in_args.use_s3: + if service_name == "minhash": + sys_argv.append("--scdata_s3_config") + sys_argv.append(ast_data_io) + if service_name == "fdclean": + sys_argv.append("--dcdata_s3_config") + sys_argv.append(ast_data_io) return sys_argv def execute_service(self, service_short_name: str, params: list) -> int: @@ -163,9 +184,9 @@ def parse_args() -> argparse.Namespace: "--contents_column", type=str, required=False, help="name of the column that stores document text" ) parser.add_argument( - "--document_id_column", type=str, required=False, help="name of the column that stores document text" + "--document_id_column", type=str, required=False, help="name of the column that stores document ID" ) - parser.add_argument("--seed", type=int, required=False, help="name of the column that stores document text") + parser.add_argument("--seed", type=int, required=False, help="seed of the random number generator") parser.add_argument( "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation" ) diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py index c49124cf1..c14c4bdce 100644 --- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py @@ -11,16 +11,12 @@ ################################################################################ import io import os -import re from argparse import ArgumentParser, Namespace -from typing import Any, List, Tuple +from typing import Any -import numpy as np import polars as pl -import pyarrow as pa from data_processing.transform import AbstractFolderTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger -from Murmur_MH import Murmur_MH short_name = "fdlist" @@ -61,7 +57,7 @@ class GetDuplicateListTransform(AbstractFolderTransform): This is an intermediate step of the fuzzy dedup pipeline. It runs in a single location and consolidates in a single file all the duplicates found for each band segment. - Args: + These internal variables are initialized from the config dictionary: subfolder: name of the subfolder with the duplicate records consolidated_filename: name of the file with the consolidated list of duplicates """ @@ -82,7 +78,7 @@ def __init__(self, config: dict[str, Any]): def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: self.logger.info(f"Get Duplicate List for folder {folder_name}") metadata = {} - input_folder = self.sanitize_folder_name(os.path.join(self.data_access.input_folder, folder_name)) + input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name)) files, retries = self.data_access.get_folder_files( path=input_folder, extensions=[".parquet"], @@ -90,24 +86,17 @@ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str ) if retries > 0: metadata |= {"data_access_retries": retries} - output_folder = self.sanitize_folder_name(self.data_access.output_folder) + output_folder = TransformUtils.clean_path(self.data_access.output_folder) output_path = os.path.join(output_folder, self.consolidated_filename) # consolidate into a single data frame band hashes computed by workers - consolidated_dataframe, consolidation_stats = self.consolidate_docs_to_remove_files(files) + consolidated_dataframe, consolidation_stats = self._consolidate_docs_to_remove_files(files) self.logger.info(f"{len(consolidated_dataframe)} documents marked as duplicates") metadata |= consolidation_stats output_data = TransformUtils.convert_arrow_to_binary(consolidated_dataframe.to_arrow()) return [(output_data, output_path)], metadata - def sanitize_folder_name(self, folder_name: str) -> str: - if "://" in folder_name: - _, folder_name = folder_name.split("://") - if folder_name[-1] != "/": - folder_name = f"{folder_name}/" - return folder_name - - def consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + def _consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: consolidated_dataframe = pl.DataFrame() total_input_rows = 0 for fname, contents in files.items(): diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py index 6b14e1ba0..4e64bcb5a 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -14,7 +14,7 @@ import unicodedata from argparse import ArgumentParser, Namespace from pathlib import Path -from typing import Any, List +from typing import Any import mmh3 import numpy as np @@ -22,7 +22,7 @@ import pyarrow as pa from data_processing.data_access import DataAccessFactory from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider +from data_processing.utils import CLIArgumentProvider, UnrecoverableException from Murmur_MH import Murmur_MH @@ -129,16 +129,13 @@ class SignatureCalculationTransform(AbstractTableTransform): """ This is the first transform of the fuzzy dedup pipeline. First, it calculates, for each document in a dataset, `num_permutations` minhashes. It accepts as - input the number of bands and the length of each band. If those two parameters - are not specified, then, based on the values of `jaccard_similarity_threshold` - and `num_permutations`, it determines the optimal number of bands, and the - length of each band (how many minhashes will be used to get the signature for - each band). The band signatures, the minhashes and the document lengths are + input the number of bands and the length (number of minhashes used for) each + band. The band signatures, the minhashes and the document lengths are then saved in the output folder, under a folder structure `bands/band=b/segment=s`. To improve scalability of the next step of fuzzy dedup, the hash space of each band is divided into `num_segments` segments. - Args: + The following internal variables are retrieved from the config parameter: document_id_column: name of the column storing the unique ID assigned to each document contents_column_cli_param: name of the column storing the contents of each document seed: the seed used to instantiate the random number generator @@ -171,21 +168,22 @@ def __init__(self, config: dict[str, Any]): self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) self.shingle_option = config.get(shingle_option_key, shingle_option_default) # use this dataframe to store the minhashes and size for each document - self.all_minhashes: pl.DataFrame = None + self.all_minhashes = None # use this dataframe to store the band hashes for each document - self.all_band_hashes: pl.DataFrame = None + self.all_band_hashes = None # this variable keeps track of how many files were processed since last # data write to properly update metadata self.files_processed = 0 self.bytes_processed = 0 self.data_access = config.get("data_access") + if self.data_access is None: + raise UnrecoverableException("Could not get a pointer to the data access object inside the transform.") self.last_file_name = None + self.sc_data_access = config.get(sigcalc_data_access_key, None) - if self.sc_data_access is None: - self.sc_daf = config.get(sigcalc_data_factory_key, None) - if self.sc_daf is None: - raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}") - self.sc_data_access = self.sc_daf.create_data_access() + self.sc_daf = config.get(sigcalc_data_factory_key, None) + if self.sc_daf is None: + raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}") def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: """ @@ -194,7 +192,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab This implementation makes no modifications so effectively implements a copy of the input parquet to the output folder, without modification. """ - self.logger.info(f"Transforming table with {table.num_rows} rows from file {file_name}") + self.logger.debug(f"Transforming table with {table.num_rows} rows from file {file_name}") self.logger.debug("----minhash---") self.last_file_name = file_name self.files_processed += 1 @@ -226,7 +224,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab self.all_minhashes = self.all_minhashes.vstack(minhashes) # Calculate band hashes - band_hashes_list = self.process_rows_into_bands( + band_hashes_list = self._process_rows_into_bands( minhashes, self.num_bands, self.num_rows, @@ -247,7 +245,7 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab self.all_band_hashes = self.all_band_hashes.vstack(band_hashes) if len(self.all_minhashes) > 750000: - tables, metadata = self.write_band_signatures() + tables, metadata = self._write_band_signatures() else: tables = [] metadata = {} @@ -266,14 +264,16 @@ def flush(self) -> tuple[list[pa.Table], dict[str, Any]]: """ self.logger.info(f"Starting flush()") if self.all_band_hashes is not None and self.all_minhashes is not None: - tables, metadata = self.write_band_signatures() + tables, metadata = self._write_band_signatures() else: tables = [] metadata = {} return tables, metadata - def write_band_signatures(self): + def _write_band_signatures(self): # define the upper and lower bounds of each band segment + if self.sc_data_access is None: + self.sc_data_access = self.sc_daf.create_data_access() segment_bounds_list = [] upper_bound = np.uint64(np.iinfo(np.uint64).max) segment_len = np.uint64(upper_bound // self.num_segments) @@ -325,7 +325,6 @@ def write_band_signatures(self): self.logger.debug(f"band {band_ix} segment {segment_index} encapsulated document info in a structure") # append the table to the result list, and the path to metadata - common_path = os.path.commonpath([self.data_access.input_folder, self.last_file_name]) last_file_name_path = Path(self.last_file_name) suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) if self.sc_data_access.output_folder is None: @@ -389,7 +388,7 @@ def _generate_word_shingles( k_shingles.append(delimiter.join(words[i : i + window_size])) return k_shingles, doc_len, document_id - def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b: int, r: int, seed: int = 42): + def _emit_bands(self, int_id_column: str, minhashes: np.array, b: int, r: int, seed: int = 42): num_minhashes = len(minhashes) assert b * r <= num_minhashes, f"b*r must be <= num minhashes, was b={b}, r={r}, num_minhashes={num_minhashes}" results = [] @@ -403,13 +402,12 @@ def emit_bands(self, int_id_column: str, minhashes: np.array, doc_length: int, b return results # Apply the function - def process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band): + def _process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band): result = [] for row in df.iter_rows(): - bands = self.emit_bands( + bands = self._emit_bands( row[0], # document id np.array(row[1], dtype=np.uint32), # minhashes - row[2], # document length minhashlsh_num_bands, minhashlsh_length_band, ) diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py index 5ddc102eb..40e0e97e3 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py +++ b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py @@ -40,5 +40,5 @@ def __init__(self): if __name__ == "__main__": launcher = PythonTransformLauncher(SignatureCalculationTransformConfiguration()) - logger.info("Launching noop transform") + logger.info("Launching fuzzy dedup signature calculation transform") launcher.launch() From 0636d5f998c61d9169fbd5afb3d124aa6b1bad4f Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 21 Nov 2024 00:44:28 -0500 Subject: [PATCH 087/105] Reduce the amount of logging Signed-off-by: Constantin M Adam --- .../universal/fdedup/python/src/cluster_analysis_transform.py | 2 +- .../universal/fdedup/python/src/data_cleaning_transform.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py index 16febc0dc..fa3ce6d28 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -115,7 +115,7 @@ def __init__(self, config: dict[str, Any]): self.logger = get_logger(__name__) def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: - self.logger.info(f"Cluster analysis for folder {folder_name}") + self.logger.debug(f"Cluster analysis for folder {folder_name}") metadata = {} input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name)) files, retries = self.data_access.get_folder_files( diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py index 3403bfc42..cb07923ae 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform.py +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -86,7 +86,7 @@ def __init__(self, config: dict[str, Any]): self.operation_mode = config.get(operation_mode_key, operation_mode_default) contents = config.get("df") self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents)) - self.logger.info(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") + self.logger.debug(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column}) def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: From d58518bfe9d52eacd0063909267cabafb1f546dc Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Thu, 21 Nov 2024 00:45:39 -0500 Subject: [PATCH 088/105] Cleanup KFP pipeline code Signed-off-by: Constantin M Adam --- .../universal/fdedup/kfp_ray/fdedup_wf.py | 40 +++--- .../src/fdedup_compute_execution_params.py | 134 ++++++++++-------- 2 files changed, 92 insertions(+), 82 deletions(-) diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 683f93210..ffc6f79bc 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -115,22 +115,23 @@ def fuzzydedup( ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = { - "cpu": 1, - "memory": 4, + "cpu": 8, + "memory": 64, "image": task_image, "image_pull_secret": image_pull_secret, "imagePullPolicy": "Always", }, ray_worker_options: dict = { - "replicas": 2, - "max_replicas": 2, - "min_replicas": 2, - "cpu": 2, - "memory": 4, + "replicas": 10, + "max_replicas": 10, + "min_replicas": 10, + "cpu": 16, + "memory": 128, "image": task_image, "image_pull_secret": image_pull_secret, "imagePullPolicy": "Always", }, + runtime_actor_options: dict = {"num_cpus": 0.8, "memory": 16}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access. checkpointing is not supported by dedup data_s3_config: str = "{'input_folder': 's3://cos-llm-pile-south/spark_test/fd_xs_dataset_test/', 'output_folder': 's3://cos-llm-pile-south/spark_test/fuzzy_dedup_test_output_data/kfp_test_1/'}", @@ -153,10 +154,6 @@ def fuzzydedup( fdedup_shingle_option: str = "word", fdedup_jaccard_similarity_threshold: float = 0.75, fdedup_seed: int = 42, - fdedup_docs_to_remove_folder: str = "docs_to_remove", - fdedup_duplicate_list_location: str = os.path.join( - "docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet" - ), fdedup_operation_mode: str = "annotate", # data sampling fdedup_n_samples: int = 10, @@ -206,8 +203,6 @@ def fuzzydedup( :param fdedup_shingle_option - type of shingle, one of 'word', or 'char' :param fdedup_jaccard_similarity_threshold - similarity threshold :param fdedup_seed - seed for the random number generator - :param fdedup_docs_to_remove_folder - name of the subfolder holding the duplicate doc ids - :param fdedup_duplicate_list_location - name of the file holding the consolidated list of duplicates :param fdedup_operation_mode - data cleaning mode, one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate' :param fdedup_n_samples - number of samples for parameters computation :return: None @@ -222,6 +217,7 @@ def fuzzydedup( # compute execution params compute_common_exec_params = compute_common_params_op( worker_options=ray_worker_options, + actor_options=runtime_actor_options, data_s3_config=data_s3_config, num_permutations=fdedup_num_permutations, n_samples=fdedup_n_samples, @@ -229,8 +225,9 @@ def fuzzydedup( ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2) ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret) fdedup_num_segments = compute_common_exec_params.outputs["num_segments"] - runtime_actor_cpus = compute_common_exec_params.outputs["cpus_per_actor"] runtime_num_actors = compute_common_exec_params.outputs["num_actors"] + runtime_actor_cpus = compute_common_exec_params.outputs["actor_cpu"] + runtime_actor_memory = compute_common_exec_params.outputs["actor_memory"] # start Ray cluster ray_cluster = create_ray_op( @@ -246,8 +243,9 @@ def fuzzydedup( # Get the parameters for the signature calculation job compute_signature_calc_exec_params = compute_signature_calc_exec_params_op( - runtime_actor_cpus=runtime_actor_cpus, runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, @@ -289,8 +287,9 @@ def fuzzydedup( # Get the parameters for the cluster analysis job compute_cluster_analysis_exec_params = compute_cluster_analysis_exec_params_op( - runtime_actor_cpus=runtime_actor_cpus, runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, @@ -319,16 +318,15 @@ def fuzzydedup( execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op( - runtime_actor_cpus=runtime_actor_cpus, runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, - duplicate_docids_folder=fdedup_docs_to_remove_folder, - duplicate_list_location=fdedup_duplicate_list_location, ) ComponentUtils.add_settings_to_component(compute_get_duplicate_list_exec_params, ONE_HOUR_SEC * 2) compute_get_duplicate_list_exec_params.after(execute_cluster_analysis_job) @@ -348,8 +346,9 @@ def fuzzydedup( execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op( - runtime_actor_cpus=runtime_actor_cpus, runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, @@ -357,7 +356,6 @@ def fuzzydedup( runtime_job_id=run_id, runtime_code_location=runtime_code_location, id_column=fdedup_document_id_column, - duplicate_list_location=fdedup_duplicate_list_location, operation_mode=fdedup_operation_mode, ) ComponentUtils.add_settings_to_component(compute_data_cleaning_exec_params, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index cd3a58b99..15722c164 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -10,15 +10,27 @@ # limitations under the License. ################################################################################ -from typing import Any, Dict, NamedTuple +from typing import Any, NamedTuple def compute_common_params( worker_options: dict, # ray worker configuration + actor_options: dict, # actor desired configuration data_s3_config: str, # S3 configuration num_permutations: int, # number of permutations (minhashes) per document n_samples: int, # files to sample for number of documents estimation -) -> NamedTuple("fdedup_params", [("num_segments", int), ("num_actors", int), ("cpus_per_actor", float)]): +) -> NamedTuple( + "fdedup_params", [("num_segments", int), ("num_actors", str), ("actor_cpu", float), ("actor_memory", int)] +): + """ + Compute fuzzy dedup execution parameters common to all the transforms + :param worker_options: worker group configuration + :param actor_options: desired actor configuration + :param data_s3_config: s3 configuration + :param num_permutations: number of permutations + :param n_samples: number of samples used to estimate the total number of documents in the dataset + :return: fdedup_params NamedTuple: num_segments - int, num_actors - str, cpus (float) and memory (int) per actor + """ import sys @@ -40,49 +52,45 @@ def compute_common_params( print(f"Estimated number of documents and documents size is zero. Please verify the input path.") sys.exit(1) print(f"Estimated number of docs: {number_of_docs}") + actor_cpu: float = actor_options.get("num_cpus", 1) # if num_cpus not specified, request 1 CPU per actor + actor_memory: int = int(actor_options.get("memory", 16)) * GB # if memory not specified, request 16 GB per actor + # Calculate the number of segments # Assume each document takes doc_bytes = (8 + num_permutations * 4 + 20) bytes, where: # 8 bytes are taken by the band hash # (num_permutations * 4) bytes are taken by the min hashes # 20 bytes to provide some extra space for storage in a table # The total amount of space needed by a band is number_of_docs * doc_bytes. - # To scale the handling of this data, divide each band into segments, where each segment size is below 3GB + # To scale band handling, divide each band into segments, each smaller than 1/6 of an actor's allocated memory doc_bytes = 8 + num_permutations * 4 + 20 band_bytes = number_of_docs * doc_bytes - num_segments = 1 + (band_bytes // (3 * GB)) + num_segments = 1 + (band_bytes // (actor_memory // 6)) print(f"Number of segments: {num_segments}") - # To process data efficiently, each actor needs 16GB of memory. - # The actor config controls CPU allocation, not memory; - # use CPU allocation s.t. the number of actors on a worker provides access to 16GB of memory for each actor. - # Also, to keep S3 utilization in check, limit the number of actors to 2000 - num_nodes = worker_options["replicas"] - cpu_per_node = worker_options["cpu"] - 1 - memory_per_node = worker_options["memory"] - - memory_per_actor = 16 # GB - max_num_actors = 2000 - num_actors_per_node: int = int(memory_per_node / memory_per_actor) - if num_actors_per_node == 0: - num_actors_per_node = 1 - # never run actors on the head node, so (n - 1) nodes to run actors - num_actors = (num_nodes - 1) * num_actors_per_node - - while num_actors > max_num_actors: - num_actors -= num_nodes - 1 - num_actors_per_node -= 1 - print(f"Number of actors per node = {num_actors_per_node}") - cpus_per_actor = cpu_per_node / num_actors_per_node - print(f"CPUs per actor = {cpus_per_actor}") + # Calculate number of actors, using KFPUtils.default_compute_execution_params() + # Create new dict with memory expressed in bytes, as expected by KFPUtils.default_compute_execution_params() + actor_config = { + "num_cpus": actor_cpu, + "memory": actor_memory, + } + num_actors = KFPUtils.default_compute_execution_params(str(worker_options), str(actor_config)) + print(f"num_actors = {num_actors}") from collections import namedtuple - fdedup_params = namedtuple("fdedup_params", ["num_segments", "num_actors", "cpus_per_actor"]) - return fdedup_params(num_segments, num_actors, cpus_per_actor) + fdedup_params = namedtuple( + typename="fdedup_params", + field_names=["num_segments", "num_actors", "actor_cpu", "actor_memory"], + ) + print( + f"num_segments = {num_segments}, num_actors = {num_actors}, actor_cpu = {actor_cpu}, actor_memory = {actor_memory}" + ) + return fdedup_params(num_segments, num_actors, actor_cpu, actor_memory) def signature_calc_compute_execution_params( - runtime_actor_cpus: float, # actor's CPU requirements - runtime_num_actors: int, # number of actors needed to run this step + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -103,8 +111,9 @@ def signature_calc_compute_execution_params( """ Compute fuzzy dedup execution parameters for signature calculation - :param runtime_actor_cpus: actor's CPU requirements - :param runtime_num_actors: number of actors to run this step + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -116,23 +125,22 @@ def signature_calc_compute_execution_params( :param num_permutations: number of permutations :param num_bands: number of bands :param num_minhashes_per_band: band length - :param word_shingle_size: number of words in shingle + :param word_shingle_size: number of words/chars in shingle :param shingle_option: str: type of shingle, one of 'word' or 'char' :param threshold: threshold, :param num_segments: number of segments :param seed: seed for the random number generator - :return: a dictionary with a Ray Job execution parameters + :return: dictionary with Ray Job execution parameters """ # fuzzy parameters for signature calculation - runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} - print(f"runtime_actor_options = {runtime_actor_options}") + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, "runtime_num_workers": runtime_num_actors, - "runtime_worker_options": str(runtime_actor_options), + "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), @@ -151,8 +159,9 @@ def signature_calc_compute_execution_params( def cluster_analysis_compute_execution_params( - runtime_actor_cpus: float, # actor's CPU requirements - runtime_num_actors: int, # number of actors needed to run this step + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -166,8 +175,9 @@ def cluster_analysis_compute_execution_params( """ Compute fuzzy dedup execution parameters for cluster analysis - :param runtime_actor_cpus: actor's CPU requirements - :param runtime_num_actors: number of actors to run this step + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -189,13 +199,13 @@ def cluster_analysis_compute_execution_params( data_s3_config_dict["input_folder"] = os.path.join(base_folder, "bands") data_s3_config_dict["output_folder"] = os.path.join(base_folder, "docs_to_remove") data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") - runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, "runtime_num_workers": runtime_num_actors, - "runtime_worker_options": str(runtime_actor_options), + "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), @@ -206,47 +216,48 @@ def cluster_analysis_compute_execution_params( def get_duplicate_list_compute_execution_params( - runtime_actor_cpus: float, # actor's CPU requirements - runtime_num_actors: int, # number of actors needed to run this step + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process runtime_pipeline_id: str, # pipeline id runtime_job_id: str, # job id runtime_code_location: dict, # code location - duplicate_docids_folder: str, # folder with the docs IDs to remove - duplicate_list_location: str, # location of the list of duplicate doc ids ) -> dict: """ Compute fuzzy dedup execution parameters for get duplicate list step - :param runtime_actor_cpus: actor's CPU requirements - :param runtime_num_actors: number of actors to run this step + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process :param runtime_pipeline_id: pipeline id :param runtime_job_id: job id :param runtime_code_location: code location - :param duplicate_docids_folder: folder with the docs IDs to remove - :param duplicate_list_location: location of the list of duplicate doc ids :return: a dictionary with a Ray Job execution parameters """ import json + import os # fuzzy parameters + duplicate_docids_folder: str = "docs_to_remove" + duplicate_list_location: str = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") # Get cluster parameters data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) base_folder = data_s3_config_dict.get("output_folder") data_s3_config_dict["input_folder"] = base_folder data_s3_config_dict["output_folder"] = base_folder data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") - runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, "runtime_num_workers": runtime_num_actors, - "runtime_worker_options": str(runtime_actor_options), + "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), @@ -256,8 +267,9 @@ def get_duplicate_list_compute_execution_params( def data_cleaning_compute_execution_params( - runtime_actor_cpus: float, # actor's CPU requirements - runtime_num_actors: int, # number of actors needed to run this step + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -265,13 +277,13 @@ def data_cleaning_compute_execution_params( runtime_job_id: str, # job id runtime_code_location: dict, # code location id_column: str, # integer document id column name - duplicate_list_location: str, # location of the list of duplicate doc ids operation_mode: str, # filter (non-)duplicates or annotate ) -> dict: """ Compute fuzzy dedup execution parameters - :param runtime_actor_cpus: actor's CPU requirements - :param runtime_num_actors: number of actors to run this step + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -279,7 +291,6 @@ def data_cleaning_compute_execution_params( :param runtime_job_id: job id :param runtime_code_location: code location :param id_column: integer document id column name - :param duplicate_list_location: location of the list of duplicate doc ids :param operation_mode: filter (non-)duplicates or annotate :return: a dictionary with a Ray Job execution parameters """ @@ -298,13 +309,14 @@ def data_cleaning_compute_execution_params( output_subfolder = "annotated" data_s3_config_dict["output_folder"] = os.path.join(base_folder, output_subfolder) data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") - runtime_actor_options: dict = {"num_cpus": runtime_actor_cpus} + duplicate_list_location: str = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, "runtime_num_workers": runtime_num_actors, - "runtime_worker_options": str(runtime_actor_options), + "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), From 170af4bb5c0e95ac31ea6971791b29d1f818cbb4 Mon Sep 17 00:00:00 2001 From: SHAHROKH DAIJAVAD Date: Thu, 21 Nov 2024 12:22:38 -0800 Subject: [PATCH 089/105] first version of a notebook Signed-off-by: SHAHROKH DAIJAVAD --- .../language/doc_quality/doc_quality.ipynb | 169 ++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 transforms/language/doc_quality/doc_quality.ipynb diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb new file mode 100644 index 000000000..99bab8ff3 --- /dev/null +++ b/transforms/language/doc_quality/doc_quality.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding DocQualityTransform configuration for values are as follows: \n", + "* text_lang - specifies language used in the text content. By default, \"en\" is used.\n", + "* doc_content_column - specifies column name that contains document text. By default, \"contents\" is used.\n", + "* bad_word_filepath - specifies a path to bad word file: local folder (file or directory) that points to bad word file. You don't have to set this parameter if you don't need to set bad words.\n", + "#####" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from doc_quality_transform import (bad_word_filepath_cli_param, doc_content_column_cli_param, text_lang_cli_param,)\n", + "from doc_quality_transform_python import DocQualityPythonTransformConfiguration" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# create parameters\n", + "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"python\", \"output\")\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # execution info\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + " \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n", + " # doc_quality params\n", + " text_lang_cli_param: \"en\",\n", + " doc_content_column_cli_param: \"contents\",\n", + " bad_word_filepath_cli_param: os.path.join(\"python\", \"ldnoobw\", \"en\"),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=DocQualityPythonTransformConfiguration())\n", + "launcher.launch()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 10851f6643fcf96c2417b332118842660d225d3d Mon Sep 17 00:00:00 2001 From: SHAHROKH DAIJAVAD Date: Thu, 21 Nov 2024 13:34:47 -0800 Subject: [PATCH 090/105] fixed code_location Signed-off-by: SHAHROKH DAIJAVAD --- .../language/doc_quality/doc_quality.ipynb | 52 ++++++++++++++++--- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb index 99bab8ff3..c6617b2bc 100644 --- a/transforms/language/doc_quality/doc_quality.ipynb +++ b/transforms/language/doc_quality/doc_quality.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, "outputs": [], @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], @@ -90,6 +90,7 @@ " \"input_folder\": input_folder,\n", " \"output_folder\": output_folder,\n", "}\n", + "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", "params = {\n", " # Data access. Only required parameters are specified\n", " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", @@ -114,10 +115,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:32:09 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "13:32:09 INFO - pipeline id pipeline_id\n", + "13:32:09 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", + "13:32:09 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "13:32:09 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:09 INFO - orchestrator docq started at 2024-11-21 13:32:09\n", + "13:32:09 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n", + "13:32:09 INFO - Load badwords found locally from python/ldnoobw/en\n", + "13:32:11 INFO - Completed 1 files (100.0%) in 0.025 min\n", + "13:32:11 INFO - Done processing 1 files, waiting for flush() completion.\n", + "13:32:11 INFO - done flushing in 0.0 sec\n", + "13:32:11 INFO - Completed execution in 0.025 min, execution result 0\n" + ] + } + ], "source": [ "%%capture\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", @@ -135,14 +156,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/metadata.json', 'python/output/test1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import glob\n", "glob.glob(\"python/output/*\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 7545872c6e059eb67f2f947418572d255bf66685 Mon Sep 17 00:00:00 2001 From: Daiki Tsuzuku Date: Fri, 22 Nov 2024 10:43:05 +0900 Subject: [PATCH 091/105] add link to jupyter notebook Signed-off-by: Daiki Tsuzuku --- .../language/doc_quality/doc_quality.ipynb | 30 +++++++++++++------ .../language/doc_quality/python/README.md | 2 +- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb index c6617b2bc..f3978dc96 100644 --- a/transforms/language/doc_quality/doc_quality.ipynb +++ b/transforms/language/doc_quality/doc_quality.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -23,9 +23,10 @@ "%%capture\n", "## This is here as a reference only\n", "# Users and application developers must use the right tag for the latest from pypi\n", - "#!pip install data-prep-toolkit\n", - "#!pip install data-prep-toolkit-transforms\n", - "#!pip install data-prep-connector" + "%pip install data-prep-toolkit\n", + "%pip install data-prep-toolkit-transforms\n", + "%pip install data-prep-connector\n", + "%pip install dpk-doc-quality-transform-python" ] }, { @@ -52,12 +53,23 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'doc_quality_transform'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 6\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdata_processing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mruntime\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpure_python\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PythonTransformLauncher\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdata_processing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ParamsUtils\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoc_quality_transform\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (bad_word_filepath_cli_param, doc_content_column_cli_param, text_lang_cli_param,)\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoc_quality_transform_python\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DocQualityPythonTransformConfiguration\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'doc_quality_transform'" + ] + } + ], "source": [ - "import ast\n", "import os\n", "import sys\n", "\n", @@ -187,7 +199,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -201,7 +213,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md index 6a085ef05..c10bc4b88 100644 --- a/transforms/language/doc_quality/python/README.md +++ b/transforms/language/doc_quality/python/README.md @@ -90,7 +90,7 @@ To see results of the transform. ### Code example -TBD (link to the notebook will be provided) +[notebook](../doc_quality.ipynb) ### Transforming data using the transform image From 9ee506e341749765c59b8ab8430829fd442f4950 Mon Sep 17 00:00:00 2001 From: Daiki Tsuzuku Date: Fri, 22 Nov 2024 15:09:11 +0900 Subject: [PATCH 092/105] update notebook Signed-off-by: Daiki Tsuzuku --- .../language/doc_quality/doc_quality.ipynb | 52 +++++++------------ 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb index f3978dc96..91aafd74d 100644 --- a/transforms/language/doc_quality/doc_quality.ipynb +++ b/transforms/language/doc_quality/doc_quality.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -53,22 +53,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'doc_quality_transform'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 6\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdata_processing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mruntime\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpure_python\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PythonTransformLauncher\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdata_processing\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ParamsUtils\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoc_quality_transform\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (bad_word_filepath_cli_param, doc_content_column_cli_param, text_lang_cli_param,)\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdoc_quality_transform_python\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DocQualityPythonTransformConfiguration\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'doc_quality_transform'" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import sys\n", @@ -89,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], @@ -127,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, "outputs": [ @@ -135,19 +123,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "13:32:09 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "13:32:09 INFO - pipeline id pipeline_id\n", - "13:32:09 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", - "13:32:09 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", - "13:32:09 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:09 INFO - orchestrator docq started at 2024-11-21 13:32:09\n", - "13:32:09 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n", - "13:32:09 INFO - Load badwords found locally from python/ldnoobw/en\n", - "13:32:11 INFO - Completed 1 files (100.0%) in 0.025 min\n", - "13:32:11 INFO - Done processing 1 files, waiting for flush() completion.\n", - "13:32:11 INFO - done flushing in 0.0 sec\n", - "13:32:11 INFO - Completed execution in 0.025 min, execution result 0\n" + "10:38:40 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "10:38:40 INFO - pipeline id pipeline_id\n", + "10:38:40 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", + "10:38:40 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "10:38:40 INFO - data factory data_ max_files -1, n_sample -1\n", + "10:38:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "10:38:40 INFO - orchestrator docq started at 2024-11-22 10:38:40\n", + "10:38:40 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n", + "10:38:40 INFO - Load badwords found locally from python/ldnoobw/en\n", + "10:38:49 INFO - Completed 1 files (100.0%) in 0.146 min\n", + "10:38:49 INFO - Done processing 1 files, waiting for flush() completion.\n", + "10:38:49 INFO - done flushing in 0.0 sec\n", + "10:38:49 INFO - Completed execution in 0.146 min, execution result 0\n" ] } ], @@ -168,7 +156,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, "outputs": [ @@ -178,7 +166,7 @@ "['python/output/metadata.json', 'python/output/test1.parquet']" ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } From 280d105a1b5ced45ae4fc7d5bdf4123e86669022 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 22 Nov 2024 20:21:02 -0500 Subject: [PATCH 093/105] added fdedup to build package for all transforms Signed-off-by: Maroun Touma --- transforms/pyproject.toml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 2357553e4..badb8bbd9 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.2.dev3" +version = "0.2.2.dev4" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -44,6 +44,7 @@ all = { file = [ "universal/hap/python/requirements.txt", "universal/tokenization/python/requirements.txt", "universal/ededup/python/requirements.txt", +"universal/fdedup/python/requirements.txt", "universal/profiler/python/requirements.txt", "universal/doc_id/python/requirements.txt", "universal/filter/python/requirements.txt", @@ -71,6 +72,7 @@ pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]} hap = { file = ["universal/hap/python/requirements.txt"]} tokenization = { file = ["universal/tokenization/python/requirements.txt"]} ededup = { file = ["universal/ededup/python/requirements.txt"]} +fdedup = { file = ["universal/fdedup/python/requirements.txt"]} profiler = { file = ["universal/profiler/python/requirements.txt"]} doc_id = { file = ["universal/doc_id/python/requirements.txt"]} filter = { file = ["universal/filter/python/requirements.txt"]} @@ -80,11 +82,8 @@ web2parquet = { file = ["universal/web2parquet/requirements.txt"]} # Does not seem to work for our custom layout # copy all files to a single src and let automatic discovery find them -[tool.setuptools.package-data] -"*" = ["*.txt"] - -[tool.setuptools.packages.find] -where = ["src"] +#[tool.setuptools.package-data] +#"*" = ["*.txt"] #[tool.setuptools.package-dir] #dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" From cf133880deac097f18cc580dc9364c680f1a9623 Mon Sep 17 00:00:00 2001 From: Daiki Tsuzuku Date: Mon, 25 Nov 2024 09:55:49 +0900 Subject: [PATCH 094/105] stop installing data-prep-connector Signed-off-by: Daiki Tsuzuku --- transforms/language/doc_quality/doc_quality.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb index 91aafd74d..5b87c91b8 100644 --- a/transforms/language/doc_quality/doc_quality.ipynb +++ b/transforms/language/doc_quality/doc_quality.ipynb @@ -25,7 +25,6 @@ "# Users and application developers must use the right tag for the latest from pypi\n", "%pip install data-prep-toolkit\n", "%pip install data-prep-toolkit-transforms\n", - "%pip install data-prep-connector\n", "%pip install dpk-doc-quality-transform-python" ] }, From edb605bb681c57db1f9eb5d3fe9f425681f57c2b Mon Sep 17 00:00:00 2001 From: Daiki Tsuzuku Date: Mon, 25 Nov 2024 12:39:31 +0900 Subject: [PATCH 095/105] use data-prep-toolkit-transforms==0.2.2.dev3 Signed-off-by: Daiki Tsuzuku --- .../language/doc_quality/doc_quality.ipynb | 41 +++++++++---------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb index 5b87c91b8..bf91047b6 100644 --- a/transforms/language/doc_quality/doc_quality.ipynb +++ b/transforms/language/doc_quality/doc_quality.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -24,8 +24,7 @@ "## This is here as a reference only\n", "# Users and application developers must use the right tag for the latest from pypi\n", "%pip install data-prep-toolkit\n", - "%pip install data-prep-toolkit-transforms\n", - "%pip install dpk-doc-quality-transform-python" + "%pip install data-prep-toolkit-transforms==0.2.2.dev3" ] }, { @@ -52,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, "outputs": [], @@ -76,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], @@ -114,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, "outputs": [ @@ -122,19 +121,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "10:38:40 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", - "10:38:40 INFO - pipeline id pipeline_id\n", - "10:38:40 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", - "10:38:40 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", - "10:38:40 INFO - data factory data_ max_files -1, n_sample -1\n", - "10:38:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "10:38:40 INFO - orchestrator docq started at 2024-11-22 10:38:40\n", - "10:38:40 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n", - "10:38:40 INFO - Load badwords found locally from python/ldnoobw/en\n", - "10:38:49 INFO - Completed 1 files (100.0%) in 0.146 min\n", - "10:38:49 INFO - Done processing 1 files, waiting for flush() completion.\n", - "10:38:49 INFO - done flushing in 0.0 sec\n", - "10:38:49 INFO - Completed execution in 0.146 min, execution result 0\n" + "12:39:07 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "12:39:07 INFO - pipeline id pipeline_id\n", + "12:39:07 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", + "12:39:07 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "12:39:07 INFO - data factory data_ max_files -1, n_sample -1\n", + "12:39:07 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "12:39:07 INFO - orchestrator docq started at 2024-11-25 12:39:07\n", + "12:39:07 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n", + "12:39:07 INFO - Load badwords found locally from python/ldnoobw/en\n", + "12:39:09 INFO - Completed 1 files (100.0%) in 0.033 min\n", + "12:39:09 INFO - Done processing 1 files, waiting for flush() completion.\n", + "12:39:09 INFO - done flushing in 0.0 sec\n", + "12:39:09 INFO - Completed execution in 0.033 min, execution result 0\n" ] } ], @@ -155,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, "outputs": [ @@ -165,7 +164,7 @@ "['python/output/metadata.json', 'python/output/test1.parquet']" ] }, - "execution_count": 11, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } From 1a762e01d6cdd3af08c5983c6bcf81e175ab3627 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 25 Nov 2024 11:53:18 -0500 Subject: [PATCH 096/105] First draft of fdedup notebook Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/fdedup.ipynb | 152 +++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 transforms/universal/fdedup/fdedup.ipynb diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb new file mode 100644 index 000000000..ee1d9b561 --- /dev/null +++ b/transforms/universal/fdedup/fdedup.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args\n", + "from fdedup_transform_ray import RayServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(\"ray\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"ray\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"num_segments\": 1,\n", + " \"operation_mode\": \"annotate\",\n", + " # ray configuration parameters\n", + " \"run_locally\": True,\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use ray runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = RayServiceOrchestrator(global_params=args)\n", + "# Launch ray fuzzy dedup execution\n", + "orchestrator.orchestrate()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ffebdc1c388440a1b03e4efe88178405b4c569dc Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 25 Nov 2024 13:52:51 -0500 Subject: [PATCH 097/105] Added sample ray fuzzy dedup jupyter notebook Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/fdedup.ipynb | 71 ++++++++++++++++--- .../python/src/fdedup_transform_python.py | 10 +++ 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb index ee1d9b561..88bcd87aa 100644 --- a/transforms/universal/fdedup/fdedup.ipynb +++ b/transforms/universal/fdedup/fdedup.ipynb @@ -67,8 +67,8 @@ "outputs": [], "source": [ "# create parameters\n", - "input_folder = os.path.join(\"ray\", \"test-data\", \"input\")\n", - "output_folder = os.path.join( \"ray\", \"output\")\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", "params = {\n", " # transform configuration parameters\n", " \"input_folder\": input_folder,\n", @@ -79,7 +79,7 @@ " \"num_bands\": 14,\n", " \"num_minhashes_per_band\": 8,\n", " \"num_segments\": 1,\n", - " \"operation_mode\": \"annotate\",\n", + " \"operation_mode\": \"filter_duplicates\",\n", " # ray configuration parameters\n", " \"run_locally\": True,\n", "}\n" @@ -90,7 +90,7 @@ "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", "metadata": {}, "source": [ - "##### ***** Use ray runtime to invoke the transform" + "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" ] }, { @@ -100,12 +100,13 @@ "metadata": {}, "outputs": [], "source": [ - "%%capture\n", + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", "args = parse_args()\n", "# Initialize the orchestrator\n", "orchestrator = RayServiceOrchestrator(global_params=args)\n", "# Launch ray fuzzy dedup execution\n", - "orchestrator.orchestrate()\n" + "orchestrator.orchestrate()" ] }, { @@ -124,15 +125,65 @@ "outputs": [], "source": [ "import glob\n", - "glob.glob(\"python/output/*\")" + "glob.glob(\"ray/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "fdedup_ray", "language": "python", - "name": "python3" + "name": "fdedup_ray" }, "language_info": { "codemirror_mode": { @@ -144,7 +195,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py index 166e48e26..b200676da 100644 --- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py +++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py @@ -147,6 +147,8 @@ def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: if service_name == "fdclean": sys_argv.append("--dcdata_s3_config") sys_argv.append(ast_data_io) + if in_args.run_locally: + sys_argv.append(f"--run_locally={in_args.run_locally}") return sys_argv def execute_service(self, service_short_name: str, params: list) -> int: @@ -240,6 +242,7 @@ def parse_args() -> argparse.Namespace: default=None, help="ast string of options for s3 credentials", ) + parser.add_argument( "--shingle_option", type=str, @@ -248,6 +251,13 @@ def parse_args() -> argparse.Namespace: help="Option used for shingling", ) + parser.add_argument( + "--run_locally", + type=lambda x: bool(str2bool(x)), + default=True, + help="run locally or connect to a remote machine", + ) + return parser.parse_args() From 75fc4d1464d4d8c83dc0a087528c46c873a46d2f Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 25 Nov 2024 16:59:26 -0500 Subject: [PATCH 098/105] Add jupyter notebooks for python, ray and spark fuzzy dedup Signed-off-by: Constantin M Adam --- .../universal/fdedup/fdedup_python.ipynb | 215 ++++++++++++++++++ transforms/universal/fdedup/fdedup_ray.ipynb | 214 +++++++++++++++++ .../universal/fdedup/fdedup_spark.ipynb | 212 +++++++++++++++++ .../python/src/fdedup_transform_python.py | 2 +- 4 files changed, 642 insertions(+), 1 deletion(-) create mode 100644 transforms/universal/fdedup/fdedup_python.ipynb create mode 100644 transforms/universal/fdedup/fdedup_ray.ipynb create mode 100644 transforms/universal/fdedup/fdedup_spark.ipynb diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb new file mode 100644 index 000000000..83f9bd600 --- /dev/null +++ b/transforms/universal/fdedup/fdedup_python.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args, ServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform\n", + "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n", + "|parameter:type | value | description |\n", + "|-|-|-|\n", + "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n", + "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n", + "| contents_column:str | contents | name of the column that stores document text |\n", + "| document_id_column:str | int_id_column | name of the column that stores document ID |\n", + "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", + "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", + "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"operation_mode\": \"filter_duplicates\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = ServiceOrchestrator(global_params=args)\n", + "# Launch python fuzzy dedup execution\n", + "orchestrator.orchestrate()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"python/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", + "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n", + "input_df = input_df_1.vstack(input_df_2)\n", + "\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n", + "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n", + "output_df = output_df_1.vstack(output_df_2)\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fdedup_ray", + "language": "python", + "name": "fdedup_ray" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb new file mode 100644 index 000000000..533ca019f --- /dev/null +++ b/transforms/universal/fdedup/fdedup_ray.ipynb @@ -0,0 +1,214 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args\n", + "from fdedup_transform_ray import RayServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform\n", + "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n", + "|parameter:type | value | description |\n", + "|-|-|-|\n", + "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n", + "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n", + "| contents_column:str | contents | name of the column that stores document text |\n", + "| document_id_column:str | int_id_column | name of the column that stores document ID |\n", + "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", + "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", + "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |\n", + "| run_locally:bool | true | if true, launch a ray cluster locally, otherwise connect to an already existing cluster | \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"operation_mode\": \"filter_duplicates\",\n", + " # ray configuration parameters\n", + " \"run_locally\": True,\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = RayServiceOrchestrator(global_params=args)\n", + "# Launch ray fuzzy dedup execution\n", + "orchestrator.orchestrate()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"ray/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fdedup_ray", + "language": "python", + "name": "fdedup_ray" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/fdedup_spark.ipynb b/transforms/universal/fdedup/fdedup_spark.ipynb new file mode 100644 index 000000000..9f4bf1772 --- /dev/null +++ b/transforms/universal/fdedup/fdedup_spark.ipynb @@ -0,0 +1,212 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args\n", + "from fdedup_transform_spark import SparkServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform\n", + "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n", + "|parameter:type | value | description |\n", + "|-|-|-|\n", + "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n", + "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n", + "| contents_column:str | contents | name of the column that stores document text |\n", + "| document_id_column:str | int_id_column | name of the column that stores document ID |\n", + "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", + "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", + "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"operation_mode\": \"filter_duplicates\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use spark runtime to invoke each transform in the fuzzy dedup pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = SparkServiceOrchestrator(global_params=args)\n", + "# Launch spark fuzzy dedup execution\n", + "orchestrator.orchestrate()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"spark/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\", \"df1.parquet\"))\n", + "\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fdedup_spark", + "language": "python", + "name": "fdedup_spark" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py index b200676da..def3590e4 100644 --- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py +++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py @@ -254,7 +254,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--run_locally", type=lambda x: bool(str2bool(x)), - default=True, + default=False, help="run locally or connect to a remote machine", ) From edd5841bb199c974489a8f612968c587bdeebab3 Mon Sep 17 00:00:00 2001 From: Constantin M Adam Date: Mon, 25 Nov 2024 17:08:43 -0500 Subject: [PATCH 099/105] Add jupyter notebooks for python, ray and spark fuzzy dedup Signed-off-by: Constantin M Adam --- transforms/universal/fdedup/fdedup.ipynb | 203 ----------------------- 1 file changed, 203 deletions(-) delete mode 100644 transforms/universal/fdedup/fdedup.ipynb diff --git a/transforms/universal/fdedup/fdedup.ipynb b/transforms/universal/fdedup/fdedup.ipynb deleted file mode 100644 index 88bcd87aa..000000000 --- a/transforms/universal/fdedup/fdedup.ipynb +++ /dev/null @@ -1,203 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "afd55886-5f5b-4794-838e-ef8179fb0394", - "metadata": {}, - "source": [ - "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", - "```\n", - "make venv\n", - "source venv/bin/activate && pip install jupyterlab\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "## This is here as a reference only\n", - "# Users and application developers must use the right tag for the latest from pypi\n", - "#!pip install data-prep-toolkit\n", - "#!pip install data-prep-toolkit-transforms\n", - "#!pip install data-prep-connector" - ] - }, - { - "cell_type": "markdown", - "id": "ebf1f782-0e61-485c-8670-81066beb734c", - "metadata": {}, - "source": [ - "##### ***** Import required Classes and modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", - "metadata": {}, - "outputs": [], - "source": [ - "import ast\n", - "import os\n", - "import sys\n", - "\n", - "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_python import parse_args\n", - "from fdedup_transform_ray import RayServiceOrchestrator" - ] - }, - { - "cell_type": "markdown", - "id": "7234563c-2924-4150-8a31-4aec98c1bf33", - "metadata": {}, - "source": [ - "##### ***** Setup runtime parameters for this transform" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90a853e-412f-45d7-af3d-959e755aeebb", - "metadata": {}, - "outputs": [], - "source": [ - "# create parameters\n", - "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", - "params = {\n", - " # transform configuration parameters\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - " \"contents_column\": \"contents\",\n", - " \"document_id_column\": \"int_id_column\",\n", - " \"num_permutations\": 112,\n", - " \"num_bands\": 14,\n", - " \"num_minhashes_per_band\": 8,\n", - " \"num_segments\": 1,\n", - " \"operation_mode\": \"filter_duplicates\",\n", - " # ray configuration parameters\n", - " \"run_locally\": True,\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", - "metadata": {}, - "source": [ - "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0775e400-7469-49a6-8998-bd4772931459", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "args = parse_args()\n", - "# Initialize the orchestrator\n", - "orchestrator = RayServiceOrchestrator(global_params=args)\n", - "# Launch ray fuzzy dedup execution\n", - "orchestrator.orchestrate()" - ] - }, - { - "cell_type": "markdown", - "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", - "metadata": {}, - "source": [ - "##### **** The specified folder will include the transformed parquet files." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7276fe84-6512-4605-ab65-747351e13a7c", - "metadata": {}, - "outputs": [], - "source": [ - "import glob\n", - "glob.glob(\"ray/output/cleaned/*\")" - ] - }, - { - "cell_type": "markdown", - "id": "d30489d9-fc98-423e-90a8-e8f372787e88", - "metadata": {}, - "source": [ - "***** print the input data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", - "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", - " print(input_df)" - ] - }, - { - "cell_type": "markdown", - "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", - "metadata": {}, - "source": [ - "***** print the output result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", - "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", - " print(output_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d60e391d-cf58-47ae-9991-04c05d114edc", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "fdedup_ray", - "language": "python", - "name": "fdedup_ray" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From f61493cf1915638e0a4ff4f94c824b02c69833a7 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 25 Nov 2024 17:53:00 -0500 Subject: [PATCH 100/105] relax hap denpendencies on torch Signed-off-by: Maroun Touma --- transforms/pyproject.toml | 8 ++++---- transforms/universal/hap/python/requirements.txt | 2 +- transforms/universal/hap/ray/requirements.txt | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 2357553e4..3c1f64c32 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -80,11 +80,11 @@ web2parquet = { file = ["universal/web2parquet/requirements.txt"]} # Does not seem to work for our custom layout # copy all files to a single src and let automatic discovery find them -[tool.setuptools.package-data] -"*" = ["*.txt"] +#[tool.setuptools.package-data] +#"*" = ["*.txt"] -[tool.setuptools.packages.find] -where = ["src"] +#[tool.setuptools.packages.find] +#where = ["src"] #[tool.setuptools.package-dir] #dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt index 505dd9ceb..ba8948477 100644 --- a/transforms/universal/hap/python/requirements.txt +++ b/transforms/universal/hap/python/requirements.txt @@ -1,5 +1,5 @@ data-prep-toolkit==0.2.2.dev2 nltk==3.9.1 transformers==4.38.2 -torch==2.4.1 +torch>=2.2.2,<=2.4.1 pandas==2.2.2 diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt index 0ed65f625..34e1d6932 100644 --- a/transforms/universal/hap/ray/requirements.txt +++ b/transforms/universal/hap/ray/requirements.txt @@ -2,5 +2,5 @@ data-prep-toolkit[ray]==0.2.2.dev2 dpk-hap-transform-python==0.2.2.dev2 nltk==3.9.1 transformers==4.38.2 -torch==2.4.1 +torch>=2.2.2,<=2.4.1 pandas==2.2.2 From 60dd4794b62876c30a12e72abd8395a9dcc24be8 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 25 Nov 2024 19:12:58 -0500 Subject: [PATCH 101/105] run make.version for dpk 0.2.2 and connector 0.2.3 Signed-off-by: Maroun Touma --- .make.versions | 12 ++++++++++-- data-connector-lib/pyproject.toml | 2 +- data-processing-lib/pyproject.toml | 2 +- data-processing-lib/spark/pyproject.toml | 4 ++-- .../createRayClusterComponent.yaml | 2 +- .../deleteRayClusterComponent.yaml | 2 +- kfp/kfp_ray_components/executeRayJobComponent.yaml | 2 +- .../executeRayJobComponent_multi_s3.yaml | 2 +- .../executeSubWorkflowComponent.yaml | 2 +- .../kfp_v1_workflow_support/pyproject.toml | 4 ++-- .../kfp_v2_workflow_support/pyproject.toml | 4 ++-- .../shared_workflow_support/pyproject.toml | 4 ++-- .../code/code2parquet/kfp_ray/code2parquet_wf.py | 2 +- transforms/code/code2parquet/python/pyproject.toml | 2 +- transforms/code/code2parquet/python/requirements.txt | 2 +- transforms/code/code2parquet/ray/pyproject.toml | 6 +++--- transforms/code/code_profiler/python/pyproject.toml | 2 +- .../code/code_profiler/python/requirements.txt | 2 +- transforms/code/code_profiler/ray/pyproject.toml | 6 +++--- .../code/code_quality/kfp_ray/code_quality_wf.py | 2 +- transforms/code/code_quality/python/pyproject.toml | 2 +- transforms/code/code_quality/python/requirements.txt | 2 +- transforms/code/code_quality/ray/pyproject.toml | 6 +++--- .../header_cleanser/kfp_ray/header_cleanser_wf.py | 2 +- .../code/header_cleanser/python/pyproject.toml | 2 +- .../code/header_cleanser/python/requirements.txt | 2 +- transforms/code/header_cleanser/ray/pyproject.toml | 6 +++--- .../code/license_select/kfp_ray/license_select_wf.py | 2 +- transforms/code/license_select/python/pyproject.toml | 2 +- .../code/license_select/python/requirements.txt | 2 +- transforms/code/license_select/ray/pyproject.toml | 6 +++--- transforms/code/malware/kfp_ray/malware_wf.py | 2 +- transforms/code/malware/python/pyproject.toml | 4 ++-- transforms/code/malware/ray/pyproject.toml | 6 +++--- .../proglang_select/kfp_ray/proglang_select_wf.py | 2 +- .../code/proglang_select/python/pyproject.toml | 2 +- .../code/proglang_select/python/requirements.txt | 2 +- transforms/code/proglang_select/ray/pyproject.toml | 6 +++--- .../kfp_ray/repo_level_order_wf.py | 2 +- .../code/repo_level_ordering/ray/pyproject.toml | 4 ++-- .../doc_chunk/kfp_ray/doc_chunk_multiple_wf.py | 2 +- .../language/doc_chunk/kfp_ray/doc_chunk_wf.py | 2 +- .../language/doc_chunk/python/requirements.txt | 2 +- transforms/language/doc_chunk/ray/pyproject.toml | 2 +- .../doc_quality/kfp_ray/doc_quality_multiple_wf.py | 2 +- .../language/doc_quality/kfp_ray/doc_quality_wf.py | 2 +- .../language/doc_quality/python/pyproject.toml | 2 +- .../language/doc_quality/python/requirements.txt | 2 +- transforms/language/doc_quality/ray/pyproject.toml | 6 +++--- .../language/html2parquet/kfp_ray/html2parquet_wf.py | 2 +- .../language/html2parquet/python/pyproject.toml | 2 +- .../language/html2parquet/python/requirements.txt | 2 +- transforms/language/html2parquet/ray/pyproject.toml | 2 +- .../language/html2parquet/ray/requirements.txt | 4 ++-- .../language/lang_id/kfp_ray/lang_id_multiple_wf.py | 2 +- transforms/language/lang_id/kfp_ray/lang_id_wf.py | 2 +- transforms/language/lang_id/python/pyproject.toml | 2 +- transforms/language/lang_id/python/requirements.txt | 2 +- transforms/language/lang_id/ray/pyproject.toml | 6 +++--- .../pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py | 2 +- .../language/pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +- .../language/pdf2parquet/python/requirements.txt | 2 +- transforms/language/pdf2parquet/ray/requirements.txt | 2 +- .../language/pii_redactor/python/requirements.txt | 2 +- transforms/language/pii_redactor/ray/pyproject.toml | 2 +- .../text_encoder/kfp_ray/text_encoder_multiple_wf.py | 2 +- .../language/text_encoder/kfp_ray/text_encoder_wf.py | 2 +- .../language/text_encoder/python/pyproject.toml | 2 +- .../language/text_encoder/python/requirements.txt | 2 +- transforms/language/text_encoder/ray/pyproject.toml | 6 +++--- transforms/pyproject.toml | 2 +- transforms/requirements-ray.txt | 2 +- transforms/requirements.txt | 2 +- transforms/transform.config | 8 -------- transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- transforms/universal/doc_id/python/pyproject.toml | 2 +- transforms/universal/doc_id/python/requirements.txt | 2 +- transforms/universal/doc_id/ray/pyproject.toml | 6 +++--- transforms/universal/doc_id/spark/pyproject.toml | 4 ++-- transforms/universal/ededup/kfp_ray/ededup_wf.py | 2 +- transforms/universal/ededup/python/pyproject.toml | 2 +- transforms/universal/ededup/python/requirements.txt | 2 +- transforms/universal/ededup/ray/pyproject.toml | 6 +++--- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- transforms/universal/fdedup/ray/pyproject.toml | 4 ++-- transforms/universal/filter/kfp_ray/filter_wf.py | 2 +- transforms/universal/filter/python/pyproject.toml | 2 +- transforms/universal/filter/python/requirements.txt | 2 +- transforms/universal/filter/ray/pyproject.toml | 6 +++--- transforms/universal/filter/spark/pyproject.toml | 4 ++-- transforms/universal/hap/kfp_ray.disable/hap_wf.py | 2 +- transforms/universal/hap/python/pyproject.toml | 2 +- transforms/universal/hap/python/requirements.txt | 2 +- transforms/universal/hap/ray/pyproject.toml | 2 +- transforms/universal/hap/ray/requirements.txt | 4 ++-- .../universal/noop/kfp_ray/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- transforms/universal/noop/python/pyproject.toml | 4 ++-- transforms/universal/noop/ray/pyproject.toml | 6 +++--- transforms/universal/noop/spark/pyproject.toml | 6 +++--- transforms/universal/profiler/kfp_ray/profiler_wf.py | 2 +- transforms/universal/profiler/python/pyproject.toml | 2 +- .../universal/profiler/python/requirements.txt | 2 +- transforms/universal/profiler/ray/pyproject.toml | 6 +++--- transforms/universal/profiler/spark/pyproject.toml | 6 +++--- transforms/universal/resize/kfp_ray/resize_wf.py | 2 +- transforms/universal/resize/python/pyproject.toml | 2 +- transforms/universal/resize/python/requirements.txt | 2 +- transforms/universal/resize/ray/pyproject.toml | 6 +++--- transforms/universal/resize/spark/pyproject.toml | 6 +++--- .../tokenization/kfp_ray/tokenization_wf.py | 2 +- .../universal/tokenization/python/pyproject.toml | 2 +- .../universal/tokenization/python/requirements.txt | 2 +- transforms/universal/tokenization/ray/pyproject.toml | 6 +++--- transforms/universal/web2parquet/requirements.txt | 4 ++-- 115 files changed, 176 insertions(+), 176 deletions(-) diff --git a/.make.versions b/.make.versions index ed36fe8c8..e3a8e8239 100644 --- a/.make.versions +++ b/.make.versions @@ -19,7 +19,7 @@ DPK_MINOR_VERSION=2 DPK_MICRO_VERSION=2 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX=.dev2 +DPK_VERSION_SUFFIX= DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) @@ -39,7 +39,7 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION) KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION) KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION) -DPK_CONNECTOR_VERSION=0.2.3.dev0 +DPK_CONNECTOR_VERSION=0.2.3 ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. @@ -59,3 +59,11 @@ else WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support endif +################################################################################ +# This defines the transforms' package version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +TRANSFORMS_PKG_VERSION=0.2.2 diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml index 4fcc97ed9..d3d213946 100644 --- a/data-connector-lib/pyproject.toml +++ b/data-connector-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_connector" -version = "0.2.3.dev1" +version = "0.2.3" requires-python = ">=3.10,<3.13" keywords = [ "data", diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index 2e827ea82..36e4e155f 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.2.dev2" +version = "0.2.2" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Ray and Python" diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml index 89b4d9bf8..c0be43920 100644 --- a/data-processing-lib/spark/pyproject.toml +++ b/data-processing-lib/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_spark" -version = "0.2.2.dev2" +version = "0.2.2" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Spark" @@ -11,7 +11,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev2", + "data-prep-toolkit==0.2.2", "pyspark>=3.5.2", "psutil>=6.0.0", "PyYAML>=6.0.2" diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml index 30b0b66d8..78976a97c 100644 --- a/kfp/kfp_ray_components/createRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml index 44e199c47..c75554d5f 100644 --- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml @@ -9,7 +9,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index 7ab517bff..2e02c3adf 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -12,7 +12,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index 9b98912f0..37c0198bf 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml index 6b261a003..ec82e9484 100644 --- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml +++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml @@ -27,7 +27,7 @@ outputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists, and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index d7058f2ae..daa903aaf 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.2.dev2", + "data-prep-toolkit-kfp-shared==0.2.2", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 04b6bc7a2..61f54663f 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.2.dev2", + "data-prep-toolkit-kfp-shared==0.2.2", ] [build-system] diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index df27ad1cf..3ba7491bc 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index f3f491e4b..3e5f262b9 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index 5e6f41bb2..d4f8c11cf 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt index bbb84b749..4a217ff8c 100644 --- a/transforms/code/code2parquet/python/requirements.txt +++ b/transforms/code/code2parquet/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 parameterized pandas diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index 15a4be4c1..98b2e3a65 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk-code2parquet-transform-python==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.2", + "dpk-code2parquet-transform-python==0.2.2", "parameterized", "pandas", ] diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml index 492603d54..d3c2c2196 100644 --- a/transforms/code/code_profiler/python/pyproject.toml +++ b/transforms/code/code_profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt index 8608c6d6e..31509b291 100644 --- a/transforms/code/code_profiler/python/requirements.txt +++ b/transforms/code/code_profiler/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 parameterized pandas aiolimiter==1.1.0 diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml index 933152e3f..0c9457efc 100644 --- a/transforms/code/code_profiler/ray/pyproject.toml +++ b/transforms/code/code_profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, ] dependencies = [ - "dpk-code-profiler-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-code-profiler-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 6a4ccec1b..7f5aa9768 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 5f201c8ae..d7b452d6b 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt index 0bd936ef2..a50ddff5c 100644 --- a/transforms/code/code_quality/python/requirements.txt +++ b/transforms/code/code_quality/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 bs4==0.0.2 transformers==4.38.2 diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index 290429f95..ea6aad8ae 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-code-quality-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 9bb315569..5049a9c11 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index ecaf4d7bb..2dadeaf02 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt index c2d0d8793..fd3fc0de4 100644 --- a/transforms/code/header_cleanser/python/requirements.txt +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 scancode-toolkit==32.1.0 ; platform_system != 'Darwin' diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index adff71cfc..471ce1d5e 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-header-cleanser-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 7dba0d9d1..9bdcc6e96 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index 30f2f001e..b445c6b09 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License Select Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt index 368287e5d..880c7c2c7 100644 --- a/transforms/code/license_select/python/requirements.txt +++ b/transforms/code/license_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit==0.2.2 \ No newline at end of file diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index 815121787..b2c56e940 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License Select Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-license-select-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index bede80b88..89eb9d730 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index 22d92fd8c..2a7d1a5b9 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev2", + "data-prep-toolkit==0.2.2", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index 791b8d253..36901b88c 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-malware-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 11f001bfa..bb114e3d6 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index 186198d83..e20a62f7c 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt index 368287e5d..880c7c2c7 100644 --- a/transforms/code/proglang_select/python/requirements.txt +++ b/transforms/code/proglang_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit==0.2.2 \ No newline at end of file diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index bf3e5f9f4..d2e820d99 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-proglang-select-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 38a829fab..fa739bfd0 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "repo_level_order_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index 80440a362..5fb561d67 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.2", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 7e30ee8b8..1fd927356 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 387c3bda7..e128df8b0 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index c24d0c3e2..144688f63 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 docling-core==2.3.0 pydantic>=2.0.0,<2.10.0 llama-index-core>=0.11.22,<0.12.0 diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 29b594fac..ed8f5d60b 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] dependencies = [ "dpk-doc-chunk-transform-python==0.3.0", - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index 436d93ff3..f103b7269 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index f39fd7e39..0ca4fb865 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index 72406e945..f3abe0337 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt index 2993d6b12..de76cb006 100644 --- a/transforms/language/doc_quality/python/requirements.txt +++ b/transforms/language/doc_quality/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index dc13d5f94..c1433d29b 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-doc_quality-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index 4eb8b9de1..4eaef2fea 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "html2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/html2parquet/python/pyproject.toml b/transforms/language/html2parquet/python/pyproject.toml index dfd0c3928..af6b64763 100644 --- a/transforms/language/html2parquet/python/pyproject.toml +++ b/transforms/language/html2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/python/requirements.txt b/transforms/language/html2parquet/python/requirements.txt index af6ffe1e5..432362451 100644 --- a/transforms/language/html2parquet/python/requirements.txt +++ b/transforms/language/html2parquet/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 trafilatura==1.12.0 diff --git a/transforms/language/html2parquet/ray/pyproject.toml b/transforms/language/html2parquet/ray/pyproject.toml index 873883e49..859706621 100644 --- a/transforms/language/html2parquet/ray/pyproject.toml +++ b/transforms/language/html2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt index 151d05a3e..7e543b153 100644 --- a/transforms/language/html2parquet/ray/requirements.txt +++ b/transforms/language/html2parquet/ray/requirements.txt @@ -1,3 +1,3 @@ -dpk-html2parquet-transform-python==0.2.2.dev2 -data-prep-toolkit[ray]==0.2.2.dev2 +dpk-html2parquet-transform-python==0.2.2 +data-prep-toolkit[ray]==0.2.2 trafilatura==1.12.0 \ No newline at end of file diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index a89c54ab3..e853c2328 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index 2ac84645d..5aed719c5 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index c5de6826a..43650a50a 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt index a405f7afc..2cd053cfb 100644 --- a/transforms/language/lang_id/python/requirements.txt +++ b/transforms/language/lang_id/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 fasttext==0.9.2 langcodes==3.3.0 huggingface-hub >= 0.21.4, <1.0.0 diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index ac45a167e..6347bda71 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-lang_id-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 8992f1145..56e881b5e 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index c9cdbf652..395918ac3 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index 2912af252..4d09ff394 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 docling-core==2.3.0 docling-ibm-models==2.0.3 deepsearch-glm==0.26.1 diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index 2b414c59e..abec5044d 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,5 +1,5 @@ dpk-pdf2parquet-transform-python==0.3.0 -data-prep-toolkit[ray]==0.2.2.dev2 +data-prep-toolkit[ray]==0.2.2 # docling-core==1.7.2 # docling-ibm-models==2.0.0 # deepsearch-glm==0.22.0 diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt index 958210865..1fb9c95b9 100644 --- a/transforms/language/pii_redactor/python/requirements.txt +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 presidio-analyzer>=2.2.355 presidio-anonymizer>=2.2.355 flair>=0.14.0 diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index b96f16615..b98b2c9af 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -11,7 +11,7 @@ authors = [ ] dependencies = [ "dpk_pii_redactor_transform_python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.2", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index e522737a1..bad5e24cd 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index f88fe9eef..5c762c2a1 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index 87dad3c1c..62182b27b 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt index 2eb79e69b..32bf83692 100644 --- a/transforms/language/text_encoder/python/requirements.txt +++ b/transforms/language/text_encoder/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 sentence-transformers==3.0.1 diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index ef08f697a..c6d49701b 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-text_encoder-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 3c1f64c32..3b853cbe7 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.2.dev3" +version = "0.2.2" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index 9012f685b..11d0decf5 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.2.dev2 +data-prep-toolkit[ray]>=0.2.2 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5 diff --git a/transforms/requirements.txt b/transforms/requirements.txt index 8b48a970f..7317d33e3 100644 --- a/transforms/requirements.txt +++ b/transforms/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.2.dev2 +data-prep-toolkit>=0.2.2 diff --git a/transforms/transform.config b/transforms/transform.config index c226171c6..7bafba684 100644 --- a/transforms/transform.config +++ b/transforms/transform.config @@ -7,11 +7,3 @@ # expected files and is used to define the transform's image name. TRANSFORM_NAME=data-prep-kit-transforms -################################################################################ -# This defines the transforms' package version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -TRANSFORMS_PKG_VERSION=0.2.2.dev2 diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index f41231159..7e1bd0b8e 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index 0e2658087..a9e69f0bf 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt index 368287e5d..880c7c2c7 100644 --- a/transforms/universal/doc_id/python/requirements.txt +++ b/transforms/universal/doc_id/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit==0.2.2 \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index 5a5941155..fc6a37b19 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk_doc_id_transform_python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index 36f345c09..f50d4f70d 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.2.dev2", + "data-prep-toolkit[spark]==0.2.2", ] [build-system] diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index ab46daadb..d878bd3e2 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 735104f20..67fd0f758 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt index 75baaef62..45b4cfd50 100644 --- a/transforms/universal/ededup/python/requirements.txt +++ b/transforms/universal/ededup/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 mmh3>=4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index 9e3885e50..d74fa0637 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_ededup_transform_python==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.2", + "dpk_ededup_transform_python==0.2.2", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 3156ab6f1..da431d030 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 923cbdf82..7c59dcff9 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.2", "mmh3>=4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index b856b1007..4b122d98f 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index 64f148799..8e9bb2366 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Filter Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt index 9f1feff29..5e3e783c8 100644 --- a/transforms/universal/filter/python/requirements.txt +++ b/transforms/universal/filter/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 duckdb>=0.10.1 diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index a794a1a0b..a8ec7bb4d 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-filter-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index 7b60dba46..85403487a 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.2.dev2", + "data-prep-toolkit[spark]==0.2.2", ] [project.optional-dependencies] diff --git a/transforms/universal/hap/kfp_ray.disable/hap_wf.py b/transforms/universal/hap/kfp_ray.disable/hap_wf.py index 786011d4d..8069ec181 100644 --- a/transforms/universal/hap/kfp_ray.disable/hap_wf.py +++ b/transforms/universal/hap/kfp_ray.disable/hap_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "hap_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml index 389788363..7b30dd72e 100644 --- a/transforms/universal/hap/python/pyproject.toml +++ b/transforms/universal/hap/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HAP Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt index ba8948477..07c5f854a 100644 --- a/transforms/universal/hap/python/requirements.txt +++ b/transforms/universal/hap/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml index abbb1a30c..6518e5277 100644 --- a/transforms/universal/hap/ray/pyproject.toml +++ b/transforms/universal/hap/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HAP Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt index 34e1d6932..119167ca2 100644 --- a/transforms/universal/hap/ray/requirements.txt +++ b/transforms/universal/hap/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit[ray]==0.2.2.dev2 -dpk-hap-transform-python==0.2.2.dev2 +data-prep-toolkit[ray]==0.2.2 +dpk-hap-transform-python==0.2.2 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 3b102d205..737b60121 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index e8125328b..9dbdaf3b0 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index 998161e31..e8c089ef0 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev2", + "data-prep-toolkit==0.2.2", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index 5d475fe12..19fe77560 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-noop-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index f867fb070..495d827a0 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2.dev2", - "data-prep-toolkit[spark]==0.2.2.dev2", + "dpk-noop-transform-python==0.2.2", + "data-prep-toolkit[spark]==0.2.2", ] [build-system] diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 914637895..ee6323d74 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "profiler_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 95775e3a6..117be53c0 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt index 89801e4ad..fee352d4a 100644 --- a/transforms/universal/profiler/python/requirements.txt +++ b/transforms/universal/profiler/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 mmh3==4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index 6060653fa..c9f1b1da3 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_profiler_transform_python==0.2.2.dev2", + "data-prep-toolkit[ray]==0.2.2", + "dpk_profiler_transform_python==0.2.2", "tqdm==4.66.3", ] diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml index 455684b4f..05602dc26 100644 --- a/transforms/universal/profiler/spark/pyproject.toml +++ b/transforms/universal/profiler/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_spark" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Profiler Spark Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-profiler-transform-python==0.2.2.dev2", - "data-prep-toolkit[spark]==0.2.2.dev2", + "dpk-profiler-transform-python==0.2.2", + "data-prep-toolkit[spark]==0.2.2", ] [build-system] diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 0724ed731..0a9be8e95 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "resize_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 082f37f0c..836388694 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "resize Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt index 368287e5d..880c7c2c7 100644 --- a/transforms/universal/resize/python/requirements.txt +++ b/transforms/universal/resize/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit==0.2.2 \ No newline at end of file diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index 1490303bb..4f7603f6f 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-resize-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml index 538c12d20..c8bb67111 100644 --- a/transforms/universal/resize/spark/pyproject.toml +++ b/transforms/universal/resize/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_spark" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Resize Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2.dev2", - "data-prep-toolkit[spark]==0.2.2.dev2", + "dpk-resize-transform-python==0.2.2", + "data-prep-toolkit[spark]==0.2.2", ] [build-system] diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index c131d11ea..243cac6be 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -23,7 +23,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files # path to kfp component specifications files diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index bc352f0fd..021a1427f 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt index 5e00dbaa1..afd567d8b 100644 --- a/transforms/universal/tokenization/python/requirements.txt +++ b/transforms/universal/tokenization/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit==0.2.2 transformers==4.38.2 diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index 095cb63e0..3cc4bcf80 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-tokenization-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.2", ] [build-system] diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt index 5c989591d..dfb74a6ca 100644 --- a/transforms/universal/web2parquet/requirements.txt +++ b/transforms/universal/web2parquet/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.2.dev2 -data_prep_connector>=0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.2 +data_prep_connector>=0.2.3 \ No newline at end of file From 0587637771e36cbb099465fc28e6d388c1bc9b8e Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 25 Nov 2024 20:18:16 -0500 Subject: [PATCH 102/105] update release notes Signed-off-by: Maroun Touma --- release-notes.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/release-notes.md b/release-notes.md index 15f23c542..4b7b8d553 100644 --- a/release-notes.md +++ b/release-notes.md @@ -1,5 +1,42 @@ # Data Prep Kit Release notes +## Release 0.2.2 - 11/25/2024 + +### General +1. Update RAG example to use granite model +1. Updated transforms with Docling 2 +1. Added single package for dpk with extra for \[spark\] and \[ray\] +1. Added single package for transforms with extra for \[all\] or \[individual-transform-name\] + + +### data-prep-toolkit libraries (python, ray, spark) + +1. Fix metadata logging even when actors crash +1. Add multilock for ray workers downloads/cleanup +1. Multiple updates to spark runtime +1. Added support for python 3.12 +1. refactoring of data access code + + +### KFP Workloads + +1. Modify superpipeline params type Str/json +1. Set kuberay apiserver version +1. Add Super pipeline for code transforms + + +### Transforms + +1. Enhance pdf2parquet with docling2 support for extracting HTML, DOCS, etc. +1. Added web2parquet transform +1. Added HAP transform + +### HTTP Connector 0.2.3 + +1. Enhanced parameter/configuration allows the user to customize crawler settings +1. implement subdomain focus feature in data-prep-connector + + ## Release 0.2.2- HTTP Connector Module - 10/23/2024 ### General From a067e55e13fede9dbb30cccc2a74b4b441a961e7 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 25 Nov 2024 21:41:15 -0500 Subject: [PATCH 103/105] Setup dev after new release Signed-off-by: Maroun Touma --- .make.versions | 8 ++++---- data-connector-lib/pyproject.toml | 2 +- data-processing-lib/pyproject.toml | 2 +- data-processing-lib/spark/pyproject.toml | 4 ++-- kfp/kfp_ray_components/createRayClusterComponent.yaml | 2 +- kfp/kfp_ray_components/deleteRayClusterComponent.yaml | 2 +- kfp/kfp_ray_components/executeRayJobComponent.yaml | 2 +- .../executeRayJobComponent_multi_s3.yaml | 2 +- kfp/kfp_ray_components/executeSubWorkflowComponent.yaml | 2 +- .../kfp_v1_workflow_support/pyproject.toml | 4 ++-- .../kfp_v2_workflow_support/pyproject.toml | 4 ++-- .../shared_workflow_support/pyproject.toml | 4 ++-- transforms/code/code2parquet/kfp_ray/code2parquet_wf.py | 2 +- transforms/code/code2parquet/python/pyproject.toml | 2 +- transforms/code/code2parquet/python/requirements.txt | 2 +- transforms/code/code2parquet/ray/pyproject.toml | 6 +++--- transforms/code/code_profiler/python/pyproject.toml | 2 +- transforms/code/code_profiler/python/requirements.txt | 2 +- transforms/code/code_profiler/ray/pyproject.toml | 6 +++--- transforms/code/code_quality/kfp_ray/code_quality_wf.py | 2 +- transforms/code/code_quality/python/pyproject.toml | 2 +- transforms/code/code_quality/python/requirements.txt | 2 +- transforms/code/code_quality/ray/pyproject.toml | 6 +++--- .../code/header_cleanser/kfp_ray/header_cleanser_wf.py | 2 +- transforms/code/header_cleanser/python/pyproject.toml | 2 +- transforms/code/header_cleanser/python/requirements.txt | 2 +- transforms/code/header_cleanser/ray/pyproject.toml | 6 +++--- .../code/license_select/kfp_ray/license_select_wf.py | 2 +- transforms/code/license_select/python/pyproject.toml | 2 +- transforms/code/license_select/python/requirements.txt | 2 +- transforms/code/license_select/ray/pyproject.toml | 6 +++--- transforms/code/malware/kfp_ray/malware_wf.py | 2 +- transforms/code/malware/python/pyproject.toml | 4 ++-- transforms/code/malware/ray/pyproject.toml | 6 +++--- .../code/proglang_select/kfp_ray/proglang_select_wf.py | 2 +- transforms/code/proglang_select/python/pyproject.toml | 2 +- transforms/code/proglang_select/python/requirements.txt | 2 +- transforms/code/proglang_select/ray/pyproject.toml | 6 +++--- .../repo_level_ordering/kfp_ray/repo_level_order_wf.py | 2 +- transforms/code/repo_level_ordering/ray/pyproject.toml | 4 ++-- .../language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py | 2 +- transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py | 2 +- transforms/language/doc_chunk/python/requirements.txt | 2 +- transforms/language/doc_chunk/ray/pyproject.toml | 2 +- .../doc_quality/kfp_ray/doc_quality_multiple_wf.py | 2 +- transforms/language/doc_quality/kfp_ray/doc_quality_wf.py | 2 +- transforms/language/doc_quality/python/pyproject.toml | 2 +- transforms/language/doc_quality/python/requirements.txt | 2 +- transforms/language/doc_quality/ray/pyproject.toml | 6 +++--- .../language/html2parquet/kfp_ray/html2parquet_wf.py | 2 +- transforms/language/html2parquet/python/pyproject.toml | 2 +- transforms/language/html2parquet/python/requirements.txt | 2 +- transforms/language/html2parquet/ray/pyproject.toml | 2 +- transforms/language/html2parquet/ray/requirements.txt | 4 ++-- .../language/lang_id/kfp_ray/lang_id_multiple_wf.py | 2 +- transforms/language/lang_id/kfp_ray/lang_id_wf.py | 2 +- transforms/language/lang_id/python/pyproject.toml | 2 +- transforms/language/lang_id/python/requirements.txt | 2 +- transforms/language/lang_id/ray/pyproject.toml | 6 +++--- .../pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py | 2 +- transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +- transforms/language/pdf2parquet/python/requirements.txt | 2 +- transforms/language/pdf2parquet/ray/requirements.txt | 2 +- transforms/language/pii_redactor/python/requirements.txt | 2 +- transforms/language/pii_redactor/ray/pyproject.toml | 6 +++--- .../text_encoder/kfp_ray/text_encoder_multiple_wf.py | 2 +- .../language/text_encoder/kfp_ray/text_encoder_wf.py | 2 +- transforms/language/text_encoder/python/pyproject.toml | 2 +- transforms/language/text_encoder/python/requirements.txt | 2 +- transforms/language/text_encoder/ray/pyproject.toml | 6 +++--- transforms/pyproject.toml | 2 +- transforms/requirements-ray.txt | 2 +- transforms/requirements.txt | 2 +- transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- transforms/universal/doc_id/python/pyproject.toml | 2 +- transforms/universal/doc_id/python/requirements.txt | 2 +- transforms/universal/doc_id/ray/pyproject.toml | 6 +++--- transforms/universal/doc_id/spark/pyproject.toml | 4 ++-- transforms/universal/ededup/kfp_ray/ededup_wf.py | 2 +- transforms/universal/ededup/python/pyproject.toml | 2 +- transforms/universal/ededup/python/requirements.txt | 2 +- transforms/universal/ededup/ray/pyproject.toml | 6 +++--- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- transforms/universal/fdedup/ray/pyproject.toml | 4 ++-- transforms/universal/filter/kfp_ray/filter_wf.py | 2 +- transforms/universal/filter/python/pyproject.toml | 2 +- transforms/universal/filter/python/requirements.txt | 2 +- transforms/universal/filter/ray/pyproject.toml | 6 +++--- transforms/universal/filter/spark/pyproject.toml | 4 ++-- transforms/universal/hap/kfp_ray.disable/hap_wf.py | 2 +- transforms/universal/hap/python/pyproject.toml | 2 +- transforms/universal/hap/python/requirements.txt | 2 +- transforms/universal/hap/ray/pyproject.toml | 2 +- transforms/universal/hap/ray/requirements.txt | 4 ++-- transforms/universal/noop/kfp_ray/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- transforms/universal/noop/python/pyproject.toml | 4 ++-- transforms/universal/noop/ray/pyproject.toml | 6 +++--- transforms/universal/noop/spark/pyproject.toml | 6 +++--- transforms/universal/profiler/kfp_ray/profiler_wf.py | 2 +- transforms/universal/profiler/python/pyproject.toml | 2 +- transforms/universal/profiler/python/requirements.txt | 2 +- transforms/universal/profiler/ray/pyproject.toml | 6 +++--- transforms/universal/profiler/spark/pyproject.toml | 6 +++--- transforms/universal/resize/kfp_ray/resize_wf.py | 2 +- transforms/universal/resize/python/pyproject.toml | 2 +- transforms/universal/resize/python/requirements.txt | 2 +- transforms/universal/resize/ray/pyproject.toml | 6 +++--- transforms/universal/resize/spark/pyproject.toml | 6 +++--- .../universal/tokenization/kfp_ray/tokenization_wf.py | 2 +- transforms/universal/tokenization/python/pyproject.toml | 2 +- transforms/universal/tokenization/python/requirements.txt | 2 +- transforms/universal/tokenization/ray/pyproject.toml | 6 +++--- transforms/universal/web2parquet/requirements.txt | 2 +- 114 files changed, 171 insertions(+), 171 deletions(-) diff --git a/.make.versions b/.make.versions index e3a8e8239..bd01a60d7 100644 --- a/.make.versions +++ b/.make.versions @@ -16,10 +16,10 @@ DPK_MAJOR_VERSION=0 # The minor version is incremented manually when significant features have been added that are backward compatible with the previous major.minor release. DPK_MINOR_VERSION=2 # The minor version is incremented AUTOMATICALLY by the release.sh script when a new release is set. -DPK_MICRO_VERSION=2 +DPK_MICRO_VERSION=3 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX= +DPK_VERSION_SUFFIX=.dev0 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) @@ -39,7 +39,7 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION) KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION) KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION) -DPK_CONNECTOR_VERSION=0.2.3 +DPK_CONNECTOR_VERSION=0.2.3.dev0 ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. @@ -66,4 +66,4 @@ endif # # If you change the versions numbers, be sure to run "make set-versions" to # update version numbers across the transform (e.g., pyproject.toml). -TRANSFORMS_PKG_VERSION=0.2.2 +TRANSFORMS_PKG_VERSION=0.2.3.dev0 diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml index d3d213946..eaf459a07 100644 --- a/data-connector-lib/pyproject.toml +++ b/data-connector-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_connector" -version = "0.2.3" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" keywords = [ "data", diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index 36e4e155f..40bf6b2a1 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.2" +version = "0.2.3.dev0" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Ray and Python" diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml index c0be43920..55c5a5e9e 100644 --- a/data-processing-lib/spark/pyproject.toml +++ b/data-processing-lib/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_spark" -version = "0.2.2" +version = "0.2.3.dev0" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Spark" @@ -11,7 +11,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2", + "data-prep-toolkit==0.2.3.dev0", "pyspark>=3.5.2", "psutil>=6.0.0", "PyYAML>=6.0.2" diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml index 78976a97c..30b0b66d8 100644 --- a/kfp/kfp_ray_components/createRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml index c75554d5f..44e199c47 100644 --- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml @@ -9,7 +9,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index 2e02c3adf..7ab517bff 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -12,7 +12,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index 37c0198bf..9b98912f0 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml index ec82e9484..6b261a003 100644 --- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml +++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml @@ -27,7 +27,7 @@ outputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists, and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index daa903aaf..f09b2f32a 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.2", + "data-prep-toolkit-kfp-shared==0.2.3.dev0", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 61f54663f..01c5b3e17 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.2", + "data-prep-toolkit-kfp-shared==0.2.3.dev0", ] [build-system] diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index 3ba7491bc..aa7a6dd3a 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit[ray]==0.2.2", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index 3e5f262b9..f3f491e4b 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index d4f8c11cf..be84b2f20 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt index 4a217ff8c..cec7f9c5f 100644 --- a/transforms/code/code2parquet/python/requirements.txt +++ b/transforms/code/code2parquet/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 parameterized pandas diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index 98b2e3a65..d56fed1e8 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2", - "dpk-code2parquet-transform-python==0.2.2", + "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-code2parquet-transform-python==0.2.3.dev0", "parameterized", "pandas", ] diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml index d3c2c2196..334c86fed 100644 --- a/transforms/code/code_profiler/python/pyproject.toml +++ b/transforms/code/code_profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt index 31509b291..27706b467 100644 --- a/transforms/code/code_profiler/python/requirements.txt +++ b/transforms/code/code_profiler/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 parameterized pandas aiolimiter==1.1.0 diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml index 0c9457efc..9b760c1c3 100644 --- a/transforms/code/code_profiler/ray/pyproject.toml +++ b/transforms/code/code_profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, ] dependencies = [ - "dpk-code-profiler-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-code-profiler-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 7f5aa9768..6a4ccec1b 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index d7b452d6b..17cbce67d 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt index a50ddff5c..ef627d39f 100644 --- a/transforms/code/code_quality/python/requirements.txt +++ b/transforms/code/code_quality/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 bs4==0.0.2 transformers==4.38.2 diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index ea6aad8ae..eceee32ed 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-code-quality-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 5049a9c11..9bb315569 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index 2dadeaf02..3703ec55f 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt index fd3fc0de4..915a462dc 100644 --- a/transforms/code/header_cleanser/python/requirements.txt +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 scancode-toolkit==32.1.0 ; platform_system != 'Darwin' diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index 471ce1d5e..5fb1bcf26 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-header-cleanser-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 9bdcc6e96..7dba0d9d1 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index b445c6b09..3345d3a5a 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License Select Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt index 880c7c2c7..2f67f6a80 100644 --- a/transforms/code/license_select/python/requirements.txt +++ b/transforms/code/license_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2 \ No newline at end of file +data-prep-toolkit==0.2.3.dev0 \ No newline at end of file diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index b2c56e940..ce5979d62 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License Select Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-license-select-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index 89eb9d730..bede80b88 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index 2a7d1a5b9..a1bc05ab4 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2", + "data-prep-toolkit==0.2.3.dev0", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index 36901b88c..659ee62ef 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-malware-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index bb114e3d6..11f001bfa 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index e20a62f7c..e5736a9c7 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt index 880c7c2c7..2f67f6a80 100644 --- a/transforms/code/proglang_select/python/requirements.txt +++ b/transforms/code/proglang_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2 \ No newline at end of file +data-prep-toolkit==0.2.3.dev0 \ No newline at end of file diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index d2e820d99..d8288d189 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-proglang-select-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index fa739bfd0..38a829fab 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "repo_level_order_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index 5fb561d67..9581c8941 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2", + "data-prep-toolkit[ray]==0.2.3.dev0", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 1fd927356..7e30ee8b8 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index e128df8b0..387c3bda7 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index 144688f63..207ab9249 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 docling-core==2.3.0 pydantic>=2.0.0,<2.10.0 llama-index-core>=0.11.22,<0.12.0 diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index ed8f5d60b..4fb356038 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] dependencies = [ "dpk-doc-chunk-transform-python==0.3.0", - "data-prep-toolkit[ray]==0.2.2", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index f103b7269..436d93ff3 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index 0ca4fb865..f39fd7e39 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index f3abe0337..23538b8c7 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt index de76cb006..4aa2d8111 100644 --- a/transforms/language/doc_quality/python/requirements.txt +++ b/transforms/language/doc_quality/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index c1433d29b..ec56ac2c7 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-doc_quality-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index 4eaef2fea..4eb8b9de1 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "html2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/html2parquet/python/pyproject.toml b/transforms/language/html2parquet/python/pyproject.toml index af6b64763..3a7a6efbc 100644 --- a/transforms/language/html2parquet/python/pyproject.toml +++ b/transforms/language/html2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/python/requirements.txt b/transforms/language/html2parquet/python/requirements.txt index 432362451..f21e65774 100644 --- a/transforms/language/html2parquet/python/requirements.txt +++ b/transforms/language/html2parquet/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 trafilatura==1.12.0 diff --git a/transforms/language/html2parquet/ray/pyproject.toml b/transforms/language/html2parquet/ray/pyproject.toml index 859706621..5e888748c 100644 --- a/transforms/language/html2parquet/ray/pyproject.toml +++ b/transforms/language/html2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt index 7e543b153..9aa193432 100644 --- a/transforms/language/html2parquet/ray/requirements.txt +++ b/transforms/language/html2parquet/ray/requirements.txt @@ -1,3 +1,3 @@ -dpk-html2parquet-transform-python==0.2.2 -data-prep-toolkit[ray]==0.2.2 +dpk-html2parquet-transform-python==0.2.3.dev0 +data-prep-toolkit[ray]==0.2.3.dev0 trafilatura==1.12.0 \ No newline at end of file diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index e853c2328..a89c54ab3 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index 5aed719c5..2ac84645d 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index 43650a50a..a69724a2d 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt index 2cd053cfb..06bec1ab9 100644 --- a/transforms/language/lang_id/python/requirements.txt +++ b/transforms/language/lang_id/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 fasttext==0.9.2 langcodes==3.3.0 huggingface-hub >= 0.21.4, <1.0.0 diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index 6347bda71..dba929905 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-lang_id-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 56e881b5e..8992f1145 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 395918ac3..c9cdbf652 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index 4d09ff394..310909164 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 docling-core==2.3.0 docling-ibm-models==2.0.3 deepsearch-glm==0.26.1 diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index abec5044d..34831cde8 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,5 +1,5 @@ dpk-pdf2parquet-transform-python==0.3.0 -data-prep-toolkit[ray]==0.2.2 +data-prep-toolkit[ray]==0.2.3.dev0 # docling-core==1.7.2 # docling-ibm-models==2.0.0 # deepsearch-glm==0.22.0 diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt index 1fb9c95b9..0abcc1d96 100644 --- a/transforms/language/pii_redactor/python/requirements.txt +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 presidio-analyzer>=2.2.355 presidio-anonymizer>=2.2.355 flair>=0.14.0 diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index b98b2c9af..4549851d0 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "PII Redactor Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk_pii_redactor_transform_python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2", + "dpk_pii_redactor_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index bad5e24cd..e522737a1 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index 5c762c2a1..f88fe9eef 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index 62182b27b..dc15beb6e 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt index 32bf83692..3ac880bba 100644 --- a/transforms/language/text_encoder/python/requirements.txt +++ b/transforms/language/text_encoder/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 sentence-transformers==3.0.1 diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index c6d49701b..f1b2c09d5 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-text_encoder-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 3b853cbe7..57a2908c2 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index 11d0decf5..b0527bdd6 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.2 +data-prep-toolkit[ray]>=0.2.3.dev0 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5 diff --git a/transforms/requirements.txt b/transforms/requirements.txt index 7317d33e3..934c95182 100644 --- a/transforms/requirements.txt +++ b/transforms/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 7e1bd0b8e..f41231159 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index a9e69f0bf..1a962662d 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt index 880c7c2c7..2f67f6a80 100644 --- a/transforms/universal/doc_id/python/requirements.txt +++ b/transforms/universal/doc_id/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2 \ No newline at end of file +data-prep-toolkit==0.2.3.dev0 \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index fc6a37b19..da34dded3 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk_doc_id_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index f50d4f70d..369a1bb72 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.2", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index d878bd3e2..ab46daadb 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 67fd0f758..da28e715f 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt index 45b4cfd50..aa73a106a 100644 --- a/transforms/universal/ededup/python/requirements.txt +++ b/transforms/universal/ededup/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 mmh3>=4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index d74fa0637..424e220fd 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2", - "dpk_ededup_transform_python==0.2.2", + "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk_ededup_transform_python==0.2.3.dev0", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index da431d030..3156ab6f1 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 7c59dcff9..ee69ac81b 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2", + "data-prep-toolkit[ray]==0.2.3.dev0", "mmh3>=4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index 4b122d98f..b856b1007 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index 8e9bb2366..fcf0f6419 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt index 5e3e783c8..100626f60 100644 --- a/transforms/universal/filter/python/requirements.txt +++ b/transforms/universal/filter/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 duckdb>=0.10.1 diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index a8ec7bb4d..64776e0c1 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-filter-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index 85403487a..ef46c9a1b 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.2", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [project.optional-dependencies] diff --git a/transforms/universal/hap/kfp_ray.disable/hap_wf.py b/transforms/universal/hap/kfp_ray.disable/hap_wf.py index 8069ec181..786011d4d 100644 --- a/transforms/universal/hap/kfp_ray.disable/hap_wf.py +++ b/transforms/universal/hap/kfp_ray.disable/hap_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "hap_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml index 7b30dd72e..bf7c85577 100644 --- a/transforms/universal/hap/python/pyproject.toml +++ b/transforms/universal/hap/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HAP Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt index 07c5f854a..1250d1f77 100644 --- a/transforms/universal/hap/python/requirements.txt +++ b/transforms/universal/hap/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml index 6518e5277..38e78938b 100644 --- a/transforms/universal/hap/ray/pyproject.toml +++ b/transforms/universal/hap/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HAP Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt index 119167ca2..7c4c8eb94 100644 --- a/transforms/universal/hap/ray/requirements.txt +++ b/transforms/universal/hap/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit[ray]==0.2.2 -dpk-hap-transform-python==0.2.2 +data-prep-toolkit[ray]==0.2.3.dev0 +dpk-hap-transform-python==0.2.3.dev0 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 737b60121..3b102d205 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 9dbdaf3b0..e8125328b 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index e8c089ef0..ff9a24244 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2", + "data-prep-toolkit==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index 19fe77560..da9327917 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-noop-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index 495d827a0..d3cd47bf6 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2", - "data-prep-toolkit[spark]==0.2.2", + "dpk-noop-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index ee6323d74..914637895 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "profiler_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 117be53c0..39d9788f8 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt index fee352d4a..526140ada 100644 --- a/transforms/universal/profiler/python/requirements.txt +++ b/transforms/universal/profiler/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 mmh3==4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index c9f1b1da3..ac8d729ec 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2", - "dpk_profiler_transform_python==0.2.2", + "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk_profiler_transform_python==0.2.3.dev0", "tqdm==4.66.3", ] diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml index 05602dc26..6ba790301 100644 --- a/transforms/universal/profiler/spark/pyproject.toml +++ b/transforms/universal/profiler/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Profiler Spark Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-profiler-transform-python==0.2.2", - "data-prep-toolkit[spark]==0.2.2", + "dpk-profiler-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 0a9be8e95..0724ed731 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "resize_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 836388694..6fdad69d0 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "resize Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt index 880c7c2c7..2f67f6a80 100644 --- a/transforms/universal/resize/python/requirements.txt +++ b/transforms/universal/resize/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2 \ No newline at end of file +data-prep-toolkit==0.2.3.dev0 \ No newline at end of file diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index 4f7603f6f..c266a39f4 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-resize-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml index c8bb67111..7de14c673 100644 --- a/transforms/universal/resize/spark/pyproject.toml +++ b/transforms/universal/resize/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Resize Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2", - "data-prep-toolkit[spark]==0.2.2", + "dpk-resize-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index 243cac6be..c131d11ea 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -23,7 +23,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files # path to kfp component specifications files diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index 021a1427f..dbb8e84ba 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt index afd567d8b..8a1920162 100644 --- a/transforms/universal/tokenization/python/requirements.txt +++ b/transforms/universal/tokenization/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2 +data-prep-toolkit==0.2.3.dev0 transformers==4.38.2 diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index 3cc4bcf80..c094b9e7e 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.2", - "data-prep-toolkit[ray]==0.2.2", + "dpk-tokenization-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt index dfb74a6ca..1af3f12a4 100644 --- a/transforms/universal/web2parquet/requirements.txt +++ b/transforms/universal/web2parquet/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 data_prep_connector>=0.2.3 \ No newline at end of file From 0509fb1ad50a84939d4635b5953d82069e9a4a36 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 26 Nov 2024 04:59:46 -0500 Subject: [PATCH 104/105] bump up connector version after code release Signed-off-by: Maroun Touma --- .make.versions | 2 +- data-connector-lib/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.make.versions b/.make.versions index bd01a60d7..564caa214 100644 --- a/.make.versions +++ b/.make.versions @@ -39,7 +39,7 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION) KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION) KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION) -DPK_CONNECTOR_VERSION=0.2.3.dev0 +DPK_CONNECTOR_VERSION=0.2.4.dev0 ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml index eaf459a07..69e914f0c 100644 --- a/data-connector-lib/pyproject.toml +++ b/data-connector-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_connector" -version = "0.2.3.dev0" +version = "0.2.4.dev0" requires-python = ">=3.10,<3.13" keywords = [ "data", From 7ae1f135ccc3ba7bb4cc4ff500b0e070b7d30b7b Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 26 Nov 2024 07:01:52 -0500 Subject: [PATCH 105/105] remove reference to noop transform project Signed-off-by: Maroun Touma --- data-processing-lib/pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index 40bf6b2a1..a347a14a1 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -16,7 +16,6 @@ dynamic = ["dependencies", "optional-dependencies"] Repository = "https://github.com/IBM/data-prep-kit" Issues = "https://github.com/IBM/data-prep-kit/issues" Documentation = "https://ibm.github.io/data-prep-kit/doc" -"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]