From 2bd246d31ef22c4d5e6cdb998e400d8a18773e90 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 18 Dec 2024 13:43:46 -0500 Subject: [PATCH 1/6] first cut at refactoring fdedup as its own named dpk_ module Signed-off-by: Maroun Touma --- .../{python/Dockerfile => Dockerfile.python} | 23 +- transforms/universal/fdedup/Dockerfile.ray | 31 ++ transforms/universal/fdedup/Dockerfile.spark | 40 ++ transforms/universal/fdedup/Makefile | 86 +--- transforms/universal/fdedup/README.md | 396 +++++++++++++++++- .../{python/src => dpk_fdedup}/Murmur_MH.py | 0 .../cluster_analysis/local_python.py} | 2 +- .../ray}/cluster_estimator.py | 0 .../cluster_analysis/ray/local.py} | 2 +- .../cluster_analysis/ray/transform.py} | 2 +- .../cluster_analysis/spark/local.py} | 2 +- .../cluster_analysis/spark/transform.py} | 2 +- .../cluster_analysis/transform.py} | 2 +- .../cluster_analysis/transform_python.py} | 2 +- .../data_cleaning/local_python.py} | 4 +- .../data_cleaning/ray/local.py} | 4 +- .../data_cleaning/ray/transform.py} | 2 +- .../data_cleaning/spark/local.py} | 2 +- .../data_cleaning/spark/transform.py} | 2 +- .../data_cleaning/transform.py} | 0 .../data_cleaning/transform_python.py} | 2 +- .../get_duplicate_list/ray/transform.py} | 2 +- .../get_duplicate_list/transform.py} | 0 .../transform_local_python.py} | 2 +- .../get_duplicate_list/transform_python.py} | 2 +- .../ray/transform.py} | 12 +- .../signature_calc/local_python.py} | 2 +- .../signature_calc/ray/local.py} | 2 +- .../signature_calc/ray/transform.py} | 2 +- .../signature_calc/spark/local.py} | 2 +- .../signature_calc/spark/transform.py} | 2 +- .../signature_calc/transform.py} | 2 +- .../signature_calc/transform_python.py} | 2 +- .../spark/transform.py} | 0 .../transform_python.py} | 8 +- .../fdedup/{ray => }/images/fuzzy.png | Bin .../universal/fdedup/python/.dockerignore | 1 - transforms/universal/fdedup/python/Makefile | 64 --- transforms/universal/fdedup/python/README.md | 244 ----------- .../universal/fdedup/python/pyproject.toml | 45 -- transforms/universal/fdedup/ray/.dockerignore | 1 - transforms/universal/fdedup/ray/.gitignore | 38 -- transforms/universal/fdedup/ray/Dockerfile | 51 --- transforms/universal/fdedup/ray/Makefile | 68 --- transforms/universal/fdedup/ray/README.md | 71 ---- .../universal/fdedup/ray/pyproject.toml | 45 -- .../universal/fdedup/ray/requirements.txt | 6 - .../fdedup/{python => }/requirements.txt | 1 - transforms/universal/fdedup/spark/Dockerfile | 51 --- transforms/universal/fdedup/spark/Makefile | 57 --- transforms/universal/fdedup/spark/README.md | 2 +- ...equirements.txt => requirements-spark.txt} | 0 .../universal/fdedup/spark/requirements.txt | 11 - .../expected/cleaned/data_1/df1.parquet | Bin .../expected/cleaned/data_2/df2.parquet | Bin .../test-data/expected/cleaned/metadata.json | 0 .../docs_to_remove/band_0_segment_0.parquet | Bin .../docs_to_remove/band_0_segment_1.parquet | Bin .../docs_to_remove/band_10_segment_0.parquet | Bin .../docs_to_remove/band_10_segment_1.parquet | Bin .../docs_to_remove/band_11_segment_0.parquet | Bin .../docs_to_remove/band_11_segment_1.parquet | Bin .../docs_to_remove/band_12_segment_0.parquet | Bin .../docs_to_remove/band_12_segment_1.parquet | Bin .../docs_to_remove/band_13_segment_0.parquet | Bin .../docs_to_remove/band_13_segment_1.parquet | Bin .../docs_to_remove/band_1_segment_0.parquet | Bin .../docs_to_remove/band_1_segment_1.parquet | Bin .../docs_to_remove/band_2_segment_0.parquet | Bin .../docs_to_remove/band_2_segment_1.parquet | Bin .../docs_to_remove/band_3_segment_0.parquet | Bin .../docs_to_remove/band_3_segment_1.parquet | Bin .../docs_to_remove/band_4_segment_0.parquet | Bin .../docs_to_remove/band_4_segment_1.parquet | Bin .../docs_to_remove/band_5_segment_0.parquet | Bin .../docs_to_remove/band_5_segment_1.parquet | Bin .../docs_to_remove/band_6_segment_0.parquet | Bin .../docs_to_remove/band_6_segment_1.parquet | Bin .../docs_to_remove/band_7_segment_0.parquet | Bin .../docs_to_remove/band_7_segment_1.parquet | Bin .../docs_to_remove/band_8_segment_0.parquet | Bin .../docs_to_remove/band_8_segment_1.parquet | Bin .../docs_to_remove/band_9_segment_0.parquet | Bin .../docs_to_remove/band_9_segment_1.parquet | Bin .../docs_to_remove/metadata.json | 0 .../data_cleaning/cleaned/data_1/df1.parquet | Bin .../data_cleaning/cleaned/data_2/df2.parquet | Bin .../data_cleaning/cleaned/metadata.json | 0 .../docs_to_remove_consolidated.parquet | Bin .../docs_to_remove_consolidated.parquet | Bin .../expected/get_list_transform/metadata.json | 0 .../test-data/expected/metadata.json | 0 .../bands/band=0/segment=0/data_2/df2.parquet | Bin .../bands/band=0/segment=1/data_2/df2.parquet | Bin .../bands/band=1/segment=0/data_2/df2.parquet | Bin .../bands/band=1/segment=1/data_2/df2.parquet | Bin .../band=10/segment=0/data_2/df2.parquet | Bin .../band=10/segment=1/data_2/df2.parquet | Bin .../band=11/segment=0/data_2/df2.parquet | Bin .../band=11/segment=1/data_2/df2.parquet | Bin .../band=12/segment=0/data_2/df2.parquet | Bin .../band=12/segment=1/data_2/df2.parquet | Bin .../band=13/segment=0/data_2/df2.parquet | Bin .../band=13/segment=1/data_2/df2.parquet | Bin .../bands/band=2/segment=0/data_2/df2.parquet | Bin .../bands/band=2/segment=1/data_2/df2.parquet | Bin .../bands/band=3/segment=0/data_2/df2.parquet | Bin .../bands/band=3/segment=1/data_2/df2.parquet | Bin .../bands/band=4/segment=0/data_2/df2.parquet | Bin .../bands/band=4/segment=1/data_2/df2.parquet | Bin .../bands/band=5/segment=0/data_2/df2.parquet | Bin .../bands/band=5/segment=1/data_2/df2.parquet | Bin .../bands/band=6/segment=0/data_2/df2.parquet | Bin .../bands/band=6/segment=1/data_2/df2.parquet | Bin .../bands/band=7/segment=0/data_2/df2.parquet | Bin .../bands/band=7/segment=1/data_2/df2.parquet | Bin .../bands/band=8/segment=0/data_2/df2.parquet | Bin .../bands/band=8/segment=1/data_2/df2.parquet | Bin .../bands/band=9/segment=0/data_2/df2.parquet | Bin .../bands/band=9/segment=1/data_2/df2.parquet | Bin .../expected/signature_calc/metadata.json | 0 .../test-data/input/data_1/df1.parquet | Bin .../test-data/input/data_2/df2.parquet | Bin .../test_cluster_analysis_transform_python.py | 4 +- .../test_cluster_analysis_transform_ray.py | 6 +- .../test_cluster_analysis_transform_spark.py | 6 +- .../test_data_cleaning_transform_python.py | 4 +- .../test/test_data_cleaning_transform_ray.py | 6 +- .../test_data_cleaning_transform_spark.py | 6 +- ...est_get_duplicate_list_transform_python.py | 4 +- .../test_get_duplicate_list_transform_ray.py | 6 +- ...test_get_duplicate_list_transform_spark.py | 6 +- .../test_signature_calc_transform_python.py | 2 +- .../test/test_signature_calc_transform_ray.py | 6 +- .../test_signature_calc_transform_spark.py | 4 +- 135 files changed, 546 insertions(+), 916 deletions(-) rename transforms/universal/fdedup/{python/Dockerfile => Dockerfile.python} (57%) create mode 100644 transforms/universal/fdedup/Dockerfile.ray create mode 100644 transforms/universal/fdedup/Dockerfile.spark rename transforms/universal/fdedup/{python/src => dpk_fdedup}/Murmur_MH.py (100%) rename transforms/universal/fdedup/{python/src/cluster_analysis_local_python.py => dpk_fdedup/cluster_analysis/local_python.py} (97%) rename transforms/universal/fdedup/{ray/src => dpk_fdedup/cluster_analysis/ray}/cluster_estimator.py (100%) rename transforms/universal/fdedup/{ray/src/cluster_analysis_local_ray.py => dpk_fdedup/cluster_analysis/ray/local.py} (95%) rename transforms/universal/fdedup/{ray/src/cluster_analysis_transform_ray.py => dpk_fdedup/cluster_analysis/ray/transform.py} (97%) rename transforms/universal/fdedup/{spark/src/cluster_analysis_local_spark.py => dpk_fdedup/cluster_analysis/spark/local.py} (95%) rename transforms/universal/fdedup/{spark/src/cluster_analysis_transform_spark.py => dpk_fdedup/cluster_analysis/spark/transform.py} (98%) rename transforms/universal/fdedup/{python/src/cluster_analysis_transform.py => dpk_fdedup/cluster_analysis/transform.py} (99%) rename transforms/universal/fdedup/{python/src/cluster_analysis_transform_python.py => dpk_fdedup/cluster_analysis/transform_python.py} (98%) rename transforms/universal/fdedup/{python/src/data_cleaning_local_python.py => dpk_fdedup/data_cleaning/local_python.py} (93%) rename transforms/universal/fdedup/{ray/src/data_cleaning_local_ray.py => dpk_fdedup/data_cleaning/ray/local.py} (93%) rename transforms/universal/fdedup/{ray/src/data_cleaning_transform_ray.py => dpk_fdedup/data_cleaning/ray/transform.py} (99%) rename transforms/universal/fdedup/{spark/src/data_cleaning_local_spark.py => dpk_fdedup/data_cleaning/spark/local.py} (94%) rename transforms/universal/fdedup/{spark/src/data_cleaning_transform_spark.py => dpk_fdedup/data_cleaning/spark/transform.py} (99%) rename transforms/universal/fdedup/{python/src/data_cleaning_transform.py => dpk_fdedup/data_cleaning/transform.py} (100%) rename transforms/universal/fdedup/{python/src/data_cleaning_transform_python.py => dpk_fdedup/data_cleaning/transform_python.py} (98%) rename transforms/universal/fdedup/{ray/src/get_duplicate_list_transform_ray.py => dpk_fdedup/get_duplicate_list/ray/transform.py} (97%) rename transforms/universal/fdedup/{python/src/get_duplicate_list_transform.py => dpk_fdedup/get_duplicate_list/transform.py} (100%) rename transforms/universal/fdedup/{python/src/get_duplicate_list_transform_local_python.py => dpk_fdedup/get_duplicate_list/transform_local_python.py} (96%) rename transforms/universal/fdedup/{python/src/get_duplicate_list_transform_python.py => dpk_fdedup/get_duplicate_list/transform_python.py} (97%) rename transforms/universal/fdedup/{ray/src/fdedup_transform_ray.py => dpk_fdedup/ray/transform.py} (85%) rename transforms/universal/fdedup/{python/src/signature_calc_local_python.py => dpk_fdedup/signature_calc/local_python.py} (97%) rename transforms/universal/fdedup/{ray/src/signature_calc_local_ray.py => dpk_fdedup/signature_calc/ray/local.py} (95%) rename transforms/universal/fdedup/{ray/src/signature_calc_transform_ray.py => dpk_fdedup/signature_calc/ray/transform.py} (94%) rename transforms/universal/fdedup/{spark/src/signature_calc_local_spark.py => dpk_fdedup/signature_calc/spark/local.py} (97%) rename transforms/universal/fdedup/{spark/src/signature_calc_transform_spark.py => dpk_fdedup/signature_calc/spark/transform.py} (94%) rename transforms/universal/fdedup/{python/src/signature_calc_transform.py => dpk_fdedup/signature_calc/transform.py} (99%) rename transforms/universal/fdedup/{python/src/signature_calc_transform_python.py => dpk_fdedup/signature_calc/transform_python.py} (94%) rename transforms/universal/fdedup/{spark/src/fdedup_transform_spark.py => dpk_fdedup/spark/transform.py} (100%) rename transforms/universal/fdedup/{python/src/fdedup_transform_python.py => dpk_fdedup/transform_python.py} (98%) rename transforms/universal/fdedup/{ray => }/images/fuzzy.png (100%) delete mode 100644 transforms/universal/fdedup/python/.dockerignore delete mode 100644 transforms/universal/fdedup/python/Makefile delete mode 100644 transforms/universal/fdedup/python/README.md delete mode 100644 transforms/universal/fdedup/python/pyproject.toml delete mode 100644 transforms/universal/fdedup/ray/.dockerignore delete mode 100644 transforms/universal/fdedup/ray/.gitignore delete mode 100644 transforms/universal/fdedup/ray/Dockerfile delete mode 100644 transforms/universal/fdedup/ray/Makefile delete mode 100644 transforms/universal/fdedup/ray/README.md delete mode 100644 transforms/universal/fdedup/ray/pyproject.toml delete mode 100644 transforms/universal/fdedup/ray/requirements.txt rename transforms/universal/fdedup/{python => }/requirements.txt (85%) delete mode 100644 transforms/universal/fdedup/spark/Dockerfile delete mode 100644 transforms/universal/fdedup/spark/Makefile rename transforms/universal/fdedup/spark/{src/requirements.txt => requirements-spark.txt} (100%) delete mode 100644 transforms/universal/fdedup/spark/requirements.txt rename transforms/universal/fdedup/{python => }/test-data/expected/cleaned/data_1/df1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cleaned/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cleaned/metadata.json (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/metadata.json (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/data_cleaning/cleaned/metadata.json (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/get_list_transform/metadata.json (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/metadata.json (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/metadata.json (100%) rename transforms/universal/fdedup/{python => }/test-data/input/data_1/df1.parquet (100%) rename transforms/universal/fdedup/{python => }/test-data/input/data_2/df2.parquet (100%) rename transforms/universal/fdedup/{python => }/test/test_cluster_analysis_transform_python.py (93%) rename transforms/universal/fdedup/{ray => }/test/test_cluster_analysis_transform_ray.py (91%) rename transforms/universal/fdedup/{spark => }/test/test_cluster_analysis_transform_spark.py (89%) rename transforms/universal/fdedup/{python => }/test/test_data_cleaning_transform_python.py (93%) rename transforms/universal/fdedup/{ray => }/test/test_data_cleaning_transform_ray.py (93%) rename transforms/universal/fdedup/{spark => }/test/test_data_cleaning_transform_spark.py (92%) rename transforms/universal/fdedup/{python => }/test/test_get_duplicate_list_transform_python.py (92%) rename transforms/universal/fdedup/{ray => }/test/test_get_duplicate_list_transform_ray.py (89%) rename transforms/universal/fdedup/{spark => }/test/test_get_duplicate_list_transform_spark.py (91%) rename transforms/universal/fdedup/{python => }/test/test_signature_calc_transform_python.py (96%) rename transforms/universal/fdedup/{ray => }/test/test_signature_calc_transform_ray.py (90%) rename transforms/universal/fdedup/{spark => }/test/test_signature_calc_transform_spark.py (95%) diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/Dockerfile.python similarity index 57% rename from transforms/universal/fdedup/python/Dockerfile rename to transforms/universal/fdedup/Dockerfile.python index 79c85e4ac..1a53451d5 100644 --- a/transforms/universal/fdedup/python/Dockerfile +++ b/transforms/universal/fdedup/Dockerfile.python @@ -2,34 +2,21 @@ FROM docker.io/python:3.10.14-slim-bullseye RUN pip install --upgrade --no-cache-dir pip -# install pytest -RUN pip install --no-cache-dir pytest -ARG DPK_WHEEL_FILE_NAME - # Create a user and use it to run the transform RUN useradd -ms /bin/bash dpk USER dpk WORKDIR /home/dpk +ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). COPY --chown=dpk:root data-processing-dist data-processing-dist -RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml -COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root dpk_fdedup/ dpk_fdedup/ COPY --chown=dpk:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt -RUN pip install --no-cache-dir -e . - -# copy source data -COPY ./src/fdedup_transform_python.py fdedup_transform_python.py -COPY ./src/fdedup_transform_python.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ # Set environment ENV PYTHONPATH /home/dpk @@ -38,4 +25,4 @@ ENV PYTHONPATH /home/dpk ARG BUILD_DATE ARG GIT_COMMIT LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT +LABEL git-commit=$GIT_COMMIT \ No newline at end of file diff --git a/transforms/universal/fdedup/Dockerfile.ray b/transforms/universal/fdedup/Dockerfile.ray new file mode 100644 index 000000000..379e45bad --- /dev/null +++ b/transforms/universal/fdedup/Dockerfile.ray @@ -0,0 +1,31 @@ +ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310 + +FROM ${BASE_IMAGE} + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=ray:users data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] + +## Copy the python version of the tansform +COPY --chown=ray:users dpk_fdedup/ dpk_fdedup/ +COPY --chown=ray:users requirements.txt requirements.txt +RUN pip install -r requirements.txt + +# Grant non-root users the necessary permissions to the ray directory +RUN chmod 755 /home/ray + +# Set environment +ENV PYTHONPATH /home/ray + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT \ No newline at end of file diff --git a/transforms/universal/fdedup/Dockerfile.spark b/transforms/universal/fdedup/Dockerfile.spark new file mode 100644 index 000000000..d228b6c2d --- /dev/null +++ b/transforms/universal/fdedup/Dockerfile.spark @@ -0,0 +1,40 @@ +FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest + +USER root +# install pytest +RUN pip install --no-cache-dir pytest + +WORKDIR ${SPARK_HOME}/work-dir +ARG DPK_WHEEL_FILE_NAME + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=spark:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark] + + +# Install project source + +## Copy the python version of the tansform +COPY --chown=spark:root dpk_fdedup/ dpk_fdedup/ +COPY --chown=spark:root requirements.txt requirements.txt +RUN pip install -r requirements.txt + +RUN mkdir -p /opt/spark/work-dir/src/templates && \ + mkdir -p /opt/spark/work-dir/config +COPY --chown=spark:root spark-deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ +COPY --chown=spark:root spark-deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ + + +USER spark + +# Set environment +ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH} +ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH} + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT + diff --git a/transforms/universal/fdedup/Makefile b/transforms/universal/fdedup/Makefile index bca6f7e85..da70ab879 100644 --- a/transforms/universal/fdedup/Makefile +++ b/transforms/universal/fdedup/Makefile @@ -1,79 +1,23 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/.make.defaults +include $(REPOROOT)/transforms/.make.cicd.targets -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse +################################################################################ -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse -set-versions: - @# Help: Recursively $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-save-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ - fi - -.PHONY: workflow-test -workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ - fi - -.PHONY: workflow-upload -workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ - fi - -.PHONY: workflow-build -workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ - fi +un-cli-sample: + $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \ + RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ + --fdedup_id_column int_id_column" \ + .transforms.run-src-file diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md index fed3c1370..afcf3db08 100644 --- a/transforms/universal/fdedup/README.md +++ b/transforms/universal/fdedup/README.md @@ -1,11 +1,393 @@ -# Fuzzy Deduplication Transform +# Fuzzy Dedup + +Please see the set of +[transform project conventions](../../../README.md) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Contributors +- Nelson Bore (kibnelson@gmail.com) +- Constantin Adam (cmadam@us.ibm.com) + +## Description The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries. -Per the set of [transform project conventions](../../README.md#transform-project-conventions) the following runtimes are available: -* [python](python/README.md) - enables running the base transform in a pure python environment -* [ray](ray/README.md) - enables running the base python transform in a Ray runtime -* [spark](spark/README.md) - enables running the base python transform in a spark runtime -* [kfp](kfp_ray/README.md) - enables running the ray docker image in a kubernetes cluster using a generated `yaml` file. +Fuzzy dedup is a complex process made up of a pipeline that performs four main steps: + +1. **Signature Calculation**: creates a set of minhashes for each document, and uses them to create band signatures for +the document. +2. **Cluster Analysis**: groups documents into clusters based on matching band signatures. Within each cluster, it +retains only the documents that have a Jaccard similarity above a specified threshold, and it identifies which documents +to keep as unique and which ones to mark as duplicates. +3. **Duplicate List Generation**: combines the similarity clusters identified in each band to create a single, unified +list of duplicate documents. +4. **Data Cleaning**: processes the documents by either filtering out duplicates or adding annotations to distinguish +duplicates from non-duplicates. + +Each one of these steps is described in more detail below. + +### Signature Calculation + +This transform computes `num_permutations` minhashes and `num_bands` signatures for each document in the dataset, by +following these processing steps: +1. **Shingle Generation**: create a set of character or word shingles, using a specified window length. Character +shingles are more effective at detecting similar documents, but require more computational resources compared to word +shingles. +2. **Minhash Calculation**: using the shingles as input, compute `num_permutations` minhashes for each document. +3. **Band Signature Calculation**: divide the minhashes into `num_bands`, where each band contains +`num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band. + +The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard +similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this +probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact +the deduplication process. + +To help distribute the workload and speed up processing of the next steps, the hash space of each band is divided into +`num_segments` segments. The band signatures, the minhashes, the document ids, and lengths are stored in an organized +output folder structure `bands/band=b/segment=s`, where `b` is the band number and `s` is the segment number. + +### Cluster Analysis + +This transform leverages segmented processing to analyze the data generated by the **Signature Calculation** step +efficiently and in parallel. Each worker processes a specific segment `s` of a band `b` by loading and analyzing all +Parquet files from the folder `bands/band=b/segment=s`. Each row in the Parquet files contains, for a document: +* `band_hash`, the document's band signature, and +* `data`, a structure with three fields: the unique `document_id`, document's `minhashes`, and `document_size`. + +The transform runs the following processing steps: +1. **Data Loading**: combine into a single dataframe all Parquet files in `bands/band=b/segment=s`. +2. **Clustering**: run a `group_by` operation on the `band_hash` column that will group documents with the same band +signature into clusters. +3. **Similarity Analysis**: for each cluster, calculate Jaccard similarity between pairs of documents using their +minhashes, and move documents below the specified Jaccard similarity threshold into new clusters. +4. **Duplicate Identification**: in clusters with more than one document remaining, retain the largest document with the +smallest document id, and mark as duplicates all other documents in the cluster. +5. **Persist Results**: save the duplicate clusters in a file. + +### Duplicate List Generation + +The **Cluster Analysis** step identifies duplicates across multiple bands, meaning a document can be marked as a +duplicate in one or more bands (e.g., if two documents are identical, one will be marked as a duplicate in all bands). +This transform consolidates all duplicate information from each band segment into a single file, providing a unified +record of duplicates detected across the dataset. + +### Data Cleaning + +This transform processes the original dataset using the list of duplicate documents generated by the **Duplicate List +Generation** step. It imports each file in the original dataset into a table and produces a new dataset. The directory +structure of the input dataset is preserved, but the contents of the output files depend on the selected operating mode: +1. **Annotate** - add a new `duplicate` column to the dataset, that contains a `d` for documents marked as duplicates, +and is empty for non-duplicates +2. **Filter duplicates** - removes all documents identified as duplicates from the dataset. +3. **Filter non-duplicates** - removes from the dataset all documents that were not marked as duplicates, leaving only +the duplicates. + +The output dataset reflects the selected mode, providing flexibility for downstream processing. + +## Input Columns Used by This Transform + +| Input Column Name | Data Type | Description | +|---------------------------------------------------------------------|-----------|----------------------------------| +| Column specified by the _contents_column_ configuration argument | str | Column that stores document text | +| Column specified by the _document_id_column_ configuration argument | int64 | Column that stores document ID | + +## Output Columns Annotated by This Transform +| Output Column Name | Data Type | Description | +|------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| duplicate | str | Column added if fuzzy dedup runs in 'annotate' mode. Value is 'd' for duplicate documents, empty for non-duplicates | + +## Configuration and Usage +### Fuzzy Deduplication Transform +The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as +follows: +```text +--input_folder INPUT_FOLDER + Input folder path +--output_folder OUTPUT_FOLDER + Output folder path +--operation_mode {filter_duplicates,filter_non_duplicates,annotate} + operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents +--contents_column CONTENTS_COLUMN + name of the column that stores document text +--document_id_column DOCUMENT_ID_COLUMN + name of the column that stores document ID +--seed SEED seed of the random number generator +--num_permutations NUM_PERMUTATIONS + number of permutations to use for minhash calculation +--num_bands NUM_BANDS + number of bands to use for band hash calculation +--num_minhashes_per_band NUM_MINHASHES_PER_BAND + number of minhashes to use in each band +--word_shingle_size WORD_SHINGLE_SIZE + number of words included in one shingle +--jaccard_similarity_threshold JACCARD_SIMILARITY_THRESHOLD + jaccard similarity threshold above which two documents are similar +--num_segments NUM_SEGMENTS + the number of segments dividing the hashing space for each band (for scalability) +--duplicate_list_location DUPLICATE_LIST_LOCATION + path to the file with all the duplicate document ids +--services SERVICES Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning) +--use_s3 USE_S3 use s3 +--s3_cred S3_CRED ast string of options for s3 credentials +--shingle_option SHINGLE_OPTION + Option used for shingling + +``` + +### Signature Calculation Transform +The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values +are as follows: +```text +--minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN + name of the column storing the unique ID assigned to each document +--minhash_contents_column MINHASH_CONTENTS_COLUMN + name of the column storing the contents of each document +--minhash_seed MINHASH_SEED + the seed used to instantiate the random number generator +--minhash_num_permutations MINHASH_NUM_PERMUTATIONS + number of permutations (minhashes) calculated for each document +--minhash_word_shingle_size MINHASH_WORD_SHINGLE_SIZE + the size of the word shingles calculated for each document +--minhash_num_bands MINHASH_NUM_BANDS + the number of bands to use in the banding technique +--minhash_num_minhashes_per_band MINHASH_NUM_MINHASHES_PER_BAND + the number of minhashes to use in each band +--minhash_num_segments MINHASH_NUM_SEGMENTS + the number of segments across which we divide the hashing space for each band +--minhash_shingle_option MINHASH_SHINGLE_OPTION + Shingling option ('word' or 'char') +``` + +### Cluster Analysis Transform +The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values +are as follows: +```text +--cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD + Jaccard similarity threshold above which two documents are duplicates +--cluster_num_bands CLUSTER_NUM_BANDS + The number of bands used in the banding technique +--cluster_num_segments CLUSTER_NUM_SEGMENTS + The number of segments dividing the hashing space for each band +``` + +### Get Duplicates List Transform +This transform currently has no configuration parameters. + +### Data Cleaning Transform +The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values +are as follows: +```text + --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN + name of the column storing the unique ID assigned to each document + --fdclean_operation_mode {filter_duplicates,filter_non_duplicates,annotate} + operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents +``` + +### Running the samples +To run the samples, use the following `make` target to create a virtual environment: + +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_python.py +``` +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_python.py +python cluster_analysis_local_python.py +python get_duplicate_list_local_python.py +python data_cleaning_local_python.py +``` +After running the transforms, execute: +```shell +ls output +``` +To see results of the transform. + +### Code example + +This is a [sample notebook](../fdedup_python.ipynb) that shows how to invoke the python fuzzy dedup transform. + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +For testing fuzzy deduplication in a pure python runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` + + +# Fuzzy Dedup - Ray implementation + +Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +configuration, testing and IDE set up. + +## Summary + +This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime. + +## Configuration and command line Options + +Fuzzy Dedup configuration and command line options are the same as for the base python transform. + +## Running +### Launched Command Line Options +When running the transform with the Ray launcher (i.e. TransformLauncher), +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. + +### Running the samples +To run the samples, use the following `make` target to create a virtual environment: + +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_ray.py +``` +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_ray.py +python cluster_analysis_local_ray.py +python get_duplicate_list_local_ray.py +python data_cleaning_local_ray.py +``` +After running the transforms, execute: +```shell +ls output +``` +To see results of the transform. + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Code Example + +This is a [sample notebook](../fdedup_ray.ipynb) that shows how to invoke the ray fuzzy dedup transform. + +## Testing + +For testing fuzzy deduplication in a ray runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` + + + +# Fuzzy Dedup -- Spark + +Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +configuration, testing and IDE set up. + +## Summary + +This project wraps the [Fuzzy Dedup transform](../python) with a Spark runtime. + +## Configuration and command line Options + +Fuzzy Dedup configuration and command line options are the same as for the base python transform. + +## Running +### Launched Command Line Options +When running the transform with the Spark launcher (i.e. TransformLauncher), +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[spark launcher](../../../../data-processing-lib/doc/spark-launcher-options.md) are available. + +### Running the samples +To run the samples, use the following `make` target to create a virtual environment: + +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_spark.py +``` +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_spark.py +python cluster_analysis_local_spark.py +python get_duplicate_list_local_spark.py +python data_cleaning_local_spark.py +``` +After running the transforms, execute: +```shell +ls output +``` +To see results of the transform. + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Code Example + +This is a [sample notebook](../fdedup_spark.ipynb) that shows how to invoke the spark fuzzy dedup transform. + +## Testing + +For testing fuzzy deduplication in a spark runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` + + + +## Further Resources +The following is a list of references to research articles and github repositories that inspired the module's design: -Please check [here](python/README.md) for a more detailed description of this transform. +1. [Jure Leskovec, Anand Rajaraman, Jeff Ullman, Mining of Massive Datasets, Chapter 3: Finding Similar Items](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) +2. [G Penedo et al., The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/pdf/2406.17557) +3. [Datatrove github repo](https://github.com/huggingface/datatrove) diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/dpk_fdedup/Murmur_MH.py similarity index 100% rename from transforms/universal/fdedup/python/src/Murmur_MH.py rename to transforms/universal/fdedup/dpk_fdedup/Murmur_MH.py diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py similarity index 97% rename from transforms/universal/fdedup/python/src/cluster_analysis_local_python.py rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py index bb785021c..61302b74a 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py @@ -13,7 +13,7 @@ import os import sys -from cluster_analysis_transform_python import ( +from dpk_fdedup.cluster_analysis.transform_python import ( ClusterAnalysisPythonTransformConfiguration, ) from data_processing.runtime.pure_python import PythonTransformLauncher diff --git a/transforms/universal/fdedup/ray/src/cluster_estimator.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/cluster_estimator.py similarity index 100% rename from transforms/universal/fdedup/ray/src/cluster_estimator.py rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/cluster_estimator.py diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py similarity index 95% rename from transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py index c54ba85c2..a4ec84741 100644 --- a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py @@ -13,7 +13,7 @@ import os import sys -from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from dpk_fdedup.cluster_analysis.ray.transform import ClusterAnalysisRayTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py similarity index 97% rename from transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py index a0e8e7de2..10b850192 100644 --- a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py @@ -13,7 +13,7 @@ import os from typing import Any -from cluster_analysis_transform import ( +from dpk_fdedup.cluster_analysis.transform import ( ClusterAnalysisTransformConfiguration, num_bands_key, num_segments_key, diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py similarity index 95% rename from transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py index c9950657c..408220b6b 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py @@ -14,7 +14,7 @@ import sys import polars as pl -from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py similarity index 98% rename from transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py index feeb3241e..97ab7a48f 100644 --- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py @@ -13,7 +13,7 @@ import os from typing import Any -from cluster_analysis_transform import ( +from dpk_fdedup.cluster_analysis.transform import ( ClusterAnalysisTransformConfiguration, num_bands_key, num_segments_key, diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py similarity index 99% rename from transforms/universal/fdedup/python/src/cluster_analysis_transform.py rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py index fa3ce6d28..b414adaa6 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py @@ -24,7 +24,7 @@ UnrecoverableException, get_logger, ) -from Murmur_MH import Murmur_MH +from dpk_fdedup.Murmur_MH import Murmur_MH short_name = "cluster" diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py similarity index 98% rename from transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py index c35c5a711..e882ea6cc 100644 --- a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py @@ -14,7 +14,7 @@ import time from typing import Any -from cluster_analysis_transform import ( +from dpk_fdedup.cluster_analysis.transform import ( ClusterAnalysisTransformConfiguration, num_bands_key, num_segments_key, diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py similarity index 93% rename from transforms/universal/fdedup/python/src/data_cleaning_local_python.py rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py index aa4aabb90..d0976ec76 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py @@ -13,11 +13,11 @@ import os import sys -from data_cleaning_transform import ( +from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, ) -from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py similarity index 93% rename from transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py index b951e2fc8..f72fc0902 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py @@ -13,11 +13,11 @@ import os import sys -from data_cleaning_transform import ( +from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, ) -from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from transforms.universal.fdedup.dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py similarity index 99% rename from transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py index 88171e260..4a4bd52f0 100644 --- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py @@ -14,7 +14,7 @@ from typing import Any import ray -from data_cleaning_transform import ( +from dpk_fdedup.data_cleaning.transform import ( DataCleaningTransform, DataCleaningTransformConfiguration, dataclean_data_access_key, diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py similarity index 94% rename from transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py index eb1e61845..12c5ab244 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py @@ -18,7 +18,7 @@ document_id_column_cli_param, duplicate_list_location_cli_param, ) -from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from transforms.universal.fdedup.dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py similarity index 99% rename from transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py index 2ff0df8bf..16d184a5e 100644 --- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py @@ -13,7 +13,7 @@ import os from typing import Any -from data_cleaning_transform import ( +from dpk_fdedup.data_cleaning.transform import ( DataCleaningTransformConfiguration, dataclean_data_access_key, dataclean_data_factory_key, diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform.py similarity index 100% rename from transforms/universal/fdedup/python/src/data_cleaning_transform.py rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform.py diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py similarity index 98% rename from transforms/universal/fdedup/python/src/data_cleaning_transform_python.py rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py index edef8b9c5..e29ef1218 100644 --- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py @@ -13,7 +13,7 @@ import os from typing import Any -from data_cleaning_transform import ( +from dpk_fdedup.data_cleaning.transform import ( DataCleaningTransformConfiguration, dataclean_data_access_key, dataclean_data_factory_key, diff --git a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/ray/transform.py similarity index 97% rename from transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py rename to transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/ray/transform.py index 40081e658..b53891f66 100644 --- a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py +++ b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/ray/transform.py @@ -20,7 +20,7 @@ RayTransformLauncher, RayTransformRuntimeConfiguration, ) -from get_duplicate_list_transform import ( +from dpk_fdedup.get_duplicate_list.transform import ( GetDuplicateListTransformConfiguration, subfolder_key, ) diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform.py similarity index 100% rename from transforms/universal/fdedup/python/src/get_duplicate_list_transform.py rename to transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform.py diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_local_python.py similarity index 96% rename from transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py rename to transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_local_python.py index 34b18ab04..2ccdec931 100644 --- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_local_python.py @@ -15,7 +15,7 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from get_duplicate_list_transform_python import ( +from dpk_fdedup.get_duplicate_list.transform_python import ( GetDuplicateListPythonTransformConfiguration, ) diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_python.py similarity index 97% rename from transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py rename to transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_python.py index 703ef630e..fe6f0bda6 100644 --- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_python.py @@ -21,7 +21,7 @@ PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger -from get_duplicate_list_transform import ( +from dpk_fdedup.get_duplicate_list.transform import ( GetDuplicateListTransformConfiguration, subfolder_key, ) diff --git a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py similarity index 85% rename from transforms/universal/fdedup/ray/src/fdedup_transform_ray.py rename to transforms/universal/fdedup/dpk_fdedup/ray/transform.py index be1bf5fcb..a59877b6f 100644 --- a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py +++ b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py @@ -14,20 +14,20 @@ import os import sys -from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration -from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from dpk_fdedup.cluster_analysis.ray.transform import ClusterAnalysisRayTransformConfiguration +from dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_python import ServiceOrchestrator, parse_args -from get_duplicate_list_transform_python import ( +from dpk_fdedup.transform_python import ServiceOrchestrator, parse_args +from dpk_fdedup.get_duplicate_list.transform_python import ( GetDuplicateListPythonTransformConfiguration, ) -from get_duplicate_list_transform_ray import ( +from dpk_fdedup.get_duplicate_list.ray.transform import ( GetDuplicateListRayRuntime, GetDuplicateListRayTransformConfiguration, ) -from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration +from dpk_fdedup.signature_calc.ray.transform import SignatureCalculationRayTransformConfiguration s3_creds = { diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/local_python.py similarity index 97% rename from transforms/universal/fdedup/python/src/signature_calc_local_python.py rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/local_python.py index be395ed4d..c68f32b71 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/local_python.py @@ -16,7 +16,7 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from signature_calc_transform_python import ( +from dpk_fdedup.signature_calc.transform_python import ( SignatureCalculationPythonTransformConfiguration, ) diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py similarity index 95% rename from transforms/universal/fdedup/ray/src/signature_calc_local_ray.py rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py index cb87b56af..2e5b7e2ab 100644 --- a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py @@ -15,7 +15,7 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration +from dpk_fdedup.signature_calc.ray.transform import SignatureCalculationRayTransformConfiguration # create parameters diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py similarity index 94% rename from transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py index 678d953f2..9a3b9f42f 100644 --- a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py @@ -15,7 +15,7 @@ RayTransformRuntimeConfiguration, ) from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher -from signature_calc_transform import SignatureCalculationTransformConfiguration +from dpk_fdedup.signature_calc.transform import SignatureCalculationTransformConfiguration logger = get_logger(__name__) diff --git a/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/local.py similarity index 97% rename from transforms/universal/fdedup/spark/src/signature_calc_local_spark.py rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/local.py index 2db884346..cf817eea4 100644 --- a/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/local.py @@ -16,7 +16,7 @@ import polars as pl from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher -from signature_calc_transform_spark import ( +from dpk_fdedup.signature_calc.spark.transform import ( SignatureCalculationSparkTransformConfiguration, ) diff --git a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py similarity index 94% rename from transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py index 4e39810c6..9b2de7f28 100644 --- a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py @@ -15,7 +15,7 @@ SparkTransformLauncher, SparkTransformRuntimeConfiguration, ) -from signature_calc_transform import SignatureCalculationTransformConfiguration +from dpk_fdedup.signature_calc.transform import SignatureCalculationTransformConfiguration logger = get_logger(__name__) diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py similarity index 99% rename from transforms/universal/fdedup/python/src/signature_calc_transform.py rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py index 4e64bcb5a..d01ee7b85 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py @@ -23,7 +23,7 @@ from data_processing.data_access import DataAccessFactory from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, UnrecoverableException -from Murmur_MH import Murmur_MH +from dpk_fdedup.Murmur_MH import Murmur_MH short_name = "minhash" diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py similarity index 94% rename from transforms/universal/fdedup/python/src/signature_calc_transform_python.py rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py index 40e0e97e3..c5a0db954 100644 --- a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py @@ -17,7 +17,7 @@ PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger -from signature_calc_transform import SignatureCalculationTransformConfiguration +from dpk_fdedup.signature_calc.transform import SignatureCalculationTransformConfiguration logger = get_logger(__name__) diff --git a/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py b/transforms/universal/fdedup/dpk_fdedup/spark/transform.py similarity index 100% rename from transforms/universal/fdedup/spark/src/fdedup_transform_spark.py rename to transforms/universal/fdedup/dpk_fdedup/spark/transform.py diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/transform_python.py similarity index 98% rename from transforms/universal/fdedup/python/src/fdedup_transform_python.py rename to transforms/universal/fdedup/dpk_fdedup/transform_python.py index def3590e4..7f7b71b82 100644 --- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/transform_python.py @@ -19,16 +19,16 @@ import data_cleaning_transform import get_duplicate_list_transform import signature_calc_transform -from cluster_analysis_transform_python import ( +from cluster_analysis.transform_python import ( ClusterAnalysisPythonTransformConfiguration, ) -from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_cleaning.transform_python import DataCleaningPythonTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils, get_logger, str2bool -from get_duplicate_list_transform_python import ( +from get_duplicate_list.transform_python import ( GetDuplicateListPythonTransformConfiguration, ) -from signature_calc_transform_python import ( +from signature_calc.transform_python import ( SignatureCalculationPythonTransformConfiguration, ) diff --git a/transforms/universal/fdedup/ray/images/fuzzy.png b/transforms/universal/fdedup/images/fuzzy.png similarity index 100% rename from transforms/universal/fdedup/ray/images/fuzzy.png rename to transforms/universal/fdedup/images/fuzzy.png diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore deleted file mode 100644 index f7275bbbd..000000000 --- a/transforms/universal/fdedup/python/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile deleted file mode 100644 index 05f6bf5ca..000000000 --- a/transforms/universal/fdedup/python/Makefile +++ /dev/null @@ -1,64 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -venv:: .transforms.python-venv - -test:: .transforms.python-test - -clean:: .transforms.clean - -image:: .transforms.python-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -setup:: .transforms.setup - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.python-test-image - -run-cli-sample: .transforms.run-cli-python-sample - -run-local-sample: .transforms.run-local-sample - -run-local-python-sample: .transforms.run-local-python-sample - -#run-s3-ray-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md deleted file mode 100644 index 4c531476f..000000000 --- a/transforms/universal/fdedup/python/README.md +++ /dev/null @@ -1,244 +0,0 @@ -# Fuzzy Dedup - -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Contributors -- Nelson Bore (kibnelson@gmail.com) -- Constantin Adam (cmadam@us.ibm.com) - -## Description -The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a -set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries. - -Fuzzy dedup is a complex process made up of a pipeline that performs four main steps: - -1. **Signature Calculation**: creates a set of minhashes for each document, and uses them to create band signatures for -the document. -2. **Cluster Analysis**: groups documents into clusters based on matching band signatures. Within each cluster, it -retains only the documents that have a Jaccard similarity above a specified threshold, and it identifies which documents -to keep as unique and which ones to mark as duplicates. -3. **Duplicate List Generation**: combines the similarity clusters identified in each band to create a single, unified -list of duplicate documents. -4. **Data Cleaning**: processes the documents by either filtering out duplicates or adding annotations to distinguish -duplicates from non-duplicates. - -Each one of these steps is described in more detail below. - -### Signature Calculation - -This transform computes `num_permutations` minhashes and `num_bands` signatures for each document in the dataset, by -following these processing steps: -1. **Shingle Generation**: create a set of character or word shingles, using a specified window length. Character -shingles are more effective at detecting similar documents, but require more computational resources compared to word -shingles. -2. **Minhash Calculation**: using the shingles as input, compute `num_permutations` minhashes for each document. -3. **Band Signature Calculation**: divide the minhashes into `num_bands`, where each band contains -`num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band. - -The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard -similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this -probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact -the deduplication process. - -To help distribute the workload and speed up processing of the next steps, the hash space of each band is divided into -`num_segments` segments. The band signatures, the minhashes, the document ids, and lengths are stored in an organized -output folder structure `bands/band=b/segment=s`, where `b` is the band number and `s` is the segment number. - -### Cluster Analysis - -This transform leverages segmented processing to analyze the data generated by the **Signature Calculation** step -efficiently and in parallel. Each worker processes a specific segment `s` of a band `b` by loading and analyzing all -Parquet files from the folder `bands/band=b/segment=s`. Each row in the Parquet files contains, for a document: -* `band_hash`, the document's band signature, and -* `data`, a structure with three fields: the unique `document_id`, document's `minhashes`, and `document_size`. - -The transform runs the following processing steps: -1. **Data Loading**: combine into a single dataframe all Parquet files in `bands/band=b/segment=s`. -2. **Clustering**: run a `group_by` operation on the `band_hash` column that will group documents with the same band -signature into clusters. -3. **Similarity Analysis**: for each cluster, calculate Jaccard similarity between pairs of documents using their -minhashes, and move documents below the specified Jaccard similarity threshold into new clusters. -4. **Duplicate Identification**: in clusters with more than one document remaining, retain the largest document with the -smallest document id, and mark as duplicates all other documents in the cluster. -5. **Persist Results**: save the duplicate clusters in a file. - -### Duplicate List Generation - -The **Cluster Analysis** step identifies duplicates across multiple bands, meaning a document can be marked as a -duplicate in one or more bands (e.g., if two documents are identical, one will be marked as a duplicate in all bands). -This transform consolidates all duplicate information from each band segment into a single file, providing a unified -record of duplicates detected across the dataset. - -### Data Cleaning - -This transform processes the original dataset using the list of duplicate documents generated by the **Duplicate List -Generation** step. It imports each file in the original dataset into a table and produces a new dataset. The directory -structure of the input dataset is preserved, but the contents of the output files depend on the selected operating mode: -1. **Annotate** - add a new `duplicate` column to the dataset, that contains a `d` for documents marked as duplicates, -and is empty for non-duplicates -2. **Filter duplicates** - removes all documents identified as duplicates from the dataset. -3. **Filter non-duplicates** - removes from the dataset all documents that were not marked as duplicates, leaving only -the duplicates. - -The output dataset reflects the selected mode, providing flexibility for downstream processing. - -## Input Columns Used by This Transform - -| Input Column Name | Data Type | Description | -|---------------------------------------------------------------------|-----------|----------------------------------| -| Column specified by the _contents_column_ configuration argument | str | Column that stores document text | -| Column specified by the _document_id_column_ configuration argument | int64 | Column that stores document ID | - -## Output Columns Annotated by This Transform -| Output Column Name | Data Type | Description | -|------------|-----------|---------------------------------------------------------------------------------------------------------------------| -| duplicate | str | Column added if fuzzy dedup runs in 'annotate' mode. Value is 'd' for duplicate documents, empty for non-duplicates | - -## Configuration and Usage -### Fuzzy Deduplication Transform -The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as -follows: -```text ---input_folder INPUT_FOLDER - Input folder path ---output_folder OUTPUT_FOLDER - Output folder path ---operation_mode {filter_duplicates,filter_non_duplicates,annotate} - operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents ---contents_column CONTENTS_COLUMN - name of the column that stores document text ---document_id_column DOCUMENT_ID_COLUMN - name of the column that stores document ID ---seed SEED seed of the random number generator ---num_permutations NUM_PERMUTATIONS - number of permutations to use for minhash calculation ---num_bands NUM_BANDS - number of bands to use for band hash calculation ---num_minhashes_per_band NUM_MINHASHES_PER_BAND - number of minhashes to use in each band ---word_shingle_size WORD_SHINGLE_SIZE - number of words included in one shingle ---jaccard_similarity_threshold JACCARD_SIMILARITY_THRESHOLD - jaccard similarity threshold above which two documents are similar ---num_segments NUM_SEGMENTS - the number of segments dividing the hashing space for each band (for scalability) ---duplicate_list_location DUPLICATE_LIST_LOCATION - path to the file with all the duplicate document ids ---services SERVICES Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning) ---use_s3 USE_S3 use s3 ---s3_cred S3_CRED ast string of options for s3 credentials ---shingle_option SHINGLE_OPTION - Option used for shingling - -``` - -### Signature Calculation Transform -The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values -are as follows: -```text ---minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN - name of the column storing the unique ID assigned to each document ---minhash_contents_column MINHASH_CONTENTS_COLUMN - name of the column storing the contents of each document ---minhash_seed MINHASH_SEED - the seed used to instantiate the random number generator ---minhash_num_permutations MINHASH_NUM_PERMUTATIONS - number of permutations (minhashes) calculated for each document ---minhash_word_shingle_size MINHASH_WORD_SHINGLE_SIZE - the size of the word shingles calculated for each document ---minhash_num_bands MINHASH_NUM_BANDS - the number of bands to use in the banding technique ---minhash_num_minhashes_per_band MINHASH_NUM_MINHASHES_PER_BAND - the number of minhashes to use in each band ---minhash_num_segments MINHASH_NUM_SEGMENTS - the number of segments across which we divide the hashing space for each band ---minhash_shingle_option MINHASH_SHINGLE_OPTION - Shingling option ('word' or 'char') -``` - -### Cluster Analysis Transform -The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values -are as follows: -```text ---cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD - Jaccard similarity threshold above which two documents are duplicates ---cluster_num_bands CLUSTER_NUM_BANDS - The number of bands used in the banding technique ---cluster_num_segments CLUSTER_NUM_SEGMENTS - The number of segments dividing the hashing space for each band -``` - -### Get Duplicates List Transform -This transform currently has no configuration parameters. - -### Data Cleaning Transform -The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values -are as follows: -```text - --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN - name of the column storing the unique ID assigned to each document - --fdclean_operation_mode {filter_duplicates,filter_non_duplicates,annotate} - operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents -``` - -### Running the samples -To run the samples, use the following `make` target to create a virtual environment: - -```commandline -make venv -``` -Subsequently, the main orchestration program can run with: -```commandline -source venv/bin/activate -cd src -python fdedup_transform_python.py -``` -Alternatively the transforms included in fuzzy dedup can be launched independently: -```commandline -source venv/bin/activate -cd src -python signature_calc_local_python.py -python cluster_analysis_local_python.py -python get_duplicate_list_local_python.py -python data_cleaning_local_python.py -``` -After running the transforms, execute: -```shell -ls output -``` -To see results of the transform. - -### Code example - -This is a [sample notebook](../fdedup_python.ipynb) that shows how to invoke the python fuzzy dedup transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. - -## Testing - -For testing fuzzy deduplication in a pure python runtime, use the following `make` targets. To launch integration tests -for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data -cleaning) use: -```commandline -make test-src -``` - -To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that -image, use: -```commandline -make test-image -``` - -## Further Resources -The following is a list of references to research articles and github repositories that inspired the module's design: - -1. [Jure Leskovec, Anand Rajaraman, Jeff Ullman, Mining of Massive Datasets, Chapter 3: Finding Similar Items](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) -2. [G Penedo et al., The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/pdf/2406.17557) -3. [Datatrove github repo](https://github.com/huggingface/datatrove) diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml deleted file mode 100644 index 81f39ebb0..000000000 --- a/transforms/universal/fdedup/python/pyproject.toml +++ /dev/null @@ -1,45 +0,0 @@ -[project] -name = "dpk_fdedup_transform_python" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "Fuzzy Dedup Transform for Python" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, - { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/fdedup/ray/.dockerignore b/transforms/universal/fdedup/ray/.dockerignore deleted file mode 100644 index f7275bbbd..000000000 --- a/transforms/universal/fdedup/ray/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/universal/fdedup/ray/.gitignore b/transforms/universal/fdedup/ray/.gitignore deleted file mode 100644 index 3ea7fd4ab..000000000 --- a/transforms/universal/fdedup/ray/.gitignore +++ /dev/null @@ -1,38 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile deleted file mode 100644 index 9a447e2db..000000000 --- a/transforms/universal/fdedup/ray/Dockerfile +++ /dev/null @@ -1,51 +0,0 @@ -ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310 -FROM ${BASE_IMAGE} - -RUN pip install --upgrade --no-cache-dir pip - -# install pytest -RUN pip install --no-cache-dir pytest -ARG DPK_WHEEL_FILE_NAME - -# Copy and install data processing libraries -# These are expected to be placed in the docker context before this is run (see the make image). -COPY --chown=ray:users data-processing-dist data-processing-dist -RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] - -## Copy the python version of the tansform -COPY --chown=ray:users python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -# Install ray project source -COPY --chown=ray:users src/ src/ -COPY --chown=ray:users pyproject.toml pyproject.toml -COPY --chown=ray:users README.md README.md -COPY --chown=ray:users requirements.txt requirements.txt -RUN pip install --no-cache-dir -e . - -# copy source files needed by test-image -COPY --chown=ray:users ./src/fdedup_transform_ray.py fdedup_transform_ray.py -COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py -COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py -COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py -COPY --chown=ray:users ./src/data_cleaning_transform_ray.py data_cleaning_transform_ray.py -COPY --chown=ray:users ./src/signature_calc_local_ray.py local/fdedup_local_ray.py - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ - -# Grant non-root users the necessary permissions to the ray directory -RUN chmod 755 /home/ray - -USER root -RUN chmod a+rwx /home/ray -USER ray -# Set environment -ENV PYTHONPATH /home/ray - -# Put these at the end since they seem to upset the docker cache. -ARG BUILD_DATE -ARG GIT_COMMIT -LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile deleted file mode 100644 index ec193b6c3..000000000 --- a/transforms/universal/fdedup/ray/Makefile +++ /dev/null @@ -1,68 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -BASE_IMAGE=${RAY_BASE_IMAGE} - -venv:: .transforms.ray-venv - -test:: .transforms.ray-test - -clean:: .transforms.clean - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -test-image:: .transforms.ray-test-image - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-ray - -setup:: .transforms.setup - -# TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -run-cli-sample: - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \ - RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ - --fdedup_id_column int_id_column" \ - .transforms.run-src-file - -run-local-sample: .transforms.run-local-ray-sample - -run-s3-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/fdedup/ray/README.md b/transforms/universal/fdedup/ray/README.md deleted file mode 100644 index 298ac39ba..000000000 --- a/transforms/universal/fdedup/ray/README.md +++ /dev/null @@ -1,71 +0,0 @@ -# Fuzzy Dedup - -Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform -configuration, testing and IDE set up. - -## Summary - -This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime. - -## Configuration and command line Options - -Fuzzy Dedup configuration and command line options are the same as for the base python transform. - -## Running -### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -In addition to those available to the transform as defined in [here](../python/README.md), -the set of -[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. - -### Running the samples -To run the samples, use the following `make` target to create a virtual environment: - -```commandline -make venv -``` -Subsequently, the main orchestration program can run with: -```commandline -source venv/bin/activate -cd src -python fdedup_transform_ray.py -``` -Alternatively the transforms included in fuzzy dedup can be launched independently: -```commandline -source venv/bin/activate -cd src -python signature_calc_local_ray.py -python cluster_analysis_local_ray.py -python get_duplicate_list_local_ray.py -python data_cleaning_local_ray.py -``` -After running the transforms, execute: -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. - -## Code Example - -This is a [sample notebook](../fdedup_ray.ipynb) that shows how to invoke the ray fuzzy dedup transform. - -## Testing - -For testing fuzzy deduplication in a ray runtime, use the following `make` targets. To launch integration tests -for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data -cleaning) use: -```commandline -make test-src -``` - -To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that -image, use: -```commandline -make test-image -``` \ No newline at end of file diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml deleted file mode 100644 index 19da8a690..000000000 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ /dev/null @@ -1,45 +0,0 @@ -[project] -name = "dpk_fdedup_transform_ray" -version = "0.2.4.dev0" -requires-python = ">=3.10,<3.13" -description = "fdedup Ray Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, - { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt deleted file mode 100644 index 782ef76e2..000000000 --- a/transforms/universal/fdedup/ray/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -data-prep-toolkit[ray]>=0.2.3 -dpk_fdedup_transform_python==0.2.4.dev0 -mmh3>=4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/requirements.txt similarity index 85% rename from transforms/universal/fdedup/python/requirements.txt rename to transforms/universal/fdedup/requirements.txt index 2c6bb5f36..b28fac859 100644 --- a/transforms/universal/fdedup/python/requirements.txt +++ b/transforms/universal/fdedup/requirements.txt @@ -1,4 +1,3 @@ -data-prep-toolkit>=0.2.3 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile deleted file mode 100644 index b04994d46..000000000 --- a/transforms/universal/fdedup/spark/Dockerfile +++ /dev/null @@ -1,51 +0,0 @@ -ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0 -FROM ${BASE_IMAGE} - -# install pytest -RUN pip install --no-cache-dir pytest -ARG DPK_WHEEL_FILE_NAME - -WORKDIR ${SPARK_HOME}/work-dir - -# Copy in the data processing framework source/project and install it -# This is expected to be placed in the docker context before this is run (see the make image). -COPY --chown=spark:root data-processing-dist data-processing-dist -RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark] - -## Copy the python version of the tansform -COPY --chown=spark:root python-transform/ python-transform/ -RUN cd python-transform && pip install --no-cache-dir -e . - -# Install spark project source -COPY --chown=spark:root src/ src/ -COPY --chown=spark:root pyproject.toml pyproject.toml -COPY --chown=spark:root README.md README.md -RUN mkdir -p /opt/spark/work-dir/src/templates && \ - mkdir -p /opt/spark/work-dir/config -COPY --chown=spark:root deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ -COPY --chown=spark:root deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ - -# install requirements from requirements.txt -COPY requirements.txt . -RUN pip3 install -r requirements.txt - -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY ./src/fdedup_transform_spark.py . - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ - -USER spark - -# Set environment -ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH} -ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH} - -# Put these at the end since they seem to upset the docker cache. -ARG BUILD_DATE -ARG GIT_COMMIT -LABEL build-date=$BUILD_DATE -LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile deleted file mode 100644 index ac2735e7d..000000000 --- a/transforms/universal/fdedup/spark/Makefile +++ /dev/null @@ -1,57 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -venv:: .transforms.spark-venv - -test:: .transforms.spark-test - -clean:: .transforms.clean - -image:: .transforms.spark-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-spark - -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.spark-test-image - -run-cli-sample: .transforms.run-cli-spark-sample - -run-local-sample: .transforms.run-local-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md index 1b02ddd00..f1cf31ff0 100644 --- a/transforms/universal/fdedup/spark/README.md +++ b/transforms/universal/fdedup/spark/README.md @@ -1,4 +1,4 @@ -# Fuzzy Dedup +# Fuzzy Dedup -- Spark Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform configuration, testing and IDE set up. diff --git a/transforms/universal/fdedup/spark/src/requirements.txt b/transforms/universal/fdedup/spark/requirements-spark.txt similarity index 100% rename from transforms/universal/fdedup/spark/src/requirements.txt rename to transforms/universal/fdedup/spark/requirements-spark.txt diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt deleted file mode 100644 index e12366dd6..000000000 --- a/transforms/universal/fdedup/spark/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -dpk_fdedup_transform_python==0.2.4.dev0 -data-prep-toolkit[spark]>=0.2.3 -pyyaml>=6.0.2 -boto3>=1.34.69 -kubernetes>=30.1.0 -polars==1.9.0 -disjoint-set>=0.8.0 -numpy<1.29.0 -sentencepiece>=0.2.0 -mmh3>=4.1.0 -scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/test-data/expected/cleaned/data_1/df1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet rename to transforms/universal/fdedup/test-data/expected/cleaned/data_1/df1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/cleaned/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/cleaned/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json b/transforms/universal/fdedup/test-data/expected/cleaned/metadata.json similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json rename to transforms/universal/fdedup/test-data/expected/cleaned/metadata.json diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/metadata.json similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/metadata.json diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet rename to transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/metadata.json similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json rename to transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/metadata.json diff --git a/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet rename to transforms/universal/fdedup/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet rename to transforms/universal/fdedup/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/test-data/expected/get_list_transform/metadata.json similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json rename to transforms/universal/fdedup/test-data/expected/get_list_transform/metadata.json diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/test-data/expected/metadata.json similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/metadata.json rename to transforms/universal/fdedup/test-data/expected/metadata.json diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/test-data/expected/signature_calc/metadata.json similarity index 100% rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json rename to transforms/universal/fdedup/test-data/expected/signature_calc/metadata.json diff --git a/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet b/transforms/universal/fdedup/test-data/input/data_1/df1.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet rename to transforms/universal/fdedup/test-data/input/data_1/df1.parquet diff --git a/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet b/transforms/universal/fdedup/test-data/input/data_2/df2.parquet similarity index 100% rename from transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet rename to transforms/universal/fdedup/test-data/input/data_2/df2.parquet diff --git a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py b/transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py similarity index 93% rename from transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py rename to transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py index cecd224fe..c14329703 100644 --- a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py +++ b/transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py @@ -12,8 +12,8 @@ import os -from cluster_analysis_transform import sort_output_cli_param -from cluster_analysis_transform_python import ( +from dpk_fdedup.cluster_analysis.transform import sort_output_cli_param +from dpk_fdedup.cluster_analysis.transform_python import ( ClusterAnalysisPythonTransformConfiguration, ) from data_processing.runtime.pure_python import PythonTransformLauncher diff --git a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py b/transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py similarity index 91% rename from transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py rename to transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py index a3771fbd8..5cfddfc65 100644 --- a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py +++ b/transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py @@ -12,13 +12,13 @@ import os -from cluster_analysis_transform import ( +from dpk_fdedup.cluster_analysis.transform import ( jaccard_similarity_threshold_cli_param, num_bands_cli_param, num_segments_cli_param, sort_output_cli_param, ) -from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from dpk_fdedup.cluster_analysis.ray.transform import ClusterAnalysisRayTransformConfiguration from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) @@ -32,7 +32,7 @@ class TestRayClusterAnalysisTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../ray/test-data")) config = { "run_locally": True, num_bands_cli_param: 14, diff --git a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py b/transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py similarity index 89% rename from transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py rename to transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py index 294c86f25..990b0cf7b 100644 --- a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py +++ b/transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py @@ -12,8 +12,8 @@ import os -from cluster_analysis_transform import sort_output_cli_param -from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from dpk_fdedup.cluster_analysis.transform import sort_output_cli_param +from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) @@ -27,7 +27,7 @@ class TestSparkClusterAnalysisTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../spark/test-data")) config = { "cluster_num_bands": 14, "cluster_num_segments": 2, diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/test/test_data_cleaning_transform_python.py similarity index 93% rename from transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py rename to transforms/universal/fdedup/test/test_data_cleaning_transform_python.py index 8c4debed9..faa5e8924 100644 --- a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py +++ b/transforms/universal/fdedup/test/test_data_cleaning_transform_python.py @@ -12,11 +12,11 @@ import os -from data_cleaning_transform import ( +from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, ) -from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, diff --git a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py b/transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py similarity index 93% rename from transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py rename to transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py index a62105b2c..960127e51 100644 --- a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py +++ b/transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py @@ -12,12 +12,12 @@ import os -from data_cleaning_transform import ( +from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, operation_mode_cli_param, ) -from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) @@ -31,7 +31,7 @@ class TestRayDataCleaningTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../ray/test-data")) duplicate_location = os.path.abspath( os.path.join( os.path.dirname(__file__), diff --git a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py b/transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py similarity index 92% rename from transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py rename to transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py index 919857e23..9639980b3 100644 --- a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py +++ b/transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py @@ -12,12 +12,12 @@ import os -from data_cleaning_transform import ( +from dpk_fdedup.data_cleaning.transform import ( document_id_column_cli_param, duplicate_list_location_cli_param, operation_mode_cli_param, ) -from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) @@ -31,7 +31,7 @@ class TestSparkDataCleaningTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../spark/test-data")) duplicate_location = os.path.abspath( os.path.join( os.path.dirname(__file__), diff --git a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_python.py similarity index 92% rename from transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py rename to transforms/universal/fdedup/test/test_get_duplicate_list_transform_python.py index 4b59e3a7a..e5ab9e6a0 100644 --- a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py +++ b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_python.py @@ -16,8 +16,8 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from get_duplicate_list_transform import sort_output_cli_param -from get_duplicate_list_transform_python import ( +from dpk_fdedup.get_duplicate_list.transform import sort_output_cli_param +from dpk_fdedup.get_duplicate_list.transform_python import ( GetDuplicateListPythonTransformConfiguration, ) diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py similarity index 89% rename from transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py rename to transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py index 55869598c..017e560b0 100644 --- a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py +++ b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py @@ -16,8 +16,8 @@ AbstractTransformLauncherTest, ) from data_processing_ray.runtime.ray import RayTransformLauncher -from get_duplicate_list_transform import sort_output_cli_param -from get_duplicate_list_transform_ray import GetDuplicateListRayTransformConfiguration +from dpk_fdedup.get_duplicate_list.transform import sort_output_cli_param +from dpk_fdedup.get_duplicate_list.ray.transform import GetDuplicateListRayTransformConfiguration class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): @@ -27,7 +27,7 @@ class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../ray/test-data")) config = { "run_locally": True, sort_output_cli_param: True, diff --git a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_spark.py similarity index 91% rename from transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py rename to transforms/universal/fdedup/test/test_get_duplicate_list_transform_spark.py index 4b59e3a7a..b64ebb116 100644 --- a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py +++ b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_spark.py @@ -16,8 +16,8 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from get_duplicate_list_transform import sort_output_cli_param -from get_duplicate_list_transform_python import ( +from dpk_fdedup.get_duplicate_list.transform import sort_output_cli_param +from dpk_fdedup.get_duplicate_list.transform_python import ( GetDuplicateListPythonTransformConfiguration, ) @@ -29,7 +29,7 @@ class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../spark/test-data")) config = { sort_output_cli_param: True, } diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/test/test_signature_calc_transform_python.py similarity index 96% rename from transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py rename to transforms/universal/fdedup/test/test_signature_calc_transform_python.py index 9ad8a32d7..2b6c49e31 100644 --- a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py +++ b/transforms/universal/fdedup/test/test_signature_calc_transform_python.py @@ -17,7 +17,7 @@ AbstractTransformLauncherTest, ) from data_processing.utils import ParamsUtils -from signature_calc_transform_python import ( +from dpk_fdedup.signature_calc.transform_python import ( SignatureCalculationPythonTransformConfiguration, ) diff --git a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py b/transforms/universal/fdedup/test/test_signature_calc_transform_ray.py similarity index 90% rename from transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py rename to transforms/universal/fdedup/test/test_signature_calc_transform_ray.py index 34f3ee403..8c08eb938 100644 --- a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py +++ b/transforms/universal/fdedup/test/test_signature_calc_transform_ray.py @@ -17,12 +17,12 @@ ) from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from signature_calc_transform import ( +from dpk_fdedup.signature_calc.transform import ( num_bands_cli_param, num_permutations_cli_param, num_segments_cli_param, ) -from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration +from dpk_fdedup.signature_calc.ray.transform import SignatureCalculationRayTransformConfiguration class TestRaySignatureCalcTransform(AbstractTransformLauncherTest): @@ -32,7 +32,7 @@ class TestRaySignatureCalcTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../ray/test-data")) config = { "run_locally": True, num_permutations_cli_param: 112, diff --git a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py b/transforms/universal/fdedup/test/test_signature_calc_transform_spark.py similarity index 95% rename from transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py rename to transforms/universal/fdedup/test/test_signature_calc_transform_spark.py index 6d93dc7a9..af8f36aa9 100644 --- a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py +++ b/transforms/universal/fdedup/test/test_signature_calc_transform_spark.py @@ -17,7 +17,7 @@ ) from data_processing.utils import ParamsUtils from data_processing_spark.runtime.spark import SparkTransformLauncher -from signature_calc_transform_spark import ( +from dpk_fdedup.signature_calc.spark.transform import ( SignatureCalculationSparkTransformConfiguration, ) @@ -29,7 +29,7 @@ class TestSparkSignatureCalcTransform(AbstractTransformLauncherTest): """ def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../spark/test-data")) config = { "minhash_num_permutations": 112, "minhash_num_bands": 14, From 470152f7018bcfaaaf7374603f40cc73e0838de9 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 18 Dec 2024 16:26:26 -0500 Subject: [PATCH 2/6] update from Shahrokh Signed-off-by: Maroun Touma --- transforms/universal/fdedup/README.md | 86 +++++++++++++-------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md index afcf3db08..fb36adbc2 100644 --- a/transforms/universal/fdedup/README.md +++ b/transforms/universal/fdedup/README.md @@ -1,7 +1,7 @@ # Fuzzy Dedup Please see the set of -[transform project conventions](../../../README.md) +[transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. @@ -39,7 +39,7 @@ shingles. `num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band. The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard -similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this +similarity will be marked as duplicates. A Jupyter notebook in the [utils](utils/calc_r_and_b.ipynb) folder generates a graph of this probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact the deduplication process. @@ -99,7 +99,7 @@ The output dataset reflects the selected mode, providing flexibility for downstr ## Configuration and Usage ### Fuzzy Deduplication Transform -The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as +The set of dictionary keys holding [Fuzzy Dedup](dpk_fdedup/transform_python.py) configuration for values are as follows: ```text --input_folder INPUT_FOLDER @@ -136,7 +136,7 @@ follows: ``` ### Signature Calculation Transform -The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values +The set of dictionary keys holding [SignatureCalcTransform](dpk_fdedup/signature_calc/transform.py) configuration for values are as follows: ```text --minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN @@ -160,7 +160,7 @@ are as follows: ``` ### Cluster Analysis Transform -The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values +The set of dictionary keys holding [ClusterAnalysisTransform](dpk_fdedup/cluster_analysis/transform.py) configuration for values are as follows: ```text --cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD @@ -175,7 +175,7 @@ are as follows: This transform currently has no configuration parameters. ### Data Cleaning Transform -The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values +The set of dictionary keys holding [DataCleaningTransform](dpk_fdedup/data_cleaning/transform.py) configuration for values are as follows: ```text --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN @@ -193,17 +193,17 @@ make venv Subsequently, the main orchestration program can run with: ```commandline source venv/bin/activate -cd src -python fdedup_transform_python.py +cd dpk_fdedup +python transform_python.py ``` Alternatively the transforms included in fuzzy dedup can be launched independently: ```commandline source venv/bin/activate -cd src -python signature_calc_local_python.py -python cluster_analysis_local_python.py -python get_duplicate_list_local_python.py -python data_cleaning_local_python.py +cd dpk_fdedup +python signature_calc/local_python.py +python cluster_analysis/local_python.py +python get_duplicate_list/transform_local_python.py +python data_cleaning/local_python.py ``` After running the transforms, execute: ```shell @@ -213,12 +213,12 @@ To see results of the transform. ### Code example -This is a [sample notebook](../fdedup_python.ipynb) that shows how to invoke the python fuzzy dedup transform. +This is a [sample notebook](fdedup_python.ipynb) that shows how to invoke the python fuzzy dedup transform. ### Transforming data using the transform image To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +[running images quickstart](../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. ## Testing @@ -239,12 +239,12 @@ make test-image # Fuzzy Dedup - Ray implementation -Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +Please see the set of [transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. ## Summary -This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime. +This project wraps the Fuzzy Dedup transform with a Ray runtime. ## Configuration and command line Options @@ -252,10 +252,10 @@ Fuzzy Dedup configuration and command line options are the same as for the base ## Running ### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -In addition to those available to the transform as defined in [here](../python/README.md), +When running the transform with the Ray launcher (i.e., TransformLauncher), +in addition to those available to the transform as defined in here, the set of -[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. +[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available. ### Running the samples To run the samples, use the following `make` target to create a virtual environment: @@ -266,17 +266,17 @@ make venv Subsequently, the main orchestration program can run with: ```commandline source venv/bin/activate -cd src -python fdedup_transform_ray.py +cd dpk_fdedup +python ray/transform.py ``` Alternatively the transforms included in fuzzy dedup can be launched independently: ```commandline source venv/bin/activate -cd src -python signature_calc_local_ray.py -python cluster_analysis_local_ray.py -python get_duplicate_list_local_ray.py -python data_cleaning_local_ray.py +cd dpk_fdedup +python signature_calc/ray/local.py +python cluster_analysis/ray/local.py +python get_duplicate_list/ray/tarnsform.py +python data_cleaning/ray/local.py ``` After running the transforms, execute: ```shell @@ -287,12 +287,12 @@ To see results of the transform. ### Transforming data using the transform image To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +[running images quickstart](../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. ## Code Example -This is a [sample notebook](../fdedup_ray.ipynb) that shows how to invoke the ray fuzzy dedup transform. +This is a [sample notebook](fdedup_ray.ipynb) that shows how to invoke the ray fuzzy dedup transform. ## Testing @@ -313,12 +313,12 @@ make test-image # Fuzzy Dedup -- Spark -Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +Please see the set of [transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. ## Summary -This project wraps the [Fuzzy Dedup transform](../python) with a Spark runtime. +This project wraps the Fuzzy Dedup transform with a Spark runtime. ## Configuration and command line Options @@ -326,10 +326,10 @@ Fuzzy Dedup configuration and command line options are the same as for the base ## Running ### Launched Command Line Options -When running the transform with the Spark launcher (i.e. TransformLauncher), -In addition to those available to the transform as defined in [here](../python/README.md), +When running the transform with the Spark launcher (i.e., TransformLauncher), +in addition to those available to the transform as defined in here, the set of -[spark launcher](../../../../data-processing-lib/doc/spark-launcher-options.md) are available. +[spark launcher options](../../../data-processing-lib/doc/spark-launcher-options.md) are available. ### Running the samples To run the samples, use the following `make` target to create a virtual environment: @@ -340,17 +340,17 @@ make venv Subsequently, the main orchestration program can run with: ```commandline source venv/bin/activate -cd src -python fdedup_transform_spark.py +cd dpk_fdedup +python spark/transform.py ``` Alternatively the transforms included in fuzzy dedup can be launched independently: ```commandline source venv/bin/activate -cd src -python signature_calc_local_spark.py -python cluster_analysis_local_spark.py -python get_duplicate_list_local_spark.py -python data_cleaning_local_spark.py +cd dpk_fdedup +python signature_calc/spark/local.py +python cluster_analysis/spark/local.py +python get_duplicate_list/spark/transform.py +python data_cleanin/spark/local.py ``` After running the transforms, execute: ```shell @@ -361,12 +361,12 @@ To see results of the transform. ### Transforming data using the transform image To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +[running images quickstart](../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. ## Code Example -This is a [sample notebook](../fdedup_spark.ipynb) that shows how to invoke the spark fuzzy dedup transform. +This is a [sample notebook](fdedup_spark.ipynb) that shows how to invoke the spark fuzzy dedup transform. ## Testing From b95e99e6d9e63de444fc50da666d64b36ce56d7a Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 18 Dec 2024 18:03:00 -0500 Subject: [PATCH 3/6] more fixes Signed-off-by: Maroun Touma --- transforms/universal/fdedup/Makefile | 8 +- transforms/universal/fdedup/README.md | 36 +- .../fdedup/dpk_fdedup/spark/transform.py | 10 +- .../fdedup/dpk_fdedup/transform_python.py | 24 +- .../universal/fdedup/fdedup_python.ipynb | 464 ++++++++++++++---- transforms/universal/fdedup/fdedup_ray.ipynb | 4 +- .../universal/fdedup/fdedup_spark.ipynb | 4 +- transforms/universal/fdedup/kfp_ray/Makefile | 47 +- .../universal/fdedup/kfp_ray/fdedup_wf.py | 8 +- 9 files changed, 436 insertions(+), 169 deletions(-) diff --git a/transforms/universal/fdedup/Makefile b/transforms/universal/fdedup/Makefile index da70ab879..477d282e1 100644 --- a/transforms/universal/fdedup/Makefile +++ b/transforms/universal/fdedup/Makefile @@ -14,10 +14,4 @@ TRANSFORM_NAME=$(shell basename `pwd`) ################################################################################ - - -un-cli-sample: - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \ - RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\" \ - --fdedup_id_column int_id_column" \ - .transforms.run-src-file + \ No newline at end of file diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md index fb36adbc2..93d032e07 100644 --- a/transforms/universal/fdedup/README.md +++ b/transforms/universal/fdedup/README.md @@ -193,17 +193,15 @@ make venv Subsequently, the main orchestration program can run with: ```commandline source venv/bin/activate -cd dpk_fdedup -python transform_python.py +python -m dpk_fdedup.transform_python ``` Alternatively the transforms included in fuzzy dedup can be launched independently: ```commandline source venv/bin/activate -cd dpk_fdedup -python signature_calc/local_python.py -python cluster_analysis/local_python.py -python get_duplicate_list/transform_local_python.py -python data_cleaning/local_python.py +python -m dpk_fdedup.signature_calc.local_python +python -m dpk_fdedup.cluster_analysis.local_python +python -m dpk_fdedup.get_duplicate_list.transform_local_python +python -m dpk_fdedup.data_cleaning.local_python ``` After running the transforms, execute: ```shell @@ -266,17 +264,15 @@ make venv Subsequently, the main orchestration program can run with: ```commandline source venv/bin/activate -cd dpk_fdedup -python ray/transform.py +python -m dpk_fdedup.ray.transform ``` Alternatively the transforms included in fuzzy dedup can be launched independently: ```commandline source venv/bin/activate -cd dpk_fdedup -python signature_calc/ray/local.py -python cluster_analysis/ray/local.py -python get_duplicate_list/ray/tarnsform.py -python data_cleaning/ray/local.py +python -m dpk_fdedup.signature_calc.ray.local +python -m dpk_fdedup.cluster_analysis.ray.local +python -m dpk_fdedup.get_duplicate_list.ray.tarnsform +python -m dpk_fdedup.data_cleaning.ray.local ``` After running the transforms, execute: ```shell @@ -340,17 +336,15 @@ make venv Subsequently, the main orchestration program can run with: ```commandline source venv/bin/activate -cd dpk_fdedup -python spark/transform.py +python -m dpk_fdedup.spark.transform ``` Alternatively the transforms included in fuzzy dedup can be launched independently: ```commandline source venv/bin/activate -cd dpk_fdedup -python signature_calc/spark/local.py -python cluster_analysis/spark/local.py -python get_duplicate_list/spark/transform.py -python data_cleanin/spark/local.py +python -m dpk_fdedup.signature_calc.spark.local +python -m dpk_fdedup.cluster_analysis.spark.local +python -m dpk_fdedup.get_duplicate_list.transform +python -m dpk_fdedup.data_cleaning.spark.local ``` After running the transforms, execute: ```shell diff --git a/transforms/universal/fdedup/dpk_fdedup/spark/transform.py b/transforms/universal/fdedup/dpk_fdedup/spark/transform.py index 82767f849..77eff4d74 100644 --- a/transforms/universal/fdedup/dpk_fdedup/spark/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/spark/transform.py @@ -14,15 +14,15 @@ import os import sys -from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration -from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration +from dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing_spark.runtime.spark import SparkTransformLauncher -from fdedup_transform_python import ServiceOrchestrator, parse_args -from get_duplicate_list_transform_python import ( +from dpk_fdedup.transform_python import ServiceOrchestrator, parse_args +from dpk_fdedup.get_duplicate_list.transform_python import ( GetDuplicateListPythonTransformConfiguration, ) -from signature_calc_transform_spark import ( +from dpk_fdedup.signature_calc.spark.transform import ( SignatureCalculationSparkTransformConfiguration, ) diff --git a/transforms/universal/fdedup/dpk_fdedup/transform_python.py b/transforms/universal/fdedup/dpk_fdedup/transform_python.py index 7f7b71b82..dbbcf39e6 100644 --- a/transforms/universal/fdedup/dpk_fdedup/transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/transform_python.py @@ -15,20 +15,20 @@ import os import sys -import cluster_analysis_transform -import data_cleaning_transform -import get_duplicate_list_transform -import signature_calc_transform -from cluster_analysis.transform_python import ( +import dpk_fdedup.cluster_analysis.transform +import dpk_fdedup.data_cleaning.transform +import dpk_fdedup.get_duplicate_list.transform +import dpk_fdedup.signature_calc.transform +from dpk_fdedup.cluster_analysis.transform_python import ( ClusterAnalysisPythonTransformConfiguration, ) -from data_cleaning.transform_python import DataCleaningPythonTransformConfiguration +from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils, get_logger, str2bool -from get_duplicate_list.transform_python import ( +from dpk_fdedup.get_duplicate_list.transform_python import ( GetDuplicateListPythonTransformConfiguration, ) -from signature_calc.transform_python import ( +from dpk_fdedup.signature_calc.transform_python import ( SignatureCalculationPythonTransformConfiguration, ) @@ -47,10 +47,10 @@ } ARGS_MAP = { - "minhash": signature_calc_transform.captured_arg_keys, - "cluster": cluster_analysis_transform.captured_arg_keys, - "fdlist": get_duplicate_list_transform.captured_arg_keys, - "fdclean": data_cleaning_transform.captured_arg_keys, + "minhash": dpk_fdedup.signature_calc.transform.captured_arg_keys, + "cluster": dpk_fdedup.cluster_analysis.transform.captured_arg_keys, + "fdlist": dpk_fdedup.get_duplicate_list.transform.captured_arg_keys, + "fdclean": dpk_fdedup.data_cleaning.transform.captured_arg_keys, } diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb index 684583ffd..3ca0ec9e5 100644 --- a/transforms/universal/fdedup/fdedup_python.ipynb +++ b/transforms/universal/fdedup/fdedup_python.ipynb @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, "outputs": [], @@ -47,7 +47,7 @@ "import sys\n", "\n", "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_python import parse_args, ServiceOrchestrator" + "from dpk_fdedup.transform_python import parse_args, ServiceOrchestrator" ] }, { @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], @@ -102,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, "outputs": [ @@ -110,91 +110,377 @@ "name": "stderr", "output_type": "stream", "text": [ - "13:30:29 INFO - Starting SignatureCalculation step\n", - "13:30:29 INFO - Got parameters for SignatureCalculation\n", - "13:30:29 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "13:30:29 INFO - data factory scdata_ is using local configuration without input/output path\n", - "13:30:29 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "13:30:29 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:29 INFO - pipeline id pipeline_id\n", - "13:30:29 INFO - code location None\n", - "13:30:29 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n", - "13:30:29 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:29 INFO - orchestrator minhash started at 2024-11-26 13:30:29\n", - "13:30:29 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n", - "13:30:33 INFO - Completed 1 files (50.0%) in 0.074 min\n", - "13:30:33 INFO - Completed 2 files (100.0%) in 0.074 min\n", - "13:30:33 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:30:33 INFO - Starting flush()\n", - "13:30:34 INFO - Wrote 14 tables with a total size of 80,640 bytes\n", - "13:30:34 INFO - done flushing in 0.063 sec\n", - "13:30:34 INFO - Completed execution in 0.075 min, execution result 0\n", - "13:30:34 INFO - SignatureCalculation completed successfully\n", - "13:30:34 INFO - Starting ClusterAnalysis step\n", - "13:30:34 INFO - Got parameters for ClusterAnalysis\n", - "13:30:34 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "13:30:34 INFO - pipeline id pipeline_id\n", - "13:30:34 INFO - code location None\n", - "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/docs_to_remove\n", - "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:34 INFO - orchestrator cluster started at 2024-11-26 13:30:34\n", - "13:30:34 INFO - Number of folders is 14\n", - "13:30:34 INFO - Completed 1 files (7.14%) in 0.0 min\n", - "13:30:34 INFO - Completed 2 files (14.29%) in 0.0 min\n", - "13:30:34 INFO - Completed 3 files (21.43%) in 0.001 min\n", - "13:30:34 INFO - Completed 4 files (28.57%) in 0.001 min\n", - "13:30:34 INFO - Completed 5 files (35.71%) in 0.001 min\n", - "13:30:34 INFO - Completed 6 files (42.86%) in 0.001 min\n", - "13:30:34 INFO - Completed 7 files (50.0%) in 0.001 min\n", - "13:30:34 INFO - Completed 8 files (57.14%) in 0.002 min\n", - "13:30:34 INFO - Completed 9 files (64.29%) in 0.002 min\n", - "13:30:34 INFO - Completed 10 files (71.43%) in 0.002 min\n", - "13:30:34 INFO - Completed 11 files (78.57%) in 0.002 min\n", - "13:30:34 INFO - Completed 12 files (85.71%) in 0.002 min\n", - "13:30:34 INFO - Completed 13 files (92.86%) in 0.002 min\n", - "13:30:34 INFO - Completed 14 files (100.0%) in 0.003 min\n", - "13:30:34 INFO - Done processing 14 files, waiting for flush() completion.\n", - "13:30:34 INFO - done flushing in 0.0 sec\n", - "13:30:34 INFO - Completed execution in 0.003 min, execution result 0\n", - "13:30:34 INFO - ClusterAnalysis completed successfully\n", - "13:30:34 INFO - Starting GetDuplicateList step\n", - "13:30:34 INFO - Got parameters for GetDuplicateList\n", - "13:30:34 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "13:30:34 INFO - pipeline id pipeline_id\n", - "13:30:34 INFO - code location None\n", - "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n", - "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:34 INFO - orchestrator fdlist started at 2024-11-26 13:30:34\n", - "13:30:34 INFO - Number of folders is 1\n", - "13:30:34 INFO - Get Duplicate List for folder docs_to_remove\n", - "13:30:34 INFO - 8 documents marked as duplicates\n", - "13:30:34 INFO - Completed 1 files (100.0%) in 0.0 min\n", - "13:30:34 INFO - Done processing 1 files, waiting for flush() completion.\n", - "13:30:34 INFO - done flushing in 0.0 sec\n", - "13:30:34 INFO - Completed execution in 0.001 min, execution result 0\n", - "13:30:34 INFO - GetDuplicateList completed successfully\n", - "13:30:34 INFO - Starting DataCleaning step\n", - "13:30:34 INFO - Got parameters for DataCleaning\n", - "13:30:34 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "13:30:34 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "13:30:34 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "13:30:34 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:34 INFO - pipeline id pipeline_id\n", - "13:30:34 INFO - code location None\n", - "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/cleaned\n", - "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:34 INFO - orchestrator fdclean started at 2024-11-26 13:30:34\n", - "13:30:34 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n", - "13:30:34 INFO - Completed 1 files (50.0%) in 0.0 min\n", - "13:30:34 INFO - Completed 2 files (100.0%) in 0.0 min\n", - "13:30:34 INFO - Done processing 2 files, waiting for flush() completion.\n", - "13:30:34 INFO - done flushing in 0.0 sec\n", - "13:30:34 INFO - Completed execution in 0.0 min, execution result 0\n", - "13:30:34 INFO - DataCleaning completed successfully\n" + "17:55:52 INFO - Starting SignatureCalculation step\n", + "17:55:52 INFO - Got parameters for SignatureCalculation\n", + "17:55:52 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "17:55:52 INFO - data factory scdata_ is using local configuration without input/output path\n", + "17:55:52 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "17:55:52 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "17:55:52 INFO - pipeline id pipeline_id\n", + "17:55:52 INFO - code location None\n", + "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output\n", + "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n", + "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "17:55:52 INFO - orchestrator minhash started at 2024-12-18 17:55:52\n", + "17:55:52 ERROR - No input files to process - exiting\n", + "17:55:52 INFO - Completed execution in 0.0 min, execution result 0\n", + "17:55:52 INFO - SignatureCalculation completed successfully\n", + "17:55:52 INFO - Starting ClusterAnalysis step\n", + "17:55:52 INFO - Got parameters for ClusterAnalysis\n", + "17:55:52 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "17:55:52 INFO - pipeline id pipeline_id\n", + "17:55:52 INFO - code location None\n", + "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/docs_to_remove\n", + "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n", + "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "17:55:52 INFO - orchestrator cluster started at 2024-12-18 17:55:52\n", + "17:55:52 INFO - Number of folders is 14\n", + "17:55:52 WARNING - Exception processing file band=0/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=1/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=2/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 3 files (21.43%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=3/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 4 files (28.57%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=4/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 5 files (35.71%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=5/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 6 files (42.86%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=6/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 7 files (50.0%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=7/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 8 files (57.14%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=8/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 9 files (64.29%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=9/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 10 files (71.43%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=10/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 11 files (78.57%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=11/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 12 files (85.71%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=12/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 13 files (92.86%) in 0.0 min\n", + "17:55:52 WARNING - Exception processing file band=13/segment=0: Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", + " out_files, stats = self.transform.transform(folder_name=f_name)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", + " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", + " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", + " .collect(no_optimization=True)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", + " return wrap_df(ldf.collect(callback))\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + "polars.exceptions.ColumnNotFoundError: band_hash\n", + "\n", + "Resolved plan until failure:\n", + "\n", + "\t---> FAILED HERE RESOLVING 'group_by' <---\n", + "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", + "\n", + "17:55:52 INFO - Completed 14 files (100.0%) in 0.0 min\n", + "17:55:52 INFO - Done processing 14 files, waiting for flush() completion.\n", + "17:55:52 INFO - done flushing in 0.0 sec\n", + "Traceback (most recent call last):\n", + " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py\", line 131, in orchestrate\n", + " stats[\"processing_time\"] = round(stats[\"processing_time\"], 3)\n", + " ~~~~~^^^^^^^^^^^^^^^^^^^\n", + "KeyError: 'processing_time'\n", + "17:55:52 ERROR - Exception during execution 'processing_time': None\n", + "17:55:52 INFO - Completed execution in 0.0 min, execution result 1\n", + "17:55:52 ERROR - ClusterAnalysis failed with status 1, aborting ...\n" ] } ], diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb index bb69579a9..8bfa98a3a 100644 --- a/transforms/universal/fdedup/fdedup_ray.ipynb +++ b/transforms/universal/fdedup/fdedup_ray.ipynb @@ -55,8 +55,8 @@ "import sys\n", "\n", "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_python import parse_args\n", - "from fdedup_transform_ray import RayServiceOrchestrator" + "from dpk_fdedup.transform_python import parse_args\n", + "from dpk_fdedup.ray.transform import RayServiceOrchestrator" ] }, { diff --git a/transforms/universal/fdedup/fdedup_spark.ipynb b/transforms/universal/fdedup/fdedup_spark.ipynb index 9f4bf1772..616543640 100644 --- a/transforms/universal/fdedup/fdedup_spark.ipynb +++ b/transforms/universal/fdedup/fdedup_spark.ipynb @@ -47,8 +47,8 @@ "import sys\n", "\n", "from data_processing.utils import ParamsUtils\n", - "from fdedup_transform_python import parse_args\n", - "from fdedup_transform_spark import SparkServiceOrchestrator" + "from dpk_fdedup.transform_python import parse_args\n", + "from dpk_fdedup.spark.transform import SparkServiceOrchestrator" ] }, { diff --git a/transforms/universal/fdedup/kfp_ray/Makefile b/transforms/universal/fdedup/kfp_ray/Makefile index 55f7851f6..5c1ae0778 100644 --- a/transforms/universal/fdedup/kfp_ray/Makefile +++ b/transforms/universal/fdedup/kfp_ray/Makefile @@ -2,10 +2,20 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows + # Include the common configuration for this transform -include ../transform.config +#include ../transform.config + +SRC_DIR=${CURDIR}/../ +# Use the docker image that is built for ray runtime +TRANSFORM_RUNTIME=ray +## override settings in .make.default as they assume old structure with ray being the current folder +DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME) +DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -SRC_DIR=${CURDIR}/../ray/ + +# Only build the image with -f Dockerfile.ray +BUILD_SPECIFIC_RUNTIME=ray PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) @@ -17,38 +27,21 @@ clean: @# Help: Clean up the virtual environment. rm -rf ${REPOROOT}/transforms/venv -venv:: - -build:: - -setup:: - -test:: - -test-src:: - -publish:: - -image:: - -test-image:: - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: - .PHONY: workflow-build workflow-build: workflow-venv $(MAKE) $(YAML_WF) .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=fdedup_wf.yaml + $(MAKE) TRANSFORM_SRC=${SRC_DIR} \ + TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \ + TRANSFORM_NAME=$(TRANSFORM_NAME) \ + BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \ + DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) \ + PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline .PHONY: workflow-upload -workflow-upload: workflow-build +workflow-upload: @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done + done \ No newline at end of file diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index ffc6f79bc..6b1265cf8 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -28,10 +28,10 @@ image_pull_secret = os.getenv("FDEDUP_IMAGE_PULL_SECRET", "my_secret") # the name of the job script -SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "signature_calc_transform_ray.py" -CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "cluster_analysis_transform_ray.py" -GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "get_duplicate_list_transform_ray.py" -DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py" +SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "-m dpk_fdedup.signature_calc.ray.transform" +CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "-m dpk_fdedup.cluster_analysis.ray.transform" +GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "-m dpk_fdedup.get_duplicate_list.ray.transform" +DATA_CLEANING_EXEC_SCRIPT_NAME: str = "-m dpk_fdedup.data_cleaning.ray.transform" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" From 52fb1a19c2c643e20eb9a2612fe5e5c192bafdd9 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 18 Dec 2024 18:54:55 -0500 Subject: [PATCH 4/6] fix and test notebok Signed-off-by: Maroun Touma --- .../universal/fdedup/fdedup_python.ipynb | 679 +----------------- transforms/universal/fdedup/fdedup_ray.ipynb | 409 +---------- .../universal/fdedup/fdedup_spark.ipynb | 12 +- 3 files changed, 47 insertions(+), 1053 deletions(-) diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb index 3ca0ec9e5..a64c48a54 100644 --- a/transforms/universal/fdedup/fdedup_python.ipynb +++ b/transforms/universal/fdedup/fdedup_python.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, "outputs": [], @@ -71,14 +71,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], "source": [ "# create parameters\n", - "input_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"output\")\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n", "params = {\n", " # transform configuration parameters\n", " \"input_folder\": input_folder,\n", @@ -102,388 +102,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "17:55:52 INFO - Starting SignatureCalculation step\n", - "17:55:52 INFO - Got parameters for SignatureCalculation\n", - "17:55:52 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "17:55:52 INFO - data factory scdata_ is using local configuration without input/output path\n", - "17:55:52 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "17:55:52 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "17:55:52 INFO - pipeline id pipeline_id\n", - "17:55:52 INFO - code location None\n", - "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output\n", - "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n", - "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "17:55:52 INFO - orchestrator minhash started at 2024-12-18 17:55:52\n", - "17:55:52 ERROR - No input files to process - exiting\n", - "17:55:52 INFO - Completed execution in 0.0 min, execution result 0\n", - "17:55:52 INFO - SignatureCalculation completed successfully\n", - "17:55:52 INFO - Starting ClusterAnalysis step\n", - "17:55:52 INFO - Got parameters for ClusterAnalysis\n", - "17:55:52 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "17:55:52 INFO - pipeline id pipeline_id\n", - "17:55:52 INFO - code location None\n", - "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/docs_to_remove\n", - "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n", - "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "17:55:52 INFO - orchestrator cluster started at 2024-12-18 17:55:52\n", - "17:55:52 INFO - Number of folders is 14\n", - "17:55:52 WARNING - Exception processing file band=0/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 1 files (7.14%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=1/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 2 files (14.29%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=2/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 3 files (21.43%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=3/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 4 files (28.57%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=4/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 5 files (35.71%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=5/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 6 files (42.86%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=6/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 7 files (50.0%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=7/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 8 files (57.14%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=8/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 9 files (64.29%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=9/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 10 files (71.43%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=10/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 11 files (78.57%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=11/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 12 files (85.71%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=12/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 13 files (92.86%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=13/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 14 files (100.0%) in 0.0 min\n", - "17:55:52 INFO - Done processing 14 files, waiting for flush() completion.\n", - "17:55:52 INFO - done flushing in 0.0 sec\n", - "Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py\", line 131, in orchestrate\n", - " stats[\"processing_time\"] = round(stats[\"processing_time\"], 3)\n", - " ~~~~~^^^^^^^^^^^^^^^^^^^\n", - "KeyError: 'processing_time'\n", - "17:55:52 ERROR - Exception during execution 'processing_time': None\n", - "17:55:52 INFO - Completed execution in 0.0 min, execution result 1\n", - "17:55:52 ERROR - ClusterAnalysis failed with status 1, aborting ...\n" - ] - } - ], + "outputs": [], "source": [ "\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", @@ -504,26 +126,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['python/output/cleaned/metadata.json',\n", - " 'python/output/cleaned/data_1',\n", - " 'python/output/cleaned/data_2']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import glob\n", - "glob.glob(\"python/output/cleaned/*\")" + "glob.glob(\"output/cleaned/*\")" ] }, { @@ -536,171 +145,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (12, 2)\n", - "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", - "│ int_id_column ┆ contents │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", - "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ PR Newswire October 12, 2019 │\n", - "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", - "│ ┆ New Location Continues Strategic National Expansion Plans │\n", - "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", - "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", - "│ ┆ store is the Company's third location in Michigan. │\n", - "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", - "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", - "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", - "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", - "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", - "│ ┆ free shipping services. │\n", - "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", - "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", - "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", - "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", - "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", - "│ ┆ location in Madison in Fall 2021. │\n", - "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", - "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", - "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", - "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", - "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", - "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", - "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", - "│ ┆ years to come.\" │\n", - "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", - "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", - "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", - "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", - "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", - "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", - "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", - "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", - "│ ┆ from the store's grand piano. │\n", - "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", - "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", - "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", - "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", - "│ ┆ p.m. ET. │\n", - "│ ┆ About Von Maur │\n", - "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", - "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", - "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", - "│ ┆ Courtney Smith │\n", - "│ ┆ courtney@reputationpartners.com │\n", - "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", - "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ 3 ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", - "│ ┆ what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ 4 ┆ │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 5 ┆ │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ │\n", - "│ 6 ┆ │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", - "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", - "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", - "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", - "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. │\n", - "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", - "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", - "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", - "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", - "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", - "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", - "│ ┆ another capy capably chewing on cantaloupe │\n", - "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", - "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", - "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "│ 17 ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", - "│ ┆ what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" - ] - } - ], + "outputs": [], "source": [ "import polars as pl\n", - "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", - "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n", + "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", + "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n", "input_df = input_df_1.vstack(input_df_2)\n", "\n", "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", @@ -717,101 +169,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (4, 2)\n", - "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", - "│ int_id_column ┆ contents │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", - "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ PR Newswire October 12, 2019 │\n", - "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", - "│ ┆ New Location Continues Strategic National Expansion Plans │\n", - "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", - "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", - "│ ┆ store is the Company's third location in Michigan. │\n", - "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", - "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", - "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", - "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", - "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", - "│ ┆ free shipping services. │\n", - "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", - "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", - "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", - "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", - "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", - "│ ┆ location in Madison in Fall 2021. │\n", - "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", - "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", - "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", - "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", - "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", - "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", - "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", - "│ ┆ years to come.\" │\n", - "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", - "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", - "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", - "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", - "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", - "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", - "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", - "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", - "│ ┆ from the store's grand piano. │\n", - "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", - "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", - "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", - "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", - "│ ┆ p.m. ET. │\n", - "│ ┆ About Von Maur │\n", - "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", - "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", - "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", - "│ ┆ Courtney Smith │\n", - "│ ┆ courtney@reputationpartners.com │\n", - "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", - "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ 4 ┆ │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. │\n", - "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", - "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" - ] - } - ], + "outputs": [], "source": [ "import polars as pl\n", - "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n", - "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n", + "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n", + "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n", "output_df = output_df_1.vstack(output_df_2)\n", "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", " print(output_df)" @@ -824,6 +189,14 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "787c644e-2640-4c05-bdc2-8a261305a89f", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb index 8bfa98a3a..7c6740f88 100644 --- a/transforms/universal/fdedup/fdedup_ray.ipynb +++ b/transforms/universal/fdedup/fdedup_ray.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -37,18 +37,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-11-26 13:30:56,482\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" - ] - } - ], + "outputs": [], "source": [ "import ast\n", "import os\n", @@ -81,14 +73,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], "source": [ "# create parameters\n", "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n", "params = {\n", " # transform configuration parameters\n", " \"input_folder\": input_folder,\n", @@ -114,126 +106,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:30:57 INFO - Starting SignatureCalculation step\n", - "13:30:57 INFO - Got parameters for SignatureCalculation\n", - "13:30:57 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "13:30:57 INFO - data factory scdata_ is using local configuration without input/output path\n", - "13:30:57 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "13:30:57 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:57 INFO - pipeline id pipeline_id\n", - "13:30:57 INFO - code location None\n", - "13:30:57 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:30:57 INFO - actor creation delay 0\n", - "13:30:57 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:30:57 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", - "13:30:57 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:57 INFO - Running locally\n", - "2024-11-26 13:31:08,860\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - orchestrator started at 2024-11-26 13:31:12\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.162438202649355, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed processing 1 files in 0.002 min\n", - "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Starting flush()\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - done flushing in 0.045 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Wrote 14 tables with a total size of 80,640 bytes\n", - "13:31:24 INFO - Completed execution in 0.446 min, execution result 0\n", - "13:31:26 INFO - SignatureCalculation completed successfully\n", - "13:31:26 INFO - Starting ClusterAnalysis step\n", - "13:31:26 INFO - Got parameters for ClusterAnalysis\n", - "13:31:26 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "13:31:26 INFO - pipeline id pipeline_id\n", - "13:31:26 INFO - code location None\n", - "13:31:26 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:26 INFO - actor creation delay 0\n", - "13:31:26 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:26 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/docs_to_remove\n", - "13:31:26 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:26 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:26 INFO - Running locally\n", - "2024-11-26 13:31:28,318\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - orchestrator started at 2024-11-26 13:31:31\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of folders is 14\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.77626838721335, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 2 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 3 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 4 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 5 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 6 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 7 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 8 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 9 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 10 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files (78.571%) in 0.001 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed processing 14 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", - "13:31:43 INFO - Completed execution in 0.292 min, execution result 0\n", - "13:31:45 INFO - ClusterAnalysis completed successfully\n", - "13:31:45 INFO - Starting GetDuplicateList step\n", - "13:31:45 INFO - Got parameters for GetDuplicateList\n", - "13:31:45 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "13:31:45 INFO - pipeline id pipeline_id\n", - "13:31:45 INFO - code location None\n", - "13:31:45 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:45 INFO - actor creation delay 0\n", - "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:45 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", - "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:45 INFO - Running locally\n", - "2024-11-26 13:31:47,311\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - orchestrator started at 2024-11-26 13:31:50\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of folders is 1\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.749520111829042, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed processing 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - Get Duplicate List for folder docs_to_remove\n", - "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - 8 documents marked as duplicates\n", - "13:32:02 INFO - Completed execution in 0.295 min, execution result 0\n", - "13:32:04 INFO - GetDuplicateList completed successfully\n", - "13:32:04 INFO - Starting DataCleaning step\n", - "13:32:04 INFO - Got parameters for DataCleaning\n", - "13:32:04 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "13:32:04 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "13:32:04 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "13:32:04 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:04 INFO - pipeline id pipeline_id\n", - "13:32:04 INFO - code location None\n", - "13:32:04 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:04 INFO - actor creation delay 0\n", - "13:32:04 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:04 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/cleaned\n", - "13:32:04 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:04 INFO - Running locally\n", - "2024-11-26 13:32:07,526\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - orchestrator started at 2024-11-26 13:32:10\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.738976669497788, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed processing 1 files in 0.002 min\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - done flushing in 0.003 sec\n", - "13:32:23 INFO - Completed execution in 0.313 min, execution result 0\n", - "13:32:24 INFO - DataCleaning completed successfully\n" - ] - } - ], + "outputs": [], "source": [ "\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", @@ -254,24 +130,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['ray/output/cleaned/metadata.json', 'ray/output/cleaned/df1.parquet']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import glob\n", - "glob.glob(\"ray/output/cleaned/*\")" + "glob.glob(\"output/cleaned/*\")" ] }, { @@ -284,167 +149,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (12, 2)\n", - "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", - "│ int_id_column ┆ contents │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", - "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ PR Newswire October 12, 2019 │\n", - "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", - "│ ┆ New Location Continues Strategic National Expansion Plans │\n", - "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", - "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", - "│ ┆ store is the Company's third location in Michigan. │\n", - "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", - "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", - "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", - "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", - "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", - "│ ┆ free shipping services. │\n", - "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", - "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", - "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", - "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", - "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", - "│ ┆ location in Madison in Fall 2021. │\n", - "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", - "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", - "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", - "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", - "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", - "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", - "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", - "│ ┆ years to come.\" │\n", - "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", - "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", - "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", - "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", - "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", - "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", - "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", - "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", - "│ ┆ from the store's grand piano. │\n", - "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", - "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", - "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", - "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", - "│ ┆ p.m. ET. │\n", - "│ ┆ About Von Maur │\n", - "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", - "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", - "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", - "│ ┆ Courtney Smith │\n", - "│ ┆ courtney@reputationpartners.com │\n", - "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", - "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ 3 ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", - "│ ┆ what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ 4 ┆ │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 5 ┆ │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ │\n", - "│ 6 ┆ │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", - "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", - "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", - "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", - "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. │\n", - "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", - "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", - "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", - "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", - "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", - "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", - "│ ┆ another capy capably chewing on cantaloupe │\n", - "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", - "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", - "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "│ 17 ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", - "│ ┆ what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" - ] - } - ], + "outputs": [], "source": [ "import polars as pl\n", "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", @@ -462,100 +170,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (4, 2)\n", - "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", - "│ int_id_column ┆ contents │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", - "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ PR Newswire October 12, 2019 │\n", - "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", - "│ ┆ New Location Continues Strategic National Expansion Plans │\n", - "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", - "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", - "│ ┆ store is the Company's third location in Michigan. │\n", - "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", - "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", - "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", - "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", - "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", - "│ ┆ free shipping services. │\n", - "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", - "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", - "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", - "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", - "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", - "│ ┆ location in Madison in Fall 2021. │\n", - "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", - "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", - "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", - "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", - "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", - "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", - "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", - "│ ┆ years to come.\" │\n", - "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", - "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", - "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", - "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", - "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", - "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", - "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", - "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", - "│ ┆ from the store's grand piano. │\n", - "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", - "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", - "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", - "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", - "│ ┆ p.m. ET. │\n", - "│ ┆ About Von Maur │\n", - "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", - "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", - "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", - "│ ┆ Courtney Smith │\n", - "│ ┆ courtney@reputationpartners.com │\n", - "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", - "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ 4 ┆ │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. │\n", - "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", - "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" - ] - } - ], + "outputs": [], "source": [ "import polars as pl\n", - "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"df1.parquet\"))\n", "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", " print(output_df)" ] diff --git a/transforms/universal/fdedup/fdedup_spark.ipynb b/transforms/universal/fdedup/fdedup_spark.ipynb index 616543640..d605d726b 100644 --- a/transforms/universal/fdedup/fdedup_spark.ipynb +++ b/transforms/universal/fdedup/fdedup_spark.ipynb @@ -79,7 +79,7 @@ "source": [ "# create parameters\n", "input_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"output\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n", "params = {\n", " # transform configuration parameters\n", " \"input_folder\": input_folder,\n", @@ -133,7 +133,7 @@ "outputs": [], "source": [ "import glob\n", - "glob.glob(\"spark/output/cleaned/*\")" + "glob.glob(\"output/cleaned/*\")" ] }, { @@ -174,7 +174,7 @@ "outputs": [], "source": [ "import polars as pl\n", - "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"df1.parquet\"))\n", "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", " print(output_df)" ] @@ -190,9 +190,9 @@ ], "metadata": { "kernelspec": { - "display_name": "fdedup_spark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "fdedup_spark" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -204,7 +204,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, From ad548bfca50c9bd98c05b6282ee950d52caaa116 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 18 Dec 2024 19:08:05 -0500 Subject: [PATCH 5/6] fix spark dockerfile Signed-off-by: Maroun Touma --- transforms/universal/fdedup/Dockerfile.spark | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/fdedup/Dockerfile.spark b/transforms/universal/fdedup/Dockerfile.spark index d228b6c2d..26bcf1da0 100644 --- a/transforms/universal/fdedup/Dockerfile.spark +++ b/transforms/universal/fdedup/Dockerfile.spark @@ -22,8 +22,8 @@ RUN pip install -r requirements.txt RUN mkdir -p /opt/spark/work-dir/src/templates && \ mkdir -p /opt/spark/work-dir/config -COPY --chown=spark:root spark-deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ -COPY --chown=spark:root spark-deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ +COPY --chown=spark:root spark/deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ +COPY --chown=spark:root spark/deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ USER spark From bc88085edf325628513e09034fb993ff417cc6c2 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 19 Dec 2024 17:11:34 -0500 Subject: [PATCH 6/6] Updated notebooks for python and ray Signed-off-by: Maroun Touma --- .../fdedup/dpk_fdedup/ray/transform.py | 16 +++++ .../fdedup/dpk_fdedup/transform_python.py | 18 +++++ .../universal/fdedup/fdedup_python.ipynb | 61 ++++------------ transforms/universal/fdedup/fdedup_ray.ipynb | 70 +++++-------------- 4 files changed, 63 insertions(+), 102 deletions(-) diff --git a/transforms/universal/fdedup/dpk_fdedup/ray/transform.py b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py index a59877b6f..76046ba4b 100644 --- a/transforms/universal/fdedup/dpk_fdedup/ray/transform.py +++ b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py @@ -67,6 +67,22 @@ def execute_service(self, service_short_name: str, params: list) -> int: return status +# Class used by the notebooks to ingest binary files and create parquet files +class Fdedup: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + args = parse_args() + # Initialize the orchestrator + orchestrator = RayServiceOrchestrator(global_params=args) + # Launch python fuzzy dedup execution + return orchestrator.orchestrate() + + if __name__ == "__main__": # Parse command line arguments args = parse_args() diff --git a/transforms/universal/fdedup/dpk_fdedup/transform_python.py b/transforms/universal/fdedup/dpk_fdedup/transform_python.py index dbbcf39e6..196affd93 100644 --- a/transforms/universal/fdedup/dpk_fdedup/transform_python.py +++ b/transforms/universal/fdedup/dpk_fdedup/transform_python.py @@ -261,6 +261,24 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() + + +# Class used by the notebooks to ingest binary files and create parquet files +class Fdedup: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + args = parse_args() + # Initialize the orchestrator + orchestrator = ServiceOrchestrator(global_params=args) + # Launch python fuzzy dedup execution + return orchestrator.orchestrate() + + if __name__ == "__main__": # Parse command line arguments diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb index a64c48a54..b02f463eb 100644 --- a/transforms/universal/fdedup/fdedup_python.ipynb +++ b/transforms/universal/fdedup/fdedup_python.ipynb @@ -23,8 +23,7 @@ "## This is here as a reference only\n", "# Users and application developers must use the right tag for the latest from pypi\n", "#!pip install data-prep-toolkit\n", - "#!pip install data-prep-toolkit-transforms\n", - "#!pip install data-prep-connector" + "#!pip install data-prep-toolkit-transforms" ] }, { @@ -38,16 +37,11 @@ { "cell_type": "code", "execution_count": null, - "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "id": "bae63d15-4ce5-4f2a-a917-0f3161e9dd73", "metadata": {}, "outputs": [], "source": [ - "import ast\n", - "import os\n", - "import sys\n", - "\n", - "from data_processing.utils import ParamsUtils\n", - "from dpk_fdedup.transform_python import parse_args, ServiceOrchestrator" + "from dpk_fdedup.transform_python import Fdedup" ] }, { @@ -72,48 +66,18 @@ { "cell_type": "code", "execution_count": null, - "id": "e90a853e-412f-45d7-af3d-959e755aeebb", - "metadata": {}, - "outputs": [], - "source": [ - "# create parameters\n", - "input_folder = os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n", - "params = {\n", - " # transform configuration parameters\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - " \"contents_column\": \"contents\",\n", - " \"document_id_column\": \"int_id_column\",\n", - " \"num_permutations\": 112,\n", - " \"num_bands\": 14,\n", - " \"num_minhashes_per_band\": 8,\n", - " \"operation_mode\": \"filter_duplicates\",\n", - "}" - ] - }, - { - "cell_type": "markdown", - "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", - "metadata": {}, - "source": [ - "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0775e400-7469-49a6-8998-bd4772931459", + "id": "a54a78e9-d78b-4aeb-ac2b-806070a2dec0", "metadata": {}, "outputs": [], "source": [ - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "args = parse_args()\n", - "# Initialize the orchestrator\n", - "orchestrator = ServiceOrchestrator(global_params=args)\n", - "# Launch python fuzzy dedup execution\n", - "orchestrator.orchestrate()" + "Fdedup(input_folder='test-data/input',\n", + " output_folder='output',\n", + " contents_column= \"contents\",\n", + " document_id_column= \"int_id_column\",\n", + " num_permutations= 112,\n", + " num_bands= 14,\n", + " num_minhashes_per_band= 8,\n", + " operation_mode=\"filter_duplicates\").transform()\n" ] }, { @@ -151,6 +115,7 @@ "outputs": [], "source": [ "import polars as pl\n", + "import os\n", "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n", "input_df = input_df_1.vstack(input_df_2)\n", diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb index 7c6740f88..39bc1ba78 100644 --- a/transforms/universal/fdedup/fdedup_ray.ipynb +++ b/transforms/universal/fdedup/fdedup_ray.ipynb @@ -23,8 +23,7 @@ "## This is here as a reference only\n", "# Users and application developers must use the right tag for the latest from pypi\n", "#!pip install data-prep-toolkit\n", - "#!pip install data-prep-toolkit-transforms\n", - "#!pip install data-prep-connector" + "#!pip install data-prep-toolkit-transforms" ] }, { @@ -38,17 +37,11 @@ { "cell_type": "code", "execution_count": null, - "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "id": "bae63d15-4ce5-4f2a-a917-0f3161e9dd73", "metadata": {}, "outputs": [], "source": [ - "import ast\n", - "import os\n", - "import sys\n", - "\n", - "from data_processing.utils import ParamsUtils\n", - "from dpk_fdedup.transform_python import parse_args\n", - "from dpk_fdedup.ray.transform import RayServiceOrchestrator" + "from dpk_fdedup.ray.transform import Fdedup" ] }, { @@ -67,57 +60,25 @@ "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", - "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |\n", - "| run_locally:bool | true | if true, launch a ray cluster locally, otherwise connect to an already existing cluster | \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90a853e-412f-45d7-af3d-959e755aeebb", - "metadata": {}, - "outputs": [], - "source": [ - "# create parameters\n", - "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n", - "params = {\n", - " # transform configuration parameters\n", - " \"input_folder\": input_folder,\n", - " \"output_folder\": output_folder,\n", - " \"contents_column\": \"contents\",\n", - " \"document_id_column\": \"int_id_column\",\n", - " \"num_permutations\": 112,\n", - " \"num_bands\": 14,\n", - " \"num_minhashes_per_band\": 8,\n", - " \"operation_mode\": \"filter_duplicates\",\n", - " # ray configuration parameters\n", - " \"run_locally\": True,\n", - "}\n" - ] - }, - { - "cell_type": "markdown", - "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", - "metadata": {}, - "source": [ - "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |" ] }, { "cell_type": "code", "execution_count": null, - "id": "0775e400-7469-49a6-8998-bd4772931459", + "id": "a54a78e9-d78b-4aeb-ac2b-806070a2dec0", "metadata": {}, "outputs": [], "source": [ - "\n", - "sys.argv = ParamsUtils.dict_to_req(d=params)\n", - "args = parse_args()\n", - "# Initialize the orchestrator\n", - "orchestrator = RayServiceOrchestrator(global_params=args)\n", - "# Launch ray fuzzy dedup execution\n", - "orchestrator.orchestrate()" + "Fdedup(input_folder='ray/test-data/input',\n", + " output_folder='output',\n", + " contents_column= \"contents\",\n", + " document_id_column= \"int_id_column\",\n", + " num_permutations= 112,\n", + " num_bands= 14,\n", + " num_minhashes_per_band= 8,\n", + " operation_mode= \"filter_duplicates\",\n", + " run_locally= True).transform()\n" ] }, { @@ -155,6 +116,7 @@ "outputs": [], "source": [ "import polars as pl\n", + "import os\n", "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", " print(input_df)" @@ -192,7 +154,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c11d3a4b-8ef9-417d-a8a2-f688db067a52", + "id": "787c644e-2640-4c05-bdc2-8a261305a89f", "metadata": {}, "outputs": [], "source": []