From 2bd246d31ef22c4d5e6cdb998e400d8a18773e90 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Wed, 18 Dec 2024 13:43:46 -0500
Subject: [PATCH 1/6] first cut at refactoring fdedup as its own named dpk_
 module

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .../{python/Dockerfile => Dockerfile.python}  |  23 +-
 transforms/universal/fdedup/Dockerfile.ray    |  31 ++
 transforms/universal/fdedup/Dockerfile.spark  |  40 ++
 transforms/universal/fdedup/Makefile          |  86 +---
 transforms/universal/fdedup/README.md         | 396 +++++++++++++++++-
 .../{python/src => dpk_fdedup}/Murmur_MH.py   |   0
 .../cluster_analysis/local_python.py}         |   2 +-
 .../ray}/cluster_estimator.py                 |   0
 .../cluster_analysis/ray/local.py}            |   2 +-
 .../cluster_analysis/ray/transform.py}        |   2 +-
 .../cluster_analysis/spark/local.py}          |   2 +-
 .../cluster_analysis/spark/transform.py}      |   2 +-
 .../cluster_analysis/transform.py}            |   2 +-
 .../cluster_analysis/transform_python.py}     |   2 +-
 .../data_cleaning/local_python.py}            |   4 +-
 .../data_cleaning/ray/local.py}               |   4 +-
 .../data_cleaning/ray/transform.py}           |   2 +-
 .../data_cleaning/spark/local.py}             |   2 +-
 .../data_cleaning/spark/transform.py}         |   2 +-
 .../data_cleaning/transform.py}               |   0
 .../data_cleaning/transform_python.py}        |   2 +-
 .../get_duplicate_list/ray/transform.py}      |   2 +-
 .../get_duplicate_list/transform.py}          |   0
 .../transform_local_python.py}                |   2 +-
 .../get_duplicate_list/transform_python.py}   |   2 +-
 .../ray/transform.py}                         |  12 +-
 .../signature_calc/local_python.py}           |   2 +-
 .../signature_calc/ray/local.py}              |   2 +-
 .../signature_calc/ray/transform.py}          |   2 +-
 .../signature_calc/spark/local.py}            |   2 +-
 .../signature_calc/spark/transform.py}        |   2 +-
 .../signature_calc/transform.py}              |   2 +-
 .../signature_calc/transform_python.py}       |   2 +-
 .../spark/transform.py}                       |   0
 .../transform_python.py}                      |   8 +-
 .../fdedup/{ray => }/images/fuzzy.png         | Bin
 .../universal/fdedup/python/.dockerignore     |   1 -
 transforms/universal/fdedup/python/Makefile   |  64 ---
 transforms/universal/fdedup/python/README.md  | 244 -----------
 .../universal/fdedup/python/pyproject.toml    |  45 --
 transforms/universal/fdedup/ray/.dockerignore |   1 -
 transforms/universal/fdedup/ray/.gitignore    |  38 --
 transforms/universal/fdedup/ray/Dockerfile    |  51 ---
 transforms/universal/fdedup/ray/Makefile      |  68 ---
 transforms/universal/fdedup/ray/README.md     |  71 ----
 .../universal/fdedup/ray/pyproject.toml       |  45 --
 .../universal/fdedup/ray/requirements.txt     |   6 -
 .../fdedup/{python => }/requirements.txt      |   1 -
 transforms/universal/fdedup/spark/Dockerfile  |  51 ---
 transforms/universal/fdedup/spark/Makefile    |  57 ---
 transforms/universal/fdedup/spark/README.md   |   2 +-
 ...equirements.txt => requirements-spark.txt} |   0
 .../universal/fdedup/spark/requirements.txt   |  11 -
 .../expected/cleaned/data_1/df1.parquet       | Bin
 .../expected/cleaned/data_2/df2.parquet       | Bin
 .../test-data/expected/cleaned/metadata.json  |   0
 .../docs_to_remove/band_0_segment_0.parquet   | Bin
 .../docs_to_remove/band_0_segment_1.parquet   | Bin
 .../docs_to_remove/band_10_segment_0.parquet  | Bin
 .../docs_to_remove/band_10_segment_1.parquet  | Bin
 .../docs_to_remove/band_11_segment_0.parquet  | Bin
 .../docs_to_remove/band_11_segment_1.parquet  | Bin
 .../docs_to_remove/band_12_segment_0.parquet  | Bin
 .../docs_to_remove/band_12_segment_1.parquet  | Bin
 .../docs_to_remove/band_13_segment_0.parquet  | Bin
 .../docs_to_remove/band_13_segment_1.parquet  | Bin
 .../docs_to_remove/band_1_segment_0.parquet   | Bin
 .../docs_to_remove/band_1_segment_1.parquet   | Bin
 .../docs_to_remove/band_2_segment_0.parquet   | Bin
 .../docs_to_remove/band_2_segment_1.parquet   | Bin
 .../docs_to_remove/band_3_segment_0.parquet   | Bin
 .../docs_to_remove/band_3_segment_1.parquet   | Bin
 .../docs_to_remove/band_4_segment_0.parquet   | Bin
 .../docs_to_remove/band_4_segment_1.parquet   | Bin
 .../docs_to_remove/band_5_segment_0.parquet   | Bin
 .../docs_to_remove/band_5_segment_1.parquet   | Bin
 .../docs_to_remove/band_6_segment_0.parquet   | Bin
 .../docs_to_remove/band_6_segment_1.parquet   | Bin
 .../docs_to_remove/band_7_segment_0.parquet   | Bin
 .../docs_to_remove/band_7_segment_1.parquet   | Bin
 .../docs_to_remove/band_8_segment_0.parquet   | Bin
 .../docs_to_remove/band_8_segment_1.parquet   | Bin
 .../docs_to_remove/band_9_segment_0.parquet   | Bin
 .../docs_to_remove/band_9_segment_1.parquet   | Bin
 .../docs_to_remove/metadata.json              |   0
 .../data_cleaning/cleaned/data_1/df1.parquet  | Bin
 .../data_cleaning/cleaned/data_2/df2.parquet  | Bin
 .../data_cleaning/cleaned/metadata.json       |   0
 .../docs_to_remove_consolidated.parquet       | Bin
 .../docs_to_remove_consolidated.parquet       | Bin
 .../expected/get_list_transform/metadata.json |   0
 .../test-data/expected/metadata.json          |   0
 .../bands/band=0/segment=0/data_2/df2.parquet | Bin
 .../bands/band=0/segment=1/data_2/df2.parquet | Bin
 .../bands/band=1/segment=0/data_2/df2.parquet | Bin
 .../bands/band=1/segment=1/data_2/df2.parquet | Bin
 .../band=10/segment=0/data_2/df2.parquet      | Bin
 .../band=10/segment=1/data_2/df2.parquet      | Bin
 .../band=11/segment=0/data_2/df2.parquet      | Bin
 .../band=11/segment=1/data_2/df2.parquet      | Bin
 .../band=12/segment=0/data_2/df2.parquet      | Bin
 .../band=12/segment=1/data_2/df2.parquet      | Bin
 .../band=13/segment=0/data_2/df2.parquet      | Bin
 .../band=13/segment=1/data_2/df2.parquet      | Bin
 .../bands/band=2/segment=0/data_2/df2.parquet | Bin
 .../bands/band=2/segment=1/data_2/df2.parquet | Bin
 .../bands/band=3/segment=0/data_2/df2.parquet | Bin
 .../bands/band=3/segment=1/data_2/df2.parquet | Bin
 .../bands/band=4/segment=0/data_2/df2.parquet | Bin
 .../bands/band=4/segment=1/data_2/df2.parquet | Bin
 .../bands/band=5/segment=0/data_2/df2.parquet | Bin
 .../bands/band=5/segment=1/data_2/df2.parquet | Bin
 .../bands/band=6/segment=0/data_2/df2.parquet | Bin
 .../bands/band=6/segment=1/data_2/df2.parquet | Bin
 .../bands/band=7/segment=0/data_2/df2.parquet | Bin
 .../bands/band=7/segment=1/data_2/df2.parquet | Bin
 .../bands/band=8/segment=0/data_2/df2.parquet | Bin
 .../bands/band=8/segment=1/data_2/df2.parquet | Bin
 .../bands/band=9/segment=0/data_2/df2.parquet | Bin
 .../bands/band=9/segment=1/data_2/df2.parquet | Bin
 .../expected/signature_calc/metadata.json     |   0
 .../test-data/input/data_1/df1.parquet        | Bin
 .../test-data/input/data_2/df2.parquet        | Bin
 .../test_cluster_analysis_transform_python.py |   4 +-
 .../test_cluster_analysis_transform_ray.py    |   6 +-
 .../test_cluster_analysis_transform_spark.py  |   6 +-
 .../test_data_cleaning_transform_python.py    |   4 +-
 .../test/test_data_cleaning_transform_ray.py  |   6 +-
 .../test_data_cleaning_transform_spark.py     |   6 +-
 ...est_get_duplicate_list_transform_python.py |   4 +-
 .../test_get_duplicate_list_transform_ray.py  |   6 +-
 ...test_get_duplicate_list_transform_spark.py |   6 +-
 .../test_signature_calc_transform_python.py   |   2 +-
 .../test/test_signature_calc_transform_ray.py |   6 +-
 .../test_signature_calc_transform_spark.py    |   4 +-
 135 files changed, 546 insertions(+), 916 deletions(-)
 rename transforms/universal/fdedup/{python/Dockerfile => Dockerfile.python} (57%)
 create mode 100644 transforms/universal/fdedup/Dockerfile.ray
 create mode 100644 transforms/universal/fdedup/Dockerfile.spark
 rename transforms/universal/fdedup/{python/src => dpk_fdedup}/Murmur_MH.py (100%)
 rename transforms/universal/fdedup/{python/src/cluster_analysis_local_python.py => dpk_fdedup/cluster_analysis/local_python.py} (97%)
 rename transforms/universal/fdedup/{ray/src => dpk_fdedup/cluster_analysis/ray}/cluster_estimator.py (100%)
 rename transforms/universal/fdedup/{ray/src/cluster_analysis_local_ray.py => dpk_fdedup/cluster_analysis/ray/local.py} (95%)
 rename transforms/universal/fdedup/{ray/src/cluster_analysis_transform_ray.py => dpk_fdedup/cluster_analysis/ray/transform.py} (97%)
 rename transforms/universal/fdedup/{spark/src/cluster_analysis_local_spark.py => dpk_fdedup/cluster_analysis/spark/local.py} (95%)
 rename transforms/universal/fdedup/{spark/src/cluster_analysis_transform_spark.py => dpk_fdedup/cluster_analysis/spark/transform.py} (98%)
 rename transforms/universal/fdedup/{python/src/cluster_analysis_transform.py => dpk_fdedup/cluster_analysis/transform.py} (99%)
 rename transforms/universal/fdedup/{python/src/cluster_analysis_transform_python.py => dpk_fdedup/cluster_analysis/transform_python.py} (98%)
 rename transforms/universal/fdedup/{python/src/data_cleaning_local_python.py => dpk_fdedup/data_cleaning/local_python.py} (93%)
 rename transforms/universal/fdedup/{ray/src/data_cleaning_local_ray.py => dpk_fdedup/data_cleaning/ray/local.py} (93%)
 rename transforms/universal/fdedup/{ray/src/data_cleaning_transform_ray.py => dpk_fdedup/data_cleaning/ray/transform.py} (99%)
 rename transforms/universal/fdedup/{spark/src/data_cleaning_local_spark.py => dpk_fdedup/data_cleaning/spark/local.py} (94%)
 rename transforms/universal/fdedup/{spark/src/data_cleaning_transform_spark.py => dpk_fdedup/data_cleaning/spark/transform.py} (99%)
 rename transforms/universal/fdedup/{python/src/data_cleaning_transform.py => dpk_fdedup/data_cleaning/transform.py} (100%)
 rename transforms/universal/fdedup/{python/src/data_cleaning_transform_python.py => dpk_fdedup/data_cleaning/transform_python.py} (98%)
 rename transforms/universal/fdedup/{ray/src/get_duplicate_list_transform_ray.py => dpk_fdedup/get_duplicate_list/ray/transform.py} (97%)
 rename transforms/universal/fdedup/{python/src/get_duplicate_list_transform.py => dpk_fdedup/get_duplicate_list/transform.py} (100%)
 rename transforms/universal/fdedup/{python/src/get_duplicate_list_transform_local_python.py => dpk_fdedup/get_duplicate_list/transform_local_python.py} (96%)
 rename transforms/universal/fdedup/{python/src/get_duplicate_list_transform_python.py => dpk_fdedup/get_duplicate_list/transform_python.py} (97%)
 rename transforms/universal/fdedup/{ray/src/fdedup_transform_ray.py => dpk_fdedup/ray/transform.py} (85%)
 rename transforms/universal/fdedup/{python/src/signature_calc_local_python.py => dpk_fdedup/signature_calc/local_python.py} (97%)
 rename transforms/universal/fdedup/{ray/src/signature_calc_local_ray.py => dpk_fdedup/signature_calc/ray/local.py} (95%)
 rename transforms/universal/fdedup/{ray/src/signature_calc_transform_ray.py => dpk_fdedup/signature_calc/ray/transform.py} (94%)
 rename transforms/universal/fdedup/{spark/src/signature_calc_local_spark.py => dpk_fdedup/signature_calc/spark/local.py} (97%)
 rename transforms/universal/fdedup/{spark/src/signature_calc_transform_spark.py => dpk_fdedup/signature_calc/spark/transform.py} (94%)
 rename transforms/universal/fdedup/{python/src/signature_calc_transform.py => dpk_fdedup/signature_calc/transform.py} (99%)
 rename transforms/universal/fdedup/{python/src/signature_calc_transform_python.py => dpk_fdedup/signature_calc/transform_python.py} (94%)
 rename transforms/universal/fdedup/{spark/src/fdedup_transform_spark.py => dpk_fdedup/spark/transform.py} (100%)
 rename transforms/universal/fdedup/{python/src/fdedup_transform_python.py => dpk_fdedup/transform_python.py} (98%)
 rename transforms/universal/fdedup/{ray => }/images/fuzzy.png (100%)
 delete mode 100644 transforms/universal/fdedup/python/.dockerignore
 delete mode 100644 transforms/universal/fdedup/python/Makefile
 delete mode 100644 transforms/universal/fdedup/python/README.md
 delete mode 100644 transforms/universal/fdedup/python/pyproject.toml
 delete mode 100644 transforms/universal/fdedup/ray/.dockerignore
 delete mode 100644 transforms/universal/fdedup/ray/.gitignore
 delete mode 100644 transforms/universal/fdedup/ray/Dockerfile
 delete mode 100644 transforms/universal/fdedup/ray/Makefile
 delete mode 100644 transforms/universal/fdedup/ray/README.md
 delete mode 100644 transforms/universal/fdedup/ray/pyproject.toml
 delete mode 100644 transforms/universal/fdedup/ray/requirements.txt
 rename transforms/universal/fdedup/{python => }/requirements.txt (85%)
 delete mode 100644 transforms/universal/fdedup/spark/Dockerfile
 delete mode 100644 transforms/universal/fdedup/spark/Makefile
 rename transforms/universal/fdedup/spark/{src/requirements.txt => requirements-spark.txt} (100%)
 delete mode 100644 transforms/universal/fdedup/spark/requirements.txt
 rename transforms/universal/fdedup/{python => }/test-data/expected/cleaned/data_1/df1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cleaned/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cleaned/metadata.json (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/cluster_analysis/docs_to_remove/metadata.json (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/data_cleaning/cleaned/metadata.json (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/get_list_transform/metadata.json (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/metadata.json (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/expected/signature_calc/metadata.json (100%)
 rename transforms/universal/fdedup/{python => }/test-data/input/data_1/df1.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test-data/input/data_2/df2.parquet (100%)
 rename transforms/universal/fdedup/{python => }/test/test_cluster_analysis_transform_python.py (93%)
 rename transforms/universal/fdedup/{ray => }/test/test_cluster_analysis_transform_ray.py (91%)
 rename transforms/universal/fdedup/{spark => }/test/test_cluster_analysis_transform_spark.py (89%)
 rename transforms/universal/fdedup/{python => }/test/test_data_cleaning_transform_python.py (93%)
 rename transforms/universal/fdedup/{ray => }/test/test_data_cleaning_transform_ray.py (93%)
 rename transforms/universal/fdedup/{spark => }/test/test_data_cleaning_transform_spark.py (92%)
 rename transforms/universal/fdedup/{python => }/test/test_get_duplicate_list_transform_python.py (92%)
 rename transforms/universal/fdedup/{ray => }/test/test_get_duplicate_list_transform_ray.py (89%)
 rename transforms/universal/fdedup/{spark => }/test/test_get_duplicate_list_transform_spark.py (91%)
 rename transforms/universal/fdedup/{python => }/test/test_signature_calc_transform_python.py (96%)
 rename transforms/universal/fdedup/{ray => }/test/test_signature_calc_transform_ray.py (90%)
 rename transforms/universal/fdedup/{spark => }/test/test_signature_calc_transform_spark.py (95%)

diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/Dockerfile.python
similarity index 57%
rename from transforms/universal/fdedup/python/Dockerfile
rename to transforms/universal/fdedup/Dockerfile.python
index 79c85e4ac..1a53451d5 100644
--- a/transforms/universal/fdedup/python/Dockerfile
+++ b/transforms/universal/fdedup/Dockerfile.python
@@ -2,34 +2,21 @@ FROM docker.io/python:3.10.14-slim-bullseye
 
 RUN pip install --upgrade --no-cache-dir pip 
 
-# install pytest
-RUN pip install --no-cache-dir pytest
-ARG DPK_WHEEL_FILE_NAME
-
 # Create a user and use it to run the transform
 RUN useradd -ms /bin/bash dpk
 USER dpk
 WORKDIR /home/dpk
+ARG DPK_WHEEL_FILE_NAME
 
 # Copy and install data processing libraries 
 # These are expected to be placed in the docker context before this is run (see the make image).
 COPY --chown=dpk:root data-processing-dist data-processing-dist
-RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
+RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
 
-COPY --chown=dpk:root src/ src/
-COPY --chown=dpk:root pyproject.toml pyproject.toml
-COPY --chown=dpk:root README.md README.md
+COPY --chown=dpk:root dpk_fdedup/ dpk_fdedup/
 COPY --chown=dpk:root requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
 
-RUN pip install --no-cache-dir -e .
-
-# copy source data
-COPY ./src/fdedup_transform_python.py fdedup_transform_python.py
-COPY ./src/fdedup_transform_python.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
 
 # Set environment
 ENV PYTHONPATH /home/dpk
@@ -38,4 +25,4 @@ ENV PYTHONPATH /home/dpk
 ARG BUILD_DATE
 ARG GIT_COMMIT
 LABEL build-date=$BUILD_DATE
-LABEL git-commit=$GIT_COMMIT
+LABEL git-commit=$GIT_COMMIT
\ No newline at end of file
diff --git a/transforms/universal/fdedup/Dockerfile.ray b/transforms/universal/fdedup/Dockerfile.ray
new file mode 100644
index 000000000..379e45bad
--- /dev/null
+++ b/transforms/universal/fdedup/Dockerfile.ray
@@ -0,0 +1,31 @@
+ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310
+
+FROM ${BASE_IMAGE}
+
+RUN pip install --upgrade --no-cache-dir pip 
+
+# install pytest
+RUN pip install --no-cache-dir pytest
+ARG DPK_WHEEL_FILE_NAME
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=ray:users data-processing-dist data-processing-dist
+RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
+
+## Copy the python version of the tansform
+COPY --chown=ray:users dpk_fdedup/ dpk_fdedup/
+COPY --chown=ray:users requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+# Grant non-root users the necessary permissions to the ray directory
+RUN chmod 755 /home/ray
+
+# Set environment
+ENV PYTHONPATH /home/ray
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
\ No newline at end of file
diff --git a/transforms/universal/fdedup/Dockerfile.spark b/transforms/universal/fdedup/Dockerfile.spark
new file mode 100644
index 000000000..d228b6c2d
--- /dev/null
+++ b/transforms/universal/fdedup/Dockerfile.spark
@@ -0,0 +1,40 @@
+FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest
+
+USER root
+# install pytest
+RUN pip install --no-cache-dir pytest
+
+WORKDIR ${SPARK_HOME}/work-dir
+ARG DPK_WHEEL_FILE_NAME
+
+# Copy and install data processing libraries 
+# These are expected to be placed in the docker context before this is run (see the make image).
+COPY --chown=spark:root data-processing-dist data-processing-dist
+RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]
+
+
+# Install project source
+
+## Copy the python version of the tansform
+COPY --chown=spark:root dpk_fdedup/ dpk_fdedup/
+COPY --chown=spark:root requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+RUN mkdir -p /opt/spark/work-dir/src/templates && \
+    mkdir -p /opt/spark/work-dir/config
+COPY --chown=spark:root spark-deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/
+COPY --chown=spark:root spark-deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/
+
+
+USER spark
+
+# Set environment
+ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH}
+ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH}
+
+# Put these at the end since they seem to upset the docker cache.
+ARG BUILD_DATE
+ARG GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+LABEL git-commit=$GIT_COMMIT
+
diff --git a/transforms/universal/fdedup/Makefile b/transforms/universal/fdedup/Makefile
index bca6f7e85..da70ab879 100644
--- a/transforms/universal/fdedup/Makefile
+++ b/transforms/universal/fdedup/Makefile
@@ -1,79 +1,23 @@
 REPOROOT=../../..
 # Use make help, to see the available rules
-include $(REPOROOT)/.make.defaults
+include $(REPOROOT)/transforms/.make.cicd.targets
 
-setup::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree,  so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name. 
+TRANSFORM_NAME=$(shell basename `pwd`)
 
-clean::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
+################################################################################
 
-build::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-venv::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
 
-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
 
-set-versions:  
-	@# Help: Recursively $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-publish:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-src::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-kind-load-image::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-docker-load-image::
-	@# Help: Recursively make $@ in all subdirs
-	$(MAKE) RULE=$@ .recurse
-
-docker-save-image::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-.PHONY: workflow-venv
-workflow-venv:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-venv;   \
-	fi
-
-.PHONY: workflow-test
-workflow-test:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-test;   \
-	fi
-	
-.PHONY: workflow-upload
-workflow-upload:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-upload; \
-	fi
-
-.PHONY: workflow-build
-workflow-build:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C  kfp_ray workflow-build; \
-	fi
 
+un-cli-sample:
+	$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \
+                RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\"  \
+                --fdedup_id_column int_id_column"	\
+                .transforms.run-src-file
diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md
index fed3c1370..afcf3db08 100644
--- a/transforms/universal/fdedup/README.md
+++ b/transforms/universal/fdedup/README.md
@@ -1,11 +1,393 @@
-# Fuzzy Deduplication Transform 
+# Fuzzy Dedup
+
+Please see the set of
+[transform project conventions](../../../README.md)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Contributors
+- Nelson Bore (kibnelson@gmail.com)
+- Constantin Adam (cmadam@us.ibm.com)
+
+## Description
 The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a
 set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries.
-Per the set of [transform project conventions](../../README.md#transform-project-conventions) the following runtimes are available:
 
-* [python](python/README.md) - enables running the base transform in a pure python environment
-* [ray](ray/README.md) - enables running the base python transform in a Ray runtime
-* [spark](spark/README.md) - enables running the base python transform in a spark runtime
-* [kfp](kfp_ray/README.md) - enables running the ray docker image in a kubernetes cluster using a generated `yaml` file.
+Fuzzy dedup is a complex process made up of a pipeline that performs four main steps:
+
+1. **Signature Calculation**: creates a set of minhashes for each document, and uses them to create band signatures for
+the document.
+2. **Cluster Analysis**: groups documents into clusters based on matching band signatures. Within each cluster, it
+retains only the documents that have a Jaccard similarity above a specified threshold, and it identifies which documents
+to keep as unique and which ones to mark as duplicates.
+3. **Duplicate List Generation**: combines the similarity clusters identified in each band to create a single, unified
+list of duplicate documents.
+4. **Data Cleaning**: processes the documents by either filtering out duplicates or adding annotations to distinguish
+duplicates from non-duplicates.
+
+Each one of these steps is described in more detail below.
+
+### Signature Calculation
+
+This transform computes `num_permutations` minhashes and `num_bands` signatures for each document in the dataset, by
+following these processing steps:
+1. **Shingle Generation**: create a set of character or word shingles, using a specified window length. Character
+shingles are more effective at detecting similar documents, but require more computational resources compared to word
+shingles.
+2. **Minhash Calculation**: using the shingles as input, compute `num_permutations` minhashes for each document.
+3. **Band Signature Calculation**: divide the minhashes into `num_bands`, where each band contains
+`num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band.
+
+The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard
+similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this
+probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact
+the deduplication process.
+
+To help distribute the workload and speed up processing of the next steps, the hash space of each band is divided into
+`num_segments` segments. The band signatures, the minhashes, the document ids, and lengths are stored in an organized
+output folder structure `bands/band=b/segment=s`, where `b` is the band number and `s` is the segment number.
+
+### Cluster Analysis
+
+This transform leverages segmented processing to analyze the data generated by the **Signature Calculation** step
+efficiently and in parallel. Each worker processes a specific segment `s` of a band `b` by loading and analyzing all
+Parquet files from the folder `bands/band=b/segment=s`. Each row in the Parquet files contains, for a document:
+* `band_hash`, the document's band signature, and 
+* `data`, a structure with three fields: the unique `document_id`, document's `minhashes`, and `document_size`.
+
+The transform runs the following processing steps:
+1. **Data Loading**: combine into a single dataframe all Parquet files in `bands/band=b/segment=s`.
+2. **Clustering**: run a `group_by` operation on the `band_hash` column that will group documents with the same band
+signature into clusters.
+3. **Similarity Analysis**: for each cluster, calculate Jaccard similarity between pairs of documents using their
+minhashes, and move documents below the specified Jaccard similarity threshold into new clusters.
+4. **Duplicate Identification**: in clusters with more than one document remaining, retain the largest document with the
+smallest document id, and mark as duplicates all other documents in the cluster.
+5. **Persist Results**: save the duplicate clusters in a file.
+
+### Duplicate List Generation
+
+The **Cluster Analysis** step identifies duplicates across multiple bands, meaning a document can be marked as a
+duplicate in one or more bands (e.g., if two documents are identical, one will be marked as a duplicate in all bands).
+This transform consolidates all duplicate information from each band segment into a single file, providing a unified
+record of duplicates detected across the dataset.
+
+### Data Cleaning
+
+This transform processes the original dataset using the list of duplicate documents generated by the **Duplicate List
+Generation** step. It imports each file in the original dataset into a table and produces a new dataset. The directory
+structure of the input dataset is preserved, but the contents of the output files depend on the selected operating mode:
+1. **Annotate** - add a new `duplicate` column to the dataset, that contains a `d` for documents marked as duplicates,
+and is empty for non-duplicates
+2. **Filter duplicates** - removes all documents identified as duplicates from the dataset.
+3. **Filter non-duplicates** - removes from the dataset all documents that were not marked as duplicates, leaving only
+the duplicates.
+
+The output dataset reflects the selected mode, providing flexibility for downstream processing.
+
+## Input Columns Used by This Transform
+
+| Input Column Name                                                   | Data Type | Description                      |
+|---------------------------------------------------------------------|-----------|----------------------------------|
+| Column specified by the _contents_column_ configuration argument    | str       | Column that stores document text |
+| Column specified by the _document_id_column_ configuration argument | int64     | Column that stores document ID   |
+
+## Output Columns Annotated by This Transform
+| Output Column Name | Data Type | Description                                                                                                         |
+|------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| duplicate  | str       | Column added if fuzzy dedup runs in 'annotate' mode. Value is 'd' for duplicate documents, empty for non-duplicates |
+
+## Configuration and Usage
+### Fuzzy Deduplication Transform
+The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as
+follows:
+```text
+--input_folder INPUT_FOLDER
+                    Input folder path
+--output_folder OUTPUT_FOLDER
+                    Output folder path
+--operation_mode {filter_duplicates,filter_non_duplicates,annotate}
+                    operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents
+--contents_column CONTENTS_COLUMN
+                    name of the column that stores document text
+--document_id_column DOCUMENT_ID_COLUMN
+                    name of the column that stores document ID
+--seed SEED         seed of the random number generator
+--num_permutations NUM_PERMUTATIONS
+                    number of permutations to use for minhash calculation
+--num_bands NUM_BANDS
+                    number of bands to use for band hash calculation
+--num_minhashes_per_band NUM_MINHASHES_PER_BAND
+                    number of minhashes to use in each band
+--word_shingle_size WORD_SHINGLE_SIZE
+                    number of words included in one shingle
+--jaccard_similarity_threshold JACCARD_SIMILARITY_THRESHOLD
+                    jaccard similarity threshold above which two documents are similar
+--num_segments NUM_SEGMENTS
+                    the number of segments dividing the hashing space for each band (for scalability)
+--duplicate_list_location DUPLICATE_LIST_LOCATION
+                    path to the file with all the duplicate document ids
+--services SERVICES   Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning)
+--use_s3 USE_S3       use s3
+--s3_cred S3_CRED     ast string of options for s3 credentials
+--shingle_option SHINGLE_OPTION
+                    Option used for shingling
+
+```
+
+### Signature Calculation Transform
+The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values
+are as follows:
+```text
+--minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN
+                    name of the column storing the unique ID assigned to each document
+--minhash_contents_column MINHASH_CONTENTS_COLUMN
+                    name of the column storing the contents of each document
+--minhash_seed MINHASH_SEED
+                    the seed used to instantiate the random number generator
+--minhash_num_permutations MINHASH_NUM_PERMUTATIONS
+                    number of permutations (minhashes) calculated for each document
+--minhash_word_shingle_size MINHASH_WORD_SHINGLE_SIZE
+                    the size of the word shingles calculated for each document
+--minhash_num_bands MINHASH_NUM_BANDS
+                    the number of bands to use in the banding technique
+--minhash_num_minhashes_per_band MINHASH_NUM_MINHASHES_PER_BAND
+                    the number of minhashes to use in each band
+--minhash_num_segments MINHASH_NUM_SEGMENTS
+                    the number of segments across which we divide the hashing space for each band
+--minhash_shingle_option MINHASH_SHINGLE_OPTION
+                    Shingling option ('word' or 'char')
+```
+
+### Cluster Analysis Transform
+The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values
+are as follows:
+```text
+--cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD
+                      Jaccard similarity threshold above which two documents are duplicates
+--cluster_num_bands CLUSTER_NUM_BANDS
+                      The number of bands used in the banding technique
+--cluster_num_segments CLUSTER_NUM_SEGMENTS
+                      The number of segments dividing the hashing space for each band
+```
+
+### Get Duplicates List Transform
+This transform currently has no configuration parameters.
+
+### Data Cleaning Transform
+The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values
+are as follows:
+```text
+  --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN
+                        name of the column storing the unique ID assigned to each document
+  --fdclean_operation_mode {filter_duplicates,filter_non_duplicates,annotate}
+                        operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents
+```
+
+### Running the samples
+To run the samples, use the following `make` target to create a virtual environment:
+
+```commandline
+make venv
+```
+Subsequently, the main orchestration program can run with:
+```commandline
+source venv/bin/activate
+cd src
+python fdedup_transform_python.py
+```
+Alternatively the transforms included in fuzzy dedup can be launched independently:
+```commandline
+source venv/bin/activate
+cd src
+python signature_calc_local_python.py
+python cluster_analysis_local_python.py
+python get_duplicate_list_local_python.py
+python data_cleaning_local_python.py
+```
+After running the transforms, execute:
+```shell
+ls output
+```
+To see results of the transform.
+
+### Code example
+
+This is a [sample notebook](../fdedup_python.ipynb) that shows how to invoke the python fuzzy dedup transform.
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+## Testing
+
+For testing fuzzy deduplication in a pure python runtime, use the following `make` targets. To launch integration tests
+for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data
+cleaning) use: 
+```commandline
+make test-src
+```
+
+To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that
+image, use:
+```commandline
+make test-image
+```
+
+
+# Fuzzy Dedup - Ray implementation
+
+Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform
+configuration,  testing and IDE set up.
+
+## Summary
+
+This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime.
+
+## Configuration and command line Options
+
+Fuzzy Dedup configuration and command line options are the same as for the base python transform. 
+
+## Running
+### Launched Command Line Options 
+When running the transform with the Ray launcher (i.e. TransformLauncher),
+In addition to those available to the transform as defined in [here](../python/README.md),
+the set of 
+[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available.
+
+### Running the samples
+To run the samples, use the following `make` target to create a virtual environment:
+
+```commandline
+make venv
+```
+Subsequently, the main orchestration program can run with:
+```commandline
+source venv/bin/activate
+cd src
+python fdedup_transform_ray.py
+```
+Alternatively the transforms included in fuzzy dedup can be launched independently:
+```commandline
+source venv/bin/activate
+cd src
+python signature_calc_local_ray.py
+python cluster_analysis_local_ray.py
+python get_duplicate_list_local_ray.py
+python data_cleaning_local_ray.py
+```
+After running the transforms, execute:
+```shell
+ls output
+```
+To see results of the transform.
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+## Code Example
+
+This is a [sample notebook](../fdedup_ray.ipynb) that shows how to invoke the ray fuzzy dedup transform.
+
+## Testing
+
+For testing fuzzy deduplication in a ray runtime, use the following `make` targets. To launch integration tests
+for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data
+cleaning) use: 
+```commandline
+make test-src
+```
+
+To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that
+image, use:
+```commandline
+make test-image
+```
+
+
+
+# Fuzzy Dedup -- Spark
+
+Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform
+configuration,  testing and IDE set up.
+
+## Summary
+
+This project wraps the [Fuzzy Dedup transform](../python) with a Spark runtime.
+
+## Configuration and command line Options
+
+Fuzzy Dedup configuration and command line options are the same as for the base python transform. 
+
+## Running
+### Launched Command Line Options 
+When running the transform with the Spark launcher (i.e. TransformLauncher),
+In addition to those available to the transform as defined in [here](../python/README.md),
+the set of 
+[spark launcher](../../../../data-processing-lib/doc/spark-launcher-options.md) are available.
+
+### Running the samples
+To run the samples, use the following `make` target to create a virtual environment:
+
+```commandline
+make venv
+```
+Subsequently, the main orchestration program can run with:
+```commandline
+source venv/bin/activate
+cd src
+python fdedup_transform_spark.py
+```
+Alternatively the transforms included in fuzzy dedup can be launched independently:
+```commandline
+source venv/bin/activate
+cd src
+python signature_calc_local_spark.py
+python cluster_analysis_local_spark.py
+python get_duplicate_list_local_spark.py
+python data_cleaning_local_spark.py
+```
+After running the transforms, execute:
+```shell
+ls output
+```
+To see results of the transform.
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+## Code Example
+
+This is a [sample notebook](../fdedup_spark.ipynb) that shows how to invoke the spark fuzzy dedup transform.
+
+## Testing
+
+For testing fuzzy deduplication in a spark runtime, use the following `make` targets. To launch integration tests
+for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data
+cleaning) use: 
+```commandline
+make test-src
+```
+
+To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that
+image, use:
+```commandline
+make test-image
+```
+
+
+
+## Further Resources
+The following is a list of references to research articles and github repositories that inspired the module's design:
 
-Please check [here](python/README.md) for a more detailed description of this transform.
+1. [Jure Leskovec, Anand Rajaraman, Jeff Ullman, Mining of Massive Datasets, Chapter 3: Finding Similar Items](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) 
+2. [G Penedo et al., The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/pdf/2406.17557)
+3. [Datatrove github repo](https://github.com/huggingface/datatrove)
diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/dpk_fdedup/Murmur_MH.py
similarity index 100%
rename from transforms/universal/fdedup/python/src/Murmur_MH.py
rename to transforms/universal/fdedup/dpk_fdedup/Murmur_MH.py
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py
similarity index 97%
rename from transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py
index bb785021c..61302b74a 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/local_python.py
@@ -13,7 +13,7 @@
 import os
 import sys
 
-from cluster_analysis_transform_python import (
+from dpk_fdedup.cluster_analysis.transform_python import (
     ClusterAnalysisPythonTransformConfiguration,
 )
 from data_processing.runtime.pure_python import PythonTransformLauncher
diff --git a/transforms/universal/fdedup/ray/src/cluster_estimator.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/cluster_estimator.py
similarity index 100%
rename from transforms/universal/fdedup/ray/src/cluster_estimator.py
rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/cluster_estimator.py
diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py
similarity index 95%
rename from transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py
index c54ba85c2..a4ec84741 100644
--- a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py
+++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/local.py
@@ -13,7 +13,7 @@
 import os
 import sys
 
-from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration
+from dpk_fdedup.cluster_analysis.ray.transform import ClusterAnalysisRayTransformConfiguration
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
 
diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py
similarity index 97%
rename from transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py
index a0e8e7de2..10b850192 100644
--- a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py
+++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/ray/transform.py
@@ -13,7 +13,7 @@
 import os
 from typing import Any
 
-from cluster_analysis_transform import (
+from dpk_fdedup.cluster_analysis.transform import (
     ClusterAnalysisTransformConfiguration,
     num_bands_key,
     num_segments_key,
diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py
similarity index 95%
rename from transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py
rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py
index c9950657c..408220b6b 100644
--- a/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py
+++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/local.py
@@ -14,7 +14,7 @@
 import sys
 
 import polars as pl
-from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
+from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration
 from data_processing.utils import ParamsUtils
 from data_processing_spark.runtime.spark import SparkTransformLauncher
 
diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py
similarity index 98%
rename from transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py
index feeb3241e..97ab7a48f 100644
--- a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py
+++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/spark/transform.py
@@ -13,7 +13,7 @@
 import os
 from typing import Any
 
-from cluster_analysis_transform import (
+from dpk_fdedup.cluster_analysis.transform import (
     ClusterAnalysisTransformConfiguration,
     num_bands_key,
     num_segments_key,
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py
similarity index 99%
rename from transforms/universal/fdedup/python/src/cluster_analysis_transform.py
rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py
index fa3ce6d28..b414adaa6 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py
+++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py
@@ -24,7 +24,7 @@
     UnrecoverableException,
     get_logger,
 )
-from Murmur_MH import Murmur_MH
+from dpk_fdedup.Murmur_MH import Murmur_MH
 
 
 short_name = "cluster"
diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py
similarity index 98%
rename from transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
rename to transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py
index c35c5a711..e882ea6cc 100644
--- a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform_python.py
@@ -14,7 +14,7 @@
 import time
 from typing import Any
 
-from cluster_analysis_transform import (
+from dpk_fdedup.cluster_analysis.transform import (
     ClusterAnalysisTransformConfiguration,
     num_bands_key,
     num_segments_key,
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py
similarity index 93%
rename from transforms/universal/fdedup/python/src/data_cleaning_local_python.py
rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py
index aa4aabb90..d0976ec76 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/local_python.py
@@ -13,11 +13,11 @@
 import os
 import sys
 
-from data_cleaning_transform import (
+from dpk_fdedup.data_cleaning.transform import (
     document_id_column_cli_param,
     duplicate_list_location_cli_param,
 )
-from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
 
diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py
similarity index 93%
rename from transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py
index b951e2fc8..f72fc0902 100644
--- a/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py
+++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/local.py
@@ -13,11 +13,11 @@
 import os
 import sys
 
-from data_cleaning_transform import (
+from dpk_fdedup.data_cleaning.transform import (
     document_id_column_cli_param,
     duplicate_list_location_cli_param,
 )
-from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration
+from transforms.universal.fdedup.dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
 
diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py
similarity index 99%
rename from transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py
index 88171e260..4a4bd52f0 100644
--- a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py
+++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/ray/transform.py
@@ -14,7 +14,7 @@
 from typing import Any
 
 import ray
-from data_cleaning_transform import (
+from dpk_fdedup.data_cleaning.transform import (
     DataCleaningTransform,
     DataCleaningTransformConfiguration,
     dataclean_data_access_key,
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py
similarity index 94%
rename from transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py
index eb1e61845..12c5ab244 100644
--- a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py
+++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/local.py
@@ -18,7 +18,7 @@
     document_id_column_cli_param,
     duplicate_list_location_cli_param,
 )
-from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
+from transforms.universal.fdedup.dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration
 from data_processing.utils import ParamsUtils
 from data_processing_spark.runtime.spark import SparkTransformLauncher
 
diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py
similarity index 99%
rename from transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py
index 2ff0df8bf..16d184a5e 100644
--- a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py
+++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/spark/transform.py
@@ -13,7 +13,7 @@
 import os
 from typing import Any
 
-from data_cleaning_transform import (
+from dpk_fdedup.data_cleaning.transform import (
     DataCleaningTransformConfiguration,
     dataclean_data_access_key,
     dataclean_data_factory_key,
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform.py
similarity index 100%
rename from transforms/universal/fdedup/python/src/data_cleaning_transform.py
rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform.py
diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py
similarity index 98%
rename from transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
rename to transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py
index edef8b9c5..e29ef1218 100644
--- a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/data_cleaning/transform_python.py
@@ -13,7 +13,7 @@
 import os
 from typing import Any
 
-from data_cleaning_transform import (
+from dpk_fdedup.data_cleaning.transform import (
     DataCleaningTransformConfiguration,
     dataclean_data_access_key,
     dataclean_data_factory_key,
diff --git a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/ray/transform.py
similarity index 97%
rename from transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py
rename to transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/ray/transform.py
index 40081e658..b53891f66 100644
--- a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py
+++ b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/ray/transform.py
@@ -20,7 +20,7 @@
     RayTransformLauncher,
     RayTransformRuntimeConfiguration,
 )
-from get_duplicate_list_transform import (
+from dpk_fdedup.get_duplicate_list.transform import (
     GetDuplicateListTransformConfiguration,
     subfolder_key,
 )
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform.py
similarity index 100%
rename from transforms/universal/fdedup/python/src/get_duplicate_list_transform.py
rename to transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform.py
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_local_python.py
similarity index 96%
rename from transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
rename to transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_local_python.py
index 34b18ab04..2ccdec931 100644
--- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_local_python.py
@@ -15,7 +15,7 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from get_duplicate_list_transform_python import (
+from dpk_fdedup.get_duplicate_list.transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
 
diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_python.py
similarity index 97%
rename from transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py
rename to transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_python.py
index 703ef630e..fe6f0bda6 100644
--- a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/get_duplicate_list/transform_python.py
@@ -21,7 +21,7 @@
     PythonTransformRuntimeConfiguration,
 )
 from data_processing.utils import get_logger
-from get_duplicate_list_transform import (
+from dpk_fdedup.get_duplicate_list.transform import (
     GetDuplicateListTransformConfiguration,
     subfolder_key,
 )
diff --git a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py
similarity index 85%
rename from transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
rename to transforms/universal/fdedup/dpk_fdedup/ray/transform.py
index be1bf5fcb..a59877b6f 100644
--- a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py
+++ b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py
@@ -14,20 +14,20 @@
 import os
 import sys
 
-from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration
-from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration
+from dpk_fdedup.cluster_analysis.ray.transform import ClusterAnalysisRayTransformConfiguration
+from dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from fdedup_transform_python import ServiceOrchestrator, parse_args
-from get_duplicate_list_transform_python import (
+from dpk_fdedup.transform_python import ServiceOrchestrator, parse_args
+from dpk_fdedup.get_duplicate_list.transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
-from get_duplicate_list_transform_ray import (
+from dpk_fdedup.get_duplicate_list.ray.transform import (
     GetDuplicateListRayRuntime,
     GetDuplicateListRayTransformConfiguration,
 )
-from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
+from dpk_fdedup.signature_calc.ray.transform import SignatureCalculationRayTransformConfiguration
 
 
 s3_creds = {
diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/local_python.py
similarity index 97%
rename from transforms/universal/fdedup/python/src/signature_calc_local_python.py
rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/local_python.py
index be395ed4d..c68f32b71 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_local_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/local_python.py
@@ -16,7 +16,7 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from signature_calc_transform_python import (
+from dpk_fdedup.signature_calc.transform_python import (
     SignatureCalculationPythonTransformConfiguration,
 )
 
diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py
similarity index 95%
rename from transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py
index cb87b56af..2e5b7e2ab 100644
--- a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py
+++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/local.py
@@ -15,7 +15,7 @@
 
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
+from dpk_fdedup.signature_calc.ray.transform import SignatureCalculationRayTransformConfiguration
 
 
 # create parameters
diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py
similarity index 94%
rename from transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py
rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py
index 678d953f2..9a3b9f42f 100644
--- a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py
+++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/ray/transform.py
@@ -15,7 +15,7 @@
     RayTransformRuntimeConfiguration,
 )
 from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher
-from signature_calc_transform import SignatureCalculationTransformConfiguration
+from dpk_fdedup.signature_calc.transform import SignatureCalculationTransformConfiguration
 
 
 logger = get_logger(__name__)
diff --git a/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/local.py
similarity index 97%
rename from transforms/universal/fdedup/spark/src/signature_calc_local_spark.py
rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/local.py
index 2db884346..cf817eea4 100644
--- a/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py
+++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/local.py
@@ -16,7 +16,7 @@
 import polars as pl
 from data_processing.utils import ParamsUtils
 from data_processing_spark.runtime.spark import SparkTransformLauncher
-from signature_calc_transform_spark import (
+from dpk_fdedup.signature_calc.spark.transform import (
     SignatureCalculationSparkTransformConfiguration,
 )
 
diff --git a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py
similarity index 94%
rename from transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py
rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py
index 4e39810c6..9b2de7f28 100644
--- a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py
+++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/spark/transform.py
@@ -15,7 +15,7 @@
     SparkTransformLauncher,
     SparkTransformRuntimeConfiguration,
 )
-from signature_calc_transform import SignatureCalculationTransformConfiguration
+from dpk_fdedup.signature_calc.transform import SignatureCalculationTransformConfiguration
 
 
 logger = get_logger(__name__)
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py
similarity index 99%
rename from transforms/universal/fdedup/python/src/signature_calc_transform.py
rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py
index 4e64bcb5a..d01ee7b85 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform.py
+++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform.py
@@ -23,7 +23,7 @@
 from data_processing.data_access import DataAccessFactory
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
 from data_processing.utils import CLIArgumentProvider, UnrecoverableException
-from Murmur_MH import Murmur_MH
+from dpk_fdedup.Murmur_MH import Murmur_MH
 
 
 short_name = "minhash"
diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py
similarity index 94%
rename from transforms/universal/fdedup/python/src/signature_calc_transform_python.py
rename to transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py
index 40e0e97e3..c5a0db954 100644
--- a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/signature_calc/transform_python.py
@@ -17,7 +17,7 @@
     PythonTransformRuntimeConfiguration,
 )
 from data_processing.utils import get_logger
-from signature_calc_transform import SignatureCalculationTransformConfiguration
+from dpk_fdedup.signature_calc.transform import SignatureCalculationTransformConfiguration
 
 
 logger = get_logger(__name__)
diff --git a/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py b/transforms/universal/fdedup/dpk_fdedup/spark/transform.py
similarity index 100%
rename from transforms/universal/fdedup/spark/src/fdedup_transform_spark.py
rename to transforms/universal/fdedup/dpk_fdedup/spark/transform.py
diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/dpk_fdedup/transform_python.py
similarity index 98%
rename from transforms/universal/fdedup/python/src/fdedup_transform_python.py
rename to transforms/universal/fdedup/dpk_fdedup/transform_python.py
index def3590e4..7f7b71b82 100644
--- a/transforms/universal/fdedup/python/src/fdedup_transform_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/transform_python.py
@@ -19,16 +19,16 @@
 import data_cleaning_transform
 import get_duplicate_list_transform
 import signature_calc_transform
-from cluster_analysis_transform_python import (
+from cluster_analysis.transform_python import (
     ClusterAnalysisPythonTransformConfiguration,
 )
-from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from data_cleaning.transform_python import DataCleaningPythonTransformConfiguration
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils, get_logger, str2bool
-from get_duplicate_list_transform_python import (
+from get_duplicate_list.transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
-from signature_calc_transform_python import (
+from signature_calc.transform_python import (
     SignatureCalculationPythonTransformConfiguration,
 )
 
diff --git a/transforms/universal/fdedup/ray/images/fuzzy.png b/transforms/universal/fdedup/images/fuzzy.png
similarity index 100%
rename from transforms/universal/fdedup/ray/images/fuzzy.png
rename to transforms/universal/fdedup/images/fuzzy.png
diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore
deleted file mode 100644
index f7275bbbd..000000000
--- a/transforms/universal/fdedup/python/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-venv/
diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile
deleted file mode 100644
index 05f6bf5ca..000000000
--- a/transforms/universal/fdedup/python/Makefile
+++ /dev/null
@@ -1,64 +0,0 @@
-# Define the root of the local git clone for the common rules to be able 
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to 
-#   1 if requirements reference the latest code in the data processing library 
-#     in this repo (that is not yet published to pypi).	 This is the default setting.
-#   0 if the transforms DPK dependencies are on wheels published to 
-#     pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse.  However, feel free
-# to override/redefine the rules below. 
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-venv::	.transforms.python-venv
-
-test::	.transforms.python-test
-
-clean:: .transforms.clean
-
-image:: .transforms.python-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-python
-
-setup:: .transforms.setup
-
-# distribution versions is the same as image version.
-set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions
-        
-build-dist:: .defaults.build-dist 
-
-publish-dist:: .defaults.publish-dist
-
-test-image:: .transforms.python-test-image
-
-run-cli-sample: .transforms.run-cli-python-sample
-
-run-local-sample: .transforms.run-local-sample
-
-run-local-python-sample: .transforms.run-local-python-sample
-
-#run-s3-ray-sample: .transforms.run-s3-ray-sample
-
-minio-start:	.minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md
deleted file mode 100644
index 4c531476f..000000000
--- a/transforms/universal/fdedup/python/README.md
+++ /dev/null
@@ -1,244 +0,0 @@
-# Fuzzy Dedup
-
-Please see the set of
-[transform project conventions](../../../README.md)
-for details on general project conventions, transform configuration,
-testing and IDE set up.
-
-## Contributors
-- Nelson Bore (kibnelson@gmail.com)
-- Constantin Adam (cmadam@us.ibm.com)
-
-## Description
-The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a
-set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries.
-
-Fuzzy dedup is a complex process made up of a pipeline that performs four main steps:
-
-1. **Signature Calculation**: creates a set of minhashes for each document, and uses them to create band signatures for
-the document.
-2. **Cluster Analysis**: groups documents into clusters based on matching band signatures. Within each cluster, it
-retains only the documents that have a Jaccard similarity above a specified threshold, and it identifies which documents
-to keep as unique and which ones to mark as duplicates.
-3. **Duplicate List Generation**: combines the similarity clusters identified in each band to create a single, unified
-list of duplicate documents.
-4. **Data Cleaning**: processes the documents by either filtering out duplicates or adding annotations to distinguish
-duplicates from non-duplicates.
-
-Each one of these steps is described in more detail below.
-
-### Signature Calculation
-
-This transform computes `num_permutations` minhashes and `num_bands` signatures for each document in the dataset, by
-following these processing steps:
-1. **Shingle Generation**: create a set of character or word shingles, using a specified window length. Character
-shingles are more effective at detecting similar documents, but require more computational resources compared to word
-shingles.
-2. **Minhash Calculation**: using the shingles as input, compute `num_permutations` minhashes for each document.
-3. **Band Signature Calculation**: divide the minhashes into `num_bands`, where each band contains
-`num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band.
-
-The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard
-similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this
-probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact
-the deduplication process.
-
-To help distribute the workload and speed up processing of the next steps, the hash space of each band is divided into
-`num_segments` segments. The band signatures, the minhashes, the document ids, and lengths are stored in an organized
-output folder structure `bands/band=b/segment=s`, where `b` is the band number and `s` is the segment number.
-
-### Cluster Analysis
-
-This transform leverages segmented processing to analyze the data generated by the **Signature Calculation** step
-efficiently and in parallel. Each worker processes a specific segment `s` of a band `b` by loading and analyzing all
-Parquet files from the folder `bands/band=b/segment=s`. Each row in the Parquet files contains, for a document:
-* `band_hash`, the document's band signature, and 
-* `data`, a structure with three fields: the unique `document_id`, document's `minhashes`, and `document_size`.
-
-The transform runs the following processing steps:
-1. **Data Loading**: combine into a single dataframe all Parquet files in `bands/band=b/segment=s`.
-2. **Clustering**: run a `group_by` operation on the `band_hash` column that will group documents with the same band
-signature into clusters.
-3. **Similarity Analysis**: for each cluster, calculate Jaccard similarity between pairs of documents using their
-minhashes, and move documents below the specified Jaccard similarity threshold into new clusters.
-4. **Duplicate Identification**: in clusters with more than one document remaining, retain the largest document with the
-smallest document id, and mark as duplicates all other documents in the cluster.
-5. **Persist Results**: save the duplicate clusters in a file.
-
-### Duplicate List Generation
-
-The **Cluster Analysis** step identifies duplicates across multiple bands, meaning a document can be marked as a
-duplicate in one or more bands (e.g., if two documents are identical, one will be marked as a duplicate in all bands).
-This transform consolidates all duplicate information from each band segment into a single file, providing a unified
-record of duplicates detected across the dataset.
-
-### Data Cleaning
-
-This transform processes the original dataset using the list of duplicate documents generated by the **Duplicate List
-Generation** step. It imports each file in the original dataset into a table and produces a new dataset. The directory
-structure of the input dataset is preserved, but the contents of the output files depend on the selected operating mode:
-1. **Annotate** - add a new `duplicate` column to the dataset, that contains a `d` for documents marked as duplicates,
-and is empty for non-duplicates
-2. **Filter duplicates** - removes all documents identified as duplicates from the dataset.
-3. **Filter non-duplicates** - removes from the dataset all documents that were not marked as duplicates, leaving only
-the duplicates.
-
-The output dataset reflects the selected mode, providing flexibility for downstream processing.
-
-## Input Columns Used by This Transform
-
-| Input Column Name                                                   | Data Type | Description                      |
-|---------------------------------------------------------------------|-----------|----------------------------------|
-| Column specified by the _contents_column_ configuration argument    | str       | Column that stores document text |
-| Column specified by the _document_id_column_ configuration argument | int64     | Column that stores document ID   |
-
-## Output Columns Annotated by This Transform
-| Output Column Name | Data Type | Description                                                                                                         |
-|------------|-----------|---------------------------------------------------------------------------------------------------------------------|
-| duplicate  | str       | Column added if fuzzy dedup runs in 'annotate' mode. Value is 'd' for duplicate documents, empty for non-duplicates |
-
-## Configuration and Usage
-### Fuzzy Deduplication Transform
-The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as
-follows:
-```text
---input_folder INPUT_FOLDER
-                    Input folder path
---output_folder OUTPUT_FOLDER
-                    Output folder path
---operation_mode {filter_duplicates,filter_non_duplicates,annotate}
-                    operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents
---contents_column CONTENTS_COLUMN
-                    name of the column that stores document text
---document_id_column DOCUMENT_ID_COLUMN
-                    name of the column that stores document ID
---seed SEED         seed of the random number generator
---num_permutations NUM_PERMUTATIONS
-                    number of permutations to use for minhash calculation
---num_bands NUM_BANDS
-                    number of bands to use for band hash calculation
---num_minhashes_per_band NUM_MINHASHES_PER_BAND
-                    number of minhashes to use in each band
---word_shingle_size WORD_SHINGLE_SIZE
-                    number of words included in one shingle
---jaccard_similarity_threshold JACCARD_SIMILARITY_THRESHOLD
-                    jaccard similarity threshold above which two documents are similar
---num_segments NUM_SEGMENTS
-                    the number of segments dividing the hashing space for each band (for scalability)
---duplicate_list_location DUPLICATE_LIST_LOCATION
-                    path to the file with all the duplicate document ids
---services SERVICES   Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning)
---use_s3 USE_S3       use s3
---s3_cred S3_CRED     ast string of options for s3 credentials
---shingle_option SHINGLE_OPTION
-                    Option used for shingling
-
-```
-
-### Signature Calculation Transform
-The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values
-are as follows:
-```text
---minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN
-                    name of the column storing the unique ID assigned to each document
---minhash_contents_column MINHASH_CONTENTS_COLUMN
-                    name of the column storing the contents of each document
---minhash_seed MINHASH_SEED
-                    the seed used to instantiate the random number generator
---minhash_num_permutations MINHASH_NUM_PERMUTATIONS
-                    number of permutations (minhashes) calculated for each document
---minhash_word_shingle_size MINHASH_WORD_SHINGLE_SIZE
-                    the size of the word shingles calculated for each document
---minhash_num_bands MINHASH_NUM_BANDS
-                    the number of bands to use in the banding technique
---minhash_num_minhashes_per_band MINHASH_NUM_MINHASHES_PER_BAND
-                    the number of minhashes to use in each band
---minhash_num_segments MINHASH_NUM_SEGMENTS
-                    the number of segments across which we divide the hashing space for each band
---minhash_shingle_option MINHASH_SHINGLE_OPTION
-                    Shingling option ('word' or 'char')
-```
-
-### Cluster Analysis Transform
-The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values
-are as follows:
-```text
---cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD
-                      Jaccard similarity threshold above which two documents are duplicates
---cluster_num_bands CLUSTER_NUM_BANDS
-                      The number of bands used in the banding technique
---cluster_num_segments CLUSTER_NUM_SEGMENTS
-                      The number of segments dividing the hashing space for each band
-```
-
-### Get Duplicates List Transform
-This transform currently has no configuration parameters.
-
-### Data Cleaning Transform
-The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values
-are as follows:
-```text
-  --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN
-                        name of the column storing the unique ID assigned to each document
-  --fdclean_operation_mode {filter_duplicates,filter_non_duplicates,annotate}
-                        operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents
-```
-
-### Running the samples
-To run the samples, use the following `make` target to create a virtual environment:
-
-```commandline
-make venv
-```
-Subsequently, the main orchestration program can run with:
-```commandline
-source venv/bin/activate
-cd src
-python fdedup_transform_python.py
-```
-Alternatively the transforms included in fuzzy dedup can be launched independently:
-```commandline
-source venv/bin/activate
-cd src
-python signature_calc_local_python.py
-python cluster_analysis_local_python.py
-python get_duplicate_list_local_python.py
-python data_cleaning_local_python.py
-```
-After running the transforms, execute:
-```shell
-ls output
-```
-To see results of the transform.
-
-### Code example
-
-This is a [sample notebook](../fdedup_python.ipynb) that shows how to invoke the python fuzzy dedup transform.
-
-### Transforming data using the transform image
-
-To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
-substituting the name of this transform image and runtime as appropriate.
-
-## Testing
-
-For testing fuzzy deduplication in a pure python runtime, use the following `make` targets. To launch integration tests
-for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data
-cleaning) use: 
-```commandline
-make test-src
-```
-
-To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that
-image, use:
-```commandline
-make test-image
-```
-
-## Further Resources
-The following is a list of references to research articles and github repositories that inspired the module's design:
-
-1. [Jure Leskovec, Anand Rajaraman, Jeff Ullman, Mining of Massive Datasets, Chapter 3: Finding Similar Items](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) 
-2. [G Penedo et al., The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/pdf/2406.17557)
-3. [Datatrove github repo](https://github.com/huggingface/datatrove)
diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml
deleted file mode 100644
index 81f39ebb0..000000000
--- a/transforms/universal/fdedup/python/pyproject.toml
+++ /dev/null
@@ -1,45 +0,0 @@
-[project]
-name = "dpk_fdedup_transform_python"
-version = "0.2.4.dev0"
-requires-python = ">=3.10,<3.13"
-description = "Fuzzy Dedup Transform for Python"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
-    { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
-    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
-]
-dynamic = ["dependencies"]
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-[tool.setuptools.dynamic]
-dependencies = {file = ["requirements.txt"]}
-
-[project.optional-dependencies]
-dev = [
-    "twine",
-    "pytest>=7.3.2",
-    "pytest-dotenv>=0.5.2",
-    "pytest-env>=1.0.0",
-    "pre-commit>=3.3.2",
-    "pytest-cov>=4.1.0",
-    "pytest-mock>=3.10.0",
-    "moto==5.0.5",
-    "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/fdedup/ray/.dockerignore b/transforms/universal/fdedup/ray/.dockerignore
deleted file mode 100644
index f7275bbbd..000000000
--- a/transforms/universal/fdedup/ray/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-venv/
diff --git a/transforms/universal/fdedup/ray/.gitignore b/transforms/universal/fdedup/ray/.gitignore
deleted file mode 100644
index 3ea7fd4ab..000000000
--- a/transforms/universal/fdedup/ray/.gitignore
+++ /dev/null
@@ -1,38 +0,0 @@
-test-data/output
-output/*
-/output/
-data-processing-lib/
-
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-
-# Distribution / packaging
-bin/
-build/
-develop-eggs/
-dist/
-eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-.tox/
-htmlcov
-.coverage
-.cache
-nosetests.xml
-coverage.xml
\ No newline at end of file
diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile
deleted file mode 100644
index 9a447e2db..000000000
--- a/transforms/universal/fdedup/ray/Dockerfile
+++ /dev/null
@@ -1,51 +0,0 @@
-ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310
-FROM ${BASE_IMAGE}
-
-RUN pip install --upgrade --no-cache-dir pip 
-
-# install pytest
-RUN pip install --no-cache-dir pytest
-ARG DPK_WHEEL_FILE_NAME
-
-# Copy and install data processing libraries 
-# These are expected to be placed in the docker context before this is run (see the make image).
-COPY --chown=ray:users data-processing-dist data-processing-dist
-RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
-
-## Copy the python version of the tansform
-COPY --chown=ray:users python-transform/  python-transform/
-RUN cd python-transform && pip install --no-cache-dir -e .
-
-# Install ray project source
-COPY --chown=ray:users src/ src/
-COPY --chown=ray:users pyproject.toml pyproject.toml
-COPY --chown=ray:users README.md README.md
-COPY --chown=ray:users requirements.txt requirements.txt
-RUN pip install --no-cache-dir -e .
-
-# copy source files needed by test-image
-COPY --chown=ray:users ./src/fdedup_transform_ray.py fdedup_transform_ray.py
-COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py
-COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py
-COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py
-COPY --chown=ray:users ./src/data_cleaning_transform_ray.py data_cleaning_transform_ray.py
-COPY --chown=ray:users ./src/signature_calc_local_ray.py local/fdedup_local_ray.py
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
-
-# Grant non-root users the necessary permissions to the ray directory
-RUN chmod 755 /home/ray
-
-USER root
-RUN chmod a+rwx /home/ray
-USER ray
-# Set environment
-ENV PYTHONPATH /home/ray
-
-# Put these at the end since they seem to upset the docker cache.
-ARG BUILD_DATE
-ARG GIT_COMMIT
-LABEL build-date=$BUILD_DATE
-LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile
deleted file mode 100644
index ec193b6c3..000000000
--- a/transforms/universal/fdedup/ray/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-# Define the root of the local git clone for the common rules to be able 
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to 
-#   1 if requirements reference the latest code in the data processing library 
-#     in this repo (that is not yet published to pypi).	 This is the default setting.
-#   0 if the transforms DPK dependencies are on wheels published to 
-#     pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse.  However, feel free
-# to override/redefine the rules below. 
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-BASE_IMAGE=${RAY_BASE_IMAGE}
-
-venv::	.transforms.ray-venv
-
-test::	.transforms.ray-test
-
-clean:: .transforms.clean
-
-image:: .transforms.ray-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-test-image:: .transforms.ray-test-image
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-ray
-
-setup:: .transforms.setup
-
-# TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation
-set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions 
-
-build-dist:: .defaults.build-dist
-
-publish-dist:: .defaults.publish-dist
-
-run-cli-sample:
-	$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \
-                RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\"  \
-                --fdedup_id_column int_id_column"	\
-                .transforms.run-src-file
-
-run-local-sample: .transforms.run-local-ray-sample
-
-run-s3-sample: .transforms.run-s3-ray-sample
-
-minio-start:	.minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/fdedup/ray/README.md b/transforms/universal/fdedup/ray/README.md
deleted file mode 100644
index 298ac39ba..000000000
--- a/transforms/universal/fdedup/ray/README.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# Fuzzy Dedup
-
-Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform
-configuration,  testing and IDE set up.
-
-## Summary
-
-This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime.
-
-## Configuration and command line Options
-
-Fuzzy Dedup configuration and command line options are the same as for the base python transform. 
-
-## Running
-### Launched Command Line Options 
-When running the transform with the Ray launcher (i.e. TransformLauncher),
-In addition to those available to the transform as defined in [here](../python/README.md),
-the set of 
-[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available.
-
-### Running the samples
-To run the samples, use the following `make` target to create a virtual environment:
-
-```commandline
-make venv
-```
-Subsequently, the main orchestration program can run with:
-```commandline
-source venv/bin/activate
-cd src
-python fdedup_transform_ray.py
-```
-Alternatively the transforms included in fuzzy dedup can be launched independently:
-```commandline
-source venv/bin/activate
-cd src
-python signature_calc_local_ray.py
-python cluster_analysis_local_ray.py
-python get_duplicate_list_local_ray.py
-python data_cleaning_local_ray.py
-```
-After running the transforms, execute:
-```shell
-ls output
-```
-To see results of the transform.
-
-### Transforming data using the transform image
-
-To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
-substituting the name of this transform image and runtime as appropriate.
-
-## Code Example
-
-This is a [sample notebook](../fdedup_ray.ipynb) that shows how to invoke the ray fuzzy dedup transform.
-
-## Testing
-
-For testing fuzzy deduplication in a ray runtime, use the following `make` targets. To launch integration tests
-for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data
-cleaning) use: 
-```commandline
-make test-src
-```
-
-To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that
-image, use:
-```commandline
-make test-image
-```
\ No newline at end of file
diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml
deleted file mode 100644
index 19da8a690..000000000
--- a/transforms/universal/fdedup/ray/pyproject.toml
+++ /dev/null
@@ -1,45 +0,0 @@
-[project]
-name = "dpk_fdedup_transform_ray"
-version = "0.2.4.dev0"
-requires-python = ">=3.10,<3.13"
-description = "fdedup Ray Transform"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
-    { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" },
-    { name = "Constantin Adam", email = "cmadam@us.ibm.com" },
-]
-dynamic = ["dependencies"]
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-[tool.setuptools.dynamic]
-dependencies = {file = ["requirements.txt"]}
-
-[project.optional-dependencies]
-dev = [
-    "twine",
-    "pytest>=7.3.2",
-    "pytest-dotenv>=0.5.2",
-    "pytest-env>=1.0.0",
-    "pre-commit>=3.3.2",
-    "pytest-cov>=4.1.0",
-    "pytest-mock>=3.10.0",
-    "moto==5.0.5",
-    "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt
deleted file mode 100644
index 782ef76e2..000000000
--- a/transforms/universal/fdedup/ray/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-data-prep-toolkit[ray]>=0.2.3
-dpk_fdedup_transform_python==0.2.4.dev0
-mmh3>=4.1.0
-xxhash==3.4.1
-tqdm==4.66.3
-scipy>=1.12.0, <2.0.0
diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/requirements.txt
similarity index 85%
rename from transforms/universal/fdedup/python/requirements.txt
rename to transforms/universal/fdedup/requirements.txt
index 2c6bb5f36..b28fac859 100644
--- a/transforms/universal/fdedup/python/requirements.txt
+++ b/transforms/universal/fdedup/requirements.txt
@@ -1,4 +1,3 @@
-data-prep-toolkit>=0.2.3
 pyyaml>=6.0.2
 boto3>=1.34.69
 kubernetes>=30.1.0
diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile
deleted file mode 100644
index b04994d46..000000000
--- a/transforms/universal/fdedup/spark/Dockerfile
+++ /dev/null
@@ -1,51 +0,0 @@
-ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0
-FROM ${BASE_IMAGE}
-
-# install pytest
-RUN pip install --no-cache-dir pytest
-ARG DPK_WHEEL_FILE_NAME
-
-WORKDIR ${SPARK_HOME}/work-dir
-
-# Copy in the data processing framework source/project and install it
-# This is expected to be placed in the docker context before this is run (see the make image).
-COPY --chown=spark:root data-processing-dist data-processing-dist
-RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]
-
-## Copy the python version of the tansform
-COPY --chown=spark:root python-transform/  python-transform/
-RUN cd python-transform && pip install --no-cache-dir -e .
-
-# Install spark project source
-COPY --chown=spark:root src/ src/
-COPY --chown=spark:root pyproject.toml pyproject.toml
-COPY --chown=spark:root README.md README.md
-RUN mkdir -p /opt/spark/work-dir/src/templates && \
-    mkdir -p /opt/spark/work-dir/config
-COPY --chown=spark:root deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/
-COPY --chown=spark:root deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/
-
-# install requirements from requirements.txt
-COPY requirements.txt .
-RUN pip3 install -r requirements.txt
-
-RUN pip install --no-cache-dir -e .
-
-# copy the main() entry point to the image
-COPY ./src/fdedup_transform_spark.py .
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
-
-USER spark
-
-# Set environment
-ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH}
-ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH}
-
-# Put these at the end since they seem to upset the docker cache.
-ARG BUILD_DATE
-ARG GIT_COMMIT
-LABEL build-date=$BUILD_DATE
-LABEL git-commit=$GIT_COMMIT
diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile
deleted file mode 100644
index ac2735e7d..000000000
--- a/transforms/universal/fdedup/spark/Makefile
+++ /dev/null
@@ -1,57 +0,0 @@
-# Define the root of the local git clone for the common rules to be able 
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to 
-#   1 if requirements reference the latest code in the data processing library 
-#     in this repo (that is not yet published to pypi).	 This is the default setting.
-#   0 if the transforms DPK dependencies are on wheels published to 
-#     pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse.  However, feel free
-# to override/redefine the rules below. 
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-venv::	.transforms.spark-venv
-
-test::	.transforms.spark-test
-
-clean:: .transforms.clean
-
-image:: .transforms.spark-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-spark
-
-set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions
-        
-build-dist:: .defaults.build-dist 
-
-publish-dist:: .defaults.publish-dist
-
-test-image:: .transforms.spark-test-image
-
-run-cli-sample: .transforms.run-cli-spark-sample
-
-run-local-sample: .transforms.run-local-sample
-
-minio-start:	.minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md
index 1b02ddd00..f1cf31ff0 100644
--- a/transforms/universal/fdedup/spark/README.md
+++ b/transforms/universal/fdedup/spark/README.md
@@ -1,4 +1,4 @@
-# Fuzzy Dedup
+# Fuzzy Dedup -- Spark
 
 Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform
 configuration,  testing and IDE set up.
diff --git a/transforms/universal/fdedup/spark/src/requirements.txt b/transforms/universal/fdedup/spark/requirements-spark.txt
similarity index 100%
rename from transforms/universal/fdedup/spark/src/requirements.txt
rename to transforms/universal/fdedup/spark/requirements-spark.txt
diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt
deleted file mode 100644
index e12366dd6..000000000
--- a/transforms/universal/fdedup/spark/requirements.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-dpk_fdedup_transform_python==0.2.4.dev0
-data-prep-toolkit[spark]>=0.2.3
-pyyaml>=6.0.2
-boto3>=1.34.69
-kubernetes>=30.1.0
-polars==1.9.0
-disjoint-set>=0.8.0
-numpy<1.29.0
-sentencepiece>=0.2.0
-mmh3>=4.1.0
-scipy>=1.12.0, <2.0.0
diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/test-data/expected/cleaned/data_1/df1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet
rename to transforms/universal/fdedup/test-data/expected/cleaned/data_1/df1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/cleaned/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/cleaned/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json b/transforms/universal/fdedup/test-data/expected/cleaned/metadata.json
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json
rename to transforms/universal/fdedup/test-data/expected/cleaned/metadata.json
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
rename to transforms/universal/fdedup/test-data/expected/cluster_analysis/docs_to_remove/metadata.json
diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
rename to transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/metadata.json
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json
rename to transforms/universal/fdedup/test-data/expected/data_cleaning/cleaned/metadata.json
diff --git a/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
rename to transforms/universal/fdedup/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
rename to transforms/universal/fdedup/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/test-data/expected/get_list_transform/metadata.json
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json
rename to transforms/universal/fdedup/test-data/expected/get_list_transform/metadata.json
diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/test-data/expected/metadata.json
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/metadata.json
rename to transforms/universal/fdedup/test-data/expected/metadata.json
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/test-data/expected/signature_calc/metadata.json
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json
rename to transforms/universal/fdedup/test-data/expected/signature_calc/metadata.json
diff --git a/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet b/transforms/universal/fdedup/test-data/input/data_1/df1.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet
rename to transforms/universal/fdedup/test-data/input/data_1/df1.parquet
diff --git a/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet b/transforms/universal/fdedup/test-data/input/data_2/df2.parquet
similarity index 100%
rename from transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet
rename to transforms/universal/fdedup/test-data/input/data_2/df2.parquet
diff --git a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py b/transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py
similarity index 93%
rename from transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
rename to transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py
index cecd224fe..c14329703 100644
--- a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py
+++ b/transforms/universal/fdedup/test/test_cluster_analysis_transform_python.py
@@ -12,8 +12,8 @@
 
 import os
 
-from cluster_analysis_transform import sort_output_cli_param
-from cluster_analysis_transform_python import (
+from dpk_fdedup.cluster_analysis.transform import sort_output_cli_param
+from dpk_fdedup.cluster_analysis.transform_python import (
     ClusterAnalysisPythonTransformConfiguration,
 )
 from data_processing.runtime.pure_python import PythonTransformLauncher
diff --git a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py b/transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py
similarity index 91%
rename from transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py
rename to transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py
index a3771fbd8..5cfddfc65 100644
--- a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py
+++ b/transforms/universal/fdedup/test/test_cluster_analysis_transform_ray.py
@@ -12,13 +12,13 @@
 
 import os
 
-from cluster_analysis_transform import (
+from dpk_fdedup.cluster_analysis.transform import (
     jaccard_similarity_threshold_cli_param,
     num_bands_cli_param,
     num_segments_cli_param,
     sort_output_cli_param,
 )
-from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration
+from dpk_fdedup.cluster_analysis.ray.transform import ClusterAnalysisRayTransformConfiguration
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
@@ -32,7 +32,7 @@ class TestRayClusterAnalysisTransform(AbstractTransformLauncherTest):
     """
 
     def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../ray/test-data"))
         config = {
             "run_locally": True,
             num_bands_cli_param: 14,
diff --git a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py b/transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py
similarity index 89%
rename from transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py
rename to transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py
index 294c86f25..990b0cf7b 100644
--- a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py
+++ b/transforms/universal/fdedup/test/test_cluster_analysis_transform_spark.py
@@ -12,8 +12,8 @@
 
 import os
 
-from cluster_analysis_transform import sort_output_cli_param
-from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
+from dpk_fdedup.cluster_analysis.transform import sort_output_cli_param
+from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
@@ -27,7 +27,7 @@ class TestSparkClusterAnalysisTransform(AbstractTransformLauncherTest):
     """
 
     def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../spark/test-data"))
         config = {
             "cluster_num_bands": 14,
             "cluster_num_segments": 2,
diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/test/test_data_cleaning_transform_python.py
similarity index 93%
rename from transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
rename to transforms/universal/fdedup/test/test_data_cleaning_transform_python.py
index 8c4debed9..faa5e8924 100644
--- a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py
+++ b/transforms/universal/fdedup/test/test_data_cleaning_transform_python.py
@@ -12,11 +12,11 @@
 
 import os
 
-from data_cleaning_transform import (
+from dpk_fdedup.data_cleaning.transform import (
     document_id_column_cli_param,
     duplicate_list_location_cli_param,
 )
-from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration
+from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
diff --git a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py b/transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py
similarity index 93%
rename from transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py
rename to transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py
index a62105b2c..960127e51 100644
--- a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py
+++ b/transforms/universal/fdedup/test/test_data_cleaning_transform_ray.py
@@ -12,12 +12,12 @@
 
 import os
 
-from data_cleaning_transform import (
+from dpk_fdedup.data_cleaning.transform import (
     document_id_column_cli_param,
     duplicate_list_location_cli_param,
     operation_mode_cli_param,
 )
-from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration
+from dpk_fdedup.data_cleaning.ray.transform import DataCleaningRayTransformConfiguration
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
@@ -31,7 +31,7 @@ class TestRayDataCleaningTransform(AbstractTransformLauncherTest):
     """
 
     def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../ray/test-data"))
         duplicate_location = os.path.abspath(
             os.path.join(
                 os.path.dirname(__file__),
diff --git a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py b/transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py
similarity index 92%
rename from transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py
rename to transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py
index 919857e23..9639980b3 100644
--- a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py
+++ b/transforms/universal/fdedup/test/test_data_cleaning_transform_spark.py
@@ -12,12 +12,12 @@
 
 import os
 
-from data_cleaning_transform import (
+from dpk_fdedup.data_cleaning.transform import (
     document_id_column_cli_param,
     duplicate_list_location_cli_param,
     operation_mode_cli_param,
 )
-from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
+from dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
@@ -31,7 +31,7 @@ class TestSparkDataCleaningTransform(AbstractTransformLauncherTest):
     """
 
     def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../spark/test-data"))
         duplicate_location = os.path.abspath(
             os.path.join(
                 os.path.dirname(__file__),
diff --git a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_python.py
similarity index 92%
rename from transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py
rename to transforms/universal/fdedup/test/test_get_duplicate_list_transform_python.py
index 4b59e3a7a..e5ab9e6a0 100644
--- a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py
+++ b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_python.py
@@ -16,8 +16,8 @@
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
-from get_duplicate_list_transform import sort_output_cli_param
-from get_duplicate_list_transform_python import (
+from dpk_fdedup.get_duplicate_list.transform import sort_output_cli_param
+from dpk_fdedup.get_duplicate_list.transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
 
diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py
similarity index 89%
rename from transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
rename to transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py
index 55869598c..017e560b0 100644
--- a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py
+++ b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_ray.py
@@ -16,8 +16,8 @@
     AbstractTransformLauncherTest,
 )
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from get_duplicate_list_transform import sort_output_cli_param
-from get_duplicate_list_transform_ray import GetDuplicateListRayTransformConfiguration
+from dpk_fdedup.get_duplicate_list.transform import sort_output_cli_param
+from dpk_fdedup.get_duplicate_list.ray.transform import GetDuplicateListRayTransformConfiguration
 
 
 class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
@@ -27,7 +27,7 @@ class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
     """
 
     def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../ray/test-data"))
         config = {
             "run_locally": True,
             sort_output_cli_param: True,
diff --git a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_spark.py
similarity index 91%
rename from transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py
rename to transforms/universal/fdedup/test/test_get_duplicate_list_transform_spark.py
index 4b59e3a7a..b64ebb116 100644
--- a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py
+++ b/transforms/universal/fdedup/test/test_get_duplicate_list_transform_spark.py
@@ -16,8 +16,8 @@
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
-from get_duplicate_list_transform import sort_output_cli_param
-from get_duplicate_list_transform_python import (
+from dpk_fdedup.get_duplicate_list.transform import sort_output_cli_param
+from dpk_fdedup.get_duplicate_list.transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
 
@@ -29,7 +29,7 @@ class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest):
     """
 
     def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../spark/test-data"))
         config = {
             sort_output_cli_param: True,
         }
diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/test/test_signature_calc_transform_python.py
similarity index 96%
rename from transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py
rename to transforms/universal/fdedup/test/test_signature_calc_transform_python.py
index 9ad8a32d7..2b6c49e31 100644
--- a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py
+++ b/transforms/universal/fdedup/test/test_signature_calc_transform_python.py
@@ -17,7 +17,7 @@
     AbstractTransformLauncherTest,
 )
 from data_processing.utils import ParamsUtils
-from signature_calc_transform_python import (
+from dpk_fdedup.signature_calc.transform_python import (
     SignatureCalculationPythonTransformConfiguration,
 )
 
diff --git a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py b/transforms/universal/fdedup/test/test_signature_calc_transform_ray.py
similarity index 90%
rename from transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py
rename to transforms/universal/fdedup/test/test_signature_calc_transform_ray.py
index 34f3ee403..8c08eb938 100644
--- a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py
+++ b/transforms/universal/fdedup/test/test_signature_calc_transform_ray.py
@@ -17,12 +17,12 @@
 )
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from signature_calc_transform import (
+from dpk_fdedup.signature_calc.transform import (
     num_bands_cli_param,
     num_permutations_cli_param,
     num_segments_cli_param,
 )
-from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration
+from dpk_fdedup.signature_calc.ray.transform import SignatureCalculationRayTransformConfiguration
 
 
 class TestRaySignatureCalcTransform(AbstractTransformLauncherTest):
@@ -32,7 +32,7 @@ class TestRaySignatureCalcTransform(AbstractTransformLauncherTest):
     """
 
     def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../ray/test-data"))
         config = {
             "run_locally": True,
             num_permutations_cli_param: 112,
diff --git a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py b/transforms/universal/fdedup/test/test_signature_calc_transform_spark.py
similarity index 95%
rename from transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py
rename to transforms/universal/fdedup/test/test_signature_calc_transform_spark.py
index 6d93dc7a9..af8f36aa9 100644
--- a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py
+++ b/transforms/universal/fdedup/test/test_signature_calc_transform_spark.py
@@ -17,7 +17,7 @@
 )
 from data_processing.utils import ParamsUtils
 from data_processing_spark.runtime.spark import SparkTransformLauncher
-from signature_calc_transform_spark import (
+from dpk_fdedup.signature_calc.spark.transform import (
     SignatureCalculationSparkTransformConfiguration,
 )
 
@@ -29,7 +29,7 @@ class TestSparkSignatureCalcTransform(AbstractTransformLauncherTest):
     """
 
     def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data"))
+        basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../spark/test-data"))
         config = {
             "minhash_num_permutations": 112,
             "minhash_num_bands": 14,

From 470152f7018bcfaaaf7374603f40cc73e0838de9 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Wed, 18 Dec 2024 16:26:26 -0500
Subject: [PATCH 2/6] update from Shahrokh

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/fdedup/README.md | 86 +++++++++++++--------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md
index afcf3db08..fb36adbc2 100644
--- a/transforms/universal/fdedup/README.md
+++ b/transforms/universal/fdedup/README.md
@@ -1,7 +1,7 @@
 # Fuzzy Dedup
 
 Please see the set of
-[transform project conventions](../../../README.md)
+[transform project conventions](../../README.md#transform-project-conventions)
 for details on general project conventions, transform configuration,
 testing and IDE set up.
 
@@ -39,7 +39,7 @@ shingles.
 `num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band.
 
 The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard
-similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this
+similarity will be marked as duplicates. A Jupyter notebook in the [utils](utils/calc_r_and_b.ipynb) folder generates a graph of this
 probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact
 the deduplication process.
 
@@ -99,7 +99,7 @@ The output dataset reflects the selected mode, providing flexibility for downstr
 
 ## Configuration and Usage
 ### Fuzzy Deduplication Transform
-The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as
+The set of dictionary keys holding [Fuzzy Dedup](dpk_fdedup/transform_python.py) configuration for values are as
 follows:
 ```text
 --input_folder INPUT_FOLDER
@@ -136,7 +136,7 @@ follows:
 ```
 
 ### Signature Calculation Transform
-The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values
+The set of dictionary keys holding [SignatureCalcTransform](dpk_fdedup/signature_calc/transform.py) configuration for values
 are as follows:
 ```text
 --minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN
@@ -160,7 +160,7 @@ are as follows:
 ```
 
 ### Cluster Analysis Transform
-The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values
+The set of dictionary keys holding [ClusterAnalysisTransform](dpk_fdedup/cluster_analysis/transform.py) configuration for values
 are as follows:
 ```text
 --cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD
@@ -175,7 +175,7 @@ are as follows:
 This transform currently has no configuration parameters.
 
 ### Data Cleaning Transform
-The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values
+The set of dictionary keys holding [DataCleaningTransform](dpk_fdedup/data_cleaning/transform.py) configuration for values
 are as follows:
 ```text
   --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN
@@ -193,17 +193,17 @@ make venv
 Subsequently, the main orchestration program can run with:
 ```commandline
 source venv/bin/activate
-cd src
-python fdedup_transform_python.py
+cd dpk_fdedup
+python transform_python.py
 ```
 Alternatively the transforms included in fuzzy dedup can be launched independently:
 ```commandline
 source venv/bin/activate
-cd src
-python signature_calc_local_python.py
-python cluster_analysis_local_python.py
-python get_duplicate_list_local_python.py
-python data_cleaning_local_python.py
+cd dpk_fdedup
+python signature_calc/local_python.py
+python cluster_analysis/local_python.py
+python get_duplicate_list/transform_local_python.py
+python data_cleaning/local_python.py
 ```
 After running the transforms, execute:
 ```shell
@@ -213,12 +213,12 @@ To see results of the transform.
 
 ### Code example
 
-This is a [sample notebook](../fdedup_python.ipynb) that shows how to invoke the python fuzzy dedup transform.
+This is a [sample notebook](fdedup_python.ipynb) that shows how to invoke the python fuzzy dedup transform.
 
 ### Transforming data using the transform image
 
 To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
 
 ## Testing
@@ -239,12 +239,12 @@ make test-image
 
 # Fuzzy Dedup - Ray implementation
 
-Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform
+Please see the set of [transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform
 configuration,  testing and IDE set up.
 
 ## Summary
 
-This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime.
+This project wraps the Fuzzy Dedup transform with a Ray runtime.
 
 ## Configuration and command line Options
 
@@ -252,10 +252,10 @@ Fuzzy Dedup configuration and command line options are the same as for the base
 
 ## Running
 ### Launched Command Line Options 
-When running the transform with the Ray launcher (i.e. TransformLauncher),
-In addition to those available to the transform as defined in [here](../python/README.md),
+When running the transform with the Ray launcher (i.e., TransformLauncher),
+in addition to those available to the transform as defined in here,
 the set of 
-[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available.
+[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available.
 
 ### Running the samples
 To run the samples, use the following `make` target to create a virtual environment:
@@ -266,17 +266,17 @@ make venv
 Subsequently, the main orchestration program can run with:
 ```commandline
 source venv/bin/activate
-cd src
-python fdedup_transform_ray.py
+cd dpk_fdedup
+python ray/transform.py
 ```
 Alternatively the transforms included in fuzzy dedup can be launched independently:
 ```commandline
 source venv/bin/activate
-cd src
-python signature_calc_local_ray.py
-python cluster_analysis_local_ray.py
-python get_duplicate_list_local_ray.py
-python data_cleaning_local_ray.py
+cd dpk_fdedup
+python signature_calc/ray/local.py
+python cluster_analysis/ray/local.py
+python get_duplicate_list/ray/tarnsform.py
+python data_cleaning/ray/local.py
 ```
 After running the transforms, execute:
 ```shell
@@ -287,12 +287,12 @@ To see results of the transform.
 ### Transforming data using the transform image
 
 To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
 
 ## Code Example
 
-This is a [sample notebook](../fdedup_ray.ipynb) that shows how to invoke the ray fuzzy dedup transform.
+This is a [sample notebook](fdedup_ray.ipynb) that shows how to invoke the ray fuzzy dedup transform.
 
 ## Testing
 
@@ -313,12 +313,12 @@ make test-image
 
 # Fuzzy Dedup -- Spark
 
-Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform
+Please see the set of [transform project conventions](../../README.md#transform-project-conventions) for details on general project conventions, transform
 configuration,  testing and IDE set up.
 
 ## Summary
 
-This project wraps the [Fuzzy Dedup transform](../python) with a Spark runtime.
+This project wraps the Fuzzy Dedup transform with a Spark runtime.
 
 ## Configuration and command line Options
 
@@ -326,10 +326,10 @@ Fuzzy Dedup configuration and command line options are the same as for the base
 
 ## Running
 ### Launched Command Line Options 
-When running the transform with the Spark launcher (i.e. TransformLauncher),
-In addition to those available to the transform as defined in [here](../python/README.md),
+When running the transform with the Spark launcher (i.e., TransformLauncher),
+in addition to those available to the transform as defined in here,
 the set of 
-[spark launcher](../../../../data-processing-lib/doc/spark-launcher-options.md) are available.
+[spark launcher options](../../../data-processing-lib/doc/spark-launcher-options.md) are available.
 
 ### Running the samples
 To run the samples, use the following `make` target to create a virtual environment:
@@ -340,17 +340,17 @@ make venv
 Subsequently, the main orchestration program can run with:
 ```commandline
 source venv/bin/activate
-cd src
-python fdedup_transform_spark.py
+cd dpk_fdedup
+python spark/transform.py
 ```
 Alternatively the transforms included in fuzzy dedup can be launched independently:
 ```commandline
 source venv/bin/activate
-cd src
-python signature_calc_local_spark.py
-python cluster_analysis_local_spark.py
-python get_duplicate_list_local_spark.py
-python data_cleaning_local_spark.py
+cd dpk_fdedup
+python signature_calc/spark/local.py
+python cluster_analysis/spark/local.py
+python get_duplicate_list/spark/transform.py
+python data_cleanin/spark/local.py
 ```
 After running the transforms, execute:
 ```shell
@@ -361,12 +361,12 @@ To see results of the transform.
 ### Transforming data using the transform image
 
 To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
 substituting the name of this transform image and runtime as appropriate.
 
 ## Code Example
 
-This is a [sample notebook](../fdedup_spark.ipynb) that shows how to invoke the spark fuzzy dedup transform.
+This is a [sample notebook](fdedup_spark.ipynb) that shows how to invoke the spark fuzzy dedup transform.
 
 ## Testing
 

From b95e99e6d9e63de444fc50da666d64b36ce56d7a Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Wed, 18 Dec 2024 18:03:00 -0500
Subject: [PATCH 3/6] more fixes

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/fdedup/Makefile          |   8 +-
 transforms/universal/fdedup/README.md         |  36 +-
 .../fdedup/dpk_fdedup/spark/transform.py      |  10 +-
 .../fdedup/dpk_fdedup/transform_python.py     |  24 +-
 .../universal/fdedup/fdedup_python.ipynb      | 464 ++++++++++++++----
 transforms/universal/fdedup/fdedup_ray.ipynb  |   4 +-
 .../universal/fdedup/fdedup_spark.ipynb       |   4 +-
 transforms/universal/fdedup/kfp_ray/Makefile  |  47 +-
 .../universal/fdedup/kfp_ray/fdedup_wf.py     |   8 +-
 9 files changed, 436 insertions(+), 169 deletions(-)

diff --git a/transforms/universal/fdedup/Makefile b/transforms/universal/fdedup/Makefile
index da70ab879..477d282e1 100644
--- a/transforms/universal/fdedup/Makefile
+++ b/transforms/universal/fdedup/Makefile
@@ -14,10 +14,4 @@ TRANSFORM_NAME=$(shell basename `pwd`)
 ################################################################################
 
 
-
-
-un-cli-sample:
-	$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \
-                RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/input', 'output_folder' : '../output'}\"  \
-                --fdedup_id_column int_id_column"	\
-                .transforms.run-src-file
+				
\ No newline at end of file
diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md
index fb36adbc2..93d032e07 100644
--- a/transforms/universal/fdedup/README.md
+++ b/transforms/universal/fdedup/README.md
@@ -193,17 +193,15 @@ make venv
 Subsequently, the main orchestration program can run with:
 ```commandline
 source venv/bin/activate
-cd dpk_fdedup
-python transform_python.py
+python -m dpk_fdedup.transform_python
 ```
 Alternatively the transforms included in fuzzy dedup can be launched independently:
 ```commandline
 source venv/bin/activate
-cd dpk_fdedup
-python signature_calc/local_python.py
-python cluster_analysis/local_python.py
-python get_duplicate_list/transform_local_python.py
-python data_cleaning/local_python.py
+python -m dpk_fdedup.signature_calc.local_python
+python -m dpk_fdedup.cluster_analysis.local_python
+python -m dpk_fdedup.get_duplicate_list.transform_local_python
+python -m dpk_fdedup.data_cleaning.local_python
 ```
 After running the transforms, execute:
 ```shell
@@ -266,17 +264,15 @@ make venv
 Subsequently, the main orchestration program can run with:
 ```commandline
 source venv/bin/activate
-cd dpk_fdedup
-python ray/transform.py
+python -m dpk_fdedup.ray.transform
 ```
 Alternatively the transforms included in fuzzy dedup can be launched independently:
 ```commandline
 source venv/bin/activate
-cd dpk_fdedup
-python signature_calc/ray/local.py
-python cluster_analysis/ray/local.py
-python get_duplicate_list/ray/tarnsform.py
-python data_cleaning/ray/local.py
+python -m dpk_fdedup.signature_calc.ray.local
+python -m dpk_fdedup.cluster_analysis.ray.local
+python -m dpk_fdedup.get_duplicate_list.ray.tarnsform
+python -m dpk_fdedup.data_cleaning.ray.local
 ```
 After running the transforms, execute:
 ```shell
@@ -340,17 +336,15 @@ make venv
 Subsequently, the main orchestration program can run with:
 ```commandline
 source venv/bin/activate
-cd dpk_fdedup
-python spark/transform.py
+python -m dpk_fdedup.spark.transform
 ```
 Alternatively the transforms included in fuzzy dedup can be launched independently:
 ```commandline
 source venv/bin/activate
-cd dpk_fdedup
-python signature_calc/spark/local.py
-python cluster_analysis/spark/local.py
-python get_duplicate_list/spark/transform.py
-python data_cleanin/spark/local.py
+python -m dpk_fdedup.signature_calc.spark.local
+python -m dpk_fdedup.cluster_analysis.spark.local
+python -m dpk_fdedup.get_duplicate_list.transform
+python -m dpk_fdedup.data_cleaning.spark.local
 ```
 After running the transforms, execute:
 ```shell
diff --git a/transforms/universal/fdedup/dpk_fdedup/spark/transform.py b/transforms/universal/fdedup/dpk_fdedup/spark/transform.py
index 82767f849..77eff4d74 100644
--- a/transforms/universal/fdedup/dpk_fdedup/spark/transform.py
+++ b/transforms/universal/fdedup/dpk_fdedup/spark/transform.py
@@ -14,15 +14,15 @@
 import os
 import sys
 
-from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration
-from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration
+from dpk_fdedup.cluster_analysis.spark.transform import ClusterAnalysisSparkTransformConfiguration
+from dpk_fdedup.data_cleaning.spark.transform import DataCleaningSparkTransformConfiguration
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing_spark.runtime.spark import SparkTransformLauncher
-from fdedup_transform_python import ServiceOrchestrator, parse_args
-from get_duplicate_list_transform_python import (
+from dpk_fdedup.transform_python import ServiceOrchestrator, parse_args
+from dpk_fdedup.get_duplicate_list.transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
-from signature_calc_transform_spark import (
+from dpk_fdedup.signature_calc.spark.transform import (
     SignatureCalculationSparkTransformConfiguration,
 )
 
diff --git a/transforms/universal/fdedup/dpk_fdedup/transform_python.py b/transforms/universal/fdedup/dpk_fdedup/transform_python.py
index 7f7b71b82..dbbcf39e6 100644
--- a/transforms/universal/fdedup/dpk_fdedup/transform_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/transform_python.py
@@ -15,20 +15,20 @@
 import os
 import sys
 
-import cluster_analysis_transform
-import data_cleaning_transform
-import get_duplicate_list_transform
-import signature_calc_transform
-from cluster_analysis.transform_python import (
+import dpk_fdedup.cluster_analysis.transform
+import dpk_fdedup.data_cleaning.transform
+import dpk_fdedup.get_duplicate_list.transform
+import dpk_fdedup.signature_calc.transform
+from dpk_fdedup.cluster_analysis.transform_python import (
     ClusterAnalysisPythonTransformConfiguration,
 )
-from data_cleaning.transform_python import DataCleaningPythonTransformConfiguration
+from dpk_fdedup.data_cleaning.transform_python import DataCleaningPythonTransformConfiguration
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils, get_logger, str2bool
-from get_duplicate_list.transform_python import (
+from dpk_fdedup.get_duplicate_list.transform_python import (
     GetDuplicateListPythonTransformConfiguration,
 )
-from signature_calc.transform_python import (
+from dpk_fdedup.signature_calc.transform_python import (
     SignatureCalculationPythonTransformConfiguration,
 )
 
@@ -47,10 +47,10 @@
 }
 
 ARGS_MAP = {
-    "minhash": signature_calc_transform.captured_arg_keys,
-    "cluster": cluster_analysis_transform.captured_arg_keys,
-    "fdlist": get_duplicate_list_transform.captured_arg_keys,
-    "fdclean": data_cleaning_transform.captured_arg_keys,
+    "minhash": dpk_fdedup.signature_calc.transform.captured_arg_keys,
+    "cluster": dpk_fdedup.cluster_analysis.transform.captured_arg_keys,
+    "fdlist": dpk_fdedup.get_duplicate_list.transform.captured_arg_keys,
+    "fdclean": dpk_fdedup.data_cleaning.transform.captured_arg_keys,
 }
 
 
diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb
index 684583ffd..3ca0ec9e5 100644
--- a/transforms/universal/fdedup/fdedup_python.ipynb
+++ b/transforms/universal/fdedup/fdedup_python.ipynb
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
    "metadata": {},
    "outputs": [],
@@ -47,7 +47,7 @@
     "import sys\n",
     "\n",
     "from data_processing.utils import ParamsUtils\n",
-    "from fdedup_transform_python import parse_args, ServiceOrchestrator"
+    "from dpk_fdedup.transform_python import parse_args, ServiceOrchestrator"
    ]
   },
   {
@@ -71,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
    "metadata": {},
    "outputs": [],
@@ -102,7 +102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "0775e400-7469-49a6-8998-bd4772931459",
    "metadata": {},
    "outputs": [
@@ -110,91 +110,377 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "13:30:29 INFO - Starting SignatureCalculation step\n",
-      "13:30:29 INFO - Got parameters for SignatureCalculation\n",
-      "13:30:29 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n",
-      "13:30:29 INFO - data factory scdata_ is using local configuration without input/output path\n",
-      "13:30:29 INFO - data factory scdata_ max_files -1, n_sample -1\n",
-      "13:30:29 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:30:29 INFO - pipeline id pipeline_id\n",
-      "13:30:29 INFO - code location None\n",
-      "13:30:29 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n",
-      "13:30:29 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "13:30:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:30:29 INFO - orchestrator minhash started at 2024-11-26 13:30:29\n",
-      "13:30:29 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n",
-      "13:30:33 INFO - Completed 1 files (50.0%) in 0.074 min\n",
-      "13:30:33 INFO - Completed 2 files (100.0%) in 0.074 min\n",
-      "13:30:33 INFO - Done processing 2 files, waiting for flush() completion.\n",
-      "13:30:33 INFO - Starting flush()\n",
-      "13:30:34 INFO - Wrote 14 tables with a total size of 80,640 bytes\n",
-      "13:30:34 INFO - done flushing in 0.063 sec\n",
-      "13:30:34 INFO - Completed execution in 0.075 min, execution result 0\n",
-      "13:30:34 INFO - SignatureCalculation completed successfully\n",
-      "13:30:34 INFO - Starting ClusterAnalysis step\n",
-      "13:30:34 INFO - Got parameters for ClusterAnalysis\n",
-      "13:30:34 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n",
-      "13:30:34 INFO - pipeline id pipeline_id\n",
-      "13:30:34 INFO - code location None\n",
-      "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/docs_to_remove\n",
-      "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:30:34 INFO - orchestrator cluster started at 2024-11-26 13:30:34\n",
-      "13:30:34 INFO - Number of folders is 14\n",
-      "13:30:34 INFO - Completed 1 files (7.14%) in 0.0 min\n",
-      "13:30:34 INFO - Completed 2 files (14.29%) in 0.0 min\n",
-      "13:30:34 INFO - Completed 3 files (21.43%) in 0.001 min\n",
-      "13:30:34 INFO - Completed 4 files (28.57%) in 0.001 min\n",
-      "13:30:34 INFO - Completed 5 files (35.71%) in 0.001 min\n",
-      "13:30:34 INFO - Completed 6 files (42.86%) in 0.001 min\n",
-      "13:30:34 INFO - Completed 7 files (50.0%) in 0.001 min\n",
-      "13:30:34 INFO - Completed 8 files (57.14%) in 0.002 min\n",
-      "13:30:34 INFO - Completed 9 files (64.29%) in 0.002 min\n",
-      "13:30:34 INFO - Completed 10 files (71.43%) in 0.002 min\n",
-      "13:30:34 INFO - Completed 11 files (78.57%) in 0.002 min\n",
-      "13:30:34 INFO - Completed 12 files (85.71%) in 0.002 min\n",
-      "13:30:34 INFO - Completed 13 files (92.86%) in 0.002 min\n",
-      "13:30:34 INFO - Completed 14 files (100.0%) in 0.003 min\n",
-      "13:30:34 INFO - Done processing 14 files, waiting for flush() completion.\n",
-      "13:30:34 INFO - done flushing in 0.0 sec\n",
-      "13:30:34 INFO - Completed execution in 0.003 min, execution result 0\n",
-      "13:30:34 INFO - ClusterAnalysis completed successfully\n",
-      "13:30:34 INFO - Starting GetDuplicateList step\n",
-      "13:30:34 INFO - Got parameters for GetDuplicateList\n",
-      "13:30:34 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n",
-      "13:30:34 INFO - pipeline id pipeline_id\n",
-      "13:30:34 INFO - code location None\n",
-      "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n",
-      "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:30:34 INFO - orchestrator fdlist started at 2024-11-26 13:30:34\n",
-      "13:30:34 INFO - Number of folders is 1\n",
-      "13:30:34 INFO - Get Duplicate List for folder docs_to_remove\n",
-      "13:30:34 INFO - 8 documents marked as duplicates\n",
-      "13:30:34 INFO - Completed 1 files (100.0%) in 0.0 min\n",
-      "13:30:34 INFO - Done processing 1 files, waiting for flush() completion.\n",
-      "13:30:34 INFO - done flushing in 0.0 sec\n",
-      "13:30:34 INFO - Completed execution in 0.001 min, execution result 0\n",
-      "13:30:34 INFO - GetDuplicateList completed successfully\n",
-      "13:30:34 INFO - Starting DataCleaning step\n",
-      "13:30:34 INFO - Got parameters for DataCleaning\n",
-      "13:30:34 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n",
-      "13:30:34 INFO - data factory dcdata_ is using local configuration without input/output path\n",
-      "13:30:34 INFO - data factory dcdata_ max_files -1, n_sample -1\n",
-      "13:30:34 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:30:34 INFO - pipeline id pipeline_id\n",
-      "13:30:34 INFO - code location None\n",
-      "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/cleaned\n",
-      "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:30:34 INFO - orchestrator fdclean started at 2024-11-26 13:30:34\n",
-      "13:30:34 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n",
-      "13:30:34 INFO - Completed 1 files (50.0%) in 0.0 min\n",
-      "13:30:34 INFO - Completed 2 files (100.0%) in 0.0 min\n",
-      "13:30:34 INFO - Done processing 2 files, waiting for flush() completion.\n",
-      "13:30:34 INFO - done flushing in 0.0 sec\n",
-      "13:30:34 INFO - Completed execution in 0.0 min, execution result 0\n",
-      "13:30:34 INFO - DataCleaning completed successfully\n"
+      "17:55:52 INFO - Starting SignatureCalculation step\n",
+      "17:55:52 INFO - Got parameters for SignatureCalculation\n",
+      "17:55:52 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n",
+      "17:55:52 INFO - data factory scdata_ is using local configuration without input/output path\n",
+      "17:55:52 INFO - data factory scdata_ max_files -1, n_sample -1\n",
+      "17:55:52 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "17:55:52 INFO - pipeline id pipeline_id\n",
+      "17:55:52 INFO - code location None\n",
+      "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output\n",
+      "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "17:55:52 INFO - orchestrator minhash started at 2024-12-18 17:55:52\n",
+      "17:55:52 ERROR - No input files to process - exiting\n",
+      "17:55:52 INFO - Completed execution in 0.0 min, execution result 0\n",
+      "17:55:52 INFO - SignatureCalculation completed successfully\n",
+      "17:55:52 INFO - Starting ClusterAnalysis step\n",
+      "17:55:52 INFO - Got parameters for ClusterAnalysis\n",
+      "17:55:52 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n",
+      "17:55:52 INFO - pipeline id pipeline_id\n",
+      "17:55:52 INFO - code location None\n",
+      "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/docs_to_remove\n",
+      "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "17:55:52 INFO - orchestrator cluster started at 2024-12-18 17:55:52\n",
+      "17:55:52 INFO - Number of folders is 14\n",
+      "17:55:52 WARNING - Exception processing file band=0/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 1 files (7.14%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=1/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 2 files (14.29%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=2/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 3 files (21.43%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=3/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 4 files (28.57%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=4/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 5 files (35.71%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=5/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 6 files (42.86%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=6/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 7 files (50.0%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=7/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 8 files (57.14%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=8/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 9 files (64.29%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=9/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 10 files (71.43%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=10/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 11 files (78.57%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=11/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 12 files (85.71%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=12/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 13 files (92.86%) in 0.0 min\n",
+      "17:55:52 WARNING - Exception processing file band=13/segment=0: Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
+      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
+      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
+      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
+      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
+      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
+      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
+      "    .collect(no_optimization=True)\n",
+      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
+      "    return wrap_df(ldf.collect(callback))\n",
+      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
+      "polars.exceptions.ColumnNotFoundError: band_hash\n",
+      "\n",
+      "Resolved plan until failure:\n",
+      "\n",
+      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
+      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
+      "\n",
+      "17:55:52 INFO - Completed 14 files (100.0%) in 0.0 min\n",
+      "17:55:52 INFO - Done processing 14 files, waiting for flush() completion.\n",
+      "17:55:52 INFO - done flushing in 0.0 sec\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py\", line 131, in orchestrate\n",
+      "    stats[\"processing_time\"] = round(stats[\"processing_time\"], 3)\n",
+      "                                     ~~~~~^^^^^^^^^^^^^^^^^^^\n",
+      "KeyError: 'processing_time'\n",
+      "17:55:52 ERROR - Exception during execution 'processing_time': None\n",
+      "17:55:52 INFO - Completed execution in 0.0 min, execution result 1\n",
+      "17:55:52 ERROR - ClusterAnalysis failed with status 1, aborting ...\n"
      ]
     }
    ],
diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb
index bb69579a9..8bfa98a3a 100644
--- a/transforms/universal/fdedup/fdedup_ray.ipynb
+++ b/transforms/universal/fdedup/fdedup_ray.ipynb
@@ -55,8 +55,8 @@
     "import sys\n",
     "\n",
     "from data_processing.utils import ParamsUtils\n",
-    "from fdedup_transform_python import parse_args\n",
-    "from fdedup_transform_ray import RayServiceOrchestrator"
+    "from dpk_fdedup.transform_python import parse_args\n",
+    "from dpk_fdedup.ray.transform import RayServiceOrchestrator"
    ]
   },
   {
diff --git a/transforms/universal/fdedup/fdedup_spark.ipynb b/transforms/universal/fdedup/fdedup_spark.ipynb
index 9f4bf1772..616543640 100644
--- a/transforms/universal/fdedup/fdedup_spark.ipynb
+++ b/transforms/universal/fdedup/fdedup_spark.ipynb
@@ -47,8 +47,8 @@
     "import sys\n",
     "\n",
     "from data_processing.utils import ParamsUtils\n",
-    "from fdedup_transform_python import parse_args\n",
-    "from fdedup_transform_spark import SparkServiceOrchestrator"
+    "from dpk_fdedup.transform_python import parse_args\n",
+    "from dpk_fdedup.spark.transform import SparkServiceOrchestrator"
    ]
   },
   {
diff --git a/transforms/universal/fdedup/kfp_ray/Makefile b/transforms/universal/fdedup/kfp_ray/Makefile
index 55f7851f6..5c1ae0778 100644
--- a/transforms/universal/fdedup/kfp_ray/Makefile
+++ b/transforms/universal/fdedup/kfp_ray/Makefile
@@ -2,10 +2,20 @@ REPOROOT=${CURDIR}/../../../../
 WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
 include $(REPOROOT)/transforms/.make.workflows
 
+
 # Include the common configuration for this transform
-include ../transform.config
+#include ../transform.config
+
+SRC_DIR=${CURDIR}/../
+# Use the docker image that is built for ray runtime
+TRANSFORM_RUNTIME=ray
+## override settings in .make.default as they assume old structure with ray being the current folder
+DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME)
+DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
 
-SRC_DIR=${CURDIR}/../ray/
+
+# Only build the image with -f Dockerfile.ray
+BUILD_SPECIFIC_RUNTIME=ray
 
 PYTHON_WF := $(shell find ./ -name '*_wf.py')
 YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF})
@@ -17,38 +27,21 @@ clean:
 	@# Help: Clean up the virtual environment.
 	rm -rf ${REPOROOT}/transforms/venv
 
-venv::
-
-build::
-
-setup::
-
-test::
-
-test-src::
-
-publish::
-
-image::
-
-test-image::
-
-kind-load-image::
-
-docker-load-image::
-
-docker-save-image::
-
 .PHONY: workflow-build
 workflow-build: workflow-venv
 	$(MAKE) $(YAML_WF)
 
 .PHONY: workflow-test
 workflow-test: workflow-build
-	$(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=fdedup_wf.yaml
+	$(MAKE) TRANSFORM_SRC=${SRC_DIR} \
+		TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \
+		TRANSFORM_NAME=$(TRANSFORM_NAME) \
+		BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \
+		DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) \
+		PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline
 
 .PHONY: workflow-upload
-workflow-upload: workflow-build
+workflow-upload:
 	@for file in $(YAML_WF); do \
 		$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
-	done
+	done
\ No newline at end of file
diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
index ffc6f79bc..6b1265cf8 100644
--- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
+++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py
@@ -28,10 +28,10 @@
 image_pull_secret = os.getenv("FDEDUP_IMAGE_PULL_SECRET", "my_secret")
 
 # the name of the job script
-SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "signature_calc_transform_ray.py"
-CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "cluster_analysis_transform_ray.py"
-GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "get_duplicate_list_transform_ray.py"
-DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py"
+SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "-m dpk_fdedup.signature_calc.ray.transform"
+CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "-m dpk_fdedup.cluster_analysis.ray.transform"
+GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "-m dpk_fdedup.get_duplicate_list.ray.transform"
+DATA_CLEANING_EXEC_SCRIPT_NAME: str = "-m dpk_fdedup.data_cleaning.ray.transform"
 
 # components
 base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"

From 52fb1a19c2c643e20eb9a2612fe5e5c192bafdd9 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Wed, 18 Dec 2024 18:54:55 -0500
Subject: [PATCH 4/6] fix and test notebok

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .../universal/fdedup/fdedup_python.ipynb      | 679 +-----------------
 transforms/universal/fdedup/fdedup_ray.ipynb  | 409 +----------
 .../universal/fdedup/fdedup_spark.ipynb       |  12 +-
 3 files changed, 47 insertions(+), 1053 deletions(-)

diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb
index 3ca0ec9e5..a64c48a54 100644
--- a/transforms/universal/fdedup/fdedup_python.ipynb
+++ b/transforms/universal/fdedup/fdedup_python.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
    "metadata": {},
    "outputs": [],
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
    "metadata": {},
    "outputs": [],
@@ -71,14 +71,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
    "metadata": {},
    "outputs": [],
    "source": [
     "# create parameters\n",
-    "input_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\")\n",
-    "output_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"output\")\n",
+    "input_folder = os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\")\n",
+    "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n",
     "params = {\n",
     "    # transform configuration parameters\n",
     "    \"input_folder\": input_folder,\n",
@@ -102,388 +102,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "0775e400-7469-49a6-8998-bd4772931459",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "17:55:52 INFO - Starting SignatureCalculation step\n",
-      "17:55:52 INFO - Got parameters for SignatureCalculation\n",
-      "17:55:52 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n",
-      "17:55:52 INFO - data factory scdata_ is using local configuration without input/output path\n",
-      "17:55:52 INFO - data factory scdata_ max_files -1, n_sample -1\n",
-      "17:55:52 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "17:55:52 INFO - pipeline id pipeline_id\n",
-      "17:55:52 INFO - code location None\n",
-      "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output\n",
-      "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "17:55:52 INFO - orchestrator minhash started at 2024-12-18 17:55:52\n",
-      "17:55:52 ERROR - No input files to process - exiting\n",
-      "17:55:52 INFO - Completed execution in 0.0 min, execution result 0\n",
-      "17:55:52 INFO - SignatureCalculation completed successfully\n",
-      "17:55:52 INFO - Starting ClusterAnalysis step\n",
-      "17:55:52 INFO - Got parameters for ClusterAnalysis\n",
-      "17:55:52 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n",
-      "17:55:52 INFO - pipeline id pipeline_id\n",
-      "17:55:52 INFO - code location None\n",
-      "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/docs_to_remove\n",
-      "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "17:55:52 INFO - orchestrator cluster started at 2024-12-18 17:55:52\n",
-      "17:55:52 INFO - Number of folders is 14\n",
-      "17:55:52 WARNING - Exception processing file band=0/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 1 files (7.14%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=1/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 2 files (14.29%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=2/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 3 files (21.43%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=3/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 4 files (28.57%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=4/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 5 files (35.71%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=5/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 6 files (42.86%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=6/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 7 files (50.0%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=7/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 8 files (57.14%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=8/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 9 files (64.29%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=9/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 10 files (71.43%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=10/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 11 files (78.57%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=11/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 12 files (85.71%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=12/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 13 files (92.86%) in 0.0 min\n",
-      "17:55:52 WARNING - Exception processing file band=13/segment=0: Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n",
-      "    out_files, stats = self.transform.transform(folder_name=f_name)\n",
-      "                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n",
-      "    cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n",
-      "                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n",
-      "    groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n",
-      "                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n",
-      "    .collect(no_optimization=True)\n",
-      "     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n",
-      "    return wrap_df(ldf.collect(callback))\n",
-      "                   ^^^^^^^^^^^^^^^^^^^^^\n",
-      "polars.exceptions.ColumnNotFoundError: band_hash\n",
-      "\n",
-      "Resolved plan until failure:\n",
-      "\n",
-      "\t---> FAILED HERE RESOLVING 'group_by' <---\n",
-      "DF []; PROJECT */0 COLUMNS; SELECTION: None\n",
-      "\n",
-      "17:55:52 INFO - Completed 14 files (100.0%) in 0.0 min\n",
-      "17:55:52 INFO - Done processing 14 files, waiting for flush() completion.\n",
-      "17:55:52 INFO - done flushing in 0.0 sec\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py\", line 131, in orchestrate\n",
-      "    stats[\"processing_time\"] = round(stats[\"processing_time\"], 3)\n",
-      "                                     ~~~~~^^^^^^^^^^^^^^^^^^^\n",
-      "KeyError: 'processing_time'\n",
-      "17:55:52 ERROR - Exception during execution 'processing_time': None\n",
-      "17:55:52 INFO - Completed execution in 0.0 min, execution result 1\n",
-      "17:55:52 ERROR - ClusterAnalysis failed with status 1, aborting ...\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "\n",
     "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
@@ -504,26 +126,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "7276fe84-6512-4605-ab65-747351e13a7c",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['python/output/cleaned/metadata.json',\n",
-       " 'python/output/cleaned/data_1',\n",
-       " 'python/output/cleaned/data_2']"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import glob\n",
-    "glob.glob(\"python/output/cleaned/*\")"
+    "glob.glob(\"output/cleaned/*\")"
    ]
   },
   {
@@ -536,171 +145,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "shape: (12, 2)\n",
-      "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n",
-      "│ int_id_column ┆ contents                                                                         │\n",
-      "│ ---           ┆ ---                                                                              │\n",
-      "│ i64           ┆ str                                                                              │\n",
-      "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n",
-      "│ 1             ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ PR Newswire October 12, 2019                                                     │\n",
-      "│               ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids;           │\n",
-      "│               ┆ New Location Continues Strategic National Expansion Plans                        │\n",
-      "│               ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n",
-      "│               ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot     │\n",
-      "│               ┆ store is the Company's third location in Michigan.                               │\n",
-      "│               ┆ Known for its outstanding selection of brand name and specialty apparel, shoes,  │\n",
-      "│               ┆ accessories and gifts, the store features products from leading brands such as   │\n",
-      "│               ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n",
-      "│               ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n",
-      "│               ┆ interest-free charge card, accommodating return policy, free gift wrapping and   │\n",
-      "│               ┆ free shipping services.                                                          │\n",
-      "│               ┆ Today's opening continues to build upon the momentum of the family-owned         │\n",
-      "│               ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n",
-      "│               ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in    │\n",
-      "│               ┆ new states beyond its Midwestern footprint, including New York, Alabama and      │\n",
-      "│               ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin       │\n",
-      "│               ┆ location in Madison in Fall 2021.                                                │\n",
-      "│               ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional     │\n",
-      "│               ┆ collection of shopping, dining and entertainment options, Woodland Mall is a     │\n",
-      "│               ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim   │\n",
-      "│               ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our      │\n",
-      "│               ┆ doors, creating an unrivaled shopping experience is the motivation behind        │\n",
-      "│               ┆ everything we do. We look forward to extending our offerings of brand name       │\n",
-      "│               ┆ merchandise and signature customer service to the Grand Rapids area for many     │\n",
-      "│               ┆ years to come.\"                                                                  │\n",
-      "│               ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise   │\n",
-      "│               ┆ and exceptional service, as the anchor of the newly developed wing at Woodland   │\n",
-      "│               ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies  │\n",
-      "│               ┆ Woodland Mall's place as the premier retail and entertainment destination in     │\n",
-      "│               ┆ Grand Rapids, driving its place as a top-performing PREIT property.\"             │\n",
-      "│               ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the  │\n",
-      "│               ┆ new single story Von Maur store features the Company's signature exterior brick  │\n",
-      "│               ┆ façade, open expansive floor plan, and residential ambiance, including music     │\n",
-      "│               ┆ from the store's grand piano.                                                    │\n",
-      "│               ┆ The Woodland Mall store will eventually employ up to 150 associates; the         │\n",
-      "│               ┆ majority of them will be full-time. Von Maur offers above-market wages,          │\n",
-      "│               ┆ excellent benefits and a positive, professional work environment. Hours of       │\n",
-      "│               ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6   │\n",
-      "│               ┆ p.m. ET.                                                                         │\n",
-      "│               ┆ About Von Maur                                                                   │\n",
-      "│               ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company      │\n",
-      "│               ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot      │\n",
-      "│               ┆ E-Commerce facility that drives its successful online business at vonmaur.com.   │\n",
-      "│               ┆ Courtney Smith                                                                   │\n",
-      "│               ┆ courtney@reputationpartners.com                                                  │\n",
-      "│               ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n",
-      "│               ┆ t-store-opens-third-location-in-michigan-300937186.html                          │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│ 3             ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about  │\n",
-      "│               ┆ what it means to live like a Genius.                                             │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│ 4             ┆                                                                                  │\n",
-      "│               ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful                          │\n",
-      "│               ┆ minds of our time about what it means to live like a Genius.                     │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│               ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│               ┆                                                                                  │\n",
-      "│ 5             ┆                                                                                  │\n",
-      "│               ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│               ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful                          │\n",
-      "│               ┆ minds of our time about what it means to live like a Genius.                     │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│               ┆                                                                                  │\n",
-      "│ 6             ┆                                                                                  │\n",
-      "│               ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful                          │\n",
-      "│               ┆ minds of our time about what it means to live like a Genius.                     │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│               ┆                                                                                  │\n",
-      "│ 11            ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously         │\n",
-      "│               ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of     │\n",
-      "│               ┆ capricious pigeons prattled placidly by the cactus, curiously considering        │\n",
-      "│               ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped      │\n",
-      "│               ┆ through the lilacs, laboriously longing for a lozenge                            │\n",
-      "│ 12            ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus,         │\n",
-      "│               ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy     │\n",
-      "│               ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A    │\n",
-      "│               ┆ couple of capricious capybaras chatted coolly by the cactus, curiously           │\n",
-      "│               ┆ considering another capy capably chewing on cantaloupe.                          │\n",
-      "│ 13            ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a      │\n",
-      "│               ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus,          │\n",
-      "│               ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a   │\n",
-      "│               ┆ pair of capricious pigeons prattled placidly by the cactus, curiously            │\n",
-      "│               ┆ considering another pigeon capably pecking at cantaloupe.                        │\n",
-      "│ 14            ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus,         │\n",
-      "│               ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy     │\n",
-      "│               ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A    │\n",
-      "│               ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n",
-      "│               ┆ another capy capably chewing on cantaloupe                                       │\n",
-      "│ 15            ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n",
-      "│               ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping   │\n",
-      "│               ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time.  │\n",
-      "│ 16            ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is     │\n",
-      "│               ┆ very beautiful bright green looks very beautiful. Purchased by the shopping      │\n",
-      "│               ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time.  │\n",
-      "│ 17            ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about  │\n",
-      "│               ┆ what it means to live like a Genius.                                             │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import polars as pl\n",
-    "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n",
-    "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n",
+    "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n",
+    "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n",
     "input_df = input_df_1.vstack(input_df_2)\n",
     "\n",
     "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
@@ -717,101 +169,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "shape: (4, 2)\n",
-      "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n",
-      "│ int_id_column ┆ contents                                                                         │\n",
-      "│ ---           ┆ ---                                                                              │\n",
-      "│ i64           ┆ str                                                                              │\n",
-      "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n",
-      "│ 1             ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ PR Newswire October 12, 2019                                                     │\n",
-      "│               ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids;           │\n",
-      "│               ┆ New Location Continues Strategic National Expansion Plans                        │\n",
-      "│               ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n",
-      "│               ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot     │\n",
-      "│               ┆ store is the Company's third location in Michigan.                               │\n",
-      "│               ┆ Known for its outstanding selection of brand name and specialty apparel, shoes,  │\n",
-      "│               ┆ accessories and gifts, the store features products from leading brands such as   │\n",
-      "│               ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n",
-      "│               ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n",
-      "│               ┆ interest-free charge card, accommodating return policy, free gift wrapping and   │\n",
-      "│               ┆ free shipping services.                                                          │\n",
-      "│               ┆ Today's opening continues to build upon the momentum of the family-owned         │\n",
-      "│               ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n",
-      "│               ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in    │\n",
-      "│               ┆ new states beyond its Midwestern footprint, including New York, Alabama and      │\n",
-      "│               ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin       │\n",
-      "│               ┆ location in Madison in Fall 2021.                                                │\n",
-      "│               ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional     │\n",
-      "│               ┆ collection of shopping, dining and entertainment options, Woodland Mall is a     │\n",
-      "│               ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim   │\n",
-      "│               ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our      │\n",
-      "│               ┆ doors, creating an unrivaled shopping experience is the motivation behind        │\n",
-      "│               ┆ everything we do. We look forward to extending our offerings of brand name       │\n",
-      "│               ┆ merchandise and signature customer service to the Grand Rapids area for many     │\n",
-      "│               ┆ years to come.\"                                                                  │\n",
-      "│               ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise   │\n",
-      "│               ┆ and exceptional service, as the anchor of the newly developed wing at Woodland   │\n",
-      "│               ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies  │\n",
-      "│               ┆ Woodland Mall's place as the premier retail and entertainment destination in     │\n",
-      "│               ┆ Grand Rapids, driving its place as a top-performing PREIT property.\"             │\n",
-      "│               ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the  │\n",
-      "│               ┆ new single story Von Maur store features the Company's signature exterior brick  │\n",
-      "│               ┆ façade, open expansive floor plan, and residential ambiance, including music     │\n",
-      "│               ┆ from the store's grand piano.                                                    │\n",
-      "│               ┆ The Woodland Mall store will eventually employ up to 150 associates; the         │\n",
-      "│               ┆ majority of them will be full-time. Von Maur offers above-market wages,          │\n",
-      "│               ┆ excellent benefits and a positive, professional work environment. Hours of       │\n",
-      "│               ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6   │\n",
-      "│               ┆ p.m. ET.                                                                         │\n",
-      "│               ┆ About Von Maur                                                                   │\n",
-      "│               ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company      │\n",
-      "│               ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot      │\n",
-      "│               ┆ E-Commerce facility that drives its successful online business at vonmaur.com.   │\n",
-      "│               ┆ Courtney Smith                                                                   │\n",
-      "│               ┆ courtney@reputationpartners.com                                                  │\n",
-      "│               ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n",
-      "│               ┆ t-store-opens-third-location-in-michigan-300937186.html                          │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│ 4             ┆                                                                                  │\n",
-      "│               ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful                          │\n",
-      "│               ┆ minds of our time about what it means to live like a Genius.                     │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│               ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│               ┆                                                                                  │\n",
-      "│ 12            ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus,         │\n",
-      "│               ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy     │\n",
-      "│               ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A    │\n",
-      "│               ┆ couple of capricious capybaras chatted coolly by the cactus, curiously           │\n",
-      "│               ┆ considering another capy capably chewing on cantaloupe.                          │\n",
-      "│ 15            ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n",
-      "│               ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping   │\n",
-      "│               ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time.  │\n",
-      "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import polars as pl\n",
-    "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n",
-    "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n",
+    "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n",
+    "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"),  \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n",
     "output_df = output_df_1.vstack(output_df_2)\n",
     "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
     "    print(output_df)"
@@ -824,6 +189,14 @@
    "metadata": {},
    "outputs": [],
    "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "787c644e-2640-4c05-bdc2-8a261305a89f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb
index 8bfa98a3a..7c6740f88 100644
--- a/transforms/universal/fdedup/fdedup_ray.ipynb
+++ b/transforms/universal/fdedup/fdedup_ray.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
    "metadata": {},
    "outputs": [],
@@ -37,18 +37,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-11-26 13:30:56,482\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import ast\n",
     "import os\n",
@@ -81,14 +73,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
    "metadata": {},
    "outputs": [],
    "source": [
     "# create parameters\n",
     "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n",
-    "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n",
+    "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n",
     "params = {\n",
     "    # transform configuration parameters\n",
     "    \"input_folder\": input_folder,\n",
@@ -114,126 +106,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "0775e400-7469-49a6-8998-bd4772931459",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "13:30:57 INFO - Starting SignatureCalculation step\n",
-      "13:30:57 INFO - Got parameters for SignatureCalculation\n",
-      "13:30:57 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n",
-      "13:30:57 INFO - data factory scdata_ is using local configuration without input/output path\n",
-      "13:30:57 INFO - data factory scdata_ max_files -1, n_sample -1\n",
-      "13:30:57 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:30:57 INFO - pipeline id pipeline_id\n",
-      "13:30:57 INFO - code location None\n",
-      "13:30:57 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
-      "13:30:57 INFO - actor creation delay 0\n",
-      "13:30:57 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "13:30:57 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n",
-      "13:30:57 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "13:30:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:30:57 INFO - Running locally\n",
-      "2024-11-26 13:31:08,860\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - orchestrator started at 2024-11-26 13:31:12\n",
-      "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n",
-      "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.162438202649355, 'object_store': 2.0}\n",
-      "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
-      "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed processing 1 files in 0.002 min\n",
-      "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Starting flush()\n",
-      "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - done flushing in 0.045 sec\n",
-      "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Wrote 14 tables with a total size of 80,640 bytes\n",
-      "13:31:24 INFO - Completed execution in 0.446 min, execution result 0\n",
-      "13:31:26 INFO - SignatureCalculation completed successfully\n",
-      "13:31:26 INFO - Starting ClusterAnalysis step\n",
-      "13:31:26 INFO - Got parameters for ClusterAnalysis\n",
-      "13:31:26 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n",
-      "13:31:26 INFO - pipeline id pipeline_id\n",
-      "13:31:26 INFO - code location None\n",
-      "13:31:26 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
-      "13:31:26 INFO - actor creation delay 0\n",
-      "13:31:26 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "13:31:26 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/docs_to_remove\n",
-      "13:31:26 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "13:31:26 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:31:26 INFO - Running locally\n",
-      "2024-11-26 13:31:28,318\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - orchestrator started at 2024-11-26 13:31:31\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of folders is 14\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.77626838721335, 'object_store': 2.0}\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 1 files in 0.0 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 2 files in 0.0 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 3 files in 0.0 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 4 files in 0.0 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 5 files in 0.0 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 6 files in 0.0 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 7 files in 0.001 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 8 files in 0.001 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 9 files in 0.001 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 10 files in 0.001 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files in 0.001 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files (78.571%)  in 0.001 min. Waiting for completion\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed processing 14 files in 0.001 min\n",
-      "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n",
-      "13:31:43 INFO - Completed execution in 0.292 min, execution result 0\n",
-      "13:31:45 INFO - ClusterAnalysis completed successfully\n",
-      "13:31:45 INFO - Starting GetDuplicateList step\n",
-      "13:31:45 INFO - Got parameters for GetDuplicateList\n",
-      "13:31:45 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n",
-      "13:31:45 INFO - pipeline id pipeline_id\n",
-      "13:31:45 INFO - code location None\n",
-      "13:31:45 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
-      "13:31:45 INFO - actor creation delay 0\n",
-      "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "13:31:45 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n",
-      "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:31:45 INFO - Running locally\n",
-      "2024-11-26 13:31:47,311\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - orchestrator started at 2024-11-26 13:31:50\n",
-      "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of folders is 1\n",
-      "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.749520111829042, 'object_store': 2.0}\n",
-      "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
-      "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed processing 1 files in 0.0 min\n",
-      "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - done flushing in 0.001 sec\n",
-      "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - Get Duplicate List for folder docs_to_remove\n",
-      "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - 8 documents marked as duplicates\n",
-      "13:32:02 INFO - Completed execution in 0.295 min, execution result 0\n",
-      "13:32:04 INFO - GetDuplicateList completed successfully\n",
-      "13:32:04 INFO - Starting DataCleaning step\n",
-      "13:32:04 INFO - Got parameters for DataCleaning\n",
-      "13:32:04 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n",
-      "13:32:04 INFO - data factory dcdata_ is using local configuration without input/output path\n",
-      "13:32:04 INFO - data factory dcdata_ max_files -1, n_sample -1\n",
-      "13:32:04 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:32:04 INFO - pipeline id pipeline_id\n",
-      "13:32:04 INFO - code location None\n",
-      "13:32:04 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n",
-      "13:32:04 INFO - actor creation delay 0\n",
-      "13:32:04 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n",
-      "13:32:04 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/cleaned\n",
-      "13:32:04 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "13:32:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "13:32:04 INFO - Running locally\n",
-      "2024-11-26 13:32:07,526\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n",
-      "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - orchestrator started at 2024-11-26 13:32:10\n",
-      "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n",
-      "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.738976669497788, 'object_store': 2.0}\n",
-      "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n",
-      "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed 0 files (0.0%)  in 0.0 min. Waiting for completion\n",
-      "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed processing 1 files in 0.002 min\n",
-      "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - done flushing in 0.003 sec\n",
-      "13:32:23 INFO - Completed execution in 0.313 min, execution result 0\n",
-      "13:32:24 INFO - DataCleaning completed successfully\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "\n",
     "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
@@ -254,24 +130,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "7276fe84-6512-4605-ab65-747351e13a7c",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['ray/output/cleaned/metadata.json', 'ray/output/cleaned/df1.parquet']"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import glob\n",
-    "glob.glob(\"ray/output/cleaned/*\")"
+    "glob.glob(\"output/cleaned/*\")"
    ]
   },
   {
@@ -284,167 +149,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "shape: (12, 2)\n",
-      "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n",
-      "│ int_id_column ┆ contents                                                                         │\n",
-      "│ ---           ┆ ---                                                                              │\n",
-      "│ i64           ┆ str                                                                              │\n",
-      "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n",
-      "│ 1             ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ PR Newswire October 12, 2019                                                     │\n",
-      "│               ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids;           │\n",
-      "│               ┆ New Location Continues Strategic National Expansion Plans                        │\n",
-      "│               ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n",
-      "│               ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot     │\n",
-      "│               ┆ store is the Company's third location in Michigan.                               │\n",
-      "│               ┆ Known for its outstanding selection of brand name and specialty apparel, shoes,  │\n",
-      "│               ┆ accessories and gifts, the store features products from leading brands such as   │\n",
-      "│               ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n",
-      "│               ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n",
-      "│               ┆ interest-free charge card, accommodating return policy, free gift wrapping and   │\n",
-      "│               ┆ free shipping services.                                                          │\n",
-      "│               ┆ Today's opening continues to build upon the momentum of the family-owned         │\n",
-      "│               ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n",
-      "│               ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in    │\n",
-      "│               ┆ new states beyond its Midwestern footprint, including New York, Alabama and      │\n",
-      "│               ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin       │\n",
-      "│               ┆ location in Madison in Fall 2021.                                                │\n",
-      "│               ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional     │\n",
-      "│               ┆ collection of shopping, dining and entertainment options, Woodland Mall is a     │\n",
-      "│               ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim   │\n",
-      "│               ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our      │\n",
-      "│               ┆ doors, creating an unrivaled shopping experience is the motivation behind        │\n",
-      "│               ┆ everything we do. We look forward to extending our offerings of brand name       │\n",
-      "│               ┆ merchandise and signature customer service to the Grand Rapids area for many     │\n",
-      "│               ┆ years to come.\"                                                                  │\n",
-      "│               ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise   │\n",
-      "│               ┆ and exceptional service, as the anchor of the newly developed wing at Woodland   │\n",
-      "│               ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies  │\n",
-      "│               ┆ Woodland Mall's place as the premier retail and entertainment destination in     │\n",
-      "│               ┆ Grand Rapids, driving its place as a top-performing PREIT property.\"             │\n",
-      "│               ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the  │\n",
-      "│               ┆ new single story Von Maur store features the Company's signature exterior brick  │\n",
-      "│               ┆ façade, open expansive floor plan, and residential ambiance, including music     │\n",
-      "│               ┆ from the store's grand piano.                                                    │\n",
-      "│               ┆ The Woodland Mall store will eventually employ up to 150 associates; the         │\n",
-      "│               ┆ majority of them will be full-time. Von Maur offers above-market wages,          │\n",
-      "│               ┆ excellent benefits and a positive, professional work environment. Hours of       │\n",
-      "│               ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6   │\n",
-      "│               ┆ p.m. ET.                                                                         │\n",
-      "│               ┆ About Von Maur                                                                   │\n",
-      "│               ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company      │\n",
-      "│               ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot      │\n",
-      "│               ┆ E-Commerce facility that drives its successful online business at vonmaur.com.   │\n",
-      "│               ┆ Courtney Smith                                                                   │\n",
-      "│               ┆ courtney@reputationpartners.com                                                  │\n",
-      "│               ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n",
-      "│               ┆ t-store-opens-third-location-in-michigan-300937186.html                          │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│ 3             ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about  │\n",
-      "│               ┆ what it means to live like a Genius.                                             │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│ 4             ┆                                                                                  │\n",
-      "│               ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful                          │\n",
-      "│               ┆ minds of our time about what it means to live like a Genius.                     │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│               ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│               ┆                                                                                  │\n",
-      "│ 5             ┆                                                                                  │\n",
-      "│               ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│               ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful                          │\n",
-      "│               ┆ minds of our time about what it means to live like a Genius.                     │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│               ┆                                                                                  │\n",
-      "│ 6             ┆                                                                                  │\n",
-      "│               ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful                          │\n",
-      "│               ┆ minds of our time about what it means to live like a Genius.                     │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│               ┆                                                                                  │\n",
-      "│ 11            ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously         │\n",
-      "│               ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of     │\n",
-      "│               ┆ capricious pigeons prattled placidly by the cactus, curiously considering        │\n",
-      "│               ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped      │\n",
-      "│               ┆ through the lilacs, laboriously longing for a lozenge                            │\n",
-      "│ 12            ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus,         │\n",
-      "│               ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy     │\n",
-      "│               ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A    │\n",
-      "│               ┆ couple of capricious capybaras chatted coolly by the cactus, curiously           │\n",
-      "│               ┆ considering another capy capably chewing on cantaloupe.                          │\n",
-      "│ 13            ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a      │\n",
-      "│               ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus,          │\n",
-      "│               ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a   │\n",
-      "│               ┆ pair of capricious pigeons prattled placidly by the cactus, curiously            │\n",
-      "│               ┆ considering another pigeon capably pecking at cantaloupe.                        │\n",
-      "│ 14            ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus,         │\n",
-      "│               ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy     │\n",
-      "│               ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A    │\n",
-      "│               ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n",
-      "│               ┆ another capy capably chewing on cantaloupe                                       │\n",
-      "│ 15            ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n",
-      "│               ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping   │\n",
-      "│               ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time.  │\n",
-      "│ 16            ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is     │\n",
-      "│               ┆ very beautiful bright green looks very beautiful. Purchased by the shopping      │\n",
-      "│               ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time.  │\n",
-      "│ 17            ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about  │\n",
-      "│               ┆ what it means to live like a Genius.                                             │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import polars as pl\n",
     "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n",
@@ -462,100 +170,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "shape: (4, 2)\n",
-      "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n",
-      "│ int_id_column ┆ contents                                                                         │\n",
-      "│ ---           ┆ ---                                                                              │\n",
-      "│ i64           ┆ str                                                                              │\n",
-      "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n",
-      "│ 1             ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ PR Newswire October 12, 2019                                                     │\n",
-      "│               ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids;           │\n",
-      "│               ┆ New Location Continues Strategic National Expansion Plans                        │\n",
-      "│               ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n",
-      "│               ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot     │\n",
-      "│               ┆ store is the Company's third location in Michigan.                               │\n",
-      "│               ┆ Known for its outstanding selection of brand name and specialty apparel, shoes,  │\n",
-      "│               ┆ accessories and gifts, the store features products from leading brands such as   │\n",
-      "│               ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n",
-      "│               ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n",
-      "│               ┆ interest-free charge card, accommodating return policy, free gift wrapping and   │\n",
-      "│               ┆ free shipping services.                                                          │\n",
-      "│               ┆ Today's opening continues to build upon the momentum of the family-owned         │\n",
-      "│               ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n",
-      "│               ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in    │\n",
-      "│               ┆ new states beyond its Midwestern footprint, including New York, Alabama and      │\n",
-      "│               ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin       │\n",
-      "│               ┆ location in Madison in Fall 2021.                                                │\n",
-      "│               ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional     │\n",
-      "│               ┆ collection of shopping, dining and entertainment options, Woodland Mall is a     │\n",
-      "│               ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim   │\n",
-      "│               ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our      │\n",
-      "│               ┆ doors, creating an unrivaled shopping experience is the motivation behind        │\n",
-      "│               ┆ everything we do. We look forward to extending our offerings of brand name       │\n",
-      "│               ┆ merchandise and signature customer service to the Grand Rapids area for many     │\n",
-      "│               ┆ years to come.\"                                                                  │\n",
-      "│               ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise   │\n",
-      "│               ┆ and exceptional service, as the anchor of the newly developed wing at Woodland   │\n",
-      "│               ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies  │\n",
-      "│               ┆ Woodland Mall's place as the premier retail and entertainment destination in     │\n",
-      "│               ┆ Grand Rapids, driving its place as a top-performing PREIT property.\"             │\n",
-      "│               ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the  │\n",
-      "│               ┆ new single story Von Maur store features the Company's signature exterior brick  │\n",
-      "│               ┆ façade, open expansive floor plan, and residential ambiance, including music     │\n",
-      "│               ┆ from the store's grand piano.                                                    │\n",
-      "│               ┆ The Woodland Mall store will eventually employ up to 150 associates; the         │\n",
-      "│               ┆ majority of them will be full-time. Von Maur offers above-market wages,          │\n",
-      "│               ┆ excellent benefits and a positive, professional work environment. Hours of       │\n",
-      "│               ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6   │\n",
-      "│               ┆ p.m. ET.                                                                         │\n",
-      "│               ┆ About Von Maur                                                                   │\n",
-      "│               ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company      │\n",
-      "│               ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot      │\n",
-      "│               ┆ E-Commerce facility that drives its successful online business at vonmaur.com.   │\n",
-      "│               ┆ Courtney Smith                                                                   │\n",
-      "│               ┆ courtney@reputationpartners.com                                                  │\n",
-      "│               ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n",
-      "│               ┆ t-store-opens-third-location-in-michigan-300937186.html                          │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│ 4             ┆                                                                                  │\n",
-      "│               ┆ The Genius Life                                                                  │\n",
-      "│               ┆ Max Lugavere                                                                     │\n",
-      "│               ┆ You don't have to be born a Genius to become one. Follow health and science      │\n",
-      "│               ┆ journalist, New York Times bestselling author, TV personality and nutrition      │\n",
-      "│               ┆ expert Max Lugavere as he speaks to the most insightful                          │\n",
-      "│               ┆ minds of our time about what it means to live like a Genius.                     │\n",
-      "│               ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD       │\n",
-      "│               ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American    │\n",
-      "│               ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker  │\n",
-      "│               ┆ and Grain Brain, now updated with the latest nutritional and neurological        │\n",
-      "│               ┆ science.                                                                         │\n",
-      "│               ┆ Von Maur Department Store Opens Third Location in Michigan                       │\n",
-      "│               ┆ Zuckerberg on Libra drop outs: 'It's a risky project'                            │\n",
-      "│               ┆                                                                                  │\n",
-      "│ 12            ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus,         │\n",
-      "│               ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy     │\n",
-      "│               ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A    │\n",
-      "│               ┆ couple of capricious capybaras chatted coolly by the cactus, curiously           │\n",
-      "│               ┆ considering another capy capably chewing on cantaloupe.                          │\n",
-      "│ 15            ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n",
-      "│               ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping   │\n",
-      "│               ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time.  │\n",
-      "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import polars as pl\n",
-    "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n",
+    "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"df1.parquet\"))\n",
     "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
     "    print(output_df)"
    ]
diff --git a/transforms/universal/fdedup/fdedup_spark.ipynb b/transforms/universal/fdedup/fdedup_spark.ipynb
index 616543640..d605d726b 100644
--- a/transforms/universal/fdedup/fdedup_spark.ipynb
+++ b/transforms/universal/fdedup/fdedup_spark.ipynb
@@ -79,7 +79,7 @@
    "source": [
     "# create parameters\n",
     "input_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\")\n",
-    "output_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"output\")\n",
+    "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n",
     "params = {\n",
     "    # transform configuration parameters\n",
     "    \"input_folder\": input_folder,\n",
@@ -133,7 +133,7 @@
    "outputs": [],
    "source": [
     "import glob\n",
-    "glob.glob(\"spark/output/cleaned/*\")"
+    "glob.glob(\"output/cleaned/*\")"
    ]
   },
   {
@@ -174,7 +174,7 @@
    "outputs": [],
    "source": [
     "import polars as pl\n",
-    "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"output\", \"cleaned\", \"df1.parquet\"))\n",
+    "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"df1.parquet\"))\n",
     "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
     "    print(output_df)"
    ]
@@ -190,9 +190,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "fdedup_spark",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "fdedup_spark"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -204,7 +204,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

From ad548bfca50c9bd98c05b6282ee950d52caaa116 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Wed, 18 Dec 2024 19:08:05 -0500
Subject: [PATCH 5/6] fix spark dockerfile

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/fdedup/Dockerfile.spark | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transforms/universal/fdedup/Dockerfile.spark b/transforms/universal/fdedup/Dockerfile.spark
index d228b6c2d..26bcf1da0 100644
--- a/transforms/universal/fdedup/Dockerfile.spark
+++ b/transforms/universal/fdedup/Dockerfile.spark
@@ -22,8 +22,8 @@ RUN pip install -r requirements.txt
 
 RUN mkdir -p /opt/spark/work-dir/src/templates && \
     mkdir -p /opt/spark/work-dir/config
-COPY --chown=spark:root spark-deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/
-COPY --chown=spark:root spark-deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/
+COPY --chown=spark:root spark/deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/
+COPY --chown=spark:root spark/deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/
 
 
 USER spark

From bc88085edf325628513e09034fb993ff417cc6c2 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 19 Dec 2024 17:11:34 -0500
Subject: [PATCH 6/6] Updated notebooks for python and ray

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .../fdedup/dpk_fdedup/ray/transform.py        | 16 +++++
 .../fdedup/dpk_fdedup/transform_python.py     | 18 +++++
 .../universal/fdedup/fdedup_python.ipynb      | 61 ++++------------
 transforms/universal/fdedup/fdedup_ray.ipynb  | 70 +++++--------------
 4 files changed, 63 insertions(+), 102 deletions(-)

diff --git a/transforms/universal/fdedup/dpk_fdedup/ray/transform.py b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py
index a59877b6f..76046ba4b 100644
--- a/transforms/universal/fdedup/dpk_fdedup/ray/transform.py
+++ b/transforms/universal/fdedup/dpk_fdedup/ray/transform.py
@@ -67,6 +67,22 @@ def execute_service(self, service_short_name: str, params: list) -> int:
         return status
 
 
+# Class used by the notebooks to ingest binary files and create parquet files
+class Fdedup:
+    def __init__(self, **kwargs):
+        self.params = {}
+        for key in kwargs:
+            self.params[key] = kwargs[key]
+
+    def transform(self):
+        sys.argv = ParamsUtils.dict_to_req(d=(self.params))
+        args = parse_args()
+        # Initialize the orchestrator
+        orchestrator = RayServiceOrchestrator(global_params=args)
+        # Launch python fuzzy dedup execution
+        return orchestrator.orchestrate()
+
+
 if __name__ == "__main__":
     # Parse command line arguments
     args = parse_args()
diff --git a/transforms/universal/fdedup/dpk_fdedup/transform_python.py b/transforms/universal/fdedup/dpk_fdedup/transform_python.py
index dbbcf39e6..196affd93 100644
--- a/transforms/universal/fdedup/dpk_fdedup/transform_python.py
+++ b/transforms/universal/fdedup/dpk_fdedup/transform_python.py
@@ -261,6 +261,24 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
+
+
+# Class used by the notebooks to ingest binary files and create parquet files
+class Fdedup:
+    def __init__(self, **kwargs):
+        self.params = {}
+        for key in kwargs:
+            self.params[key] = kwargs[key]
+
+    def transform(self):
+        sys.argv = ParamsUtils.dict_to_req(d=(self.params))
+        args = parse_args()
+        # Initialize the orchestrator
+        orchestrator = ServiceOrchestrator(global_params=args)
+        # Launch python fuzzy dedup execution
+        return orchestrator.orchestrate()
+
+
 if __name__ == "__main__":
 
     # Parse command line arguments
diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb
index a64c48a54..b02f463eb 100644
--- a/transforms/universal/fdedup/fdedup_python.ipynb
+++ b/transforms/universal/fdedup/fdedup_python.ipynb
@@ -23,8 +23,7 @@
     "## This is here as a reference only\n",
     "# Users and application developers must use the right tag for the latest from pypi\n",
     "#!pip install data-prep-toolkit\n",
-    "#!pip install data-prep-toolkit-transforms\n",
-    "#!pip install data-prep-connector"
+    "#!pip install data-prep-toolkit-transforms"
    ]
   },
   {
@@ -38,16 +37,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
+   "id": "bae63d15-4ce5-4f2a-a917-0f3161e9dd73",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import ast\n",
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "from data_processing.utils import ParamsUtils\n",
-    "from dpk_fdedup.transform_python import parse_args, ServiceOrchestrator"
+    "from dpk_fdedup.transform_python import Fdedup"
    ]
   },
   {
@@ -72,48 +66,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create parameters\n",
-    "input_folder = os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\")\n",
-    "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n",
-    "params = {\n",
-    "    # transform configuration parameters\n",
-    "    \"input_folder\": input_folder,\n",
-    "    \"output_folder\": output_folder,\n",
-    "    \"contents_column\": \"contents\",\n",
-    "    \"document_id_column\": \"int_id_column\",\n",
-    "    \"num_permutations\": 112,\n",
-    "    \"num_bands\": 14,\n",
-    "    \"num_minhashes_per_band\": 8,\n",
-    "    \"operation_mode\": \"filter_duplicates\",\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
-   "metadata": {},
-   "source": [
-    "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0775e400-7469-49a6-8998-bd4772931459",
+   "id": "a54a78e9-d78b-4aeb-ac2b-806070a2dec0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
-    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-    "args = parse_args()\n",
-    "# Initialize the orchestrator\n",
-    "orchestrator = ServiceOrchestrator(global_params=args)\n",
-    "# Launch python fuzzy dedup execution\n",
-    "orchestrator.orchestrate()"
+    "Fdedup(input_folder='test-data/input',\n",
+    "    output_folder='output',\n",
+    "    contents_column= \"contents\",\n",
+    "    document_id_column= \"int_id_column\",\n",
+    "    num_permutations= 112,\n",
+    "    num_bands= 14,\n",
+    "    num_minhashes_per_band= 8,\n",
+    "    operation_mode=\"filter_duplicates\").transform()\n"
    ]
   },
   {
@@ -151,6 +115,7 @@
    "outputs": [],
    "source": [
     "import polars as pl\n",
+    "import os\n",
     "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n",
     "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n",
     "input_df = input_df_1.vstack(input_df_2)\n",
diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb
index 7c6740f88..39bc1ba78 100644
--- a/transforms/universal/fdedup/fdedup_ray.ipynb
+++ b/transforms/universal/fdedup/fdedup_ray.ipynb
@@ -23,8 +23,7 @@
     "## This is here as a reference only\n",
     "# Users and application developers must use the right tag for the latest from pypi\n",
     "#!pip install data-prep-toolkit\n",
-    "#!pip install data-prep-toolkit-transforms\n",
-    "#!pip install data-prep-connector"
+    "#!pip install data-prep-toolkit-transforms"
    ]
   },
   {
@@ -38,17 +37,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
+   "id": "bae63d15-4ce5-4f2a-a917-0f3161e9dd73",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import ast\n",
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "from data_processing.utils import ParamsUtils\n",
-    "from dpk_fdedup.transform_python import parse_args\n",
-    "from dpk_fdedup.ray.transform import RayServiceOrchestrator"
+    "from dpk_fdedup.ray.transform import Fdedup"
    ]
   },
   {
@@ -67,57 +60,25 @@
     "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n",
     "| num_bands:int | 14 | number of bands to use for band hash calculation |\n",
     "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n",
-    "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |\n",
-    "| run_locally:bool | true | if true, launch a ray cluster locally, otherwise connect to an already existing cluster | \n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e90a853e-412f-45d7-af3d-959e755aeebb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create parameters\n",
-    "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n",
-    "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n",
-    "params = {\n",
-    "    # transform configuration parameters\n",
-    "    \"input_folder\": input_folder,\n",
-    "    \"output_folder\": output_folder,\n",
-    "    \"contents_column\": \"contents\",\n",
-    "    \"document_id_column\": \"int_id_column\",\n",
-    "    \"num_permutations\": 112,\n",
-    "    \"num_bands\": 14,\n",
-    "    \"num_minhashes_per_band\": 8,\n",
-    "    \"operation_mode\": \"filter_duplicates\",\n",
-    "    # ray configuration parameters\n",
-    "    \"run_locally\": True,\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a",
-   "metadata": {},
-   "source": [
-    "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline"
+    "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0775e400-7469-49a6-8998-bd4772931459",
+   "id": "a54a78e9-d78b-4aeb-ac2b-806070a2dec0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
-    "sys.argv = ParamsUtils.dict_to_req(d=params)\n",
-    "args = parse_args()\n",
-    "# Initialize the orchestrator\n",
-    "orchestrator = RayServiceOrchestrator(global_params=args)\n",
-    "# Launch ray fuzzy dedup execution\n",
-    "orchestrator.orchestrate()"
+    "Fdedup(input_folder='ray/test-data/input',\n",
+    "    output_folder='output',\n",
+    "    contents_column= \"contents\",\n",
+    "    document_id_column= \"int_id_column\",\n",
+    "    num_permutations= 112,\n",
+    "    num_bands= 14,\n",
+    "    num_minhashes_per_band= 8,\n",
+    "    operation_mode= \"filter_duplicates\",\n",
+    "    run_locally= True).transform()\n"
    ]
   },
   {
@@ -155,6 +116,7 @@
    "outputs": [],
    "source": [
     "import polars as pl\n",
+    "import os\n",
     "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n",
     "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n",
     "    print(input_df)"
@@ -192,7 +154,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c11d3a4b-8ef9-417d-a8a2-f688db067a52",
+   "id": "787c644e-2640-4c05-bdc2-8a261305a89f",
    "metadata": {},
    "outputs": [],
    "source": []