From 52fb1a19c2c643e20eb9a2612fe5e5c192bafdd9 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 18 Dec 2024 18:54:55 -0500 Subject: [PATCH] fix and test notebok Signed-off-by: Maroun Touma --- .../universal/fdedup/fdedup_python.ipynb | 679 +----------------- transforms/universal/fdedup/fdedup_ray.ipynb | 409 +---------- .../universal/fdedup/fdedup_spark.ipynb | 12 +- 3 files changed, 47 insertions(+), 1053 deletions(-) diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb index 3ca0ec9e5..a64c48a54 100644 --- a/transforms/universal/fdedup/fdedup_python.ipynb +++ b/transforms/universal/fdedup/fdedup_python.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, "outputs": [], @@ -71,14 +71,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], "source": [ "# create parameters\n", - "input_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"output\")\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n", "params = {\n", " # transform configuration parameters\n", " \"input_folder\": input_folder,\n", @@ -102,388 +102,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "17:55:52 INFO - Starting SignatureCalculation step\n", - "17:55:52 INFO - Got parameters for SignatureCalculation\n", - "17:55:52 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "17:55:52 INFO - data factory scdata_ is using local configuration without input/output path\n", - "17:55:52 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "17:55:52 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "17:55:52 INFO - pipeline id pipeline_id\n", - "17:55:52 INFO - code location None\n", - "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output\n", - "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n", - "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "17:55:52 INFO - orchestrator minhash started at 2024-12-18 17:55:52\n", - "17:55:52 ERROR - No input files to process - exiting\n", - "17:55:52 INFO - Completed execution in 0.0 min, execution result 0\n", - "17:55:52 INFO - SignatureCalculation completed successfully\n", - "17:55:52 INFO - Starting ClusterAnalysis step\n", - "17:55:52 INFO - Got parameters for ClusterAnalysis\n", - "17:55:52 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "17:55:52 INFO - pipeline id pipeline_id\n", - "17:55:52 INFO - code location None\n", - "17:55:52 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/python/output/docs_to_remove\n", - "17:55:52 INFO - data factory data_ max_files -1, n_sample -1\n", - "17:55:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "17:55:52 INFO - orchestrator cluster started at 2024-12-18 17:55:52\n", - "17:55:52 INFO - Number of folders is 14\n", - "17:55:52 WARNING - Exception processing file band=0/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 1 files (7.14%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=1/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 2 files (14.29%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=2/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 3 files (21.43%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=3/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 4 files (28.57%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=4/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 5 files (35.71%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=5/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 6 files (42.86%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=6/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 7 files (50.0%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=7/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 8 files (57.14%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=8/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 9 files (64.29%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=9/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 10 files (71.43%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=10/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 11 files (78.57%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=11/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 12 files (85.71%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=12/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 13 files (92.86%) in 0.0 min\n", - "17:55:52 WARNING - Exception processing file band=13/segment=0: Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/transform_file_processor.py\", line 85, in process_file\n", - " out_files, stats = self.transform.transform(folder_name=f_name)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 141, in transform\n", - " cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/dpk_fdedup/cluster_analysis/transform.py\", line 173, in _get_clusters\n", - " groupby_dataframe = band_segment_dataframe.group_by(\"band_hash\").agg(\"document_data\")\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/dataframe/group_by.py\", line 232, in agg\n", - " .collect(no_optimization=True)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/touma/data-prep-kit-pkg/transforms/universal/fdedup/venv/lib/python3.11/site-packages/polars/lazyframe/frame.py\", line 2050, in collect\n", - " return wrap_df(ldf.collect(callback))\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - "polars.exceptions.ColumnNotFoundError: band_hash\n", - "\n", - "Resolved plan until failure:\n", - "\n", - "\t---> FAILED HERE RESOLVING 'group_by' <---\n", - "DF []; PROJECT */0 COLUMNS; SELECTION: None\n", - "\n", - "17:55:52 INFO - Completed 14 files (100.0%) in 0.0 min\n", - "17:55:52 INFO - Done processing 14 files, waiting for flush() completion.\n", - "17:55:52 INFO - done flushing in 0.0 sec\n", - "Traceback (most recent call last):\n", - " File \"/Users/touma/data-prep-kit-pkg/data-processing-lib/python/src/data_processing/runtime/pure_python/transform_orchestrator.py\", line 131, in orchestrate\n", - " stats[\"processing_time\"] = round(stats[\"processing_time\"], 3)\n", - " ~~~~~^^^^^^^^^^^^^^^^^^^\n", - "KeyError: 'processing_time'\n", - "17:55:52 ERROR - Exception during execution 'processing_time': None\n", - "17:55:52 INFO - Completed execution in 0.0 min, execution result 1\n", - "17:55:52 ERROR - ClusterAnalysis failed with status 1, aborting ...\n" - ] - } - ], + "outputs": [], "source": [ "\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", @@ -504,26 +126,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['python/output/cleaned/metadata.json',\n", - " 'python/output/cleaned/data_1',\n", - " 'python/output/cleaned/data_2']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import glob\n", - "glob.glob(\"python/output/cleaned/*\")" + "glob.glob(\"output/cleaned/*\")" ] }, { @@ -536,171 +145,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (12, 2)\n", - "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", - "│ int_id_column ┆ contents │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", - "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ PR Newswire October 12, 2019 │\n", - "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", - "│ ┆ New Location Continues Strategic National Expansion Plans │\n", - "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", - "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", - "│ ┆ store is the Company's third location in Michigan. │\n", - "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", - "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", - "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", - "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", - "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", - "│ ┆ free shipping services. │\n", - "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", - "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", - "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", - "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", - "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", - "│ ┆ location in Madison in Fall 2021. │\n", - "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", - "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", - "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", - "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", - "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", - "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", - "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", - "│ ┆ years to come.\" │\n", - "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", - "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", - "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", - "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", - "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", - "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", - "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", - "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", - "│ ┆ from the store's grand piano. │\n", - "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", - "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", - "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", - "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", - "│ ┆ p.m. ET. │\n", - "│ ┆ About Von Maur │\n", - "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", - "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", - "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", - "│ ┆ Courtney Smith │\n", - "│ ┆ courtney@reputationpartners.com │\n", - "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", - "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ 3 ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", - "│ ┆ what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ 4 ┆ │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 5 ┆ │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ │\n", - "│ 6 ┆ │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", - "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", - "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", - "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", - "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. │\n", - "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", - "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", - "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", - "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", - "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", - "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", - "│ ┆ another capy capably chewing on cantaloupe │\n", - "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", - "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", - "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "│ 17 ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", - "│ ┆ what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" - ] - } - ], + "outputs": [], "source": [ "import polars as pl\n", - "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", - "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n", + "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", + "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n", "input_df = input_df_1.vstack(input_df_2)\n", "\n", "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", @@ -717,101 +169,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (4, 2)\n", - "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", - "│ int_id_column ┆ contents │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", - "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ PR Newswire October 12, 2019 │\n", - "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", - "│ ┆ New Location Continues Strategic National Expansion Plans │\n", - "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", - "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", - "│ ┆ store is the Company's third location in Michigan. │\n", - "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", - "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", - "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", - "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", - "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", - "│ ┆ free shipping services. │\n", - "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", - "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", - "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", - "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", - "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", - "│ ┆ location in Madison in Fall 2021. │\n", - "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", - "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", - "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", - "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", - "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", - "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", - "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", - "│ ┆ years to come.\" │\n", - "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", - "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", - "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", - "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", - "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", - "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", - "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", - "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", - "│ ┆ from the store's grand piano. │\n", - "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", - "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", - "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", - "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", - "│ ┆ p.m. ET. │\n", - "│ ┆ About Von Maur │\n", - "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", - "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", - "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", - "│ ┆ Courtney Smith │\n", - "│ ┆ courtney@reputationpartners.com │\n", - "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", - "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ 4 ┆ │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. │\n", - "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", - "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" - ] - } - ], + "outputs": [], "source": [ "import polars as pl\n", - "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n", - "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n", + "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n", + "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n", "output_df = output_df_1.vstack(output_df_2)\n", "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", " print(output_df)" @@ -824,6 +189,14 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "787c644e-2640-4c05-bdc2-8a261305a89f", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb index 8bfa98a3a..7c6740f88 100644 --- a/transforms/universal/fdedup/fdedup_ray.ipynb +++ b/transforms/universal/fdedup/fdedup_ray.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -37,18 +37,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-11-26 13:30:56,482\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" - ] - } - ], + "outputs": [], "source": [ "import ast\n", "import os\n", @@ -81,14 +73,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], "source": [ "# create parameters\n", "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n", "params = {\n", " # transform configuration parameters\n", " \"input_folder\": input_folder,\n", @@ -114,126 +106,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "13:30:57 INFO - Starting SignatureCalculation step\n", - "13:30:57 INFO - Got parameters for SignatureCalculation\n", - "13:30:57 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", - "13:30:57 INFO - data factory scdata_ is using local configuration without input/output path\n", - "13:30:57 INFO - data factory scdata_ max_files -1, n_sample -1\n", - "13:30:57 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:57 INFO - pipeline id pipeline_id\n", - "13:30:57 INFO - code location None\n", - "13:30:57 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:30:57 INFO - actor creation delay 0\n", - "13:30:57 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:30:57 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", - "13:30:57 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:30:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:30:57 INFO - Running locally\n", - "2024-11-26 13:31:08,860\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - orchestrator started at 2024-11-26 13:31:12\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.162438202649355, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed processing 1 files in 0.002 min\n", - "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Starting flush()\n", - "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - done flushing in 0.045 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Wrote 14 tables with a total size of 80,640 bytes\n", - "13:31:24 INFO - Completed execution in 0.446 min, execution result 0\n", - "13:31:26 INFO - SignatureCalculation completed successfully\n", - "13:31:26 INFO - Starting ClusterAnalysis step\n", - "13:31:26 INFO - Got parameters for ClusterAnalysis\n", - "13:31:26 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", - "13:31:26 INFO - pipeline id pipeline_id\n", - "13:31:26 INFO - code location None\n", - "13:31:26 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:26 INFO - actor creation delay 0\n", - "13:31:26 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:26 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/docs_to_remove\n", - "13:31:26 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:26 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:26 INFO - Running locally\n", - "2024-11-26 13:31:28,318\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - orchestrator started at 2024-11-26 13:31:31\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of folders is 14\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.77626838721335, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 2 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 3 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 4 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 5 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 6 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 7 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 8 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 9 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 10 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files (78.571%) in 0.001 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed processing 14 files in 0.001 min\n", - "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", - "13:31:43 INFO - Completed execution in 0.292 min, execution result 0\n", - "13:31:45 INFO - ClusterAnalysis completed successfully\n", - "13:31:45 INFO - Starting GetDuplicateList step\n", - "13:31:45 INFO - Got parameters for GetDuplicateList\n", - "13:31:45 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", - "13:31:45 INFO - pipeline id pipeline_id\n", - "13:31:45 INFO - code location None\n", - "13:31:45 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:31:45 INFO - actor creation delay 0\n", - "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:31:45 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", - "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:31:45 INFO - Running locally\n", - "2024-11-26 13:31:47,311\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - orchestrator started at 2024-11-26 13:31:50\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of folders is 1\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.749520111829042, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed processing 1 files in 0.0 min\n", - "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - done flushing in 0.001 sec\n", - "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - Get Duplicate List for folder docs_to_remove\n", - "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - 8 documents marked as duplicates\n", - "13:32:02 INFO - Completed execution in 0.295 min, execution result 0\n", - "13:32:04 INFO - GetDuplicateList completed successfully\n", - "13:32:04 INFO - Starting DataCleaning step\n", - "13:32:04 INFO - Got parameters for DataCleaning\n", - "13:32:04 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", - "13:32:04 INFO - data factory dcdata_ is using local configuration without input/output path\n", - "13:32:04 INFO - data factory dcdata_ max_files -1, n_sample -1\n", - "13:32:04 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:04 INFO - pipeline id pipeline_id\n", - "13:32:04 INFO - code location None\n", - "13:32:04 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", - "13:32:04 INFO - actor creation delay 0\n", - "13:32:04 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", - "13:32:04 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/cleaned\n", - "13:32:04 INFO - data factory data_ max_files -1, n_sample -1\n", - "13:32:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "13:32:04 INFO - Running locally\n", - "2024-11-26 13:32:07,526\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - orchestrator started at 2024-11-26 13:32:10\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.738976669497788, 'object_store': 2.0}\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed processing 1 files in 0.002 min\n", - "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - done flushing in 0.003 sec\n", - "13:32:23 INFO - Completed execution in 0.313 min, execution result 0\n", - "13:32:24 INFO - DataCleaning completed successfully\n" - ] - } - ], + "outputs": [], "source": [ "\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", @@ -254,24 +130,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['ray/output/cleaned/metadata.json', 'ray/output/cleaned/df1.parquet']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import glob\n", - "glob.glob(\"ray/output/cleaned/*\")" + "glob.glob(\"output/cleaned/*\")" ] }, { @@ -284,167 +149,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (12, 2)\n", - "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", - "│ int_id_column ┆ contents │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", - "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ PR Newswire October 12, 2019 │\n", - "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", - "│ ┆ New Location Continues Strategic National Expansion Plans │\n", - "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", - "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", - "│ ┆ store is the Company's third location in Michigan. │\n", - "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", - "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", - "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", - "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", - "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", - "│ ┆ free shipping services. │\n", - "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", - "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", - "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", - "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", - "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", - "│ ┆ location in Madison in Fall 2021. │\n", - "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", - "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", - "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", - "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", - "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", - "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", - "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", - "│ ┆ years to come.\" │\n", - "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", - "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", - "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", - "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", - "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", - "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", - "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", - "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", - "│ ┆ from the store's grand piano. │\n", - "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", - "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", - "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", - "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", - "│ ┆ p.m. ET. │\n", - "│ ┆ About Von Maur │\n", - "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", - "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", - "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", - "│ ┆ Courtney Smith │\n", - "│ ┆ courtney@reputationpartners.com │\n", - "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", - "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ 3 ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", - "│ ┆ what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ 4 ┆ │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 5 ┆ │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ │\n", - "│ 6 ┆ │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", - "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", - "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", - "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", - "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. │\n", - "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", - "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", - "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", - "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", - "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", - "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", - "│ ┆ another capy capably chewing on cantaloupe │\n", - "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", - "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", - "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "│ 17 ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", - "│ ┆ what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" - ] - } - ], + "outputs": [], "source": [ "import polars as pl\n", "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", @@ -462,100 +170,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape: (4, 2)\n", - "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", - "│ int_id_column ┆ contents │\n", - "│ --- ┆ --- │\n", - "│ i64 ┆ str │\n", - "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", - "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ PR Newswire October 12, 2019 │\n", - "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", - "│ ┆ New Location Continues Strategic National Expansion Plans │\n", - "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", - "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", - "│ ┆ store is the Company's third location in Michigan. │\n", - "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", - "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", - "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", - "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", - "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", - "│ ┆ free shipping services. │\n", - "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", - "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", - "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", - "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", - "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", - "│ ┆ location in Madison in Fall 2021. │\n", - "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", - "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", - "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", - "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", - "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", - "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", - "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", - "│ ┆ years to come.\" │\n", - "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", - "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", - "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", - "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", - "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", - "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", - "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", - "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", - "│ ┆ from the store's grand piano. │\n", - "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", - "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", - "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", - "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", - "│ ┆ p.m. ET. │\n", - "│ ┆ About Von Maur │\n", - "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", - "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", - "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", - "│ ┆ Courtney Smith │\n", - "│ ┆ courtney@reputationpartners.com │\n", - "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", - "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ 4 ┆ │\n", - "│ ┆ The Genius Life │\n", - "│ ┆ Max Lugavere │\n", - "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", - "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", - "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", - "│ ┆ minds of our time about what it means to live like a Genius. │\n", - "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", - "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", - "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", - "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", - "│ ┆ science. │\n", - "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", - "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", - "│ ┆ │\n", - "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", - "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", - "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", - "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", - "│ ┆ considering another capy capably chewing on cantaloupe. │\n", - "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", - "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", - "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", - "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" - ] - } - ], + "outputs": [], "source": [ "import polars as pl\n", - "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"df1.parquet\"))\n", "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", " print(output_df)" ] diff --git a/transforms/universal/fdedup/fdedup_spark.ipynb b/transforms/universal/fdedup/fdedup_spark.ipynb index 616543640..d605d726b 100644 --- a/transforms/universal/fdedup/fdedup_spark.ipynb +++ b/transforms/universal/fdedup/fdedup_spark.ipynb @@ -79,7 +79,7 @@ "source": [ "# create parameters\n", "input_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\")\n", - "output_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"output\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"output\")\n", "params = {\n", " # transform configuration parameters\n", " \"input_folder\": input_folder,\n", @@ -133,7 +133,7 @@ "outputs": [], "source": [ "import glob\n", - "glob.glob(\"spark/output/cleaned/*\")" + "glob.glob(\"output/cleaned/*\")" ] }, { @@ -174,7 +174,7 @@ "outputs": [], "source": [ "import polars as pl\n", - "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"output\", \"cleaned\", \"df1.parquet\"))\n", "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", " print(output_df)" ] @@ -190,9 +190,9 @@ ], "metadata": { "kernelspec": { - "display_name": "fdedup_spark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "fdedup_spark" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -204,7 +204,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4,