diff --git a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py index 0f5de10c0..20ef49dc3 100644 --- a/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py +++ b/transforms/language/pdf2parquet/python/src/pdf2parquet_transform.py @@ -24,6 +24,7 @@ import filetype import pandas as pd import pyarrow as pa +import numpy as np from data_processing.transform import AbstractBinaryTransform, TransformConfiguration from data_processing.utils import TransformUtils, get_logger, str2bool from data_processing.utils.cli_utils import CLIArgumentProvider @@ -237,7 +238,7 @@ def _convert_pdf2parquet( num_pages = len(doc.pages) num_tables = len(doc.tables) num_doc_elements = len(doc.texts) - document_hash = doc.origin.binary_hash + document_hash = np.uint64(doc.origin.binary_hash) self._update_metrics(num_pages=num_pages, elapse_time=elapse_time) diff --git a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet index 907fb3803..f68ff66e1 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json index b9a535098..330ee3a5c 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-29 14:17:59", - "end_time": "2024-10-29 14:18:05", + "start_time": "2024-11-11 21:04:30", + "end_time": "2024-11-11 21:04:38", "status": "success" }, "code": { @@ -15,6 +15,7 @@ "path": "path" }, "job_input_params": { + "batch_size": -1, "artifacts_path": null, "contents_type": "text/markdown", "do_table_structure": true, @@ -28,23 +29,25 @@ "random_samples": -1, "files_to_use": [ ".pdf", + ".docx", + ".pptx", ".zip" ], "num_processors": 0 }, "execution_stats": { - "cpus": 16.8, + "cpus": 21.1, "gpus": 0, - "memory": 31.22, + "memory": 32.09, "object_store": 0, - "execution time, min": 0.108 + "execution time, min": 0.139 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 33044, - "processing_time": 6.478, + "result_size": 32939, + "processing_time": 5.596, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet index 39613b1d1..17a7cf950 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json index f8f9ad71a..32023e56a 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_batch/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-31 13:14:39", - "end_time": "2024-10-31 13:16:41", + "start_time": "2024-11-11 21:06:08", + "end_time": "2024-11-11 21:06:14", "status": "success" }, "code": { @@ -36,22 +36,22 @@ "num_processors": 0 }, "execution_stats": { - "cpus": 39.0, + "cpus": 21.5, "gpus": 0, - "memory": 29.87, + "memory": 32.19, "object_store": 0, - "execution time, min": 2.029 + "execution time, min": 0.1 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 1, - "processing_time": 3.888, + "processing_time": 3.353, "nrows": 3, "nsuccess": 3, "nfail": 0, "nskip": 0, - "result_size": 27200 + "result_size": 27147 }, "source": { "name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input", diff --git a/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet index 3e9ba12c7..c29b5db0e 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_batch/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet index 7f34e1ba8..42b0a245d 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json index 04bec2b88..ed05c6b34 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_json/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-29 14:20:01", - "end_time": "2024-10-29 14:20:07", + "start_time": "2024-11-11 21:05:31", + "end_time": "2024-11-11 21:05:36", "status": "success" }, "code": { @@ -15,6 +15,7 @@ "path": "path" }, "job_input_params": { + "batch_size": -1, "artifacts_path": null, "contents_type": "application/json", "do_table_structure": true, @@ -28,23 +29,25 @@ "random_samples": -1, "files_to_use": [ ".pdf", + ".docx", + ".pptx", ".zip" ], "num_processors": 0 }, "execution_stats": { - "cpus": 18.0, + "cpus": 21.4, "gpus": 0, - "memory": 30.77, + "memory": 32.33, "object_store": 0, - "execution time, min": 0.105 + "execution time, min": 0.096 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 22953, - "processing_time": 6.282, + "result_size": 22850, + "processing_time": 3.229, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet index 32905aa74..0f4bda73e 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_json/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet index 9fec2cd2d..32bfa6d00 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json index bf5c9e12a..e8a3894bf 100644 --- a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json +++ b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-29 14:19:30", - "end_time": "2024-10-29 14:19:33", + "start_time": "2024-11-11 21:05:04", + "end_time": "2024-11-11 21:05:06", "status": "success" }, "code": { @@ -15,6 +15,7 @@ "path": "path" }, "job_input_params": { + "batch_size": -1, "artifacts_path": null, "contents_type": "text/markdown", "do_table_structure": false, @@ -28,23 +29,25 @@ "random_samples": -1, "files_to_use": [ ".pdf", + ".docx", + ".pptx", ".zip" ], "num_processors": 0 }, "execution_stats": { - "cpus": 17.3, + "cpus": 21.6, "gpus": 0, - "memory": 28.85, + "memory": 29.57, "object_store": 0, - "execution time, min": 0.043 + "execution time, min": 0.041 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 29659, - "processing_time": 2.554, + "result_size": 29555, + "processing_time": 1.997, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet index 69bc4e421..db8b58790 100644 Binary files a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table_no_ocr/redp5110-ch1.parquet differ diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet index 907fb3803..f68ff66e1 100644 Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json index b9a535098..330ee3a5c 100644 --- a/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json +++ b/transforms/language/pdf2parquet/ray/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "pdf2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-10-29 14:17:59", - "end_time": "2024-10-29 14:18:05", + "start_time": "2024-11-11 21:04:30", + "end_time": "2024-11-11 21:04:38", "status": "success" }, "code": { @@ -15,6 +15,7 @@ "path": "path" }, "job_input_params": { + "batch_size": -1, "artifacts_path": null, "contents_type": "text/markdown", "do_table_structure": true, @@ -28,23 +29,25 @@ "random_samples": -1, "files_to_use": [ ".pdf", + ".docx", + ".pptx", ".zip" ], "num_processors": 0 }, "execution_stats": { - "cpus": 16.8, + "cpus": 21.1, "gpus": 0, - "memory": 31.22, + "memory": 32.09, "object_store": 0, - "execution time, min": 0.108 + "execution time, min": 0.139 }, "job_output_stats": { "source_files": 2, "source_size": 605137, "result_files": 2, - "result_size": 33044, - "processing_time": 6.478, + "result_size": 32939, + "processing_time": 5.596, "nrows": 3, "nsuccess": 3, "nfail": 0, diff --git a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet index 39613b1d1..17a7cf950 100644 Binary files a/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet and b/transforms/language/pdf2parquet/ray/test-data/expected/redp5110-ch1.parquet differ