Skip to content

Commit

Permalink
Merge pull request #798 from IBM/pdf2parquet-use-str-for-hash
Browse files Browse the repository at this point in the history
use str as document_hash
  • Loading branch information
touma-I authored Nov 13, 2024
2 parents c72e98d + 5d918db commit f982ebe
Show file tree
Hide file tree
Showing 15 changed files with 35 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def _convert_pdf2parquet(
num_pages = len(doc.pages)
num_tables = len(doc.tables)
num_doc_elements = len(doc.texts)
document_hash = np.uint64(doc.origin.binary_hash)
document_hash = str(doc.origin.binary_hash) # we turn the uint64 hash into str, because it is easier to handle for pyarrow

self._update_metrics(num_pages=num_pages, elapse_time=elapse_time)

Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-11 21:04:30",
"end_time": "2024-11-11 21:04:38",
"start_time": "2024-11-13 08:35:51",
"end_time": "2024-11-13 08:36:23",
"status": "success"
},
"code": {
Expand Down Expand Up @@ -36,18 +36,18 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 21.1,
"cpus": 147.5,
"gpus": 0,
"memory": 32.09,
"memory": 33.72,
"object_store": 0,
"execution time, min": 0.139
"execution time, min": 0.522
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 32939,
"processing_time": 5.596,
"result_size": 33078,
"processing_time": 4.221,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-11 21:06:08",
"end_time": "2024-11-11 21:06:14",
"start_time": "2024-11-13 08:37:05",
"end_time": "2024-11-13 08:37:11",
"status": "success"
},
"code": {
Expand Down Expand Up @@ -36,22 +36,22 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 21.5,
"cpus": 143.9,
"gpus": 0,
"memory": 32.19,
"memory": 34.21,
"object_store": 0,
"execution time, min": 0.1
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 1,
"processing_time": 3.353,
"processing_time": 3.364,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
"nskip": 0,
"result_size": 27147
"result_size": 27226
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-11 21:05:31",
"end_time": "2024-11-11 21:05:36",
"start_time": "2024-11-13 08:37:56",
"end_time": "2024-11-13 08:38:02",
"status": "success"
},
"code": {
Expand Down Expand Up @@ -36,18 +36,18 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 21.4,
"cpus": 142.2,
"gpus": 0,
"memory": 32.33,
"memory": 33.63,
"object_store": 0,
"execution time, min": 0.096
"execution time, min": 0.1
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 22850,
"processing_time": 3.229,
"result_size": 22993,
"processing_time": 3.422,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-11 21:05:04",
"end_time": "2024-11-11 21:05:06",
"start_time": "2024-11-13 08:37:31",
"end_time": "2024-11-13 08:37:34",
"status": "success"
},
"code": {
Expand Down Expand Up @@ -36,18 +36,18 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 21.6,
"cpus": 143.4,
"gpus": 0,
"memory": 29.57,
"memory": 31.51,
"object_store": 0,
"execution time, min": 0.041
"execution time, min": 0.042
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 29555,
"processing_time": 1.997,
"result_size": 29694,
"processing_time": 2.077,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-11 21:04:30",
"end_time": "2024-11-11 21:04:38",
"start_time": "2024-11-13 08:35:51",
"end_time": "2024-11-13 08:36:23",
"status": "success"
},
"code": {
Expand Down Expand Up @@ -36,18 +36,18 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 21.1,
"cpus": 147.5,
"gpus": 0,
"memory": 32.09,
"memory": 33.72,
"object_store": 0,
"execution time, min": 0.139
"execution time, min": 0.522
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 32939,
"processing_time": 5.596,
"result_size": 33078,
"processing_time": 4.221,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.

0 comments on commit f982ebe

Please sign in to comment.