Skip to content

Commit

Permalink
fix uint64 hash to pyarrow
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm committed Nov 11, 2024
1 parent 723e675 commit 4999604
Showing 1 changed file with 2 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import filetype
import pandas as pd
import pyarrow as pa
import numpy as np
from data_processing.transform import AbstractBinaryTransform, TransformConfiguration
from data_processing.utils import TransformUtils, get_logger, str2bool
from data_processing.utils.cli_utils import CLIArgumentProvider
Expand Down Expand Up @@ -237,7 +238,7 @@ def _convert_pdf2parquet(
num_pages = len(doc.pages)
num_tables = len(doc.tables)
num_doc_elements = len(doc.texts)
document_hash = doc.origin.binary_hash
document_hash = np.uint64(doc.origin.binary_hash)

self._update_metrics(num_pages=num_pages, elapse_time=elapse_time)

Expand Down

0 comments on commit 4999604

Please sign in to comment.