Skip to content

Commit

Permalink
added Ededup to the notebook
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <[email protected]>
  • Loading branch information
touma-I committed Dec 17, 2024
1 parent d8835f2 commit b6c7b44
Showing 1 changed file with 68 additions and 147 deletions.
215 changes: 68 additions & 147 deletions transforms/transforms-1.0-lang.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 38,
"id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
"metadata": {},
"outputs": [],
Expand All @@ -21,9 +21,17 @@
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "c276c60e",
"metadata": {},
"source": [
"configur and run web2parquet"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 39,
"id": "b6c89ac7-6824-4d99-8120-7d5b150bd683",
"metadata": {},
"outputs": [],
Expand All @@ -35,7 +43,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 40,
"id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
"metadata": {},
"outputs": [],
Expand All @@ -50,7 +58,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 25,
"id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
"metadata": {},
"outputs": [],
Expand All @@ -60,43 +68,20 @@
"#glob.glob(\"downloads/*\") "
]
},
{
"cell_type": "markdown",
"id": "bd71fe8a",
"metadata": {},
"source": [
"Configure and run Pdf2Parquet"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "7276fe84-6512-4605-ab65-747351e13a7c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"10:55:10 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': <pdf2parquet_contents_types.MARKDOWN: 'text/markdown'>, 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': <pdf2parquet_ocr_engine.EASYOCR: 'easyocr'>, 'bitmap_area_threshold': 0.05, 'pdf_backend': <pdf2parquet_pdf_backend.DLPARSE_V2: 'dlparse_v2'>, 'double_precision': 8}\n",
"10:55:10 INFO - pipeline id pipeline_id\n",
"10:55:10 INFO - code location None\n",
"10:55:10 INFO - data factory data_ is using local data access: input_folder - downloads output_folder - pdf2parquet-files\n",
"10:55:10 INFO - data factory data_ max_files -1, n_sample -1\n",
"10:55:10 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n",
"10:55:10 INFO - orchestrator pdf2parquet started at 2024-12-14 10:55:10\n",
"10:55:10 INFO - Number of files is 1, source profile {'max_file_size': 5.308699607849121, 'min_file_size': 5.308699607849121, 'total_file_size': 5.308699607849121}\n",
"10:55:10 INFO - Initializing models\n",
"Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 20015.24it/s]\n",
"10:56:06 INFO - Completed 1 files (100.0%) in 0.847 min\n",
"10:56:06 INFO - Done processing 1 files, waiting for flush() completion.\n",
"10:56:06 INFO - done flushing in 0.0 sec\n",
"10:56:07 INFO - Completed execution in 0.941 min, execution result 0\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from dpk_pdf2parquet.transform_python import Pdf2Parquet\n",
"Pdf2Parquet(input_folder= \"downloads\", \n",
Expand All @@ -107,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 29,
"id": "fef6667e-71ed-4054-9382-55c6bb3fda70",
"metadata": {},
"outputs": [],
Expand All @@ -117,30 +102,20 @@
"#table.to_pandas()"
]
},
{
"cell_type": "markdown",
"id": "54cba5c4",
"metadata": {},
"source": [
"Configure and Run DocChunk"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "fe8bf1bc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"10:56:09 INFO - pipeline id pipeline_id\n",
"10:56:09 INFO - code location None\n",
"10:56:09 INFO - data factory data_ is using local data access: input_folder - pdf2parquet-files output_folder - doc-chunk-files\n",
"10:56:09 INFO - data factory data_ max_files -1, n_sample -1\n",
"10:56:09 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
"10:56:09 INFO - orchestrator doc_chunk started at 2024-12-14 10:56:09\n",
"10:56:09 INFO - Number of files is 1, source profile {'max_file_size': 0.023062705993652344, 'min_file_size': 0.023062705993652344, 'total_file_size': 0.023062705993652344}\n",
"10:56:09 INFO - Completed 1 files (100.0%) in 0.001 min\n",
"10:56:09 INFO - Done processing 1 files, waiting for flush() completion.\n",
"10:56:09 INFO - done flushing in 0.0 sec\n",
"10:56:09 INFO - Completed execution in 0.001 min, execution result 0\n"
]
}
],
"outputs": [],
"source": [
"%%capture\n",
"from dpk_doc_chunk.transform_python import DocChunk\n",
Expand All @@ -151,7 +126,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 31,
"id": "9d4f7bfc",
"metadata": {},
"outputs": [],
Expand All @@ -161,42 +136,20 @@
"#table.to_pandas()"
]
},
{
"cell_type": "markdown",
"id": "349cf6ff",
"metadata": {},
"source": [
"Configure and Run Exact dedup"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "38480cd5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"10:56:59 INFO - exact dedup params are {'doc_column': 'contents', 'doc_id_column': 'document_id', 'use_snapshot': False, 'snapshot_directory': None}\n",
"10:56:59 INFO - pipeline id pipeline_id\n",
"10:56:59 INFO - code location None\n",
"10:56:59 INFO - data factory data_ is using local data access: input_folder - doc-chunk-files output_folder - dedup-files\n",
"10:56:59 INFO - data factory data_ max_files -1, n_sample -1\n",
"10:56:59 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
"10:56:59 INFO - orchestrator ededup started at 2024-12-14 10:56:59\n",
"10:56:59 INFO - Number of files is 1, source profile {'max_file_size': 0.03043651580810547, 'min_file_size': 0.03043651580810547, 'total_file_size': 0.03043651580810547}\n",
"10:56:59 INFO - Starting from the beginning\n",
"10:56:59 INFO - Completed 1 files (100.0%) in 0.0 min\n",
"10:56:59 INFO - Done processing 1 files, waiting for flush() completion.\n",
"10:56:59 INFO - done flushing in 0.0 sec\n",
"10:56:59 INFO - Completed execution in 0.0 min, execution result 0\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from dpk_ededup.transform_python import Ededup\n",
"Ededup(input_folder=\"doc-chunk-files\",\n",
Expand All @@ -207,7 +160,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 33,
"id": "27e36a8e",
"metadata": {},
"outputs": [],
Expand All @@ -217,42 +170,20 @@
"#table.to_pandas()"
]
},
{
"cell_type": "markdown",
"id": "318bc520",
"metadata": {},
"source": [
"Configure and run Land Id"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"id": "ad27a462",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"10:57:06 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'contents', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n",
"10:57:06 INFO - pipeline id pipeline_id\n",
"10:57:06 INFO - code location None\n",
"10:57:06 INFO - data factory data_ is using local data access: input_folder - dedup-files output_folder - langId-files\n",
"10:57:06 INFO - data factory data_ max_files -1, n_sample -1\n",
"10:57:06 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
"10:57:06 INFO - orchestrator lang_id started at 2024-12-14 10:57:06\n",
"10:57:06 INFO - Number of files is 1, source profile {'max_file_size': 0.031200408935546875, 'min_file_size': 0.031200408935546875, 'total_file_size': 0.031200408935546875}\n",
"Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
"10:57:08 INFO - Completed 1 files (100.0%) in 0.001 min\n",
"10:57:08 INFO - Done processing 1 files, waiting for flush() completion.\n",
"10:57:08 INFO - done flushing in 0.0 sec\n",
"10:57:08 INFO - Completed execution in 0.036 min, execution result 0\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from dpk_lang_id.transform_python import LangId\n",
"LangId(input_folder= \"dedup-files\",\n",
Expand All @@ -265,7 +196,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 35,
"id": "c35cab2e",
"metadata": {},
"outputs": [],
Expand All @@ -275,30 +206,20 @@
"#table.to_pandas()"
]
},
{
"cell_type": "markdown",
"id": "a968dbb4",
"metadata": {},
"source": [
"Configure and run Doc Quality"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"id": "4e84ce78",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"10:57:13 INFO - pipeline id pipeline_id\n",
"10:57:13 INFO - code location None\n",
"10:57:13 INFO - data factory data_ is using local data access: input_folder - dedup-files output_folder - doc-quality-files\n",
"10:57:13 INFO - data factory data_ max_files -1, n_sample -1\n",
"10:57:13 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
"10:57:13 INFO - orchestrator docq started at 2024-12-14 10:57:13\n",
"10:57:13 INFO - Number of files is 1, source profile {'max_file_size': 0.031200408935546875, 'min_file_size': 0.031200408935546875, 'total_file_size': 0.031200408935546875}\n",
"10:57:13 INFO - Completed 1 files (100.0%) in 0.003 min\n",
"10:57:13 INFO - Done processing 1 files, waiting for flush() completion.\n",
"10:57:13 INFO - done flushing in 0.0 sec\n",
"10:57:13 INFO - Completed execution in 0.003 min, execution result 0\n"
]
}
],
"outputs": [],
"source": [
"%%capture\n",
"from dpk_doc_quality.transform_python import DocQuality\n",
Expand All @@ -310,7 +231,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 37,
"id": "d98b854f",
"metadata": {},
"outputs": [],
Expand All @@ -323,7 +244,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand Down

0 comments on commit b6c7b44

Please sign in to comment.