Skip to content

Commit

Permalink
Merge branch 'dev' into remote-image-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
touma-I committed Dec 20, 2024
2 parents 31fb868 + 62e3f97 commit 947e1dc
Show file tree
Hide file tree
Showing 198 changed files with 1,157 additions and 2,931 deletions.
35 changes: 17 additions & 18 deletions transforms/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_toolkit_transforms"
version = "1.0.0a0"
version = "1.0.0a1"
requires-python = ">=3.10,<3.13"
keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
description = "Data Preparation Toolkit Transforms using Ray"
Expand Down Expand Up @@ -32,52 +32,49 @@ all = { file = [
##### pii_redactor seem to be failing UT
## "language/pii_redactor/python/requirements.txt",

"universal/fdedup/python/requirements.txt",
"universal/profiler/python/requirements.txt",
"universal/filter/python/requirements.txt",
"universal/resize/python/requirements.txt",
"universal/tokenization/python/requirements.txt",

"language/lang_id/requirements.txt",
"language/doc_quality/requirements.txt",
"language/pdf2parquet/requirements.txt",
"language/doc_chunk/requirements.txt",
"language/text_encoder/requirements.txt",

##### Cannot have html2parquet until we solve
## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1
## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
## "language/html2parquet/requirements.txt",
"language/lang_id/requirements.txt",
"language/text_encoder/requirements.txt",
"language/pdf2parquet/requirements.txt",

"universal/doc_id/requirements.txt",
"universal/ededup/requirements.txt",
"universal/fdedup/requirements.txt",
"universal/hap/requirements.txt",
"universal/tokenization/requirements.txt",
"universal/web2parquet/requirements.txt"
]}

language = { file = [
##### pii_redactor seem to be failing UT
## "language/pii_redactor/python/requirements.txt",
"language/lang_id/python/requirements.txt",
"language/text_encoder/requirements.txt",

"universal/hap/python/requirements.txt",
"universal/tokenization/python/requirements.txt",
"universal/ededup/requirements.txt",
"universal/fdedup/python/requirements.txt",

"language/lang_id/requirements.txt",
"language/doc_quality/requirements.txt",
"language/pdf2parquet/requirements.txt",
"language/doc_chunk/requirements.txt",
"language/text_encoder/requirements.txt",

##### Cannot have html2parquet until we solve
## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1
## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
## "language/html2parquet/requirements.txt",
"language/lang_id/requirements.txt",
"language/text_encoder/requirements.txt",
"language/pdf2parquet/requirements.txt",

"universal/doc_id/requirements.txt",
"universal/ededup/requirements.txt",
"universal/fdedup/requirements.txt",
"universal/hap/requirements.txt",
"universal/tokenization/requirements.txt",
"universal/web2parquet/requirements.txt"
]}

Expand All @@ -93,11 +90,9 @@ code_profiler = { file = ["code/code_profiler/python/requirements.txt"]}

pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]}

fdedup = { file = ["universal/fdedup/python/requirements.txt"]}
profiler = { file = ["universal/profiler/python/requirements.txt"]}
filter = { file = ["universal/filter/python/requirements.txt"]}
resize = { file = ["universal/resize/python/requirements.txt"]}
tokenization = { file = ["universal/tokenization/python/requirements.txt"]}

######## Named transforms
doc_chunk = { file = ["language/doc_chunk/requirements.txt"]}
Expand All @@ -110,6 +105,8 @@ text_encoder = { file = ["language/text_encoder/requirements.txt"]}
doc_id = { file = ["universal/doc_id/requirements.txt"]}
hap = { file = ["universal/hap/requirements.txt"]}
ededup = { file = ["universal/ededup/requirements.txt"]}
fdedup = { file = ["universal/fdedup/requirements.txt"]}
tokenization = { file = ["universal/tokenization/requirements.txt"]}

web2parquet = { file = ["universal/web2parquet/requirements.txt"]}

Expand All @@ -130,6 +127,8 @@ dpk_text_encoder = "language/text_encoder/dpk_text_encoder"
dpk_doc_id = "universal/doc_id/dpk_doc_id"
dpk_hap = "universal/hap/dpk_hap"
dpk_ededup = "universal/ededup/dpk_ededup"
dpk_fdedup = "universal/fdedup/dpk_fdedup"
dpk_tokenization = "universal/tokenization/dpk_tokenization"

#[tool.setuptools.package-data]
#"*" = ["*.txt"]
Expand Down
139 changes: 110 additions & 29 deletions transforms/transforms-1.0-lang.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": null,
"id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"!pip install 'data-prep-toolkit-transforms[language]==1.0.0a0'\n",
"!pip install 'data-prep-toolkit-transforms[language]==1.0.0a1'\n",
"import pyarrow.parquet as pq\n",
"import pandas as pd"
]
Expand All @@ -31,7 +31,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": null,
"id": "b6c89ac7-6824-4d99-8120-7d5b150bd683",
"metadata": {},
"outputs": [],
Expand All @@ -43,7 +43,7 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": null,
"id": "c2a12abc-9460-4e45-8961-873b48a9ab19",
"metadata": {},
"outputs": [],
Expand All @@ -53,19 +53,19 @@
"Web2Parquet(urls= ['https://arxiv.org/pdf/2408.09869'],\n",
" depth=2, \n",
" downloads=10,\n",
" folder='downloads').transform()\n"
" folder='files-web2parquet').transform()\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
"metadata": {},
"outputs": [],
"source": [
"##### **** The specified downloads folder will include the downloaded file(s).\n",
"#import glob\n",
"#glob.glob(\"downloads/*\") "
"#glob.glob(\"files-web2parquet/*\") "
]
},
{
Expand All @@ -84,21 +84,21 @@
"outputs": [],
"source": [
"from dpk_pdf2parquet.transform_python import Pdf2Parquet\n",
"Pdf2Parquet(input_folder= \"downloads\", \n",
" output_folder= \"pdf2parquet-files\", \n",
"Pdf2Parquet(input_folder= \"files-web2parquet\", \n",
" output_folder= \"files-pdf2parquet\", \n",
" data_files_to_use=['.pdf'],\n",
" pdf2parquet_contents_type='text/markdown').transform()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": null,
"id": "fef6667e-71ed-4054-9382-55c6bb3fda70",
"metadata": {},
"outputs": [],
"source": [
"##### **** To explote the output from pdf2parquet, run the code below\n",
"#table = pq.read_table('pdf2parquet-files/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table = pq.read_table('files-pdf2parquet/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table.to_pandas()"
]
},
Expand All @@ -119,20 +119,20 @@
"source": [
"%%capture\n",
"from dpk_doc_chunk.transform_python import DocChunk\n",
"DocChunk(input_folder='pdf2parquet-files',\n",
" output_folder='doc-chunk-files',\n",
"DocChunk(input_folder='files-pdf2parquet',\n",
" output_folder='files-doc-chunk',\n",
" doc_chunk_chunking_type= \"li_markdown\").transform()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": null,
"id": "9d4f7bfc",
"metadata": {},
"outputs": [],
"source": [
"##### **** To explote the output from doc-chunk, run the code below\n",
"#table = pq.read_table('doc-chunk-files/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table = pq.read_table('files-doc-chunk/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table.to_pandas()"
]
},
Expand All @@ -152,21 +152,21 @@
"outputs": [],
"source": [
"from dpk_ededup.transform_python import Ededup\n",
"Ededup(input_folder=\"doc-chunk-files\",\n",
" output_folder=\"dedup-files\",\n",
"Ededup(input_folder=\"files-doc-chunk\",\n",
" output_folder=\"files-ededup\",\n",
" ededup_doc_column=\"contents\",\n",
" ededup_doc_id_column=\"document_id\").transform()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": null,
"id": "27e36a8e",
"metadata": {},
"outputs": [],
"source": [
"##### **** To explote the output from eDedup, run the code below\n",
"#table = pq.read_table('dedup-files/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table = pq.read_table('files-ededup/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table.to_pandas()"
]
},
Expand All @@ -175,7 +175,7 @@
"id": "318bc520",
"metadata": {},
"source": [
"Configure and run Land Id"
"Configure and run Lang Id"
]
},
{
Expand All @@ -186,8 +186,8 @@
"outputs": [],
"source": [
"from dpk_lang_id.transform_python import LangId\n",
"LangId(input_folder= \"dedup-files\",\n",
" output_folder= \"langId-files\",\n",
"LangId(input_folder= \"files-ededup\",\n",
" output_folder= \"files-langId\",\n",
" lang_id_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n",
" lang_id_model_kind= \"fasttext\",\n",
" lang_id_model_url= \"facebook/fasttext-language-identification\",\n",
Expand All @@ -196,13 +196,13 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": null,
"id": "c35cab2e",
"metadata": {},
"outputs": [],
"source": [
"##### **** To explote the output from langId, run the code below\n",
"#table = pq.read_table('langId-files/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table = pq.read_table('files-langId/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table.to_pandas()"
]
},
Expand All @@ -223,28 +223,109 @@
"source": [
"%%capture\n",
"from dpk_doc_quality.transform_python import DocQuality\n",
"DocQuality(input_folder='dedup-files',\n",
" output_folder= 'doc-quality-files',\n",
"DocQuality(input_folder='files-doc-chunk',\n",
" output_folder= 'files-doc-quality',\n",
" docq_text_lang = \"en\",\n",
" docq_doc_content_column =\"contents\").transform()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": null,
"id": "d98b854f",
"metadata": {},
"outputs": [],
"source": [
"##### **** To explote the output from Doc Quality, run the code below\n",
"#table = pq.read_table('doc-quality-files/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table = pq.read_table('files-doc-quality/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table.to_pandas()"
]
},
{
"cell_type": "markdown",
"id": "a6230859",
"metadata": {},
"source": [
"Run tokenization"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b454872",
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"from dpk_tokenization.transform_python import Tokenization\n",
"Tokenization(input_folder= \"files-doc-quality\",\n",
" output_folder= \"files-tokenization\",\n",
" tkn_tokenizer= \"hf-internal-testing/llama-tokenizer\",\n",
" tkn_chunk_size= 20_000).transform()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd9bf9f3-3c59-4efb-9d7e-0201d6d8d783",
"metadata": {},
"outputs": [],
"source": [
"##### **** To explote the output from Doc Quality, run the code below\n",
"#table = pq.read_table('files-tokenization/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table.to_pandas()"
]
},
{
"cell_type": "markdown",
"id": "b33d6a5e-453a-46c8-ab70-727cd72af973",
"metadata": {},
"source": [
"Configure and Run Fuzzy dedup"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41813816-c412-4dd7-b762-1d9b37a34123",
"metadata": {},
"outputs": [],
"source": [
"from dpk_fdedup.transform_python import Fdedup\n",
"Fdedup(input_folder='files-doc-chunk',\n",
" output_folder='files-fdedup',\n",
" contents_column= \"contents\",\n",
" document_id_column= \"document_id\",\n",
" num_permutations= 112,\n",
" num_bands= 14,\n",
" num_minhashes_per_band= 8,\n",
" operation_mode=\"filter_duplicates\").transform()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5bdba762",
"metadata": {},
"outputs": [],
"source": [
"##### **** To explote the output from eDedup, run the code below\n",
"#table = pq.read_table('files-fdedup/arxiv_org_2408.09869v5.pdf_application.parquet')\n",
"#table.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45551a89-bf18-494a-96a7-d3a67ae25189",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand Down
Loading

0 comments on commit 947e1dc

Please sign in to comment.