Skip to content

Commit

Permalink
Merge pull request #950 from touma-I/relax-fasttext-requirements
Browse files Browse the repository at this point in the history
Relax fasttext requirements >=0.9.2
  • Loading branch information
touma-I authored Jan 17, 2025
2 parents 725fdf6 + e86ddb9 commit c8096b1
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 24 deletions.
1 change: 1 addition & 0 deletions transforms/README-list.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Note: This list includes the transforms that were part of the release starting w

### 1.0.0.a5
Added Pii Redactor
Relax fasttext requirement >= 0.9.2
### 1.0.0.a4
Added missing ray implementation for lang_id, doc_quality, tokenization and filter
Added ray notebooks for lang id, Doc Quality, tokenization, and Filter
Expand Down
202 changes: 179 additions & 23 deletions transforms/language/lang_id/lang_id.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
"metadata": {},
"outputs": [],
Expand All @@ -24,7 +24,8 @@
"## This is here as a reference only\n",
"# Users and application developers must use the right tag for the latest from pypi\n",
"%pip install data-prep-toolkit\n",
"%pip install 'data-prep-toolkit-transforms[lang_id]'"
"%pip install 'data-prep-toolkit-transforms[lang_id]'\n",
"%pip install pandas"
]
},
{
Expand Down Expand Up @@ -55,7 +56,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
"metadata": {},
"outputs": [],
Expand All @@ -73,29 +74,28 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"00:06:41 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n",
"00:06:41 INFO - pipeline id pipeline_id\n",
"00:06:41 INFO - code location None\n",
"00:06:41 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
"00:06:41 INFO - data factory data_ max_files -1, n_sample -1\n",
"00:06:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
"00:06:41 INFO - orchestrator lang_id started at 2024-12-11 00:06:41\n",
"00:06:41 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n",
"Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
"00:06:47 INFO - Completed 1 files (33.33%) in 0.074 min\n",
"00:06:47 INFO - Completed 2 files (66.67%) in 0.076 min\n",
"00:06:48 INFO - Completed 3 files (100.0%) in 0.081 min\n",
"00:06:48 INFO - Done processing 3 files, waiting for flush() completion.\n",
"00:06:48 INFO - done flushing in 0.0 sec\n",
"00:06:48 INFO - Completed execution in 0.111 min, execution result 0\n"
"10:01:42 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n",
"10:01:42 INFO - pipeline id pipeline_id\n",
"10:01:42 INFO - code location None\n",
"10:01:42 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
"10:01:42 INFO - data factory data_ max_files -1, n_sample -1\n",
"10:01:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
"10:01:42 INFO - orchestrator lang_id started at 2025-01-17 10:01:42\n",
"10:01:42 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n",
"10:01:43 INFO - Completed 1 files (33.33%) in 0.009 min\n",
"10:01:44 INFO - Completed 2 files (66.67%) in 0.011 min\n",
"10:01:44 INFO - Completed 3 files (100.0%) in 0.013 min\n",
"10:01:44 INFO - Done processing 3 files, waiting for flush() completion.\n",
"10:01:44 INFO - done flushing in 0.0 sec\n",
"10:01:44 INFO - Completed execution in 0.024 min, execution result 0\n"
]
},
{
Expand All @@ -104,7 +104,7 @@
"0"
]
},
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -128,7 +128,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "7276fe84-6512-4605-ab65-747351e13a7c",
"metadata": {},
"outputs": [
Expand All @@ -141,7 +141,7 @@
" 'output/test_01.parquet']"
]
},
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -153,9 +153,165 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>count()</th>\n",
" <th>lang</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>- Notice of name-email change.doc</td>\n",
" <td>6</td>\n",
" <td>en</td>\n",
" <td>0.858</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>- Nov13ENAOnly.doc</td>\n",
" <td>2</td>\n",
" <td>de</td>\n",
" <td>0.264</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>- OHIO_C~1.XLS</td>\n",
" <td>2</td>\n",
" <td>de</td>\n",
" <td>0.603</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>- Oneok(5-30)final.doc</td>\n",
" <td>1</td>\n",
" <td>vi</td>\n",
" <td>0.152</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>- OpeningBrief.doc</td>\n",
" <td>6</td>\n",
" <td>ko-Hang</td>\n",
" <td>0.365</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>195</th>\n",
" <td>- invite.doc</td>\n",
" <td>2</td>\n",
" <td>ro</td>\n",
" <td>0.717</td>\n",
" </tr>\n",
" <tr>\n",
" <th>196</th>\n",
" <td>- issues wrt portland and calgary signing shor...</td>\n",
" <td>2</td>\n",
" <td>en</td>\n",
" <td>0.997</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>- jan3102.XLS</td>\n",
" <td>2</td>\n",
" <td>de</td>\n",
" <td>0.399</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>- job market.gif</td>\n",
" <td>2</td>\n",
" <td>en</td>\n",
" <td>0.791</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>- kick~1.mpe</td>\n",
" <td>4</td>\n",
" <td>eo</td>\n",
" <td>0.253</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" text count() lang \\\n",
"0 - Notice of name-email change.doc 6 en \n",
"1 - Nov13ENAOnly.doc 2 de \n",
"2 - OHIO_C~1.XLS 2 de \n",
"3 - Oneok(5-30)final.doc 1 vi \n",
"4 - OpeningBrief.doc 6 ko-Hang \n",
".. ... ... ... \n",
"195 - invite.doc 2 ro \n",
"196 - issues wrt portland and calgary signing shor... 2 en \n",
"197 - jan3102.XLS 2 de \n",
"198 - job market.gif 2 en \n",
"199 - kick~1.mpe 4 eo \n",
"\n",
" score \n",
"0 0.858 \n",
"1 0.264 \n",
"2 0.603 \n",
"3 0.152 \n",
"4 0.365 \n",
".. ... \n",
"195 0.717 \n",
"196 0.997 \n",
"197 0.399 \n",
"198 0.791 \n",
"199 0.253 \n",
"\n",
"[200 rows x 4 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"pd.read_parquet('output/test_01.parquet', engine='pyarrow')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7aef6ac9-96cf-40ad-a472-b5d9036436e5",
"metadata": {},
"outputs": [],
"source": []
}
Expand Down
2 changes: 1 addition & 1 deletion transforms/language/lang_id/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
fasttext==0.9.2 ; platform_system != "Windows"
fasttext>=0.9.2 ; platform_system != "Windows"
langcodes>=3.3.0
huggingface-hub >= 0.21.4, <1.0.0
numpy==1.26.4

0 comments on commit c8096b1

Please sign in to comment.