Merge pull request #950 from touma-I/relax-fasttext-requirements

Relax fasttext requirements >=0.9.2
IBM · Jan 17, 2025 · c8096b1 · c8096b1
2 parents 725fdf6 + e86ddb9
commit c8096b1
Show file tree

Hide file tree

Showing 3 changed files with 181 additions and 24 deletions.
diff --git a/transforms/README-list.md b/transforms/README-list.md
@@ -43,6 +43,7 @@ Note: This list includes the transforms that were part of the release starting w
 
 ### 1.0.0.a5
 	Added Pii Redactor
+	Relax fasttext requirement >= 0.9.2
 ### 1.0.0.a4
 	Added missing ray implementation for lang_id, doc_quality, tokenization and filter
 	Added ray notebooks for lang id, Doc Quality, tokenization, and Filter

diff --git a/transforms/language/lang_id/lang_id.ipynb b/transforms/language/lang_id/lang_id.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
    "metadata": {},
    "outputs": [],
@@ -24,7 +24,8 @@
     "## This is here as a reference only\n",
     "# Users and application developers must use the right tag for the latest from pypi\n",
     "%pip install data-prep-toolkit\n",
-    "%pip install 'data-prep-toolkit-transforms[lang_id]'"
+    "%pip install 'data-prep-toolkit-transforms[lang_id]'\n",
+    "%pip install pandas"
    ]
   },
   {
@@ -55,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
    "metadata": {},
    "outputs": [],
@@ -73,29 +74,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "00:06:41 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n",
-      "00:06:41 INFO - pipeline id pipeline_id\n",
-      "00:06:41 INFO - code location None\n",
-      "00:06:41 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
-      "00:06:41 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "00:06:41 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "00:06:41 INFO - orchestrator lang_id started at 2024-12-11 00:06:41\n",
-      "00:06:41 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n",
-      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
-      "00:06:47 INFO - Completed 1 files (33.33%) in 0.074 min\n",
-      "00:06:47 INFO - Completed 2 files (66.67%) in 0.076 min\n",
-      "00:06:48 INFO - Completed 3 files (100.0%) in 0.081 min\n",
-      "00:06:48 INFO - Done processing 3 files, waiting for flush() completion.\n",
-      "00:06:48 INFO - done flushing in 0.0 sec\n",
-      "00:06:48 INFO - Completed execution in 0.111 min, execution result 0\n"
+      "10:01:42 INFO - lang_id parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_kind': 'fasttext', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_lang_column_name': 'lang', 'output_score_column_name': 'score'}\n",
+      "10:01:42 INFO - pipeline id pipeline_id\n",
+      "10:01:42 INFO - code location None\n",
+      "10:01:42 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
+      "10:01:42 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "10:01:42 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "10:01:42 INFO - orchestrator lang_id started at 2025-01-17 10:01:42\n",
+      "10:01:42 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n",
+      "10:01:43 INFO - Completed 1 files (33.33%) in 0.009 min\n",
+      "10:01:44 INFO - Completed 2 files (66.67%) in 0.011 min\n",
+      "10:01:44 INFO - Completed 3 files (100.0%) in 0.013 min\n",
+      "10:01:44 INFO - Done processing 3 files, waiting for flush() completion.\n",
+      "10:01:44 INFO - done flushing in 0.0 sec\n",
+      "10:01:44 INFO - Completed execution in 0.024 min, execution result 0\n"
      ]
     },
     {
@@ -104,7 +104,7 @@
        "0"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -128,7 +128,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "7276fe84-6512-4605-ab65-747351e13a7c",
    "metadata": {},
    "outputs": [
@@ -141,7 +141,7 @@
        " 'output/test_01.parquet']"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -153,9 +153,165 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>count()</th>\n",
+       "      <th>lang</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>- Notice of name-email change.doc</td>\n",
+       "      <td>6</td>\n",
+       "      <td>en</td>\n",
+       "      <td>0.858</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>- Nov13ENAOnly.doc</td>\n",
+       "      <td>2</td>\n",
+       "      <td>de</td>\n",
+       "      <td>0.264</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>- OHIO_C~1.XLS</td>\n",
+       "      <td>2</td>\n",
+       "      <td>de</td>\n",
+       "      <td>0.603</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>- Oneok(5-30)final.doc</td>\n",
+       "      <td>1</td>\n",
+       "      <td>vi</td>\n",
+       "      <td>0.152</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>- OpeningBrief.doc</td>\n",
+       "      <td>6</td>\n",
+       "      <td>ko-Hang</td>\n",
+       "      <td>0.365</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>195</th>\n",
+       "      <td>- invite.doc</td>\n",
+       "      <td>2</td>\n",
+       "      <td>ro</td>\n",
+       "      <td>0.717</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>196</th>\n",
+       "      <td>- issues wrt portland and calgary signing shor...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>en</td>\n",
+       "      <td>0.997</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>197</th>\n",
+       "      <td>- jan3102.XLS</td>\n",
+       "      <td>2</td>\n",
+       "      <td>de</td>\n",
+       "      <td>0.399</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>198</th>\n",
+       "      <td>- job market.gif</td>\n",
+       "      <td>2</td>\n",
+       "      <td>en</td>\n",
+       "      <td>0.791</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>199</th>\n",
+       "      <td>- kick~1.mpe</td>\n",
+       "      <td>4</td>\n",
+       "      <td>eo</td>\n",
+       "      <td>0.253</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>200 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                  text  count()     lang  \\\n",
+       "0                    - Notice of name-email change.doc        6       en   \n",
+       "1                                   - Nov13ENAOnly.doc        2       de   \n",
+       "2                                       - OHIO_C~1.XLS        2       de   \n",
+       "3                               - Oneok(5-30)final.doc        1       vi   \n",
+       "4                                   - OpeningBrief.doc        6  ko-Hang   \n",
+       "..                                                 ...      ...      ...   \n",
+       "195                                       - invite.doc        2       ro   \n",
+       "196  - issues wrt portland and calgary signing shor...        2       en   \n",
+       "197                                      - jan3102.XLS        2       de   \n",
+       "198                                   - job market.gif        2       en   \n",
+       "199                                       - kick~1.mpe        4       eo   \n",
+       "\n",
+       "     score  \n",
+       "0    0.858  \n",
+       "1    0.264  \n",
+       "2    0.603  \n",
+       "3    0.152  \n",
+       "4    0.365  \n",
+       "..     ...  \n",
+       "195  0.717  \n",
+       "196  0.997  \n",
+       "197  0.399  \n",
+       "198  0.791  \n",
+       "199  0.253  \n",
+       "\n",
+       "[200 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "pd.read_parquet('output/test_01.parquet', engine='pyarrow')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7aef6ac9-96cf-40ad-a472-b5d9036436e5",
+   "metadata": {},
    "outputs": [],
    "source": []
   }

diff --git a/transforms/language/lang_id/requirements.txt b/transforms/language/lang_id/requirements.txt
@@ -1,4 +1,4 @@
-fasttext==0.9.2 ; platform_system != "Windows"
+fasttext>=0.9.2 ; platform_system != "Windows"
 langcodes>=3.3.0
 huggingface-hub >= 0.21.4, <1.0.0
 numpy==1.26.4