Merge branch 'main' into decouple_convert_feature

scribe-org · Oct 24, 2024 · 9b56206 · 9b56206
2 parents f6352dc + eb42856
commit 9b56206
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 17 deletions.
diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -25,6 +25,7 @@
 from typing import List, Union
 import os  # For removing the JSON file
 
+from scribe_data.unicode.generate_emoji_keywords import generate_emoji
 from scribe_data.utils import (
     DEFAULT_CSV_EXPORT_DIR,
     DEFAULT_JSON_EXPORT_DIR,
@@ -105,17 +106,7 @@ def get_data(
     # MARK: Emojis
 
     elif data_type in {"emoji-keywords", "emoji_keywords"}:
-        for lang in languages:
-            emoji_keyword_extraction_script = (
-                Path(__file__).parent.parent
-                / "language_data_extraction"
-                / lang
-                / "emoji_keywords"
-                / "generate_emoji_keywords.py"
-            )
-            subprocess_result = subprocess.run(
-                ["python", emoji_keyword_extraction_script]
-            )
+        generate_emoji(language=language, output_dir=output_dir)
 
     # MARK: Query Data
 

diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py
@@ -237,6 +237,9 @@ def main() -> None:
 
     args = parser.parse_args()
 
+    if args.data_type and isinstance(args.data_type, str):
+        args.data_type = args.data_type.replace("-", "_")
+
     try:
         if args.language or args.data_type:
             validate_language_and_data_type(

diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py
@@ -0,0 +1,59 @@
+"""
+Centralized keyword-emoji generation file to generated emoji for a specified Language.
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
+import os
+from pathlib import Path
+
+from scribe_data.unicode.process_unicode import gen_emoji_lexicon
+from scribe_data.utils import export_formatted_data, get_language_iso
+
+DATA_TYPE = "emoji-keywords"
+EMOJI_KEYWORDS_DICT = 3
+
+
+def generate_emoji(language, output_dir: str = None):
+    iso = get_language_iso(language=language)
+    path_to_cldr_annotations = (
+        Path(__file__).parent / "cldr-annotations-full" / "annotations"
+    )
+    if iso in os.listdir(path_to_cldr_annotations):
+        print(f"Emoji Generation for language {language} is supported")
+
+    else:
+        print(f"Emoji Generation for language {language} is not supported")
+        return
+
+    updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir
+    export_dir = Path(updated_path) / language.capitalize()
+    export_dir.mkdir(parents=True, exist_ok=True)
+
+    if emoji_keywords_dict := gen_emoji_lexicon(
+        language=language,
+        emojis_per_keyword=EMOJI_KEYWORDS_DICT,
+    ):
+        export_formatted_data(
+            file_path=output_dir,
+            formatted_data=emoji_keywords_dict,
+            query_data_in_use=True,
+            language=language,
+            data_type=DATA_TYPE,
+        )
diff --git a/src/scribe_data/unicode/process_unicode.py b/src/scribe_data/unicode/process_unicode.py
@@ -76,7 +76,9 @@ def gen_emoji_lexicon(
     # Pre-set up the emoji popularity data.
     popularity_dict = {}
 
-    with (Path(__file__).parent / "2021_ranked.tsv").open() as popularity_file:
+    with (Path(__file__).parent / "2021_ranked.tsv").open(
+        encoding="utf-8"
+    ) as popularity_file:
         tsv_reader = csv.DictReader(popularity_file, delimiter="\t")
         for tsv_row in tsv_reader:
             popularity_dict[tsv_row["Emoji"]] = int(tsv_row["Rank"])
@@ -93,6 +95,7 @@ def gen_emoji_lexicon(
         / f"{iso}"
         / "annotations.json"
     )
+
     annotations_derived_file_path = (
         Path(__file__).parent
         / "cldr-annotations-derived-full"
@@ -107,7 +110,7 @@ def gen_emoji_lexicon(
     }
 
     for cldr_file_key, cldr_file_path in cldr_file_paths.items():
-        with open(cldr_file_path, "r") as file:
+        with open(cldr_file_path, "r", encoding="utf-8") as file:
             cldr_data = json.load(file)
 
         cldr_dict = cldr_data[cldr_file_key]["annotations"]

diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py
@@ -29,10 +29,15 @@
 class TestGetData(unittest.TestCase):
     # MARK: Subprocess Patching
 
-    @patch("subprocess.run")
-    def test_get_emoji_keywords(self, mock_subprocess_run):
-        get_data(language="English", data_type="emoji-keywords")
-        self.assertTrue(mock_subprocess_run.called)
+    @patch("scribe_data.cli.get.generate_emoji")
+    def test_get_emoji_keywords(self, generate_emoji):
+        get_data(
+            language="English", data_type="emoji_keywords", output_dir="./test_output"
+        )
+        generate_emoji.assert_called_once_with(
+            language="English",
+            output_dir="./test_output",
+        )
 
     # MARK: Invalid Arguments