Skip to content

Commit

Permalink
Merge branch 'main' into decouple_convert_feature
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis authored Oct 24, 2024
2 parents f6352dc + eb42856 commit 9b56206
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 17 deletions.
13 changes: 2 additions & 11 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from typing import List, Union
import os # For removing the JSON file

from scribe_data.unicode.generate_emoji_keywords import generate_emoji
from scribe_data.utils import (
DEFAULT_CSV_EXPORT_DIR,
DEFAULT_JSON_EXPORT_DIR,
Expand Down Expand Up @@ -105,17 +106,7 @@ def get_data(
# MARK: Emojis

elif data_type in {"emoji-keywords", "emoji_keywords"}:
for lang in languages:
emoji_keyword_extraction_script = (
Path(__file__).parent.parent
/ "language_data_extraction"
/ lang
/ "emoji_keywords"
/ "generate_emoji_keywords.py"
)
subprocess_result = subprocess.run(
["python", emoji_keyword_extraction_script]
)
generate_emoji(language=language, output_dir=output_dir)

# MARK: Query Data

Expand Down
3 changes: 3 additions & 0 deletions src/scribe_data/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,9 @@ def main() -> None:

args = parser.parse_args()

if args.data_type and isinstance(args.data_type, str):
args.data_type = args.data_type.replace("-", "_")

try:
if args.language or args.data_type:
validate_language_and_data_type(
Expand Down
59 changes: 59 additions & 0 deletions src/scribe_data/unicode/generate_emoji_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Centralized keyword-emoji generation file to generated emoji for a specified Language.
.. raw:: html
<!--
* Copyright (C) 2024 Scribe
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
"""

import os
from pathlib import Path

from scribe_data.unicode.process_unicode import gen_emoji_lexicon
from scribe_data.utils import export_formatted_data, get_language_iso

DATA_TYPE = "emoji-keywords"
EMOJI_KEYWORDS_DICT = 3


def generate_emoji(language, output_dir: str = None):
iso = get_language_iso(language=language)
path_to_cldr_annotations = (
Path(__file__).parent / "cldr-annotations-full" / "annotations"
)
if iso in os.listdir(path_to_cldr_annotations):
print(f"Emoji Generation for language {language} is supported")

else:
print(f"Emoji Generation for language {language} is not supported")
return

updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir
export_dir = Path(updated_path) / language.capitalize()
export_dir.mkdir(parents=True, exist_ok=True)

if emoji_keywords_dict := gen_emoji_lexicon(
language=language,
emojis_per_keyword=EMOJI_KEYWORDS_DICT,
):
export_formatted_data(
file_path=output_dir,
formatted_data=emoji_keywords_dict,
query_data_in_use=True,
language=language,
data_type=DATA_TYPE,
)
7 changes: 5 additions & 2 deletions src/scribe_data/unicode/process_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ def gen_emoji_lexicon(
# Pre-set up the emoji popularity data.
popularity_dict = {}

with (Path(__file__).parent / "2021_ranked.tsv").open() as popularity_file:
with (Path(__file__).parent / "2021_ranked.tsv").open(
encoding="utf-8"
) as popularity_file:
tsv_reader = csv.DictReader(popularity_file, delimiter="\t")
for tsv_row in tsv_reader:
popularity_dict[tsv_row["Emoji"]] = int(tsv_row["Rank"])
Expand All @@ -93,6 +95,7 @@ def gen_emoji_lexicon(
/ f"{iso}"
/ "annotations.json"
)

annotations_derived_file_path = (
Path(__file__).parent
/ "cldr-annotations-derived-full"
Expand All @@ -107,7 +110,7 @@ def gen_emoji_lexicon(
}

for cldr_file_key, cldr_file_path in cldr_file_paths.items():
with open(cldr_file_path, "r") as file:
with open(cldr_file_path, "r", encoding="utf-8") as file:
cldr_data = json.load(file)

cldr_dict = cldr_data[cldr_file_key]["annotations"]
Expand Down
13 changes: 9 additions & 4 deletions tests/cli/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,15 @@
class TestGetData(unittest.TestCase):
# MARK: Subprocess Patching

@patch("subprocess.run")
def test_get_emoji_keywords(self, mock_subprocess_run):
get_data(language="English", data_type="emoji-keywords")
self.assertTrue(mock_subprocess_run.called)
@patch("scribe_data.cli.get.generate_emoji")
def test_get_emoji_keywords(self, generate_emoji):
get_data(
language="English", data_type="emoji_keywords", output_dir="./test_output"
)
generate_emoji.assert_called_once_with(
language="English",
output_dir="./test_output",
)

# MARK: Invalid Arguments

Expand Down

0 comments on commit 9b56206

Please sign in to comment.