From 0f6d235eec952890fc862f7d8b8c5289b04508eb Mon Sep 17 00:00:00 2001 From: john-thuo1 Date: Sun, 13 Oct 2024 17:58:20 +0300 Subject: [PATCH 01/13] feat : Functionality to convert json/csv&tsv files --- src/scribe_data/cli/convert.py | 398 ++++++++++++++++++++------------- src/scribe_data/cli/main.py | 82 +++++-- tests/cli/test_convert.py | 61 +++-- 3 files changed, 358 insertions(+), 183 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index aa24b08da..d49762536 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -24,32 +24,47 @@ import json import shutil from pathlib import Path +from typing import List, Union from scribe_data.cli.cli_utils import language_map from scribe_data.load.data_to_sqlite import data_to_sqlite from scribe_data.utils import ( DEFAULT_SQLITE_EXPORT_DIR, + DEFAULT_JSON_EXPORT_DIR, + DEFAULT_CSV_EXPORT_DIR, + DEFAULT_TSV_EXPORT_DIR, get_language_iso, ) # MARK: JSON -def export_json( - language: str, data_type: str, output_dir: Path, overwrite: bool +def convert_to_json( + language: str, + data_type: Union[str, List[str]], + output_type: str, + input_file: str, + output_dir: str = None, + overwrite: bool = False, ) -> None: """ - Export a JSON file from the CLI process. + Convert a CSV/TSV file to JSON. Parameters ---------- language : str The language of the file to convert. - data_type : str - The data type to of the file to convert. + data_type : Union[str, List[str]] + The data type of the file to convert. + + output_type : str + The output format, should be "json". + + input_file : str + The input CSV/TSV file path. - output_dir : str + output_dir : Path The output directory path for results. overwrite : bool @@ -57,121 +72,182 @@ def export_json( Returns ------- - A JSON file saved in the given location. + None """ normalized_language = language_map.get(language.lower()) if not normalized_language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") - data_type = data_type[0] if isinstance(data_type, list) else data_type - data_file = ( - output_dir / normalized_language["language"].capitalize() / f"{data_type}.json" - ) + data_types = [data_type] if isinstance(data_type, str) else data_type - print(data_file) + if output_dir is None: + output_dir = DEFAULT_JSON_EXPORT_DIR - if not data_file.exists(): - print( - f"No data found for language '{normalized_language['language']}' and data type '{data_type}'." - ) - return + json_output_dir = Path(output_dir) / normalized_language["language"].capitalize() + json_output_dir.mkdir(parents=True, exist_ok=True) - try: - with data_file.open("r", encoding="utf-8") as file: - data = json.load(file) + for dtype in data_types: + input_file_path = Path(input_file) - except (IOError, json.JSONDecodeError) as e: - print(f"Error reading '{data_file}': {e}") - return + if not input_file_path.exists(): + print(f"No data found for input file '{input_file_path}'.") + continue - json_output_dir = output_dir / normalized_language["language"].capitalize() - json_output_dir.mkdir(parents=True, exist_ok=True) + delimiter = "," if input_file_path.suffix.lower() == ".csv" else "\t" - output_file = json_output_dir / f"{data_type}.json" - if output_file.exists() and not overwrite: - user_input = input(f"File '{output_file}' already exists. Overwrite? (y/n): ") - if user_input.lower() != "y": - print(f"Skipping {normalized_language['language']} - {data_type}") - return + try: + with input_file_path.open("r", encoding="utf-8") as file: + reader = csv.DictReader(file, delimiter=delimiter) + rows = list(reader) + + if not rows: + print(f"No data found in '{input_file_path}'.") + continue + + # Use the first row to inspect column headers + first_row = rows[0] + keys = list(first_row.keys()) + data = {} + + if len(keys) == 1: + # Handle Case: { key: None } + data[first_row[keys[0]]] = None + + elif len(keys) == 2: + # Handle Case: { key: value } + for row in rows: + key = row[keys[0]] + value = row[keys[1]] + data[key] = value + + elif len(keys) > 2: + if all(col in first_row for col in ["emoji", "is_base", "rank"]): + # Handle Case: { key: [ { emoji: ..., is_base: ..., rank: ... }, { emoji: ..., is_base: ..., rank: ... } ] } + for row in rows: + key = row.get(reader.fieldnames[0]) + emoji = row.get("emoji", "").strip() + is_base = ( + row.get("is_base", "false").strip().lower() == "true" + ) + rank = row.get("rank", None) + rank = int(rank) if rank and rank.isdigit() else None + + entry = {"emoji": emoji, "is_base": is_base, "rank": rank} + + if key not in data: + data[key] = [] + data[key].append(entry) + + else: + # Handle Case: { key: { value1: ..., value2: ... } } + for row in rows: + data[row[keys[0]]] = {k: row[k] for k in keys[1:]} + + except (IOError, csv.Error) as e: + print(f"Error reading '{input_file_path}': {e}") + continue - try: - with output_file.open("w") as file: - json.dump(data, file, indent=0) + # Define output file path + output_file = json_output_dir / f"{dtype}.{output_type}" - except IOError as e: - raise IOError(f"Error writing to '{output_file}': {e}") from e + if output_file.exists() and not overwrite: + user_input = input( + f"File '{output_file}' already exists. Overwrite? (y/n): " + ) + if user_input.lower() != "y": + print(f"Skipping {normalized_language['language']} - {dtype}") + continue - print( - f"Data for {normalized_language['language'].capitalize()} {data_type} written to {output_file}" - ) + try: + with output_file.open("w", encoding="utf-8") as file: + json.dump(data, file, ensure_ascii=False, indent=2) + + except IOError as e: + print(f"Error writing to '{output_file}': {e}") + continue + + print( + f"Data for {normalized_language['language'].capitalize()} {dtype} written to {output_file}" + ) +# # MARK: CSV or TSV def convert_to_csv_or_tsv( language: str, - data_type: list, - output_dir: Path, - overwrite: bool, + data_type: Union[str, List[str]], output_type: str, + input_file: str, + output_dir: str = None, + overwrite: bool = False, ) -> None: """ - Converts a Scribe-Data output file to a CSV or TSV file. + Convert a JSON File to CSV/TSV file. Parameters ---------- - output_type : str - The file type to convert to (CSV or TSV). + language : str + The language of the file to convert. - language : str - The language of the file to convert. + data_type : Union[str, List[str]] + The data type of the file to convert. - data_type : str - The data type to of the file to convert. + output_type : str + The output format, should be "csv" or "tsv". - output_dir : str - The output directory path for results. + input_file : str + The input JSON file path. - overwrite : bool - Whether to overwrite existing files. + output_dir : str + The output directory path for results. + + overwrite : bool + Whether to overwrite existing files. Returns ------- - A CSV or TSV file saved in the given location. + None """ + + # Normalize the language normalized_language = language_map.get(language.lower()) if not normalized_language: print(f"Language '{language}' is not recognized.") return - for dtype in data_type: - # Replace non-JSON default paths with JSON path for where exported data is. - file_path = ( - Path( - str(output_dir) - .replace("scribe_data_csv_export", "scribe_data_json_export") - .replace("scribe_data_tsv_export", "scribe_data_json_export") - ) - / normalized_language["language"].capitalize() - / f"{dtype}.json" - ) - if not file_path.exists(): - raise FileNotFoundError( - f"No data found for {dtype} conversion at '{file_path}'." - ) + # Split the data_type string by commas + data_types = [dtype.strip() for dtype in data_type.split(",")] + + for dtype in data_types: + input_file = Path(input_file) + if not input_file.exists(): + print(f"No data found for {dtype} conversion at '{input_file}'.") + continue try: - with file_path.open("r", encoding="utf-8") as f: + with input_file.open("r", encoding="utf-8") as f: data = json.load(f) except (IOError, json.JSONDecodeError) as e: - print(f"Error reading '{file_path}': {e}") + print(f"Error reading '{input_file}': {e}") continue + # Determine the delimiter based on output type delimiter = "," if output_type == "csv" else "\t" - final_output_dir = output_dir / normalized_language["language"].capitalize() + + if output_dir is None: + output_dir = ( + DEFAULT_CSV_EXPORT_DIR + if output_type == "csv" + else DEFAULT_TSV_EXPORT_DIR + ) + + final_output_dir = ( + Path(output_dir) / normalized_language["language"].capitalize() + ) final_output_dir.mkdir(parents=True, exist_ok=True) output_file = final_output_dir / f"{dtype}.{output_type}" @@ -186,19 +262,67 @@ def convert_to_csv_or_tsv( try: with output_file.open("w", newline="", encoding="utf-8") as file: writer = csv.writer(file, delimiter=delimiter) - if isinstance(data, dict): - writer.writerow(data.keys()) - writer.writerow(data.values()) - - elif isinstance(data, list): - for item in data: - if isinstance(item, dict): - writer.writerow(item.values()) - else: - writer.writerow([item]) - else: - print(f"Unsupported data format for {output_type} export.") + # Handle different JSON structures based on the format + if isinstance(data, dict): + first_key = list(data.keys())[0] + + if isinstance(data[first_key], dict): + # Handle case: { key: { value1: ..., value2: ... } } + columns = set() + for value in data.values(): + columns.update(value.keys()) + writer.writerow([dtype[:-1]] + list(columns)) + + for key, value in data.items(): + row = [key] + [value.get(col, "") for col in columns] + writer.writerow(row) + + elif isinstance(data[first_key], list): + if all(isinstance(item, dict) for item in data[first_key]): + # Handle case: { key: [ { value1: ..., value2: ... } ] } + if "emoji" in data[first_key][0]: # Emoji specific case + columns = ["word", "emoji", "is_base", "rank"] + writer.writerow(columns) + + for key, value in data.items(): + for item in value: + row = [ + key, + item.get("emoji", ""), + item.get("is_base", ""), + item.get("rank", ""), + ] + writer.writerow(row) + else: + columns = [dtype[:-1]] + list(data[first_key][0].keys()) + writer.writerow(columns) + + for key, value in data.items(): + for item in value: + row = [key] + [ + item.get(col, "") for col in columns[1:] + ] + writer.writerow(row) + + elif all(isinstance(item, str) for item in data[first_key]): + # Handle case: { key: [value1, value2, ...] } + writer.writerow( + [dtype[:-1]] + + [ + f"autosuggestion_{i+1}" + for i in range(len(data[first_key])) + ] + ) + for key, value in data.items(): + row = [key] + value + writer.writerow(row) + + else: + # Handle case: { key: value } + writer.writerow([dtype[:-1], "value"]) + for key, value in data.items(): + writer.writerow([key, value]) except IOError as e: print(f"Error writing to '{output_file}': {e}") @@ -213,8 +337,10 @@ def convert_to_csv_or_tsv( def convert_to_sqlite( language: str, data_type: str, - output_dir: Path, - overwrite: bool, + output_type: str, + input_file: str = None, + output_dir: str = None, + overwrite: bool = False, ) -> None: """ Converts a Scribe-Data output file to an SQLite file. @@ -225,9 +351,15 @@ def convert_to_sqlite( The language of the file to convert. data_type : str - The data type to of the file to convert. + The data type of the file to convert. + + output_type : str + The output format, should be "sqlite". - output_dir : str + input_file : Path + The input file path for the data to be converted. + + output_dir : Path The output directory path for results. overwrite : bool @@ -240,80 +372,38 @@ def convert_to_sqlite( if not language: raise ValueError("Language must be specified for SQLite conversion.") + if input_file: + input_file = Path(input_file) + if not input_file.exists(): + raise ValueError(f"Input file does not exist: {input_file}") + languages = [language] specific_tables = [data_type] if data_type else None - if output_dir: + if output_dir is None: + output_dir = Path(DEFAULT_SQLITE_EXPORT_DIR) + else: output_dir = Path(output_dir) - if not output_dir.exists(): - output_dir.mkdir(parents=True, exist_ok=True) - print(f"Converting data for language: {language}, data type: {data_type} to SQLite") - data_to_sqlite(languages, specific_tables) + if not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) - if output_dir: - source_file = f"{get_language_iso(language).upper()}LanguageData.sqlite" - source_path = Path(DEFAULT_SQLITE_EXPORT_DIR) / source_file - target_path = output_dir / source_file - if source_path.exists(): - if target_path.exists() and not overwrite: - print(f"File {target_path} already exists. Use --overwrite to replace.") + print( + f"Converting data for language: {language}, data type: {data_type} to {output_type}" + ) + data_to_sqlite(languages, specific_tables) - else: - shutil.copy(source_path, target_path) - print(f"SQLite database copied to: {target_path}") + source_file = f"{get_language_iso(language).upper()}LanguageData.sqlite" + source_path = input_file.parent / source_file + target_path = output_dir / source_file + if source_path.exists(): + if target_path.exists() and not overwrite: + print(f"File {target_path} already exists. Use --overwrite to replace.") else: - print(f"Warning: SQLite file not found at {source_path}") - + shutil.copy(source_path, target_path) + print(f"SQLite database copied to: {target_path}") else: - print("No output directory specified. SQLite file remains in default location.") - - -# MARK: Convert - - -def convert( - language: str, data_type: str, output_dir: str, overwrite: bool, output_type: str -): - """ - Converts a Scribe-Data output file to a different file type. - - Parameters - ---------- - output_type : str - The file type to convert to (CSV or TSV). - - language : str - The language of the file to convert. + print(f"Warning: SQLite file not found at {source_path}") - data_type : str - The data type to of the file to convert. - - output_dir : str - The output directory path for results. - - overwrite : bool - Whether to overwrite existing files. - - Returns - ------- - A SQLite file saved in the given location. - """ - if output_dir: - output_dir = Path(output_dir).resolve() - if not output_dir.exists(): - output_dir.mkdir(parents=True, exist_ok=True) - - if output_type == "json" or output_type is None: - export_json(language, data_type, output_dir, overwrite) - - elif output_type in {"csv", "tsv"}: - convert_to_csv_or_tsv( - language, data_type, output_dir, overwrite, output_type - ) - - else: - raise ValueError( - "Unsupported output type. Please use 'json', 'csv', or 'tsv'." - ) + print("SQLite file conversion complete.") diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index 9cbf13518..7bb5574e4 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -22,8 +22,13 @@ #!/usr/bin/env python3 import argparse +from pathlib import Path -from scribe_data.cli.convert import convert_to_csv_or_tsv, convert_to_sqlite +from scribe_data.cli.convert import ( + convert_to_csv_or_tsv, + convert_to_json, + convert_to_sqlite, +) from scribe_data.cli.get import get_data from scribe_data.cli.interactive import start_interactive_mode from scribe_data.cli.list import list_wrapper @@ -179,22 +184,55 @@ def main() -> None: epilog=CLI_EPILOG, formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - convert_parser._actions[0].help = "Show this help message and exit." + + # Setting up the arguments for the convert command + convert_parser.add_argument( + "-lang", + "--language", + type=str, + required=True, + help="The language of the file to convert.", + ) + convert_parser.add_argument( + "-dt", + "--data-type", + type=str, + required=True, + help="The data type(s) of the file to convert (e.g., noun, verb).", + ) convert_parser.add_argument( - "-f", "--file", type=str, help="The file to convert to a new type." + "-if", + "--input-file", + type=Path, + required=True, + help="The path to the input file to convert.", ) convert_parser.add_argument( "-ot", "--output-type", type=str, choices=["json", "csv", "tsv", "sqlite"], + required=True, help="The output file type.", ) + convert_parser.add_argument( + "-od", + "--output-dir", + type=str, + help="The directory where the output file will be saved.", + ) + convert_parser.add_argument( + "-o", + "--overwrite", + action="store_true", + help="Whether to overwrite existing files (default: False).", + ) convert_parser.add_argument( "-ko", "--keep-original", - action="store_false", - help="Whether to keep the file to be converted (default: True).", + action="store_true", + default=True, + help="Whether to keep the original file to be converted (default: True).", ) # MARK: Setup CLI @@ -210,7 +248,9 @@ def main() -> None: return if args.command in ["list", "l"]: - list_wrapper(args.language, args.data_type, args.all) + list_wrapper( + language=args.language, data_type=args.data_type, all_bool=args.all + ) elif args.command in ["get", "g"]: if args.interactive: @@ -233,18 +273,32 @@ def main() -> None: elif args.command in ["convert", "c"]: if args.output_type in ["csv", "tsv"]: convert_to_csv_or_tsv( - args.language, - args.data_type, - args.output_dir, - args.overwrite, + language=args.language, + data_type=args.data_type, + output_type=args.output_type, + input_file=args.input_file, + output_dir=args.output_dir, + overwrite=args.overwrite, ) elif args.output_type == "sqlite": convert_to_sqlite( - args.language, - args.data_type, - args.output_dir, - args.overwrite, + language=args.language, + data_type=args.data_type, + output_type=args.output_type, + input_file=args.input_file, + output_dir=args.output_dir, + overwrite=args.overwrite, + ) + + elif args.output_type == "json": + convert_to_json( + language=args.language, + data_type=args.data_type, + output_type=args.output_type, + input_file=args.input_file, + output_dir=args.output_dir, + overwrite=args.overwrite, ) else: diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index a98cd31cd..50a1be08b 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -21,10 +21,11 @@ """ import unittest -from pathlib import Path -from unittest.mock import patch +from unittest.mock import MagicMock, patch -from scribe_data.cli.convert import convert_to_sqlite, export_json +from scribe_data.cli.convert import ( + convert_to_sqlite, +) class TestConvert(unittest.TestCase): @@ -34,7 +35,14 @@ class TestConvert(unittest.TestCase): def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_path): mock_path.return_value.exists.return_value = True - convert_to_sqlite("english", "nouns", "/output", True) + convert_to_sqlite( + language="english", + data_type="nouns", + input_file="file", + output_type="sqlite", + output_dir="/output", + overwrite=True, + ) mock_data_to_sqlite.assert_called_with(["english"], ["nouns"]) mock_shutil_copy.assert_called() @@ -42,10 +50,27 @@ def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_pat @patch("scribe_data.cli.convert.Path") @patch("scribe_data.cli.convert.data_to_sqlite") def test_convert_to_sqlite_no_output_dir(self, mock_data_to_sqlite, mock_path): - convert_to_sqlite("english", "nouns", None, True) + # Create a mock for input file + mock_input_file = MagicMock() + mock_input_file.exists.return_value = True + + mock_path.return_value = mock_input_file + + # source and destination paths + mock_input_file.parent = MagicMock() + mock_input_file.parent.__truediv__.return_value = MagicMock() + mock_input_file.parent.__truediv__.return_value.exists.return_value = False + + convert_to_sqlite( + language="english", + data_type="nouns", + input_file=mock_input_file, + output_type="sqlite", + output_dir=None, + overwrite=True, + ) mock_data_to_sqlite.assert_called_with(["english"], ["nouns"]) - mock_path.assert_not_called() @patch("scribe_data.cli.convert.Path") @patch("scribe_data.cli.convert.data_to_sqlite") @@ -57,18 +82,24 @@ def test_convert_to_sqlite_with_language_iso( mock_get_language_iso.return_value = "en" mock_path.return_value.exists.return_value = True - convert_to_sqlite("English", "data_type", "/output", True) + convert_to_sqlite( + language="English", + data_type="data_type", + input_file="file", + output_type="sqlite", + output_dir="/output", + overwrite=True, + ) mock_data_to_sqlite.assert_called_with(["English"], ["data_type"]) mock_copy.assert_called() - @patch("scribe_data.cli.convert.language_map") - def test_export_json_invalid_language(self, mock_language_map): - mock_language_map.get.return_value = None - - with self.assertRaises(ValueError): - export_json("invalid", "data_type", Path("/output"), True) - def test_convert_to_sqlite_no_language(self): with self.assertRaises(ValueError): - convert_to_sqlite(None, "data_type", "/output", True) + convert_to_sqlite( + language=None, + data_type="data_type", + output_type="sqlite", + output_dir="/output", + overwrite=True, + ) From 0bbf20b20d1f4c2d9168eb03935c365fe83693b0 Mon Sep 17 00:00:00 2001 From: john-thuo1 Date: Sat, 19 Oct 2024 17:25:05 +0300 Subject: [PATCH 02/13] Added tests for convert functions --- src/scribe_data/cli/convert.py | 48 +- tests/cli/test_convert.py | 963 ++++++++++++++++++++++++++++++++- 2 files changed, 976 insertions(+), 35 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index d49762536..0055afad5 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -91,10 +91,16 @@ def convert_to_json( input_file_path = Path(input_file) if not input_file_path.exists(): - print(f"No data found for input file '{input_file_path}'.") - continue + raise FileNotFoundError( + f"No data found for input file '{input_file_path}'." + ) - delimiter = "," if input_file_path.suffix.lower() == ".csv" else "\t" + delimiter = {".csv": ",", ".tsv": "\t"}.get(input_file_path.suffix.lower()) + + if not delimiter: + raise ValueError( + f"Unsupported file extension '{input_file_path.suffix}' for {input_file}. Please provide a '.csv' or '.tsv' file." + ) try: with input_file_path.open("r", encoding="utf-8") as file: @@ -186,37 +192,13 @@ def convert_to_csv_or_tsv( ) -> None: """ Convert a JSON File to CSV/TSV file. - - Parameters - ---------- - language : str - The language of the file to convert. - - data_type : Union[str, List[str]] - The data type of the file to convert. - - output_type : str - The output format, should be "csv" or "tsv". - - input_file : str - The input JSON file path. - - output_dir : str - The output directory path for results. - - overwrite : bool - Whether to overwrite existing files. - - Returns - ------- - None """ # Normalize the language normalized_language = language_map.get(language.lower()) + if not normalized_language: - print(f"Language '{language}' is not recognized.") - return + raise ValueError(f"Language '{language.capitalize()}' is not recognized.") # Split the data_type string by commas data_types = [dtype.strip() for dtype in data_type.split(",")] @@ -262,17 +244,15 @@ def convert_to_csv_or_tsv( try: with output_file.open("w", newline="", encoding="utf-8") as file: writer = csv.writer(file, delimiter=delimiter) - # Handle different JSON structures based on the format + if isinstance(data, dict): first_key = list(data.keys())[0] if isinstance(data[first_key], dict): # Handle case: { key: { value1: ..., value2: ... } } - columns = set() - for value in data.values(): - columns.update(value.keys()) - writer.writerow([dtype[:-1]] + list(columns)) + columns = sorted(next(iter(data.values())).keys()) + writer.writerow([dtype[:-1]] + columns) for key, value in data.items(): row = [key] + [value.get(col, "") for col in columns] diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index 50a1be08b..23362b6fa 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -20,15 +20,976 @@ --> """ +from io import StringIO +import json +from pathlib import Path import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, mock_open, patch + from scribe_data.cli.convert import ( + convert_to_json, convert_to_sqlite, + convert_to_csv_or_tsv, ) class TestConvert(unittest.TestCase): + # MARK: JSON Tests + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_json_normalized_language(self, mock_path, mock_language_map): + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + # Mocking Path object behavior + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + + # Set the file extension to .csv/ .tsv + mock_path_obj.suffix = ".csv" + mock_path_obj.exists.return_value = True + + # Call the function with 'English' + convert_to_json( + language="English", + data_type="nouns", + output_type="json", + input_file="input.csv", + output_dir="/output_dir", + overwrite=True, + ) + + # Verify that the mock's get method was called with 'english' (lowercased by the function) + mock_language_map.get.assert_called_with("english") + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_json_unknown_language(self, mock_path, mock_language_map): + mock_language_map.get.return_value = None + + # Mock for input file and output_directory + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.exists.return_value = True + mock_path.side_effect = [mock_input_file_path, MagicMock(spec=Path)] + + with self.assertRaises(ValueError) as context: + convert_to_json( + language="kazatan", + data_type="nouns", + output_type="json", + input_file="test.csv", + output_dir="/output_dir", + overwrite=True, + ) + + # Assert the error message + self.assertEqual( + str(context.exception), "Language 'Kazatan' is not recognized." + ) + + @patch("scribe_data.cli.convert.Path") + def test_convert_to_json_with_input_file(self, mock_path): + # Sample Data + csv_data = "key,value\na,1\nb,2" + mock_file = StringIO(csv_data) + + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + mock_path_obj.suffix = ".csv" + mock_path_obj.exists.return_value = True + mock_path_obj.open.return_value.__enter__.return_value = mock_file + + convert_to_json( + language="English", + data_type="nouns", + output_type="json", + input_file="test.csv", + output_dir="/output_dir", + overwrite=True, + ) + + mock_path_obj.exists.assert_called_once() + + # Verify the file was opened for reading + mock_path_obj.open.assert_called_once_with("r", encoding="utf-8") + + @patch("scribe_data.cli.convert.Path") + def test_convert_to_json_no_input_file(self, mock_path): + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + mock_path_obj.exists.return_value = False + + mock_path_obj.__str__.return_value = "Data/ecode.csv" + + with self.assertRaises(FileNotFoundError) as context: + convert_to_json( + language="English", + data_type="nouns", + output_type="json", + input_file="Data/ecode.csv", + output_dir="/output_dir", + overwrite=True, + ) + + self.assertEqual( + str(context.exception), "No data found for input file 'Data/ecode.csv'." + ) + + mock_path_obj.exists.assert_called_once() + + @patch("scribe_data.cli.convert.Path") + def test_convert_to_json_supported_file_extension_csv(self, mock_path): + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + + mock_path_obj.suffix = ".csv" + mock_path_obj.exists.return_value = True + + convert_to_json( + language="English", + data_type="nouns", + output_type="json", + input_file="test.csv", + output_dir="/output_dir", + overwrite=True, + ) + + @patch("scribe_data.cli.convert.Path") + def test_convert_to_json_supported_file_extension_tsv(self, mock_path): + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + + mock_path_obj.suffix = ".tsv" + mock_path_obj.exists.return_value = True + + convert_to_json( + language="English", + data_type="nouns", + output_type="json", + input_file="test.tsv", + output_dir="/output_dir", + overwrite=True, + ) + + @patch("scribe_data.cli.convert.Path") + def test_convert_to_json_unsupported_file_extension(self, mock_path): + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + + mock_path_obj.suffix = ".txt" + mock_path_obj.exists.return_value = True + + with self.assertRaises(ValueError) as context: + convert_to_json( + language="English", + data_type="nouns", + output_type="json", + input_file="test.txt", + output_dir="/output_dir", + overwrite=True, + ) + + self.assertIn("Unsupported file extension", str(context.exception)) + self.assertEqual( + str(context.exception), + "Unsupported file extension '.txt' for test.txt. Please provide a '.csv' or '.tsv' file.", + ) + + # ==================================================================================================================== + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_json_standard_csv(self, mock_path_class, mock_language_map): + csv_data = "key,value\na,1\nb,2" + expected_json = {"a": "1", "b": "2"} + mock_file_obj = StringIO(csv_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".csv" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.csv" else Path(x) + ) + + mocked_open = mock_open() + + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + # Prevent actual directory creation + mock_mkdir.return_value = None + convert_to_json( + language="English", + data_type="nouns", + output_type="json", + input_file="test.csv", + output_dir="/output_dir", + overwrite=True, + ) + + mocked_open.assert_called_once_with("w", encoding="utf-8") + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + + self.assertEqual(json.loads(written_data), expected_json) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_json_with_multiple_keys( + self, mock_path_class, mock_language_map + ): + csv_data = "key,value1,value2\na,1,x\nb,2,y\nc,3,z" + expected_json = { + "a": {"value1": "1", "value2": "x"}, + "b": {"value1": "2", "value2": "y"}, + "c": {"value1": "3", "value2": "z"}, + } + mock_file_obj = StringIO(csv_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".csv" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.csv" else Path(x) + ) + + mocked_open = mock_open() + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + mock_mkdir.return_value = None + convert_to_json( + language="English", + data_type="nouns", + output_type="json", + input_file="test.csv", + output_dir="/output_dir", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + self.assertEqual(json.loads(written_data), expected_json) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_json_with_complex_structure( + self, mock_path_class, mock_language_map + ): + csv_data = "key,emoji,is_base,rank\na,šŸ˜€,true,1\nb,šŸ˜…,false,2" + expected_json = { + "a": [{"emoji": "šŸ˜€", "is_base": True, "rank": 1}], + "b": [{"emoji": "šŸ˜…", "is_base": False, "rank": 2}], + } + mock_file_obj = StringIO(csv_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".csv" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.csv" else Path(x) + ) + + mocked_open = mock_open() + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + mock_mkdir.return_value = None + convert_to_json( + language="English", + data_type="nouns", + output_type="json", + input_file="test.csv", + output_dir="/output", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + self.assertEqual(json.loads(written_data), expected_json) + + # MARK: CSV OR TSV Tests + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_json_normalized_language( + self, mock_path, mock_language_map + ): + # Mock the language map to return a normalized language for testing + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + + mock_path_obj.suffix = ".json" + mock_path_obj.exists.return_value = True + + mock_json_data = json.dumps({"key1": "value1", "key2": "value2"}) + mock_open_function = mock_open(read_data=mock_json_data) + mock_path_obj.open = mock_open_function + + # Call the function with 'English' + convert_to_csv_or_tsv( + language="English", + data_type="nouns", + output_type="csv", + input_file="input.json", + output_dir="/output_dir", + overwrite=True, + ) + + mock_language_map.get.assert_called_with("english") + + mock_open_function.assert_called_once_with("r", encoding="utf-8") + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_json_unknown_language( + self, mock_path, mock_language_map + ): + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_path_obj = MagicMock(spec=Path) + mock_path.return_value = mock_path_obj + + mock_path_obj.suffix = ".json" + mock_path_obj.exists.return_value = True + + mock_json_data = json.dumps({"key1": "value1", "key2": "value2"}) + mock_open_function = mock_open(read_data=mock_json_data) + mock_path_obj.open = mock_open_function + + with self.assertRaises(ValueError) as context: + convert_to_csv_or_tsv( + language="kazatan", + data_type="nouns", + output_type="csv", + input_file="input.json", + output_dir="/output_dir", + overwrite=True, + ) + + self.assertEqual( + str(context.exception), "Language 'Kazatan' is not recognized." + ) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_tsv_standarddict_to_csv( + self, mock_path_class, mock_language_map + ): + json_data = '{"a": "1", "b": "2"}' + expected_csv_output = "preposition,value\n" "a,1\n" "b,2\n" + + mock_file_obj = StringIO(json_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".json" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.json" else Path(x) + ) + + mocked_open = mock_open() + + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + mock_mkdir.return_value = None + + convert_to_csv_or_tsv( + language="English", + data_type="prepositions", + output_type="csv", + input_file="test.json", + output_dir="/output_dir", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + + # Normalize the line endings for comparison + written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") + expected_csv_output = expected_csv_output.replace("\r\n", "\n").replace( + "\r", "\n" + ) + + self.assertEqual(written_data, expected_csv_output) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_tsv_standarddict_to_tsv( + self, mock_path_class, mock_language_map + ): + json_data = '{"a": "1", "b": "2"}' + + expected_tsv_output = "preposition\tvalue\n" "a\t1\n" "b\t2\n" + + mock_file_obj = StringIO(json_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".json" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.json" else Path(x) + ) + + mocked_open = mock_open() + + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + mock_mkdir.return_value = None + convert_to_csv_or_tsv( + language="English", + data_type="prepositions", + output_type="tsv", + input_file="test.json", + output_dir="/output_dir", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + + written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") + expected_tsv_output = expected_tsv_output.replace("\r\n", "\n").replace( + "\r", "\n" + ) + + self.assertEqual(written_data, expected_tsv_output) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_tsv_nesteddict_to_csv( + self, mock_path_class, mock_language_map + ): + json_data = ( + '{"a": {"value1": "1", "value2": "x"}, "b": {"value1": "2", "value2": "y"}}' + ) + expected_csv_output = "noun,value1,value2\n" "a,1,x\n" "b,2,y\n" + mock_file_obj = StringIO(json_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".json" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.json" else Path(x) + ) + + mocked_open = mock_open() + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + mock_mkdir.return_value = None + convert_to_csv_or_tsv( + language="English", + data_type="nouns", + output_type="csv", + input_file="test.json", + output_dir="/output_dir", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + + written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") + expected_csv_output = expected_csv_output.replace("\r\n", "\n").replace( + "\r", "\n" + ) + self.assertEqual(written_data, expected_csv_output) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_tsv_nesteddict_to_tsv( + self, mock_path_class, mock_language_map + ): + json_data = ( + '{"a": {"value1": "1", "value2": "x"}, "b": {"value1": "2", "value2": "y"}}' + ) + expected_tsv_output = "noun\tvalue1\tvalue2\n" "a\t1\tx\n" "b\t2\ty\n" + + mock_file_obj = StringIO(json_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".json" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.json" else Path(x) + ) + + mocked_open = mock_open() + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + mock_mkdir.return_value = None + convert_to_csv_or_tsv( + language="English", + data_type="nouns", + output_type="tsv", + input_file="test.json", + output_dir="/output_dir", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + + written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") + expected_tsv_output = expected_tsv_output.replace("\r\n", "\n").replace( + "\r", "\n" + ) + + self.assertEqual(written_data, expected_tsv_output) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_tsv_listofdicts_to_csv( + self, mock_path_class, mock_language_map + ): + json_data = '{"a": [{"emoji": "šŸ˜€", "is_base": true, "rank": 1}, {"emoji": "šŸ˜…", "is_base": false, "rank": 2}]}' + expected_csv_output = ( + "word,emoji,is_base,rank\n" "a,šŸ˜€,True,1\n" "a,šŸ˜…,False,2\n" + ) + mock_file_obj = StringIO(json_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".json" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.json" else Path(x) + ) + + mocked_open = mock_open() + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + mock_mkdir.return_value = None + convert_to_csv_or_tsv( + language="English", + data_type="emoji-keywords", + output_type="csv", + input_file="test.json", + output_dir="/output_dir", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + + written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") + expected_csv_output = expected_csv_output.replace("\r\n", "\n").replace( + "\r", "\n" + ) + self.assertEqual(written_data, expected_csv_output) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_tsv_listofdicts_to_tsv( + self, mock_path_class, mock_language_map + ): + json_data = '{"a": [{"emoji": "šŸ˜€", "is_base": true, "rank": 1}, {"emoji": "šŸ˜…", "is_base": false, "rank": 2}]}' + expected_tsv_output = ( + "word\temoji\tis_base\trank\n" "a\tšŸ˜€\tTrue\t1\n" "a\tšŸ˜…\tFalse\t2\n" + ) + mock_file_obj = StringIO(json_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + # Mock input file path + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".json" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.json" else Path(x) + ) + + mocked_open = mock_open() + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + # Prevent actual directory creation + mock_mkdir.return_value = None + convert_to_csv_or_tsv( + language="English", + data_type="emoji-keywords", + output_type="tsv", + input_file="test.json", + output_dir="/output_dir", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + + written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") + expected_tsv_output = expected_tsv_output.replace("\r\n", "\n").replace( + "\r", "\n" + ) + self.assertEqual(written_data, expected_tsv_output) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_tsv_liststrings_to_csv( + self, mock_path_class, mock_language_map + ): + json_data = '{"a": ["x", "y", "z"]}' + expected_csv_output = ( + "autosuggestion,autosuggestion_1,autosuggestion_2,autosuggestion_3\n" + "a,x,y,z\n" + ) + mock_file_obj = StringIO(json_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".json" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.json" else Path(x) + ) + + mocked_open = mock_open() + + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + mock_mkdir.return_value = None + convert_to_csv_or_tsv( + language="English", + data_type="autosuggestions", + output_type="csv", + input_file="test.json", + output_dir="/output_dir", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + + written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") + expected_csv_output = expected_csv_output.replace("\r\n", "\n").replace( + "\r", "\n" + ) + self.assertEqual(written_data, expected_csv_output) + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_csv_or_tsv_liststrings_to_tsv( + self, mock_path_class, mock_language_map + ): + json_data = '{"a": ["x", "y", "z"]}' + expected_tsv_output = ( + "autosuggestion\tautosuggestion_1\tautosuggestion_2\tautosuggestion_3\n" + "a\tx\ty\tz\n" + ) + mock_file_obj = StringIO(json_data) + + mock_language_map.get.side_effect = lambda lang: { + "english": { + "language": "english", + "iso": "en", + "qid": "Q1860", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": [], + }, + "french": { + "language": "french", + "iso": "fr", + "qid": "Q150", + "remove-words": ["of", "the", "The", "and"], + "ignore-words": ["XXe"], + }, + }.get(lang.lower()) + + # Mock input file path + mock_input_file_path = MagicMock(spec=Path) + mock_input_file_path.suffix = ".json" + mock_input_file_path.exists.return_value = True + mock_input_file_path.open.return_value.__enter__.return_value = mock_file_obj + + mock_path_class.side_effect = ( + lambda x: mock_input_file_path if x == "test.json" else Path(x) + ) + + mocked_open = mock_open() + + with patch("pathlib.Path.open", mocked_open), patch( + "pathlib.Path.mkdir" + ) as mock_mkdir: + # Prevent actual directory creation + mock_mkdir.return_value = None + convert_to_csv_or_tsv( + language="English", + data_type="autosuggestions", + output_type="tsv", + input_file="test.json", + output_dir="/output_dir", + overwrite=True, + ) + + mock_file_handle = mocked_open() + written_data = "".join( + call.args[0] for call in mock_file_handle.write.call_args_list + ) + written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") + expected_tsv_output = expected_tsv_output.replace("\r\n", "\n").replace( + "\r", "\n" + ) + self.assertEqual(written_data, expected_tsv_output) + + # MARK: SQLITE Tests + @patch("scribe_data.cli.convert.Path") @patch("scribe_data.cli.convert.data_to_sqlite") @patch("shutil.copy") From faf76b36868412b32bee093cc7189790974457ac Mon Sep 17 00:00:00 2001 From: john-thuo1 Date: Sat, 19 Oct 2024 18:01:24 +0300 Subject: [PATCH 03/13] Modified function docstring --- src/scribe_data/cli/convert.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 0055afad5..3a2774331 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -192,6 +192,23 @@ def convert_to_csv_or_tsv( ) -> None: """ Convert a JSON File to CSV/TSV file. + Parameters + ---------- + language : str + The language of the file to convert. + data_type : Union[str, List[str]] + The data type of the file to convert. + output_type : str + The output format, should be "csv" or "tsv". + input_file : str + The input JSON file path. + output_dir : str + The output directory path for results. + overwrite : bool + Whether to overwrite existing files. + Returns + ------- + None """ # Normalize the language From 3e53aa278ee516232f1bb438548f09476271ee82 Mon Sep 17 00:00:00 2001 From: john-thuo1 Date: Sat, 19 Oct 2024 22:48:03 +0300 Subject: [PATCH 04/13] Remove broken test comments --- tests/cli/test_convert.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index 23362b6fa..bc1a35393 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -57,15 +57,12 @@ def test_convert_to_json_normalized_language(self, mock_path, mock_language_map) }, }.get(lang.lower()) - # Mocking Path object behavior mock_path_obj = MagicMock(spec=Path) mock_path.return_value = mock_path_obj - # Set the file extension to .csv/ .tsv mock_path_obj.suffix = ".csv" mock_path_obj.exists.return_value = True - # Call the function with 'English' convert_to_json( language="English", data_type="nouns", @@ -75,7 +72,6 @@ def test_convert_to_json_normalized_language(self, mock_path, mock_language_map) overwrite=True, ) - # Verify that the mock's get method was called with 'english' (lowercased by the function) mock_language_map.get.assert_called_with("english") @patch("scribe_data.cli.convert.language_map", autospec=True) @@ -105,7 +101,6 @@ def test_convert_to_json_unknown_language(self, mock_path, mock_language_map): @patch("scribe_data.cli.convert.Path") def test_convert_to_json_with_input_file(self, mock_path): - # Sample Data csv_data = "key,value\na,1\nb,2" mock_file = StringIO(csv_data) @@ -126,7 +121,6 @@ def test_convert_to_json_with_input_file(self, mock_path): mock_path_obj.exists.assert_called_once() - # Verify the file was opened for reading mock_path_obj.open.assert_called_once_with("r", encoding="utf-8") @patch("scribe_data.cli.convert.Path") @@ -211,7 +205,6 @@ def test_convert_to_json_unsupported_file_extension(self, mock_path): "Unsupported file extension '.txt' for test.txt. Please provide a '.csv' or '.tsv' file.", ) - # ==================================================================================================================== @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) def test_convert_to_json_standard_csv(self, mock_path_class, mock_language_map): @@ -250,7 +243,6 @@ def test_convert_to_json_standard_csv(self, mock_path_class, mock_language_map): with patch("pathlib.Path.open", mocked_open), patch( "pathlib.Path.mkdir" ) as mock_mkdir: - # Prevent actual directory creation mock_mkdir.return_value = None convert_to_json( language="English", @@ -392,7 +384,6 @@ def test_convert_to_json_with_complex_structure( def test_convert_to_csv_or_json_normalized_language( self, mock_path, mock_language_map ): - # Mock the language map to return a normalized language for testing mock_language_map.get.side_effect = lambda lang: { "english": { "language": "english", @@ -420,7 +411,6 @@ def test_convert_to_csv_or_json_normalized_language( mock_open_function = mock_open(read_data=mock_json_data) mock_path_obj.open = mock_open_function - # Call the function with 'English' convert_to_csv_or_tsv( language="English", data_type="nouns", @@ -536,7 +526,6 @@ def test_convert_to_csv_or_tsv_standarddict_to_csv( call.args[0] for call in mock_file_handle.write.call_args_list ) - # Normalize the line endings for comparison written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") expected_csv_output = expected_csv_output.replace("\r\n", "\n").replace( "\r", "\n" @@ -952,7 +941,6 @@ def test_convert_to_csv_or_tsv_liststrings_to_tsv( }, }.get(lang.lower()) - # Mock input file path mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" mock_input_file_path.exists.return_value = True @@ -967,7 +955,6 @@ def test_convert_to_csv_or_tsv_liststrings_to_tsv( with patch("pathlib.Path.open", mocked_open), patch( "pathlib.Path.mkdir" ) as mock_mkdir: - # Prevent actual directory creation mock_mkdir.return_value = None convert_to_csv_or_tsv( language="English", From 27199a4fd14110c5523344870a635e0b499b5fc5 Mon Sep 17 00:00:00 2001 From: john-thuo1 Date: Sun, 20 Oct 2024 14:30:39 +0300 Subject: [PATCH 05/13] Clean up repeated code blocks in convert tests --- tests/cli/test_convert.py | 324 +++++++++----------------------------- 1 file changed, 75 insertions(+), 249 deletions(-) diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index bc1a35393..996cc5f52 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -24,7 +24,7 @@ import json from pathlib import Path import unittest -from unittest.mock import MagicMock, mock_open, patch +from unittest.mock import MagicMock, Mock, mock_open, patch from scribe_data.cli.convert import ( @@ -35,11 +35,21 @@ class TestConvert(unittest.TestCase): - # MARK: JSON Tests - - @patch("scribe_data.cli.convert.language_map", autospec=True) - @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_normalized_language(self, mock_path, mock_language_map): + # Helper Functions + def setup_language_map(self, mock_language_map: Mock) -> None: + """ + Set up the mock language map for testing. + + Parameters + --------- + mock_language_map: Mock + Mock object representing the language map + to be configured. + + Returns + ------- + None + """ mock_language_map.get.side_effect = lambda lang: { "english": { "language": "english", @@ -57,6 +67,34 @@ def test_convert_to_json_normalized_language(self, mock_path, mock_language_map) }, }.get(lang.lower()) + def normalize_line_endings(self, data: str) -> str: + """ + Normalize line endings in a given string. + + This method replaces Windows-style line endings (`\r\n`) and + standalone carriage return characters (`\r`) with Unix-style + line endings (`\n`). This is useful for ensuring consistent + line endings when comparing strings or writing to files. + + Parameters + ---------- + data: str + The input string whose line endings are to be normalized. + + Returns + --------- + data: str + The input string with normalized line endings. + """ + return data.replace("\r\n", "\n").replace("\r", "\n") + + # MARK: JSON Tests + + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_json_normalized_language(self, mock_path, mock_language_map): + self.setup_language_map(mock_language_map) + mock_path_obj = MagicMock(spec=Path) mock_path.return_value = mock_path_obj @@ -64,7 +102,7 @@ def test_convert_to_json_normalized_language(self, mock_path, mock_language_map) mock_path_obj.exists.return_value = True convert_to_json( - language="English", + language="French", data_type="nouns", output_type="json", input_file="input.csv", @@ -72,13 +110,12 @@ def test_convert_to_json_normalized_language(self, mock_path, mock_language_map) overwrite=True, ) - mock_language_map.get.assert_called_with("english") + mock_language_map.get.assert_called_with("french") @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) def test_convert_to_json_unknown_language(self, mock_path, mock_language_map): mock_language_map.get.return_value = None - # Mock for input file and output_directory mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.exists.return_value = True @@ -212,22 +249,7 @@ def test_convert_to_json_standard_csv(self, mock_path_class, mock_language_map): expected_json = {"a": "1", "b": "2"} mock_file_obj = StringIO(csv_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".csv" @@ -275,22 +297,7 @@ def test_convert_to_json_with_multiple_keys( } mock_file_obj = StringIO(csv_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".csv" @@ -332,22 +339,7 @@ def test_convert_to_json_with_complex_structure( } mock_file_obj = StringIO(csv_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".csv" @@ -384,22 +376,7 @@ def test_convert_to_json_with_complex_structure( def test_convert_to_csv_or_json_normalized_language( self, mock_path, mock_language_map ): - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_path_obj = MagicMock(spec=Path) mock_path.return_value = mock_path_obj @@ -429,22 +406,7 @@ def test_convert_to_csv_or_json_normalized_language( def test_convert_to_csv_or_json_unknown_language( self, mock_path, mock_language_map ): - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_path_obj = MagicMock(spec=Path) mock_path.return_value = mock_path_obj @@ -480,22 +442,7 @@ def test_convert_to_csv_or_tsv_standarddict_to_csv( mock_file_obj = StringIO(json_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" @@ -526,10 +473,8 @@ def test_convert_to_csv_or_tsv_standarddict_to_csv( call.args[0] for call in mock_file_handle.write.call_args_list ) - written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") - expected_csv_output = expected_csv_output.replace("\r\n", "\n").replace( - "\r", "\n" - ) + written_data = self.normalize_line_endings(written_data) + expected_csv_output = self.normalize_line_endings(expected_csv_output) self.assertEqual(written_data, expected_csv_output) @@ -544,22 +489,7 @@ def test_convert_to_csv_or_tsv_standarddict_to_tsv( mock_file_obj = StringIO(json_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" @@ -589,10 +519,8 @@ def test_convert_to_csv_or_tsv_standarddict_to_tsv( call.args[0] for call in mock_file_handle.write.call_args_list ) - written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") - expected_tsv_output = expected_tsv_output.replace("\r\n", "\n").replace( - "\r", "\n" - ) + written_data = self.normalize_line_endings(written_data) + expected_tsv_output = self.normalize_line_endings(expected_tsv_output) self.assertEqual(written_data, expected_tsv_output) @@ -607,22 +535,7 @@ def test_convert_to_csv_or_tsv_nesteddict_to_csv( expected_csv_output = "noun,value1,value2\n" "a,1,x\n" "b,2,y\n" mock_file_obj = StringIO(json_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" @@ -652,10 +565,8 @@ def test_convert_to_csv_or_tsv_nesteddict_to_csv( call.args[0] for call in mock_file_handle.write.call_args_list ) - written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") - expected_csv_output = expected_csv_output.replace("\r\n", "\n").replace( - "\r", "\n" - ) + written_data = self.normalize_line_endings(written_data) + expected_csv_output = self.normalize_line_endings(expected_csv_output) self.assertEqual(written_data, expected_csv_output) @patch("scribe_data.cli.convert.language_map", autospec=True) @@ -670,22 +581,7 @@ def test_convert_to_csv_or_tsv_nesteddict_to_tsv( mock_file_obj = StringIO(json_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" @@ -715,10 +611,8 @@ def test_convert_to_csv_or_tsv_nesteddict_to_tsv( call.args[0] for call in mock_file_handle.write.call_args_list ) - written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") - expected_tsv_output = expected_tsv_output.replace("\r\n", "\n").replace( - "\r", "\n" - ) + written_data = self.normalize_line_endings(written_data) + expected_tsv_output = self.normalize_line_endings(expected_tsv_output) self.assertEqual(written_data, expected_tsv_output) @@ -733,22 +627,7 @@ def test_convert_to_csv_or_tsv_listofdicts_to_csv( ) mock_file_obj = StringIO(json_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" @@ -778,10 +657,8 @@ def test_convert_to_csv_or_tsv_listofdicts_to_csv( call.args[0] for call in mock_file_handle.write.call_args_list ) - written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") - expected_csv_output = expected_csv_output.replace("\r\n", "\n").replace( - "\r", "\n" - ) + written_data = self.normalize_line_endings(written_data) + expected_csv_output = self.normalize_line_endings(expected_csv_output) self.assertEqual(written_data, expected_csv_output) @patch("scribe_data.cli.convert.language_map", autospec=True) @@ -795,22 +672,7 @@ def test_convert_to_csv_or_tsv_listofdicts_to_tsv( ) mock_file_obj = StringIO(json_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) # Mock input file path mock_input_file_path = MagicMock(spec=Path) @@ -842,10 +704,8 @@ def test_convert_to_csv_or_tsv_listofdicts_to_tsv( call.args[0] for call in mock_file_handle.write.call_args_list ) - written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") - expected_tsv_output = expected_tsv_output.replace("\r\n", "\n").replace( - "\r", "\n" - ) + written_data = self.normalize_line_endings(written_data) + expected_tsv_output = self.normalize_line_endings(expected_tsv_output) self.assertEqual(written_data, expected_tsv_output) @patch("scribe_data.cli.convert.language_map", autospec=True) @@ -860,22 +720,7 @@ def test_convert_to_csv_or_tsv_liststrings_to_csv( ) mock_file_obj = StringIO(json_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" @@ -906,10 +751,8 @@ def test_convert_to_csv_or_tsv_liststrings_to_csv( call.args[0] for call in mock_file_handle.write.call_args_list ) - written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") - expected_csv_output = expected_csv_output.replace("\r\n", "\n").replace( - "\r", "\n" - ) + written_data = self.normalize_line_endings(written_data) + expected_csv_output = self.normalize_line_endings(expected_csv_output) self.assertEqual(written_data, expected_csv_output) @patch("scribe_data.cli.convert.language_map", autospec=True) @@ -924,22 +767,7 @@ def test_convert_to_csv_or_tsv_liststrings_to_tsv( ) mock_file_obj = StringIO(json_data) - mock_language_map.get.side_effect = lambda lang: { - "english": { - "language": "english", - "iso": "en", - "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], - }, - "french": { - "language": "french", - "iso": "fr", - "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], - }, - }.get(lang.lower()) + self.setup_language_map(mock_language_map) mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" @@ -969,10 +797,8 @@ def test_convert_to_csv_or_tsv_liststrings_to_tsv( written_data = "".join( call.args[0] for call in mock_file_handle.write.call_args_list ) - written_data = written_data.replace("\r\n", "\n").replace("\r", "\n") - expected_tsv_output = expected_tsv_output.replace("\r\n", "\n").replace( - "\r", "\n" - ) + written_data = self.normalize_line_endings(written_data) + expected_tsv_output = self.normalize_line_endings(expected_tsv_output) self.assertEqual(written_data, expected_tsv_output) # MARK: SQLITE Tests From 6b506e57f6eb3e6946209572fc18e480d7112e4c Mon Sep 17 00:00:00 2001 From: john-thuo1 Date: Wed, 23 Oct 2024 17:37:17 +0300 Subject: [PATCH 06/13] Modify get functionality --- src/scribe_data/cli/convert.py | 79 ++++++++++++++++++++++++++++++++-- src/scribe_data/cli/get.py | 34 +++++++++++---- src/scribe_data/cli/main.py | 44 +++++-------------- 3 files changed, 112 insertions(+), 45 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 3a2774331..c842b3a14 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -217,8 +217,10 @@ def convert_to_csv_or_tsv( if not normalized_language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") - # Split the data_type string by commas - data_types = [dtype.strip() for dtype in data_type.split(",")] + if isinstance(data_type, str): + data_types = [data_type.strip()] + else: + data_types = [dtype.strip() for dtype in data_type] for dtype in data_types: input_file = Path(input_file) @@ -325,7 +327,7 @@ def convert_to_csv_or_tsv( print(f"Error writing to '{output_file}': {e}") continue - print(f"Data for '{dtype}' written to '{output_file}'") + print(f"Data for '{language} {dtype}' written to '{output_file}'") # MARK: SQLITE @@ -404,3 +406,74 @@ def convert_to_sqlite( print(f"Warning: SQLite file not found at {source_path}") print("SQLite file conversion complete.") + + +def convert( + language: str, + data_type: Union[str, List[str]], + output_type: str, + input_file: str, + output_dir: str = None, + overwrite: bool = False, +): + """ + Convert data to the specified output type: JSON, CSV/TSV, or SQLite. + + Parameters + ---------- + language : str + The language of the data to convert. + + data_type : Union[str, List[str]] + The data type(s) of the data to convert. + + output_type : str + The desired output format. It can be 'json', 'csv', 'tsv', or 'sqlite'. + + input_file : str + The path to the input file. + + output_dir : str, optional + The output directory where converted files will be stored. Defaults to None. + + overwrite : bool, optional + Whether to overwrite existing output files. Defaults to False. + + Returns + ------- + None + """ + output_type = output_type.lower() + + # Route the function call to the correct conversion method + if output_type == "json": + convert_to_json( + language=language, + data_type=data_type, + output_type=output_type, + input_file=input_file, + output_dir=output_dir, + overwrite=overwrite, + ) + elif output_type in {"csv", "tsv"}: + convert_to_csv_or_tsv( + language=language, + data_type=data_type, + output_type=output_type, + input_file=input_file, + output_dir=output_dir, + overwrite=overwrite, + ) + elif output_type == "sqlite": + convert_to_sqlite( + language=language, + data_type=data_type, + output_type=output_type, + input_file=input_file, + output_dir=output_dir, + overwrite=overwrite, + ) + else: + raise ValueError( + f"Unsupported output type '{output_type}'. Must be 'json', 'csv', 'tsv', or 'sqlite'." + ) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index c3d5eecc9..69c369741 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -22,6 +22,8 @@ import subprocess from pathlib import Path +from typing import List, Union +import os # For removing the JSON file from scribe_data.utils import ( DEFAULT_CSV_EXPORT_DIR, @@ -30,11 +32,12 @@ DEFAULT_TSV_EXPORT_DIR, ) from scribe_data.wikidata.query_data import query_data +from scribe_data.cli.convert import convert def get_data( language: str = None, - data_type: str = None, + data_type: Union[str, List[str]] = None, output_type: str = None, output_dir: str = None, overwrite: bool = False, @@ -110,7 +113,6 @@ def get_data( / "emoji_keywords" / "generate_emoji_keywords.py" ) - subprocess_result = subprocess.run( ["python", emoji_keyword_extraction_script] ) @@ -120,9 +122,8 @@ def get_data( elif language or data_type: data_type = data_type[0] if isinstance(data_type, list) else data_type - data_type = [data_type] if data_type else None print( - f"Updating data for language(s): {language}; data type(s): {', '.join(data_type)}" + f"Updating data for language(s): {language}; data type(s): {', '.join([data_type])}" ) query_data( languages=languages, @@ -142,13 +143,30 @@ def get_data( isinstance(subprocess_result, subprocess.CompletedProcess) and subprocess_result.returncode != 1 ) or (isinstance(subprocess_result, bool) and subprocess_result is not False): - print( - f"Updated data was saved in: {Path(output_dir).resolve()}.", - ) + print(f"Updated data was saved in: {Path(output_dir).resolve()}.") + + json_input_path = Path(output_dir) / f"{language}/{data_type}.json" + + # Proceed with conversion only if the output type is not JSON + if output_type != "json": + if json_input_path.exists(): + convert( + language=language, + data_type=data_type, + output_type=output_type, + input_file=str(json_input_path), + output_dir=output_dir, + overwrite=overwrite, + ) + + os.remove(json_input_path) + else: + print(f"Error: Input file '{json_input_path}' does not exist.") + if interactive: return True - # The emoji keywords process has failed. + # Handle emoji keywords process failure elif data_type in {"emoji-keywords", "emoji_keywords"}: print( "\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed." diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index ffe76a8c7..cbb5a2b44 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -25,11 +25,8 @@ from pathlib import Path from scribe_data.cli.cli_utils import validate_language_and_data_type -from scribe_data.cli.convert import ( - convert_to_csv_or_tsv, - convert_to_json, - convert_to_sqlite, -) +from scribe_data.cli.convert import convert + from scribe_data.cli.get import get_data from scribe_data.cli.interactive import start_interactive_mode from scribe_data.cli.list import list_wrapper @@ -276,35 +273,14 @@ def main() -> None: total_wrapper(args.language, args.data_type, args.all) elif args.command in ["convert", "c"]: - if args.output_type in ["csv", "tsv"]: - convert_to_csv_or_tsv( - language=args.language, - data_type=args.data_type, - output_type=args.output_type, - input_file=args.input_file, - output_dir=args.output_dir, - overwrite=args.overwrite, - ) - - elif args.output_type == "sqlite": - convert_to_sqlite( - language=args.language, - data_type=args.data_type, - output_type=args.output_type, - input_file=args.input_file, - output_dir=args.output_dir, - overwrite=args.overwrite, - ) - - elif args.output_type == "json": - convert_to_json( - language=args.language, - data_type=args.data_type, - output_type=args.output_type, - input_file=args.input_file, - output_dir=args.output_dir, - overwrite=args.overwrite, - ) + convert( + language=args.language, + data_type=args.data_type, + output_type=args.output_type, + input_file=args.input_file, + output_dir=args.output_dir, + overwrite=args.overwrite, + ) else: parser.print_help() From 21f79541e46d1080f7bc83a49cde92f678d972f9 Mon Sep 17 00:00:00 2001 From: john-thuo1 Date: Wed, 23 Oct 2024 19:12:59 +0300 Subject: [PATCH 07/13] Add convert function test --- src/scribe_data/cli/convert.py | 2 +- tests/cli/test_convert.py | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index c842b3a14..ce11d09cd 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -445,7 +445,7 @@ def convert( """ output_type = output_type.lower() - # Route the function call to the correct conversion method + # Route the function call to the correct conversion function if output_type == "json": convert_to_json( language=language, diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index 996cc5f52..f17da572a 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -28,6 +28,7 @@ from scribe_data.cli.convert import ( + convert, convert_to_json, convert_to_sqlite, convert_to_csv_or_tsv, @@ -71,10 +72,6 @@ def normalize_line_endings(self, data: str) -> str: """ Normalize line endings in a given string. - This method replaces Windows-style line endings (`\r\n`) and - standalone carriage return characters (`\r`) with Unix-style - line endings (`\n`). This is useful for ensuring consistent - line endings when comparing strings or writing to files. Parameters ---------- @@ -877,3 +874,19 @@ def test_convert_to_sqlite_no_language(self): output_dir="/output", overwrite=True, ) + + def test_convert(self): + with self.assertRaises(ValueError) as context: + convert( + language="English", + data_type="nouns", + output_type="parquet", + input_file="Data/ecode.csv", + output_dir="/output_dir", + overwrite=True, + ) + + self.assertEqual( + str(context.exception), + "Unsupported output type 'parquet'. Must be 'json', 'csv', 'tsv', or 'sqlite'.", + ) From edfd156268b30f6573829febd5630d9c7c77eb4b Mon Sep 17 00:00:00 2001 From: john-thuo1 Date: Thu, 24 Oct 2024 19:39:37 +0300 Subject: [PATCH 08/13] Updated tests for convert and get functions --- src/scribe_data/cli/convert.py | 4 +- src/scribe_data/cli/get.py | 3 +- tests/cli/test_convert.py | 68 +++++++++++++++------------------- tests/cli/test_get.py | 13 ++----- 4 files changed, 36 insertions(+), 52 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 8a2527d42..dfb4dcb3e 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -91,9 +91,7 @@ def convert_to_json( input_file_path = Path(input_file) if not input_file_path.exists(): - raise FileNotFoundError( - f"No data found for input file '{input_file_path}'." - ) + raise FileNotFoundError(f"Input file '{input_file_path}' does not exist.") delimiter = {".csv": ",", ".tsv": "\t"}.get(input_file_path.suffix.lower()) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 58337db1e..fd521846a 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -93,6 +93,7 @@ def get_data( output_dir = DEFAULT_TSV_EXPORT_DIR languages = [language] if language else None + data_types = [data_type] if data_type else None subprocess_result = False @@ -118,7 +119,7 @@ def get_data( ) query_data( languages=languages, - data_type=data_type, + data_type=data_types, output_dir=output_dir, overwrite=overwrite, interactive=interactive, diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index f17da572a..84c5d5f8b 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -113,7 +113,6 @@ def test_convert_to_json_normalized_language(self, mock_path, mock_language_map) @patch("scribe_data.cli.convert.Path", autospec=True) def test_convert_to_json_unknown_language(self, mock_path, mock_language_map): mock_language_map.get.return_value = None - # Mock for input file and output_directory mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.exists.return_value = True mock_path.side_effect = [mock_input_file_path, MagicMock(spec=Path)] @@ -128,16 +127,18 @@ def test_convert_to_json_unknown_language(self, mock_path, mock_language_map): overwrite=True, ) - # Assert the error message self.assertEqual( str(context.exception), "Language 'Kazatan' is not recognized." ) - @patch("scribe_data.cli.convert.Path") - def test_convert_to_json_with_input_file(self, mock_path): + @patch("scribe_data.cli.convert.language_map", autospec=True) + @patch("scribe_data.cli.convert.Path", autospec=True) + def test_convert_to_json_with_input_file(self, mock_path, mock_language_map): csv_data = "key,value\na,1\nb,2" mock_file = StringIO(csv_data) + self.setup_language_map(mock_language_map) + mock_path_obj = MagicMock(spec=Path) mock_path.return_value = mock_path_obj mock_path_obj.suffix = ".csv" @@ -157,37 +158,19 @@ def test_convert_to_json_with_input_file(self, mock_path): mock_path_obj.open.assert_called_once_with("r", encoding="utf-8") + @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path") - def test_convert_to_json_no_input_file(self, mock_path): - mock_path_obj = MagicMock(spec=Path) - mock_path.return_value = mock_path_obj - mock_path_obj.exists.return_value = False - - mock_path_obj.__str__.return_value = "Data/ecode.csv" - - with self.assertRaises(FileNotFoundError) as context: - convert_to_json( - language="English", - data_type="nouns", - output_type="json", - input_file="Data/ecode.csv", - output_dir="/output_dir", - overwrite=True, - ) - - self.assertEqual( - str(context.exception), "No data found for input file 'Data/ecode.csv'." - ) + def test_convert_to_json_supported_file_extension_csv( + self, mock_path_class, mock_language_map + ): + self.setup_language_map(mock_language_map) - mock_path_obj.exists.assert_called_once() + mock_path_instance = MagicMock(spec=Path) - @patch("scribe_data.cli.convert.Path") - def test_convert_to_json_supported_file_extension_csv(self, mock_path): - mock_path_obj = MagicMock(spec=Path) - mock_path.return_value = mock_path_obj + mock_path_class.return_value = mock_path_instance - mock_path_obj.suffix = ".csv" - mock_path_obj.exists.return_value = True + mock_path_instance.suffix = ".csv" + mock_path_instance.exists.return_value = True convert_to_json( language="English", @@ -198,13 +181,18 @@ def test_convert_to_json_supported_file_extension_csv(self, mock_path): overwrite=True, ) + @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path") - def test_convert_to_json_supported_file_extension_tsv(self, mock_path): - mock_path_obj = MagicMock(spec=Path) - mock_path.return_value = mock_path_obj + def test_convert_to_json_supported_file_extension_tsv( + self, mock_path_class, mock_language_map + ): + self.setup_language_map(mock_language_map) + mock_path_instance = MagicMock(spec=Path) - mock_path_obj.suffix = ".tsv" - mock_path_obj.exists.return_value = True + mock_path_class.return_value = mock_path_instance + + mock_path_instance.suffix = ".tsv" + mock_path_instance.exists.return_value = True convert_to_json( language="English", @@ -215,8 +203,12 @@ def test_convert_to_json_supported_file_extension_tsv(self, mock_path): overwrite=True, ) + @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path") - def test_convert_to_json_unsupported_file_extension(self, mock_path): + def test_convert_to_json_unsupported_file_extension( + self, mock_path, mock_language_map + ): + self.setup_language_map(mock_language_map) mock_path_obj = MagicMock(spec=Path) mock_path.return_value = mock_path_obj @@ -821,13 +813,11 @@ def test_convert_to_sqlite(self, mock_shutil_copy, mock_data_to_sqlite, mock_pat @patch("scribe_data.cli.convert.Path") @patch("scribe_data.cli.convert.data_to_sqlite") def test_convert_to_sqlite_no_output_dir(self, mock_data_to_sqlite, mock_path): - # Create a mock for input file mock_input_file = MagicMock() mock_input_file.exists.return_value = True mock_path.return_value = mock_input_file - # source and destination paths mock_input_file.parent = MagicMock() mock_input_file.parent.__truediv__.return_value = MagicMock() mock_input_file.parent.__truediv__.return_value.exists.return_value = False diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index a1e21e750..686f62843 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -29,15 +29,10 @@ class TestGetData(unittest.TestCase): # MARK: Subprocess Patching - @patch("scribe_data.cli.get.generate_emoji") - def test_get_emoji_keywords(self, generate_emoji): - get_data( - language="English", data_type="emoji_keywords", output_dir="./test_output" - ) - generate_emoji.assert_called_once_with( - language="English", - output_dir="./test_output", - ) + @patch("subprocess.run") + def test_get_emoji_keywords(self, mock_subprocess_run): + get_data(language="English", data_type="emoji-keywords") + self.assertTrue(mock_subprocess_run.called) # MARK: Invalid Arguments From 94b639a87d758508ad54bdb992ae9997690506c7 Mon Sep 17 00:00:00 2001 From: axif Date: Fri, 25 Oct 2024 01:10:26 +0600 Subject: [PATCH 09/13] ubuntu pyicu fix --- src/scribe_data/check/check_pyICU.py | 180 ++++++++++++++++++ src/scribe_data/cli/get.py | 14 +- .../unicode/generate_emoji_keywords.py | 73 ++++--- 3 files changed, 235 insertions(+), 32 deletions(-) create mode 100644 src/scribe_data/check/check_pyICU.py diff --git a/src/scribe_data/check/check_pyICU.py b/src/scribe_data/check/check_pyICU.py new file mode 100644 index 000000000..a30e7e8e8 --- /dev/null +++ b/src/scribe_data/check/check_pyICU.py @@ -0,0 +1,180 @@ +import requests +import pkg_resources +import sys +import os +import platform # Added to check the OS +from pathlib import Path +import subprocess + + +def check_if_pyicu_installed(): + installed_packages = {pkg.key for pkg in pkg_resources.working_set} + if "pyicu" in installed_packages: + return True + return False + + +def get_python_version_and_architecture(): + """ + Get the current Python version and architecture. + + Returns + ------- + str : python_version + The Python version in the format 'cpXY'. + str : architecture + The architecture type ('amd64' or 'win32'). + """ + version = sys.version_info + python_version = f"cp{version.major}{version.minor}" + architecture = "win_amd64" if sys.maxsize > 2**32 else "win32" + return python_version, architecture + + +def fetch_wheel_releases(): + """ + Fetch the release data for PyICU from GitHub. + + Returns + ------- + list : available_wheels + A list of tuples containing wheel file names and their download URLs. + float : total_size_mb + The total size of all available wheels in MB. + """ + url = "https://api.github.com/repos/cgohlke/pyicu-build/releases" + response = requests.get(url) + response.raise_for_status() # Raise an error for bad responses + + available_wheels = [] + total_size_bytes = 0 + + for release in response.json(): + for asset in release["assets"]: + if asset["name"].endswith(".whl"): + available_wheels.append((asset["name"], asset["browser_download_url"])) + total_size_bytes += asset["size"] + + total_size_mb = total_size_bytes / (1024 * 1024) # Convert bytes to MB + return available_wheels, total_size_mb + + +def download_wheel_file(wheel_url, output_dir): + """ + Download the wheel file from the given URL. + + Parameters + ---------- + wheel_url : str + The URL of the wheel file to download. + output_dir : str + The directory to save the downloaded file. + + Returns + ------- + str : path to the downloaded wheel file. + """ + response = requests.get(wheel_url) + response.raise_for_status() # Raise an error for bad responses + + wheel_filename = os.path.basename(wheel_url) + wheel_path = os.path.join(output_dir, wheel_filename) + + with open(wheel_path, "wb") as wheel_file: + wheel_file.write(response.content) + + return wheel_path + + +def find_matching_wheel(wheels, python_version, architecture): + """ + Find the matching wheel file based on Python version and architecture. + + Parameters + ---------- + wheels : list + The list of available wheels. + python_version : str + The Python version (e.g., 'cp311'). + architecture : str + The architecture type (e.g., 'win_amd64'). + + Returns + ------- + str : The download URL of the matching wheel or None if not found. + """ + for name, download_url in wheels: + if python_version in name and architecture in name: + return download_url + return None + + +def check_and_install_pyicu(): + package_name = "PyICU" + installed_packages = {pkg.key for pkg in pkg_resources.working_set} + if package_name.lower() not in installed_packages: + # print(f"{package_name} not found. Installing...") + + # Fetch available wheels from GitHub to estimate download size + wheels, total_size_mb = fetch_wheel_releases() + + print( + f"{package_name} is not installed.\nIt will be downloaded from 'https://github.com/repos/cgohlke/pyicu'" + f"\nApproximately {total_size_mb:.2f} MB will be downloaded.\nDo you want to proceed? (Y/n)?" + ) + + user_input = input().strip().lower() + if user_input == "" or user_input in ["y", "yes"]: + print("Proceeding with installation...") + else: + print("Installation aborted by the user.") + return False + + # Check the operating system + if platform.system() != "Windows": + # If not Windows, directly use pip to install PyICU + try: + subprocess.run( + [sys.executable, "-m", "pip", "install", package_name], check=True + ) + print(f"{package_name} has been installed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error occurred while installing {package_name}: {e}") + return False + else: + # Windows-specific installation using wheel files + python_version, architecture = get_python_version_and_architecture() + + # Find the matching wheel for the current Python version and architecture + wheel_url = find_matching_wheel(wheels, python_version, architecture) + + if not wheel_url: + print( + "No matching wheel file found for your Python version and architecture." + ) + return False + + # Download the wheel file + output_dir = Path.cwd() # Use the current directory for simplicity + wheel_path = download_wheel_file(wheel_url, output_dir) + + # Install PyICU using pip + try: + subprocess.run( + [sys.executable, "-m", "pip", "install", wheel_path], + check=True, + ) + print(f"{package_name} has been installed successfully.") + + # Remove the downloaded wheel file + os.remove(wheel_path) + print(f"Removed temporary file: {wheel_path}") + + except subprocess.CalledProcessError as e: + print(f"Error occurred while installing {package_name}: {e}") + return False + + # else: + # print(f"{package_name} is already installed.") + + return True diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index 7bf54453b..0ce19d9bc 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -139,10 +139,10 @@ def get_data( return True # The emoji keywords process has failed. - elif data_type in {"emoji-keywords", "emoji_keywords"}: - print( - "\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed." - ) - print( - "Please check the installation guide at https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md for more information.\n" - ) + # elif data_type in {"emoji-keywords", "emoji_keywords"}: + # print( + # "\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed." + # ) + # print( + # "Please check the installation guide at https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/unicode/UNICODE_INSTALLTION.md for more information.\n" + # ) diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py index 756f06b31..6dbdcc5a9 100644 --- a/src/scribe_data/unicode/generate_emoji_keywords.py +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -25,35 +25,58 @@ from scribe_data.unicode.process_unicode import gen_emoji_lexicon from scribe_data.utils import export_formatted_data, get_language_iso +from scribe_data.check.check_pyICU import ( + check_and_install_pyicu, + check_if_pyicu_installed, +) DATA_TYPE = "emoji-keywords" EMOJI_KEYWORDS_DICT = 3 def generate_emoji(language, output_dir: str = None): - iso = get_language_iso(language=language) - path_to_cldr_annotations = ( - Path(__file__).parent / "cldr-annotations-full" / "annotations" - ) - if iso in os.listdir(path_to_cldr_annotations): - print(f"Emoji Generation for language {language} is supported") - - else: - print(f"Emoji Generation for language {language} is not supported") - return - - updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir - export_dir = Path(updated_path) / language.capitalize() - export_dir.mkdir(parents=True, exist_ok=True) - - if emoji_keywords_dict := gen_emoji_lexicon( - language=language, - emojis_per_keyword=EMOJI_KEYWORDS_DICT, - ): - export_formatted_data( - file_path=output_dir, - formatted_data=emoji_keywords_dict, - query_data_in_use=True, - language=language, - data_type=DATA_TYPE, + """ + Generates emoji keywords for a specified language and exports the data to the given directory. + + This function first checks and installs the PyICU package, which is necessary for the script to run. + If the installation is successful, it proceeds with generating emoji keywords based on the specified language. + The results are then exported to the provided output directory. + + Parameters: + - language (str): The ISO code of the language for which to generate emoji keywords. + - output_dir (str, optional): The directory where the generated data will be saved. If not specified, + the data will be saved in a default directory. + + Returns: + - None: The function does not return any value but outputs data to the specified directory. + """ + if check_and_install_pyicu() and check_if_pyicu_installed() is False: + print("Thank you.") + + if check_if_pyicu_installed(): + iso = get_language_iso(language=language) + path_to_cldr_annotations = ( + Path(__file__).parent / "cldr-annotations-full" / "annotations" ) + if iso in os.listdir(path_to_cldr_annotations): + print(f"Emoji Generation for language {language} is supported") + + else: + print(f"Emoji Generation for language {language} is not supported") + return + + updated_path = output_dir[2:] if output_dir.startswith("./") else output_dir + export_dir = Path(updated_path) / language.capitalize() + export_dir.mkdir(parents=True, exist_ok=True) + + if emoji_keywords_dict := gen_emoji_lexicon( + language=language, + emojis_per_keyword=EMOJI_KEYWORDS_DICT, + ): + export_formatted_data( + file_path=output_dir, + formatted_data=emoji_keywords_dict, + query_data_in_use=True, + language=language.capitalize(), + data_type=DATA_TYPE, + ) From c49c1691e42de0727647bc3a06fa9b3bfd313d88 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Thu, 24 Oct 2024 23:42:46 +0200 Subject: [PATCH 10/13] Fixes to functionality and comment out broken tests --- src/scribe_data/cli/convert.py | 97 +++++----- src/scribe_data/cli/get.py | 15 +- src/scribe_data/cli/main.py | 27 +-- src/scribe_data/load/data_to_sqlite.py | 20 ++- src/scribe_data/wikidata/query_data.py | 1 - tests/cli/test_convert.py | 234 ++++++++++++------------- tests/cli/test_get.py | 13 +- 7 files changed, 222 insertions(+), 185 deletions(-) diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index dfb4dcb3e..6d5f4d38a 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -28,12 +28,11 @@ from scribe_data.load.data_to_sqlite import data_to_sqlite from scribe_data.utils import ( - DEFAULT_SQLITE_EXPORT_DIR, - DEFAULT_JSON_EXPORT_DIR, DEFAULT_CSV_EXPORT_DIR, + DEFAULT_JSON_EXPORT_DIR, + DEFAULT_SQLITE_EXPORT_DIR, DEFAULT_TSV_EXPORT_DIR, get_language_iso, - language_map, ) # MARK: JSON @@ -74,7 +73,7 @@ def convert_to_json( ------- None """ - normalized_language = language_map.get(language.lower()) + normalized_language = language.lower() if not normalized_language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") @@ -84,7 +83,7 @@ def convert_to_json( if output_dir is None: output_dir = DEFAULT_JSON_EXPORT_DIR - json_output_dir = Path(output_dir) / normalized_language["language"].capitalize() + json_output_dir = Path(output_dir) / normalized_language.capitalize() json_output_dir.mkdir(parents=True, exist_ok=True) for dtype in data_types: @@ -109,17 +108,17 @@ def convert_to_json( print(f"No data found in '{input_file_path}'.") continue - # Use the first row to inspect column headers + # Use the first row to inspect column headers. first_row = rows[0] keys = list(first_row.keys()) data = {} if len(keys) == 1: - # Handle Case: { key: None } + # Handle Case: { key: None }. data[first_row[keys[0]]] = None elif len(keys) == 2: - # Handle Case: { key: value } + # Handle Case: { key: value }. for row in rows: key = row[keys[0]] value = row[keys[1]] @@ -127,7 +126,7 @@ def convert_to_json( elif len(keys) > 2: if all(col in first_row for col in ["emoji", "is_base", "rank"]): - # Handle Case: { key: [ { emoji: ..., is_base: ..., rank: ... }, { emoji: ..., is_base: ..., rank: ... } ] } + # Handle Case: { key: [ { emoji: ..., is_base: ..., rank: ... }, { emoji: ..., is_base: ..., rank: ... } ] }. for row in rows: key = row.get(reader.fieldnames[0]) emoji = row.get("emoji", "").strip() @@ -144,7 +143,7 @@ def convert_to_json( data[key].append(entry) else: - # Handle Case: { key: { value1: ..., value2: ... } } + # Handle Case: { key: { value1: ..., value2: ... } }. for row in rows: data[row[keys[0]]] = {k: row[k] for k in keys[1:]} @@ -171,12 +170,9 @@ def convert_to_json( print(f"Error writing to '{output_file}': {e}") continue - print( - f"Data for {normalized_language['language'].capitalize()} {dtype} written to {output_file}" - ) + print(f"Data for {language.capitalize()} {dtype} written to {output_file}") -# # MARK: CSV or TSV @@ -190,33 +186,39 @@ def convert_to_csv_or_tsv( ) -> None: """ Convert a JSON File to CSV/TSV file. + Parameters ---------- - language : str - The language of the file to convert. - data_type : Union[str, List[str]] - The data type of the file to convert. - output_type : str - The output format, should be "csv" or "tsv". - input_file : str - The input JSON file path. - output_dir : str - The output directory path for results. - overwrite : bool - Whether to overwrite existing files. + language : str + The language of the file to convert. + + data_type : Union[str, List[str]] + The data type of the file to convert. + + output_type : str + The output format, should be "csv" or "tsv". + + input_file : str + The input JSON file path. + + output_dir : str + The output directory path for results. + + overwrite : bool + Whether to overwrite existing files. + Returns ------- None """ - - # Normalize the language - normalized_language = language_map.get(language.lower()) + normalized_language = language.lower() if not normalized_language: raise ValueError(f"Language '{language.capitalize()}' is not recognized.") if isinstance(data_type, str): data_types = [data_type.strip()] + else: data_types = [dtype.strip() for dtype in data_type] @@ -234,7 +236,7 @@ def convert_to_csv_or_tsv( print(f"Error reading '{input_file}': {e}") continue - # Determine the delimiter based on output type + # Determine the delimiter based on output type. delimiter = "," if output_type == "csv" else "\t" if output_dir is None: @@ -244,9 +246,7 @@ def convert_to_csv_or_tsv( else DEFAULT_TSV_EXPORT_DIR ) - final_output_dir = ( - Path(output_dir) / normalized_language["language"].capitalize() - ) + final_output_dir = Path(output_dir) / language.capitalize() final_output_dir.mkdir(parents=True, exist_ok=True) output_file = final_output_dir / f"{dtype}.{output_type}" @@ -261,13 +261,13 @@ def convert_to_csv_or_tsv( try: with output_file.open("w", newline="", encoding="utf-8") as file: writer = csv.writer(file, delimiter=delimiter) - # Handle different JSON structures based on the format + # Handle different JSON structures based on the format. if isinstance(data, dict): first_key = list(data.keys())[0] if isinstance(data[first_key], dict): - # Handle case: { key: { value1: ..., value2: ... } } + # Handle case: { key: { value1: ..., value2: ... } }. columns = sorted(next(iter(data.values())).keys()) writer.writerow([dtype[:-1]] + columns) @@ -277,8 +277,8 @@ def convert_to_csv_or_tsv( elif isinstance(data[first_key], list): if all(isinstance(item, dict) for item in data[first_key]): - # Handle case: { key: [ { value1: ..., value2: ... } ] } - if "emoji" in data[first_key][0]: # Emoji specific case + # Handle case: { key: [ { value1: ..., value2: ... } ] }. + if "emoji" in data[first_key][0]: # emoji specific case columns = ["word", "emoji", "is_base", "rank"] writer.writerow(columns) @@ -303,7 +303,7 @@ def convert_to_csv_or_tsv( writer.writerow(row) elif all(isinstance(item, str) for item in data[first_key]): - # Handle case: { key: [value1, value2, ...] } + # Handle case: { key: [value1, value2, ...] }. writer.writerow( [dtype[:-1]] + [ @@ -316,7 +316,7 @@ def convert_to_csv_or_tsv( writer.writerow(row) else: - # Handle case: { key: value } + # Handle case: { key: value }. writer.writerow([dtype[:-1], "value"]) for key, value in data.items(): writer.writerow([key, value]) @@ -325,7 +325,7 @@ def convert_to_csv_or_tsv( print(f"Error writing to '{output_file}': {e}") continue - print(f"Data for '{language} {dtype}' written to '{output_file}'") + print(f"Data for {language} {dtype} written to '{output_file}'") # MARK: SQLITE @@ -371,6 +371,7 @@ def convert_to_sqlite( if input_file: input_file = Path(input_file) + if not input_file.exists(): raise ValueError(f"Input file does not exist: {input_file}") @@ -379,15 +380,13 @@ def convert_to_sqlite( if output_dir is None: output_dir = Path(DEFAULT_SQLITE_EXPORT_DIR) + else: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) - print( - f"Converting data for language: {language}, data type: {data_type} to {output_type}" - ) data_to_sqlite(languages, specific_tables) source_file = f"{get_language_iso(language).upper()}LanguageData.sqlite" @@ -397,16 +396,18 @@ def convert_to_sqlite( if source_path.exists(): if target_path.exists() and not overwrite: print(f"File {target_path} already exists. Use --overwrite to replace.") + else: shutil.copy(source_path, target_path) print(f"SQLite database copied to: {target_path}") + else: print(f"Warning: SQLite file not found at {source_path}") print("SQLite file conversion complete.") -def convert( +def convert_wrapper( language: str, data_type: Union[str, List[str]], output_type: str, @@ -442,8 +443,9 @@ def convert( None """ output_type = output_type.lower() + print(f"Converting data for {language} {data_type} to {output_type} ...") - # Route the function call to the correct conversion function + # Route the function call to the correct conversion function. if output_type == "json": convert_to_json( language=language, @@ -453,6 +455,7 @@ def convert( output_dir=output_dir, overwrite=overwrite, ) + elif output_type in {"csv", "tsv"}: convert_to_csv_or_tsv( language=language, @@ -462,6 +465,7 @@ def convert( output_dir=output_dir, overwrite=overwrite, ) + elif output_type == "sqlite": convert_to_sqlite( language=language, @@ -471,7 +475,8 @@ def convert( output_dir=output_dir, overwrite=overwrite, ) + else: raise ValueError( - f"Unsupported output type '{output_type}'. Must be 'json', 'csv', 'tsv', or 'sqlite'." + f"Unsupported output type '{output_type}'. Must be 'json', 'csv', 'tsv' or 'sqlite'." ) diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index fd521846a..3bde53831 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -20,11 +20,12 @@ --> """ +import os # for removing original JSON files import subprocess from pathlib import Path from typing import List, Union -import os # For removing the JSON file +from scribe_data.cli.convert import convert_wrapper from scribe_data.unicode.generate_emoji_keywords import generate_emoji from scribe_data.utils import ( DEFAULT_CSV_EXPORT_DIR, @@ -33,7 +34,6 @@ DEFAULT_TSV_EXPORT_DIR, ) from scribe_data.wikidata.query_data import query_data -from scribe_data.cli.convert import convert def get_data( @@ -139,10 +139,10 @@ def get_data( json_input_path = Path(output_dir) / f"{language}/{data_type}.json" - # Proceed with conversion only if the output type is not JSON + # Proceed with conversion only if the output type is not JSON. if output_type != "json": if json_input_path.exists(): - convert( + convert_wrapper( language=language, data_type=data_type, output_type=output_type, @@ -152,13 +152,16 @@ def get_data( ) os.remove(json_input_path) + else: - print(f"Error: Input file '{json_input_path}' does not exist.") + print( + f"Error: Input file '{json_input_path}' does not exist for conversion." + ) if interactive: return True - # Handle emoji keywords process failure + # Handle emoji keywords process failure. elif data_type in {"emoji-keywords", "emoji_keywords"}: print( "\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed." diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index b56ec42c3..83bd4d817 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -25,8 +25,7 @@ from pathlib import Path from scribe_data.cli.cli_utils import validate_language_and_data_type -from scribe_data.cli.convert import convert - +from scribe_data.cli.convert import convert_wrapper from scribe_data.cli.get import get_data from scribe_data.cli.interactive import start_interactive_mode from scribe_data.cli.list import list_wrapper @@ -90,7 +89,7 @@ def main() -> None: "--data-type", nargs="?", const=True, - help="List options for all or given data types.", + help="List options for all or given data types (e.g., nouns, verbs).", ) list_parser.add_argument( "-a", @@ -111,10 +110,13 @@ def main() -> None: ) get_parser._actions[0].help = "Show this help message and exit." get_parser.add_argument( - "-lang", "--language", type=str, help="The language(s) to get." + "-lang", "--language", type=str, help="The language(s) to get data for." ) get_parser.add_argument( - "-dt", "--data-type", type=str, help="The data type(s) to get." + "-dt", + "--data-type", + type=str, + help="The data type(s) to get data for (e.g., nouns, verbs).", ) get_parser.add_argument( "-ot", @@ -163,7 +165,10 @@ def main() -> None: "-lang", "--language", type=str, help="The language(s) to check totals for." ) total_parser.add_argument( - "-dt", "--data-type", type=str, help="The data type(s) to check totals for." + "-dt", + "--data-type", + type=str, + help="The data type(s) to check totals for (e.g., nouns, verbs).", ) total_parser.add_argument( "-a", @@ -183,7 +188,7 @@ def main() -> None: formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=60), ) - # Setting up the arguments for the convert command + convert_parser._actions[0].help = "Show this help message and exit." convert_parser.add_argument( "-lang", "--language", @@ -196,7 +201,7 @@ def main() -> None: "--data-type", type=str, required=True, - help="The data type(s) of the file to convert (e.g., noun, verb).", + help="The data type(s) of the file to convert (e.g., nouns, verbs).", ) convert_parser.add_argument( "-if", @@ -279,10 +284,12 @@ def main() -> None: ) elif args.command in ["total", "t"]: - total_wrapper(args.language, args.data_type, args.all) + total_wrapper( + language=args.language, data_type=args.data_type, all_bool=args.all + ) elif args.command in ["convert", "c"]: - convert( + convert_wrapper( language=args.language, data_type=args.data_type, output_type=args.output_type, diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index aec1f9560..1be35b28d 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -34,8 +34,8 @@ DEFAULT_JSON_EXPORT_DIR, DEFAULT_SQLITE_EXPORT_DIR, get_language_iso, + list_all_languages, ) -from scribe_data.utils import list_all_languages def data_to_sqlite( @@ -53,10 +53,28 @@ def data_to_sqlite( current_language_data = json.load(f_languages) data_types = json.load(f_data_types).keys() + # TODO: Switch to all languages. current_languages = list_all_languages(current_language_data) + current_languages = [ + "english", + "french", + "german", + "italian", + "portuguese", + "russian", + "spanish", + "swedish", + ] + if not languages: languages = current_languages + elif isinstance(languages, str): + languages = languages.lower() + + elif isinstance(languages, list): + languages = [lang.lower() for lang in languages] + if not set(languages).issubset(current_languages): raise ValueError( f"Invalid language(s) specified. Available languages are: {', '.join(current_languages)}" diff --git a/src/scribe_data/wikidata/query_data.py b/src/scribe_data/wikidata/query_data.py index ad077bf01..c1f70ab99 100644 --- a/src/scribe_data/wikidata/query_data.py +++ b/src/scribe_data/wikidata/query_data.py @@ -143,7 +143,6 @@ def query_data( desc="Data updated", unit="process", disable=interactive, - colour="MAGENTA", ): lang = format_sublanguage_name(q.parent.parent.name, language_metadata) target_type = q.parent.name diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index 84c5d5f8b..1109b8037 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -20,23 +20,23 @@ --> """ -from io import StringIO import json -from pathlib import Path import unittest +from io import StringIO +from pathlib import Path from unittest.mock import MagicMock, Mock, mock_open, patch - from scribe_data.cli.convert import ( - convert, + convert_to_csv_or_tsv, convert_to_json, convert_to_sqlite, - convert_to_csv_or_tsv, + convert_wrapper, ) class TestConvert(unittest.TestCase): - # Helper Functions + # MARK: Helper Functions + def setup_language_map(self, mock_language_map: Mock) -> None: """ Set up the mock language map for testing. @@ -85,51 +85,51 @@ def normalize_line_endings(self, data: str) -> str: """ return data.replace("\r\n", "\n").replace("\r", "\n") - # MARK: JSON Tests - - @patch("scribe_data.cli.convert.language_map", autospec=True) - @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_normalized_language(self, mock_path, mock_language_map): - self.setup_language_map(mock_language_map) - - mock_path_obj = MagicMock(spec=Path) - mock_path.return_value = mock_path_obj - - mock_path_obj.suffix = ".csv" - mock_path_obj.exists.return_value = True - - convert_to_json( - language="French", - data_type="nouns", - output_type="json", - input_file="input.csv", - output_dir="/output_dir", - overwrite=True, - ) - - mock_language_map.get.assert_called_with("french") - - @patch("scribe_data.cli.convert.language_map", autospec=True) - @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_unknown_language(self, mock_path, mock_language_map): - mock_language_map.get.return_value = None - mock_input_file_path = MagicMock(spec=Path) - mock_input_file_path.exists.return_value = True - mock_path.side_effect = [mock_input_file_path, MagicMock(spec=Path)] - - with self.assertRaises(ValueError) as context: - convert_to_json( - language="kazatan", - data_type="nouns", - output_type="json", - input_file="test.csv", - output_dir="/output_dir", - overwrite=True, - ) - - self.assertEqual( - str(context.exception), "Language 'Kazatan' is not recognized." - ) + # MARK: JSON + + # @patch("scribe_data.cli.convert.language_map", autospec=True) + # @patch("scribe_data.cli.convert.Path", autospec=True) + # def test_convert_to_json_normalized_language(self, mock_path, mock_language_map): + # self.setup_language_map(mock_language_map) + + # mock_path_obj = MagicMock(spec=Path) + # mock_path.return_value = mock_path_obj + + # mock_path_obj.suffix = ".csv" + # mock_path_obj.exists.return_value = True + + # convert_to_json( + # language="French", + # data_type="nouns", + # output_type="json", + # input_file="input.csv", + # output_dir="/output_dir", + # overwrite=True, + # ) + + # mock_language_map.get.assert_called_with("french") + + # @patch("scribe_data.cli.convert.language_map", autospec=True) + # @patch("scribe_data.cli.convert.Path", autospec=True) + # def test_convert_to_json_unknown_language(self, mock_path, mock_language_map): + # mock_language_map.get.return_value = None + # mock_input_file_path = MagicMock(spec=Path) + # mock_input_file_path.exists.return_value = True + # mock_path.side_effect = [mock_input_file_path, MagicMock(spec=Path)] + + # with self.assertRaises(ValueError) as context: + # convert_to_json( + # language="UnsupportedLanguage", + # data_type="nouns", + # output_type="json", + # input_file="test.csv", + # output_dir="/output_dir", + # overwrite=True, + # ) + + # self.assertEqual( + # str(context.exception), "Language 'UnsupportedLanguage' is not recognized." + # ) @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) @@ -358,68 +358,68 @@ def test_convert_to_json_with_complex_structure( ) self.assertEqual(json.loads(written_data), expected_json) - # MARK: CSV OR TSV Tests - - @patch("scribe_data.cli.convert.language_map", autospec=True) - @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_json_normalized_language( - self, mock_path, mock_language_map - ): - self.setup_language_map(mock_language_map) - - mock_path_obj = MagicMock(spec=Path) - mock_path.return_value = mock_path_obj - - mock_path_obj.suffix = ".json" - mock_path_obj.exists.return_value = True - - mock_json_data = json.dumps({"key1": "value1", "key2": "value2"}) - mock_open_function = mock_open(read_data=mock_json_data) - mock_path_obj.open = mock_open_function - - convert_to_csv_or_tsv( - language="English", - data_type="nouns", - output_type="csv", - input_file="input.json", - output_dir="/output_dir", - overwrite=True, - ) - - mock_language_map.get.assert_called_with("english") - - mock_open_function.assert_called_once_with("r", encoding="utf-8") - - @patch("scribe_data.cli.convert.language_map", autospec=True) - @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_json_unknown_language( - self, mock_path, mock_language_map - ): - self.setup_language_map(mock_language_map) - - mock_path_obj = MagicMock(spec=Path) - mock_path.return_value = mock_path_obj - - mock_path_obj.suffix = ".json" - mock_path_obj.exists.return_value = True - - mock_json_data = json.dumps({"key1": "value1", "key2": "value2"}) - mock_open_function = mock_open(read_data=mock_json_data) - mock_path_obj.open = mock_open_function - - with self.assertRaises(ValueError) as context: - convert_to_csv_or_tsv( - language="kazatan", - data_type="nouns", - output_type="csv", - input_file="input.json", - output_dir="/output_dir", - overwrite=True, - ) - - self.assertEqual( - str(context.exception), "Language 'Kazatan' is not recognized." - ) + # MARK: CSV or TSV + + # @patch("scribe_data.cli.convert.language_map", autospec=True) + # @patch("scribe_data.cli.convert.Path", autospec=True) + # def test_convert_to_csv_or_json_normalized_language( + # self, mock_path, mock_language_map + # ): + # self.setup_language_map(mock_language_map) + + # mock_path_obj = MagicMock(spec=Path) + # mock_path.return_value = mock_path_obj + + # mock_path_obj.suffix = ".json" + # mock_path_obj.exists.return_value = True + + # mock_json_data = json.dumps({"key1": "value1", "key2": "value2"}) + # mock_open_function = mock_open(read_data=mock_json_data) + # mock_path_obj.open = mock_open_function + + # convert_to_csv_or_tsv( + # language="English", + # data_type="nouns", + # output_type="csv", + # input_file="input.json", + # output_dir="/output_dir", + # overwrite=True, + # ) + + # mock_language_map.get.assert_called_with("english") + + # mock_open_function.assert_called_once_with("r", encoding="utf-8") + + # @patch("scribe_data.cli.convert.language_map", autospec=True) + # @patch("scribe_data.cli.convert.Path", autospec=True) + # def test_convert_to_csv_or_json_unknown_language( + # self, mock_path, mock_language_map + # ): + # self.setup_language_map(mock_language_map) + + # mock_path_obj = MagicMock(spec=Path) + # mock_path.return_value = mock_path_obj + + # mock_path_obj.suffix = ".json" + # mock_path_obj.exists.return_value = True + + # mock_json_data = json.dumps({"key1": "value1", "key2": "value2"}) + # mock_open_function = mock_open(read_data=mock_json_data) + # mock_path_obj.open = mock_open_function + + # with self.assertRaises(ValueError) as context: + # convert_to_csv_or_tsv( + # language="UnsupportedLanguage", + # data_type="nouns", + # output_type="csv", + # input_file="input.json", + # output_dir="/output_dir", + # overwrite=True, + # ) + + # self.assertEqual( + # str(context.exception), "Language 'UnsupportedLanguage' is not recognized." + # ) @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) @@ -663,7 +663,7 @@ def test_convert_to_csv_or_tsv_listofdicts_to_tsv( self.setup_language_map(mock_language_map) - # Mock input file path + # Mock input file path. mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" mock_input_file_path.exists.return_value = True @@ -790,7 +790,7 @@ def test_convert_to_csv_or_tsv_liststrings_to_tsv( expected_tsv_output = self.normalize_line_endings(expected_tsv_output) self.assertEqual(written_data, expected_tsv_output) - # MARK: SQLITE Tests + # MARK: SQLITE @patch("scribe_data.cli.convert.Path") @patch("scribe_data.cli.convert.data_to_sqlite") @@ -867,7 +867,7 @@ def test_convert_to_sqlite_no_language(self): def test_convert(self): with self.assertRaises(ValueError) as context: - convert( + convert_wrapper( language="English", data_type="nouns", output_type="parquet", @@ -878,5 +878,5 @@ def test_convert(self): self.assertEqual( str(context.exception), - "Unsupported output type 'parquet'. Must be 'json', 'csv', 'tsv', or 'sqlite'.", + "Unsupported output type 'parquet'. Must be 'json', 'csv', 'tsv' or 'sqlite'.", ) diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py index 686f62843..a1e21e750 100644 --- a/tests/cli/test_get.py +++ b/tests/cli/test_get.py @@ -29,10 +29,15 @@ class TestGetData(unittest.TestCase): # MARK: Subprocess Patching - @patch("subprocess.run") - def test_get_emoji_keywords(self, mock_subprocess_run): - get_data(language="English", data_type="emoji-keywords") - self.assertTrue(mock_subprocess_run.called) + @patch("scribe_data.cli.get.generate_emoji") + def test_get_emoji_keywords(self, generate_emoji): + get_data( + language="English", data_type="emoji_keywords", output_dir="./test_output" + ) + generate_emoji.assert_called_once_with( + language="English", + output_dir="./test_output", + ) # MARK: Invalid Arguments From fbb5e5ae86af6b6dc46e5fefca18f35ac2aa2ef1 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Thu, 24 Oct 2024 23:56:27 +0200 Subject: [PATCH 11/13] Update tests to remove language map --- tests/cli/test_convert.py | 134 ++++++++------------------------------ 1 file changed, 26 insertions(+), 108 deletions(-) diff --git a/tests/cli/test_convert.py b/tests/cli/test_convert.py index 1109b8037..5927f3c48 100644 --- a/tests/cli/test_convert.py +++ b/tests/cli/test_convert.py @@ -56,15 +56,11 @@ def setup_language_map(self, mock_language_map: Mock) -> None: "language": "english", "iso": "en", "qid": "Q1860", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": [], }, "french": { "language": "french", "iso": "fr", "qid": "Q150", - "remove-words": ["of", "the", "The", "and"], - "ignore-words": ["XXe"], }, }.get(lang.lower()) @@ -87,10 +83,9 @@ def normalize_line_endings(self, data: str) -> str: # MARK: JSON - # @patch("scribe_data.cli.convert.language_map", autospec=True) # @patch("scribe_data.cli.convert.Path", autospec=True) - # def test_convert_to_json_normalized_language(self, mock_path, mock_language_map): - # self.setup_language_map(mock_language_map) + # def test_convert_to_json_normalized_language(self, mock_path): + # # mock_path_obj = MagicMock(spec=Path) # mock_path.return_value = mock_path_obj @@ -107,19 +102,15 @@ def normalize_line_endings(self, data: str) -> str: # overwrite=True, # ) - # mock_language_map.get.assert_called_with("french") - - # @patch("scribe_data.cli.convert.language_map", autospec=True) # @patch("scribe_data.cli.convert.Path", autospec=True) - # def test_convert_to_json_unknown_language(self, mock_path, mock_language_map): - # mock_language_map.get.return_value = None + # def test_convert_to_json_unknown_language(self, mock_path): # mock_input_file_path = MagicMock(spec=Path) # mock_input_file_path.exists.return_value = True # mock_path.side_effect = [mock_input_file_path, MagicMock(spec=Path)] # with self.assertRaises(ValueError) as context: # convert_to_json( - # language="UnsupportedLanguage", + # language="FakeLanguage", # data_type="nouns", # output_type="json", # input_file="test.csv", @@ -128,17 +119,14 @@ def normalize_line_endings(self, data: str) -> str: # ) # self.assertEqual( - # str(context.exception), "Language 'UnsupportedLanguage' is not recognized." + # str(context.exception), "Language 'FakeLanguage' is not recognized." # ) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_with_input_file(self, mock_path, mock_language_map): + def test_convert_to_json_with_input_file(self, mock_path): csv_data = "key,value\na,1\nb,2" mock_file = StringIO(csv_data) - self.setup_language_map(mock_language_map) - mock_path_obj = MagicMock(spec=Path) mock_path.return_value = mock_path_obj mock_path_obj.suffix = ".csv" @@ -158,13 +146,8 @@ def test_convert_to_json_with_input_file(self, mock_path, mock_language_map): mock_path_obj.open.assert_called_once_with("r", encoding="utf-8") - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path") - def test_convert_to_json_supported_file_extension_csv( - self, mock_path_class, mock_language_map - ): - self.setup_language_map(mock_language_map) - + def test_convert_to_json_supported_file_extension_csv(self, mock_path_class): mock_path_instance = MagicMock(spec=Path) mock_path_class.return_value = mock_path_instance @@ -181,12 +164,8 @@ def test_convert_to_json_supported_file_extension_csv( overwrite=True, ) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path") - def test_convert_to_json_supported_file_extension_tsv( - self, mock_path_class, mock_language_map - ): - self.setup_language_map(mock_language_map) + def test_convert_to_json_supported_file_extension_tsv(self, mock_path_class): mock_path_instance = MagicMock(spec=Path) mock_path_class.return_value = mock_path_instance @@ -203,12 +182,8 @@ def test_convert_to_json_supported_file_extension_tsv( overwrite=True, ) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path") - def test_convert_to_json_unsupported_file_extension( - self, mock_path, mock_language_map - ): - self.setup_language_map(mock_language_map) + def test_convert_to_json_unsupported_file_extension(self, mock_path): mock_path_obj = MagicMock(spec=Path) mock_path.return_value = mock_path_obj @@ -231,15 +206,12 @@ def test_convert_to_json_unsupported_file_extension( "Unsupported file extension '.txt' for test.txt. Please provide a '.csv' or '.tsv' file.", ) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_standard_csv(self, mock_path_class, mock_language_map): + def test_convert_to_json_standard_csv(self, mock_path_class): csv_data = "key,value\na,1\nb,2" expected_json = {"a": "1", "b": "2"} mock_file_obj = StringIO(csv_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".csv" mock_input_file_path.exists.return_value = True @@ -273,11 +245,8 @@ def test_convert_to_json_standard_csv(self, mock_path_class, mock_language_map): self.assertEqual(json.loads(written_data), expected_json) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_with_multiple_keys( - self, mock_path_class, mock_language_map - ): + def test_convert_to_json_with_multiple_keys(self, mock_path_class): csv_data = "key,value1,value2\na,1,x\nb,2,y\nc,3,z" expected_json = { "a": {"value1": "1", "value2": "x"}, @@ -286,8 +255,6 @@ def test_convert_to_json_with_multiple_keys( } mock_file_obj = StringIO(csv_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".csv" mock_input_file_path.exists.return_value = True @@ -316,11 +283,8 @@ def test_convert_to_json_with_multiple_keys( ) self.assertEqual(json.loads(written_data), expected_json) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_json_with_complex_structure( - self, mock_path_class, mock_language_map - ): + def test_convert_to_json_with_complex_structure(self, mock_path_class): csv_data = "key,emoji,is_base,rank\na,šŸ˜€,true,1\nb,šŸ˜…,false,2" expected_json = { "a": [{"emoji": "šŸ˜€", "is_base": True, "rank": 1}], @@ -328,8 +292,6 @@ def test_convert_to_json_with_complex_structure( } mock_file_obj = StringIO(csv_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".csv" mock_input_file_path.exists.return_value = True @@ -360,12 +322,11 @@ def test_convert_to_json_with_complex_structure( # MARK: CSV or TSV - # @patch("scribe_data.cli.convert.language_map", autospec=True) # @patch("scribe_data.cli.convert.Path", autospec=True) # def test_convert_to_csv_or_json_normalized_language( - # self, mock_path, mock_language_map + # self, mock_path # ): - # self.setup_language_map(mock_language_map) + # # mock_path_obj = MagicMock(spec=Path) # mock_path.return_value = mock_path_obj @@ -386,16 +347,13 @@ def test_convert_to_json_with_complex_structure( # overwrite=True, # ) - # mock_language_map.get.assert_called_with("english") - # mock_open_function.assert_called_once_with("r", encoding="utf-8") - # @patch("scribe_data.cli.convert.language_map", autospec=True) # @patch("scribe_data.cli.convert.Path", autospec=True) # def test_convert_to_csv_or_json_unknown_language( - # self, mock_path, mock_language_map + # self, mock_path # ): - # self.setup_language_map(mock_language_map) + # # mock_path_obj = MagicMock(spec=Path) # mock_path.return_value = mock_path_obj @@ -409,7 +367,7 @@ def test_convert_to_json_with_complex_structure( # with self.assertRaises(ValueError) as context: # convert_to_csv_or_tsv( - # language="UnsupportedLanguage", + # language="FakeLanguage", # data_type="nouns", # output_type="csv", # input_file="input.json", @@ -418,21 +376,16 @@ def test_convert_to_json_with_complex_structure( # ) # self.assertEqual( - # str(context.exception), "Language 'UnsupportedLanguage' is not recognized." + # str(context.exception), "Language 'FakeLanguage' is not recognized." # ) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_tsv_standarddict_to_csv( - self, mock_path_class, mock_language_map - ): + def test_convert_to_csv_or_tsv_standarddict_to_csv(self, mock_path_class): json_data = '{"a": "1", "b": "2"}' expected_csv_output = "preposition,value\n" "a,1\n" "b,2\n" mock_file_obj = StringIO(json_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" mock_input_file_path.exists.return_value = True @@ -467,19 +420,14 @@ def test_convert_to_csv_or_tsv_standarddict_to_csv( self.assertEqual(written_data, expected_csv_output) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_tsv_standarddict_to_tsv( - self, mock_path_class, mock_language_map - ): + def test_convert_to_csv_or_tsv_standarddict_to_tsv(self, mock_path_class): json_data = '{"a": "1", "b": "2"}' expected_tsv_output = "preposition\tvalue\n" "a\t1\n" "b\t2\n" mock_file_obj = StringIO(json_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" mock_input_file_path.exists.return_value = True @@ -513,19 +461,14 @@ def test_convert_to_csv_or_tsv_standarddict_to_tsv( self.assertEqual(written_data, expected_tsv_output) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_tsv_nesteddict_to_csv( - self, mock_path_class, mock_language_map - ): + def test_convert_to_csv_or_tsv_nesteddict_to_csv(self, mock_path_class): json_data = ( '{"a": {"value1": "1", "value2": "x"}, "b": {"value1": "2", "value2": "y"}}' ) expected_csv_output = "noun,value1,value2\n" "a,1,x\n" "b,2,y\n" mock_file_obj = StringIO(json_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" mock_input_file_path.exists.return_value = True @@ -558,11 +501,8 @@ def test_convert_to_csv_or_tsv_nesteddict_to_csv( expected_csv_output = self.normalize_line_endings(expected_csv_output) self.assertEqual(written_data, expected_csv_output) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_tsv_nesteddict_to_tsv( - self, mock_path_class, mock_language_map - ): + def test_convert_to_csv_or_tsv_nesteddict_to_tsv(self, mock_path_class): json_data = ( '{"a": {"value1": "1", "value2": "x"}, "b": {"value1": "2", "value2": "y"}}' ) @@ -570,8 +510,6 @@ def test_convert_to_csv_or_tsv_nesteddict_to_tsv( mock_file_obj = StringIO(json_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" mock_input_file_path.exists.return_value = True @@ -605,19 +543,14 @@ def test_convert_to_csv_or_tsv_nesteddict_to_tsv( self.assertEqual(written_data, expected_tsv_output) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_tsv_listofdicts_to_csv( - self, mock_path_class, mock_language_map - ): + def test_convert_to_csv_or_tsv_listofdicts_to_csv(self, mock_path_class): json_data = '{"a": [{"emoji": "šŸ˜€", "is_base": true, "rank": 1}, {"emoji": "šŸ˜…", "is_base": false, "rank": 2}]}' expected_csv_output = ( "word,emoji,is_base,rank\n" "a,šŸ˜€,True,1\n" "a,šŸ˜…,False,2\n" ) mock_file_obj = StringIO(json_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" mock_input_file_path.exists.return_value = True @@ -650,19 +583,14 @@ def test_convert_to_csv_or_tsv_listofdicts_to_csv( expected_csv_output = self.normalize_line_endings(expected_csv_output) self.assertEqual(written_data, expected_csv_output) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_tsv_listofdicts_to_tsv( - self, mock_path_class, mock_language_map - ): + def test_convert_to_csv_or_tsv_listofdicts_to_tsv(self, mock_path_class): json_data = '{"a": [{"emoji": "šŸ˜€", "is_base": true, "rank": 1}, {"emoji": "šŸ˜…", "is_base": false, "rank": 2}]}' expected_tsv_output = ( "word\temoji\tis_base\trank\n" "a\tšŸ˜€\tTrue\t1\n" "a\tšŸ˜…\tFalse\t2\n" ) mock_file_obj = StringIO(json_data) - self.setup_language_map(mock_language_map) - # Mock input file path. mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" @@ -697,11 +625,8 @@ def test_convert_to_csv_or_tsv_listofdicts_to_tsv( expected_tsv_output = self.normalize_line_endings(expected_tsv_output) self.assertEqual(written_data, expected_tsv_output) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_tsv_liststrings_to_csv( - self, mock_path_class, mock_language_map - ): + def test_convert_to_csv_or_tsv_liststrings_to_csv(self, mock_path_class): json_data = '{"a": ["x", "y", "z"]}' expected_csv_output = ( "autosuggestion,autosuggestion_1,autosuggestion_2,autosuggestion_3\n" @@ -709,8 +634,6 @@ def test_convert_to_csv_or_tsv_liststrings_to_csv( ) mock_file_obj = StringIO(json_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" mock_input_file_path.exists.return_value = True @@ -744,11 +667,8 @@ def test_convert_to_csv_or_tsv_liststrings_to_csv( expected_csv_output = self.normalize_line_endings(expected_csv_output) self.assertEqual(written_data, expected_csv_output) - @patch("scribe_data.cli.convert.language_map", autospec=True) @patch("scribe_data.cli.convert.Path", autospec=True) - def test_convert_to_csv_or_tsv_liststrings_to_tsv( - self, mock_path_class, mock_language_map - ): + def test_convert_to_csv_or_tsv_liststrings_to_tsv(self, mock_path_class): json_data = '{"a": ["x", "y", "z"]}' expected_tsv_output = ( "autosuggestion\tautosuggestion_1\tautosuggestion_2\tautosuggestion_3\n" @@ -756,8 +676,6 @@ def test_convert_to_csv_or_tsv_liststrings_to_tsv( ) mock_file_obj = StringIO(json_data) - self.setup_language_map(mock_language_map) - mock_input_file_path = MagicMock(spec=Path) mock_input_file_path.suffix = ".json" mock_input_file_path.exists.return_value = True From 4b910236dd925dd980bdbd9a4f576f49581d3c12 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 25 Oct 2024 00:13:50 +0200 Subject: [PATCH 12/13] Update code formatting and doc strings --- src/scribe_data/check/check_pyICU.py | 95 ++++++++++++------- .../unicode/generate_emoji_keywords.py | 23 +++-- 2 files changed, 77 insertions(+), 41 deletions(-) diff --git a/src/scribe_data/check/check_pyICU.py b/src/scribe_data/check/check_pyICU.py index a30e7e8e8..c67b4d3bc 100644 --- a/src/scribe_data/check/check_pyICU.py +++ b/src/scribe_data/check/check_pyICU.py @@ -1,17 +1,39 @@ -import requests -import pkg_resources -import sys +""" +Check to see if the requirements of the emoji process are installed. + +.. raw:: html + +""" + import os -import platform # Added to check the OS -from pathlib import Path +import platform # added to check the OS import subprocess +import sys +from pathlib import Path + +import pkg_resources +import requests def check_if_pyicu_installed(): installed_packages = {pkg.key for pkg in pkg_resources.working_set} - if "pyicu" in installed_packages: - return True - return False + + return "pyicu" in installed_packages def get_python_version_and_architecture(): @@ -20,9 +42,10 @@ def get_python_version_and_architecture(): Returns ------- - str : python_version + python_version : str The Python version in the format 'cpXY'. - str : architecture + + architecture : str The architecture type ('amd64' or 'win32'). """ version = sys.version_info @@ -37,14 +60,15 @@ def fetch_wheel_releases(): Returns ------- - list : available_wheels + available_wheels : list A list of tuples containing wheel file names and their download URLs. - float : total_size_mb + + total_size_mb : float The total size of all available wheels in MB. """ url = "https://api.github.com/repos/cgohlke/pyicu-build/releases" response = requests.get(url) - response.raise_for_status() # Raise an error for bad responses + response.raise_for_status() # raise an error for bad responses available_wheels = [] total_size_bytes = 0 @@ -55,7 +79,7 @@ def fetch_wheel_releases(): available_wheels.append((asset["name"], asset["browser_download_url"])) total_size_bytes += asset["size"] - total_size_mb = total_size_bytes / (1024 * 1024) # Convert bytes to MB + total_size_mb = total_size_bytes / (1024 * 1024) # convert bytes to MB return available_wheels, total_size_mb @@ -67,6 +91,7 @@ def download_wheel_file(wheel_url, output_dir): ---------- wheel_url : str The URL of the wheel file to download. + output_dir : str The directory to save the downloaded file. @@ -75,7 +100,7 @@ def download_wheel_file(wheel_url, output_dir): str : path to the downloaded wheel file. """ response = requests.get(wheel_url) - response.raise_for_status() # Raise an error for bad responses + response.raise_for_status() # raise an error for bad responses wheel_filename = os.path.basename(wheel_url) wheel_path = os.path.join(output_dir, wheel_filename) @@ -94,8 +119,10 @@ def find_matching_wheel(wheels, python_version, architecture): ---------- wheels : list The list of available wheels. + python_version : str The Python version (e.g., 'cp311'). + architecture : str The architecture type (e.g., 'win_amd64'). @@ -103,10 +130,14 @@ def find_matching_wheel(wheels, python_version, architecture): ------- str : The download URL of the matching wheel or None if not found. """ - for name, download_url in wheels: - if python_version in name and architecture in name: - return download_url - return None + return next( + ( + download_url + for name, download_url in wheels + if python_version in name and architecture in name + ), + None, + ) def check_and_install_pyicu(): @@ -115,7 +146,7 @@ def check_and_install_pyicu(): if package_name.lower() not in installed_packages: # print(f"{package_name} not found. Installing...") - # Fetch available wheels from GitHub to estimate download size + # Fetch available wheels from GitHub to estimate download size. wheels, total_size_mb = fetch_wheel_releases() print( @@ -124,28 +155,31 @@ def check_and_install_pyicu(): ) user_input = input().strip().lower() - if user_input == "" or user_input in ["y", "yes"]: + if user_input in ["", "y", "yes"]: print("Proceeding with installation...") + else: print("Installation aborted by the user.") return False - # Check the operating system + # Check the operating system. if platform.system() != "Windows": - # If not Windows, directly use pip to install PyICU + # If not Windows, directly use pip to install PyICU. try: subprocess.run( [sys.executable, "-m", "pip", "install", package_name], check=True ) print(f"{package_name} has been installed successfully.") + except subprocess.CalledProcessError as e: print(f"Error occurred while installing {package_name}: {e}") return False + else: - # Windows-specific installation using wheel files + # Windows-specific installation using wheel files. python_version, architecture = get_python_version_and_architecture() - # Find the matching wheel for the current Python version and architecture + # Find the matching wheel for the current Python version and architecture. wheel_url = find_matching_wheel(wheels, python_version, architecture) if not wheel_url: @@ -154,11 +188,11 @@ def check_and_install_pyicu(): ) return False - # Download the wheel file - output_dir = Path.cwd() # Use the current directory for simplicity + # Download the wheel file. + output_dir = Path.cwd() # use the current directory for simplicity wheel_path = download_wheel_file(wheel_url, output_dir) - # Install PyICU using pip + # Install PyICU using pip. try: subprocess.run( [sys.executable, "-m", "pip", "install", wheel_path], @@ -166,7 +200,7 @@ def check_and_install_pyicu(): ) print(f"{package_name} has been installed successfully.") - # Remove the downloaded wheel file + # Remove the downloaded wheel file. os.remove(wheel_path) print(f"Removed temporary file: {wheel_path}") @@ -174,7 +208,4 @@ def check_and_install_pyicu(): print(f"Error occurred while installing {package_name}: {e}") return False - # else: - # print(f"{package_name} is already installed.") - return True diff --git a/src/scribe_data/unicode/generate_emoji_keywords.py b/src/scribe_data/unicode/generate_emoji_keywords.py index 6dbdcc5a9..beb34257d 100644 --- a/src/scribe_data/unicode/generate_emoji_keywords.py +++ b/src/scribe_data/unicode/generate_emoji_keywords.py @@ -23,12 +23,12 @@ import os from pathlib import Path -from scribe_data.unicode.process_unicode import gen_emoji_lexicon -from scribe_data.utils import export_formatted_data, get_language_iso -from scribe_data.check.check_pyICU import ( +from scribe_data.check.check_pyicu import ( check_and_install_pyicu, check_if_pyicu_installed, ) +from scribe_data.unicode.process_unicode import gen_emoji_lexicon +from scribe_data.utils import export_formatted_data, get_language_iso DATA_TYPE = "emoji-keywords" EMOJI_KEYWORDS_DICT = 3 @@ -42,13 +42,18 @@ def generate_emoji(language, output_dir: str = None): If the installation is successful, it proceeds with generating emoji keywords based on the specified language. The results are then exported to the provided output directory. - Parameters: - - language (str): The ISO code of the language for which to generate emoji keywords. - - output_dir (str, optional): The directory where the generated data will be saved. If not specified, - the data will be saved in a default directory. + Parameters + ---------- + language : str + The ISO code of the language for which to generate emoji keywords. + + output_dir : str, optional + The directory where the generated data will be saved. + If not specified, the data will be saved in a default directory. - Returns: - - None: The function does not return any value but outputs data to the specified directory. + Returns + ------- + None: The function does not return any value but outputs data to the specified directory. """ if check_and_install_pyicu() and check_if_pyicu_installed() is False: print("Thank you.") From c35989f41025f509fec37b57a507db2d436c2418 Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Fri, 25 Oct 2024 00:17:59 +0200 Subject: [PATCH 13/13] Rename pyICU in Git as well --- src/scribe_data/check/{check_pyICU.py => check_pyicu.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/scribe_data/check/{check_pyICU.py => check_pyicu.py} (100%) diff --git a/src/scribe_data/check/check_pyICU.py b/src/scribe_data/check/check_pyicu.py similarity index 100% rename from src/scribe_data/check/check_pyICU.py rename to src/scribe_data/check/check_pyicu.py