diff --git a/docs/source/scribe_data/cli.rst b/docs/source/scribe_data/cli.rst index 5df9ee92..d8274675 100644 --- a/docs/source/scribe_data/cli.rst +++ b/docs/source/scribe_data/cli.rst @@ -145,6 +145,7 @@ Options: - ``-o, --overwrite``: Whether to overwrite existing files (default: False). - ``-a, --all``: Get all languages and data types. Can be combined with `-dt` to get all languages for a specific data type, or with `-lang` to get all data types for a specific language. - ``-i, --interactive``: Run in interactive mode. +- ``-ic, --identifier-case``: The case format for identifiers in the output data (default: camel). Examples: diff --git a/src/scribe_data/cli/convert.py b/src/scribe_data/cli/convert.py index 1ef07061..7fbb98b4 100644 --- a/src/scribe_data/cli/convert.py +++ b/src/scribe_data/cli/convert.py @@ -33,8 +33,10 @@ DEFAULT_SQLITE_EXPORT_DIR, DEFAULT_TSV_EXPORT_DIR, get_language_iso, + camel_to_snake, ) + # MARK: JSON @@ -45,6 +47,7 @@ def convert_to_json( input_file: str, output_dir: str = None, overwrite: bool = False, + identifier_case: str = "camel", ) -> None: """ Convert a CSV/TSV file to JSON. @@ -69,6 +72,9 @@ def convert_to_json( overwrite : bool Whether to overwrite existing files. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- None @@ -118,7 +124,11 @@ def convert_to_json( elif len(keys) == 2: # Handle Case: { key: value }. for row in rows: - key = row[keys[0]] + key = ( + camel_to_snake(row[keys[0]]) + if identifier_case == "snake" + else row[keys[0]] + ) value = row[keys[1]] data[key] = value @@ -126,7 +136,10 @@ def convert_to_json( if all(col in first_row for col in ["emoji", "is_base", "rank"]): # Handle Case: { key: [ { emoji: ..., is_base: ..., rank: ... }, { emoji: ..., is_base: ..., rank: ... } ] }. for row in rows: - key = row.get(reader.fieldnames[0]) + if identifier_case == "snake": + key = camel_to_snake(row.get(reader.fieldnames[0])) + else: + key = row.get(reader.fieldnames[0]) emoji = row.get("emoji", "").strip() is_base = ( row.get("is_base", "false").strip().lower() == "true" @@ -143,7 +156,14 @@ def convert_to_json( else: # Handle Case: { key: { value1: ..., value2: ... } }. for row in rows: - data[row[keys[0]]] = {k: row[k] for k in keys[1:]} + data[row[keys[0]]] = { + ( + camel_to_snake(k) + if identifier_case == "snake" + else k + ): row[k] + for k in keys[1:] + } except (IOError, csv.Error) as e: print(f"Error reading '{input_file_path}': {e}") @@ -181,6 +201,7 @@ def convert_to_csv_or_tsv( input_file: str, output_dir: str = None, overwrite: bool = False, + identifier_case: str = "camel", ) -> None: """ Convert a JSON File to CSV/TSV file. @@ -205,6 +226,9 @@ def convert_to_csv_or_tsv( overwrite : bool Whether to overwrite existing files. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- None @@ -265,7 +289,16 @@ def convert_to_csv_or_tsv( if isinstance(data[first_key], dict): # Handle case: { key: { value1: ..., value2: ... } }. columns = sorted(next(iter(data.values())).keys()) - writer.writerow([dtype[:-1]] + columns) + header = [ + camel_to_snake(dtype[:-1]) + if identifier_case == "snake" + else dtype[:-1] + ] + header += [ + camel_to_snake(col) if identifier_case == "snake" else col + for col in columns + ] + writer.writerow(header) for key, value in data.items(): row = [key] + [value.get(col, "") for col in columns] @@ -276,7 +309,11 @@ def convert_to_csv_or_tsv( # Handle case: { key: [ { value1: ..., value2: ... } ] }. if "emoji" in data[first_key][0]: # emoji specific case columns = ["word", "emoji", "is_base", "rank"] - writer.writerow(columns) + writer.writerow( + [camel_to_snake(col) for col in columns] + if identifier_case == "snake" + else columns + ) for key, value in data.items(): for item in value: @@ -288,7 +325,13 @@ def convert_to_csv_or_tsv( ] writer.writerow(row) else: - columns = [dtype[:-1]] + list(data[first_key][0].keys()) + if identifier_case == "snake": + columns = [camel_to_snake(dtype[:-1])] + [ + camel_to_snake(col) + for col in data[first_key][0].keys() + ] + else: + writer.writerow(columns) writer.writerow(columns) for key, value in data.items(): @@ -300,20 +343,30 @@ def convert_to_csv_or_tsv( elif all(isinstance(item, str) for item in data[first_key]): # Handle case: { key: [value1, value2, ...] }. - writer.writerow( - [dtype[:-1]] - + [ - f"autosuggestion_{i+1}" - for i in range(len(data[first_key])) - ] - ) + header = [ + camel_to_snake(dtype[:-1]) + if identifier_case == "snake" + else dtype[:-1] + ] + header += [ + f"autosuggestion_{i+1}" + for i in range(len(data[first_key])) + ] + writer.writerow(header) for key, value in data.items(): row = [key] + value writer.writerow(row) else: # Handle case: { key: value }. - writer.writerow([dtype[:-1], "value"]) + writer.writerow( + [ + camel_to_snake(dtype[:-1]) + if identifier_case == "snake" + else dtype[:-1], + "value", + ] + ) for key, value in data.items(): writer.writerow([key, value]) @@ -334,6 +387,7 @@ def convert_to_sqlite( input_file: str = None, output_dir: str = None, overwrite: bool = False, + identifier_case: str = "snake", ) -> None: """ Converts a Scribe-Data output file to an SQLite file. @@ -358,6 +412,9 @@ def convert_to_sqlite( overwrite : bool Whether to overwrite existing files. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- A SQLite file saved in the given location. @@ -383,7 +440,7 @@ def convert_to_sqlite( if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) - data_to_sqlite(languages, specific_tables) + data_to_sqlite(languages, specific_tables, identifier_case) source_file = f"{get_language_iso(language).capitalize()}LanguageData.sqlite" source_path = input_file.parent / source_file @@ -410,6 +467,7 @@ def convert_wrapper( input_file: str, output_dir: str = None, overwrite: bool = False, + identifier_case: str = "snake", ): """ Convert data to the specified output type: JSON, CSV/TSV, or SQLite. @@ -434,6 +492,9 @@ def convert_wrapper( overwrite : bool, optional Whether to overwrite existing output files. Defaults to False. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- None @@ -452,6 +513,7 @@ def convert_wrapper( input_file=input_file, output_dir=output_dir, overwrite=overwrite, + identifier_case=identifier_case, ) elif output_type in {"csv", "tsv"}: @@ -462,6 +524,7 @@ def convert_wrapper( input_file=input_file, output_dir=output_dir, overwrite=overwrite, + identifier_case=identifier_case, ) elif output_type == "sqlite": @@ -472,6 +535,7 @@ def convert_wrapper( input_file=input_file, output_dir=output_dir, overwrite=overwrite, + identifier_case=identifier_case, ) else: diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py index c02908aa..9e9bcd54 100644 --- a/src/scribe_data/cli/get.py +++ b/src/scribe_data/cli/get.py @@ -45,6 +45,7 @@ def get_data( outputs_per_entry: int = None, all: bool = False, interactive: bool = False, + identifier_case: str = "camel", ) -> None: """ Function for controlling the data get process for the CLI. @@ -75,6 +76,9 @@ def get_data( interactive : bool (default: False) Whether it's running in interactive mode. + identifier_case : str + The case format for identifiers. Default is "camel". + Returns ------- The requested data saved locally given file type and location arguments. @@ -181,6 +185,7 @@ def get_data( input_file=str(json_input_path), output_dir=output_dir, overwrite=overwrite, + identifier_case=identifier_case, ) os.remove(json_input_path) diff --git a/src/scribe_data/cli/main.py b/src/scribe_data/cli/main.py index be6e760a..38615200 100644 --- a/src/scribe_data/cli/main.py +++ b/src/scribe_data/cli/main.py @@ -151,6 +151,14 @@ def main() -> None: get_parser.add_argument( "-i", "--interactive", action="store_true", help="Run in interactive mode" ) + get_parser.add_argument( + "-ic", + "--identifier-case", + type=str, + choices=["camel", "snake"], + default="camel", + help="The case format for identifiers in the output data (default: camel).", + ) # MARK: Total @@ -242,6 +250,14 @@ def main() -> None: default=True, help="Whether to keep the original file to be converted (default: True).", ) + convert_parser.add_argument( + "-ic", + "--identifier-case", + type=str, + choices=["camel", "snake"], + default="camel", + help="The case format for identifiers in the output data (default: camel).", + ) # MARK: Setup CLI @@ -287,6 +303,7 @@ def main() -> None: outputs_per_entry=args.outputs_per_entry, overwrite=args.overwrite, all=args.all, + identifier_case=args.identifier_case, ) elif args.command in ["total", "t"]: @@ -295,8 +312,12 @@ def main() -> None: else: total_wrapper( - language=args.language.lower() if args.language is not None else None, - data_type=args.data_type.lower() if args.data_type is not None else None, + language=args.language.lower() + if args.language is not None + else None, + data_type=args.data_type.lower() + if args.data_type is not None + else None, all_bool=args.all, ) @@ -308,6 +329,7 @@ def main() -> None: input_file=args.input_file, output_dir=args.output_dir, overwrite=args.overwrite, + identifier_case=args.identifier_case, ) else: diff --git a/src/scribe_data/load/data_to_sqlite.py b/src/scribe_data/load/data_to_sqlite.py index 1be35b28..99ee7ff0 100644 --- a/src/scribe_data/load/data_to_sqlite.py +++ b/src/scribe_data/load/data_to_sqlite.py @@ -33,13 +33,16 @@ from scribe_data.utils import ( DEFAULT_JSON_EXPORT_DIR, DEFAULT_SQLITE_EXPORT_DIR, + camel_to_snake, get_language_iso, list_all_languages, ) def data_to_sqlite( - languages: Optional[List[str]] = None, specific_tables: Optional[List[str]] = None + languages: Optional[List[str]] = None, + specific_tables: Optional[List[str]] = None, + identifier_case: str = "camel", ) -> None: PATH_TO_SCRIBE_DATA = Path(__file__).parent.parent @@ -108,11 +111,16 @@ def create_table(data_type, cols): Parameters ---------- data_type : str - The name of the table to be created + The name of the table to be created. cols : list of strings - The names of columns for the new table + The names of columns for the new table. """ + # Convert column names to snake_case if requested. + cols = [ + camel_to_snake(col) if identifier_case == "snake" else col for col in cols + ] + cursor.execute( f"CREATE TABLE IF NOT EXISTS {data_type} ({' Text, '.join(cols)} Text, UNIQUE({cols[0]}))" ) diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index fbd3db2b..919a5868 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -23,6 +23,7 @@ import ast import json +import re from importlib import resources from pathlib import Path from typing import Any, Optional @@ -649,3 +650,11 @@ def list_languages_with_metadata_for_data_type(language_metadata=_languages): ) return sorted(current_languages, key=lambda x: x["name"]) + + +# MARK: Case Conversion + + +def camel_to_snake(name: str) -> str: + """Convert camelCase to snake_case.""" + return re.sub(r"(?