Merge pull request #545 from axif0/all_in_one

Introduced QID search in translation.
scribe-org · Jan 11, 2025 · 9bbc0c6 · 9bbc0c6
2 parents 49df8d8 + 2fdd74b
commit 9bbc0c6
Show file tree

Hide file tree

Showing 8 changed files with 135 additions and 155 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -154,6 +154,13 @@ pre-commit install  # install pre-commit hooks
 # pre-commit run --all-files  # lint and fix common problems in the codebase
 ```
 
+> [!NOTE]
+> If you are having issues with pre-commit and want to send along your changes regardless, you can ignore the pre-commit hooks via the following:
+>
+> ```bash
+> git commit --no-verify -m "COMMIT_MESSAGE"
+> ```
+
 If you face any issues, consider reinstalling Scribe-data by running the following:
 
 ```bash

diff --git a/src/scribe_data/cli/get.py b/src/scribe_data/cli/get.py
@@ -120,7 +120,7 @@ def prompt_user_download_all():
                 parse_wd_lexeme_dump(
                     language=language,
                     wikidata_dump_type=["form"],
-                    data_types=data_types,
+                    data_types="all",
                     type_output_dir=output_dir,
                 )
             else:

diff --git a/src/scribe_data/cli/total.py b/src/scribe_data/cli/total.py
@@ -24,7 +24,6 @@
 from typing import List, Union
 from urllib.error import HTTPError
 
-import requests
 from SPARQLWrapper import JSON
 
 from scribe_data.utils import (
@@ -34,6 +33,7 @@
     language_metadata,
     language_to_qid,
     list_all_languages,
+    check_qid_is_language,
 )
 from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump, sparql
 
@@ -124,39 +124,6 @@ def get_datatype_list(language):
         return data_type_metadata
 
 
-def check_qid_is_language(qid: str):
-    """
-    Parameters
-    ----------
-    qid : str
-        The QID to check Wikidata to see if it's a language and return its English label.
-
-    Outputs
-    -------
-    str
-        The English label of the Wikidata language entity.
-
-    Raises
-    ------
-    ValueError
-        An invalid QID that's not a language has been passed.
-    """
-    api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0"
-    request_string = f"{api_endpoint}/entities/items/{qid}"
-
-    request = requests.get(request_string, timeout=5)
-    request_result = request.json()
-
-    if request_result["statements"]["P31"]:
-        instance_of_values = request_result["statements"]["P31"]
-        for val in instance_of_values:
-            if val["value"]["content"] == "Q34770":
-                print(f"{request_result['labels']['en']} ({qid}) is a language.\n")
-                return request_result["labels"]["en"]
-
-    raise ValueError("The passed Wikidata QID is not a language.")
-
-
 # MARK: Print
 
 

diff --git a/src/scribe_data/resources/wikidata_qids_pids.json b/src/scribe_data/resources/wikidata_qids_pids.json
@@ -0,0 +1,4 @@
+{
+  "instance_of": "P31",
+  "ietf_language_tag": "P305"
+}
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
@@ -32,6 +32,7 @@
 from typing import Any, Optional
 
 import questionary
+import requests
 from rich import print as rprint
 
 # MARK: Utils Variables
@@ -54,6 +55,9 @@
 LEXEME_FORM_METADATA_FILE = (
     Path(__file__).parent / "resources" / "lexeme_form_metadata.json"
 )
+WIKIDATA_QIDS_PIDS_FILE = (
+    Path(__file__).parent / "resources" / "wikidata_qids_pids.json"
+)
 DATA_DIR = Path(DEFAULT_JSON_EXPORT_DIR)
 
 try:
@@ -78,6 +82,13 @@
 except (IOError, json.JSONDecodeError) as e:
     print(f"Error reading lexeme form metadata: {e}")
 
+try:
+    with WIKIDATA_QIDS_PIDS_FILE.open("r", encoding="utf-8") as file:
+        wikidata_qids_pids = json.load(file)
+
+except (IOError, json.JSONDecodeError) as e:
+    print(f"Error reading language metadata: {e}")
+
 
 language_map = {}
 language_to_qid = {}
@@ -736,3 +747,72 @@ def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool:
         return choice == "Skip process"
 
     return False
+
+
+def check_qid_is_language(qid: str):
+    """
+    Parameters
+    ----------
+    qid : str
+        The QID to check Wikidata to see if it's a language and return its English label.
+
+    Outputs
+    -------
+    str
+        The English label of the Wikidata language entity.
+
+    Raises
+    ------
+    ValueError
+        An invalid QID that's not a language has been passed.
+    """
+    api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0"
+    request_string = f"{api_endpoint}/entities/items/{qid}"
+
+    request = requests.get(request_string, timeout=5)
+    request_result = request.json()
+
+    if request_result["statements"][wikidata_qids_pids["instance_of"]]:
+        instance_of_values = request_result["statements"][
+            wikidata_qids_pids["instance_of"]
+        ]
+        for val in instance_of_values:
+            if val["value"]["content"] == "Q34770":
+                print(f"{request_result['labels']['en']} ({qid}) is a language.\n")
+                return request_result["labels"]["en"]
+
+    raise ValueError("The passed Wikidata QID is not a language.")
+
+
+def get_language_iso_code(qid: str):
+    """
+    Parameters
+    ----------
+    qid : str
+        Get the ISO code of a language given its Wikidata QID.
+
+    Outputs
+    -------
+    str
+        The ISO code of the language.
+
+    Raises
+    ------
+    ValueError
+        An invalid QID that's not a language has been passed.
+    KeyError
+        The ISO code for the language is not available.
+    """
+
+    api_endpoint = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={qid}&props=claims&format=json"
+    response = requests.get(api_endpoint)
+    data = response.json()
+    try:
+        return data["entities"][qid]["claims"][wikidata_qids_pids["ietf_language_tag"]][
+            0
+        ]["mainsnak"]["datavalue"]["value"]
+
+    except ValueError:
+        raise ValueError("The passed Wikidata QID is not a language.")
+    except KeyError:
+        return KeyError("The ISO code for the language is not available.")
diff --git a/src/scribe_data/wiktionary/parse_dump.py b/src/scribe_data/wiktionary/parse_dump.py
@@ -33,6 +33,8 @@
     check_index_exists,
     data_type_metadata,
     language_metadata,
+    get_language_iso_code,
+    check_qid_is_language,
 )
 from tqdm import tqdm
 
@@ -81,7 +83,6 @@ def __init__(
 
         # Build map from ISO to full language name.
         self.iso_to_name = self._build_iso_mapping()
-
         # For "total" usage.
         self.lexical_category_counts = defaultdict(Counter)
         self.translation_counts = defaultdict(Counter)
@@ -101,120 +102,18 @@ def _build_iso_mapping(self) -> dict:
             if iso_code := data.get("iso"):
                 iso_mapping[iso_code] = lang_name
 
-        return iso_mapping
-
-    # MARK: process total
-    def _process_lexeme_total(self, lexeme: dict) -> None:
-        """
-        Gather stats if 'total' is in parse_type: how many entries per language & category,
-        how many translations, etc.
-        """
-        lexicalCategory = lexeme.get("lexicalCategory")
-        if not lexicalCategory or lexicalCategory not in data_type_metadata.values():
-            return
-
-        category_name = self._category_lookup.get(lexicalCategory)
-        if not category_name:
-            return
-
-        # Update counters.
-        lemmas = lexeme.get("lemmas", {})
-        for lemma in lemmas.values():
-            lang = lemma.get("language")
-
-            if lang in self.iso_to_name:
-                self.lexical_category_counts[lang][category_name] += 1
-                translation_count = sum(
-                    len(sense.get("glosses", {})) for sense in lexeme.get("senses", [])
-                )
-                self.translation_counts[lang][category_name] += translation_count
-
-                break
-
-    # MARK: process translations
-    def _process_lexeme_translations(self, lexeme: dict) -> None:
-        """
-        Process gloss-based translations if 'translations' is in parse_type.
-        Store them in self.translations_index.
-        """
-        lemmas = lexeme.get("lemmas", {})
-        qid = lexeme.get("lexicalCategory")
-
-        if not (lemmas and qid):
-            return
-
-        category_name = self._category_lookup.get(qid)
-        if not category_name:
-            return
-
-        # Only store first valid lemma for translations.
-        for lang_code, lemma_data in lemmas.items():
-            if lang_code not in self.iso_to_name:
-                continue
-
-            word = lemma_data.get("value", "").lower()
-            if not word:
-                continue
-
-            # Build translations from sense glosses.
-            translations = {}
-            for sense in lexeme.get("senses", []):
-                for sense_lang_code, gloss in sense.get("glosses", {}).items():
-                    if sense_lang_code in self.iso_to_name:
-                        translations[sense_lang_code] = gloss["value"]
+        for language in self.target_iso:
+            if (
+                language.lower().startswith("q")
+                and language[1:].isdigit()
+            ):
+                qid_to_lang = check_qid_is_language(language)
+                if qid_to_lang:
+                    iso_code = get_language_iso_code(language.upper())
+                    iso_mapping[iso_code] = qid_to_lang
+                    print(f"ISO code for {language} is {iso_code}")
 
-            if translations:
-                self.translations_index[word][lang_code][category_name] = translations
-
-            break  # only handle the first lemma
-
-    # MARK: process forms
-    def _process_lexeme_forms(self, lexeme: dict) -> None:
-        """
-        Process forms for categories in self.data_types if 'form' is in parse_type.
-        Store them in self.forms_index.
-        """
-        lemmas = lexeme.get("lemmas", {})
-        lexical_category = lexeme.get("lexicalCategory")
-
-        # Skip if category missing or not recognized.
-        if not lexical_category or lexical_category not in data_type_metadata.values():
-            return
-
-        # Convert Q1084 -> "nouns", etc.
-        category_name = self._category_lookup.get(lexical_category)
-        if not category_name:
-            return
-
-        # If the category_name is NOT in our data_types list, skip
-        # e.g., category_name = "nouns", but user didn't request "nouns" in data_types.
-        if category_name not in self.data_types:
-            return
-
-        # Process forms.
-        for lang_code, lemma_data in lemmas.items():
-            if lang_code not in self.iso_to_name:
-                continue
-
-            word = lemma_data.get("value", "").lower()
-            if not word:
-                continue
-
-            forms_data = defaultdict(list)
-            for form in lexeme.get("forms", []):
-                representations = form.get("representations", {})
-                grammatical_features = form.get("grammaticalFeatures", [])
-
-                for rep_lang, rep_data in representations.items():
-                    if rep_lang == lang_code:
-                        if form_value := rep_data.get("value"):
-                            forms_data[form_value].extend(grammatical_features)
-
-            if forms_data:
-                self.forms_index[word][lang_code][category_name] = dict(forms_data)
-                self.forms_counts[lang_code][category_name] += len(forms_data)
-
-            break  # only first valid lemma
+        return iso_mapping
 
     # MARK: process lines
     def process_lines(self, line: str) -> None:
@@ -385,6 +284,12 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N
                 for word, lang_data in self.translations_index.items()
                 if language_iso in lang_data
             }
+
+            # Check if filtered data is empty before saving.
+            if not filtered:
+                print(f"No translations found for {language_iso}, skipping export...")
+                return
+
             self._save_by_language(filtered, filepath, language_iso, "translations")
 
     # MARK: export forms
@@ -418,6 +323,11 @@ def export_forms_json(
                     else:
                         filtered[word] = {language_iso: lang_data[language_iso]}
 
+            # Check if filtered data is empty before saving.
+            if not filtered:
+                print(f"No forms found for {language_iso}, skipping export...")
+                return
+
             self._save_by_language(
                 filtered, filepath, language_iso, data_type or "forms"
             )

diff --git a/tests/cli/test_get.py b/tests/cli/test_get.py
@@ -81,7 +81,7 @@ def test_get_all_data_types_for_language_user_says_yes(
         mock_parse.assert_called_once_with(
             language="English",
             wikidata_dump_type=["form"],
-            data_types=None,  # because data_types = [data_type] if provided else None
+            data_types="all",  # because if only language given, data_types is None
             type_output_dir="scribe_data_json_export",  # default for JSON
         )
         mock_query_data.assert_not_called()