Skip to content

Commit

Permalink
Merge pull request #545 from axif0/all_in_one
Browse files Browse the repository at this point in the history
Introduced QID search in translation.
  • Loading branch information
andrewtavis authored Jan 11, 2025
2 parents 49df8d8 + 2fdd74b commit 9bbc0c6
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 155 deletions.
7 changes: 7 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,13 @@ pre-commit install # install pre-commit hooks
# pre-commit run --all-files # lint and fix common problems in the codebase
```

> [!NOTE]
> If you are having issues with pre-commit and want to send along your changes regardless, you can ignore the pre-commit hooks via the following:
>
> ```bash
> git commit --no-verify -m "COMMIT_MESSAGE"
> ```
If you face any issues, consider reinstalling Scribe-data by running the following:
```bash
Expand Down
2 changes: 1 addition & 1 deletion src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def prompt_user_download_all():
parse_wd_lexeme_dump(
language=language,
wikidata_dump_type=["form"],
data_types=data_types,
data_types="all",
type_output_dir=output_dir,
)
else:
Expand Down
35 changes: 1 addition & 34 deletions src/scribe_data/cli/total.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from typing import List, Union
from urllib.error import HTTPError

import requests
from SPARQLWrapper import JSON

from scribe_data.utils import (
Expand All @@ -34,6 +33,7 @@
language_metadata,
language_to_qid,
list_all_languages,
check_qid_is_language,
)
from scribe_data.wikidata.wikidata_utils import parse_wd_lexeme_dump, sparql

Expand Down Expand Up @@ -124,39 +124,6 @@ def get_datatype_list(language):
return data_type_metadata


def check_qid_is_language(qid: str):
"""
Parameters
----------
qid : str
The QID to check Wikidata to see if it's a language and return its English label.
Outputs
-------
str
The English label of the Wikidata language entity.
Raises
------
ValueError
An invalid QID that's not a language has been passed.
"""
api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0"
request_string = f"{api_endpoint}/entities/items/{qid}"

request = requests.get(request_string, timeout=5)
request_result = request.json()

if request_result["statements"]["P31"]:
instance_of_values = request_result["statements"]["P31"]
for val in instance_of_values:
if val["value"]["content"] == "Q34770":
print(f"{request_result['labels']['en']} ({qid}) is a language.\n")
return request_result["labels"]["en"]

raise ValueError("The passed Wikidata QID is not a language.")


# MARK: Print


Expand Down
4 changes: 4 additions & 0 deletions src/scribe_data/resources/wikidata_qids_pids.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"instance_of": "P31",
"ietf_language_tag": "P305"
}
80 changes: 80 additions & 0 deletions src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from typing import Any, Optional

import questionary
import requests
from rich import print as rprint

# MARK: Utils Variables
Expand All @@ -54,6 +55,9 @@
LEXEME_FORM_METADATA_FILE = (
Path(__file__).parent / "resources" / "lexeme_form_metadata.json"
)
WIKIDATA_QIDS_PIDS_FILE = (
Path(__file__).parent / "resources" / "wikidata_qids_pids.json"
)
DATA_DIR = Path(DEFAULT_JSON_EXPORT_DIR)

try:
Expand All @@ -78,6 +82,13 @@
except (IOError, json.JSONDecodeError) as e:
print(f"Error reading lexeme form metadata: {e}")

try:
with WIKIDATA_QIDS_PIDS_FILE.open("r", encoding="utf-8") as file:
wikidata_qids_pids = json.load(file)

except (IOError, json.JSONDecodeError) as e:
print(f"Error reading language metadata: {e}")


language_map = {}
language_to_qid = {}
Expand Down Expand Up @@ -736,3 +747,72 @@ def check_index_exists(index_path: Path, overwrite_all: bool = False) -> bool:
return choice == "Skip process"

return False


def check_qid_is_language(qid: str):
"""
Parameters
----------
qid : str
The QID to check Wikidata to see if it's a language and return its English label.
Outputs
-------
str
The English label of the Wikidata language entity.
Raises
------
ValueError
An invalid QID that's not a language has been passed.
"""
api_endpoint = "https://www.wikidata.org/w/rest.php/wikibase/v0"
request_string = f"{api_endpoint}/entities/items/{qid}"

request = requests.get(request_string, timeout=5)
request_result = request.json()

if request_result["statements"][wikidata_qids_pids["instance_of"]]:
instance_of_values = request_result["statements"][
wikidata_qids_pids["instance_of"]
]
for val in instance_of_values:
if val["value"]["content"] == "Q34770":
print(f"{request_result['labels']['en']} ({qid}) is a language.\n")
return request_result["labels"]["en"]

raise ValueError("The passed Wikidata QID is not a language.")


def get_language_iso_code(qid: str):
"""
Parameters
----------
qid : str
Get the ISO code of a language given its Wikidata QID.
Outputs
-------
str
The ISO code of the language.
Raises
------
ValueError
An invalid QID that's not a language has been passed.
KeyError
The ISO code for the language is not available.
"""

api_endpoint = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={qid}&props=claims&format=json"
response = requests.get(api_endpoint)
data = response.json()
try:
return data["entities"][qid]["claims"][wikidata_qids_pids["ietf_language_tag"]][
0
]["mainsnak"]["datavalue"]["value"]

except ValueError:
raise ValueError("The passed Wikidata QID is not a language.")
except KeyError:
return KeyError("The ISO code for the language is not available.")
138 changes: 24 additions & 114 deletions src/scribe_data/wiktionary/parse_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
check_index_exists,
data_type_metadata,
language_metadata,
get_language_iso_code,
check_qid_is_language,
)
from tqdm import tqdm

Expand Down Expand Up @@ -81,7 +83,6 @@ def __init__(

# Build map from ISO to full language name.
self.iso_to_name = self._build_iso_mapping()

# For "total" usage.
self.lexical_category_counts = defaultdict(Counter)
self.translation_counts = defaultdict(Counter)
Expand All @@ -101,120 +102,18 @@ def _build_iso_mapping(self) -> dict:
if iso_code := data.get("iso"):
iso_mapping[iso_code] = lang_name

return iso_mapping

# MARK: process total
def _process_lexeme_total(self, lexeme: dict) -> None:
"""
Gather stats if 'total' is in parse_type: how many entries per language & category,
how many translations, etc.
"""
lexicalCategory = lexeme.get("lexicalCategory")
if not lexicalCategory or lexicalCategory not in data_type_metadata.values():
return

category_name = self._category_lookup.get(lexicalCategory)
if not category_name:
return

# Update counters.
lemmas = lexeme.get("lemmas", {})
for lemma in lemmas.values():
lang = lemma.get("language")

if lang in self.iso_to_name:
self.lexical_category_counts[lang][category_name] += 1
translation_count = sum(
len(sense.get("glosses", {})) for sense in lexeme.get("senses", [])
)
self.translation_counts[lang][category_name] += translation_count

break

# MARK: process translations
def _process_lexeme_translations(self, lexeme: dict) -> None:
"""
Process gloss-based translations if 'translations' is in parse_type.
Store them in self.translations_index.
"""
lemmas = lexeme.get("lemmas", {})
qid = lexeme.get("lexicalCategory")

if not (lemmas and qid):
return

category_name = self._category_lookup.get(qid)
if not category_name:
return

# Only store first valid lemma for translations.
for lang_code, lemma_data in lemmas.items():
if lang_code not in self.iso_to_name:
continue

word = lemma_data.get("value", "").lower()
if not word:
continue

# Build translations from sense glosses.
translations = {}
for sense in lexeme.get("senses", []):
for sense_lang_code, gloss in sense.get("glosses", {}).items():
if sense_lang_code in self.iso_to_name:
translations[sense_lang_code] = gloss["value"]
for language in self.target_iso:
if (
language.lower().startswith("q")
and language[1:].isdigit()
):
qid_to_lang = check_qid_is_language(language)
if qid_to_lang:
iso_code = get_language_iso_code(language.upper())
iso_mapping[iso_code] = qid_to_lang
print(f"ISO code for {language} is {iso_code}")

if translations:
self.translations_index[word][lang_code][category_name] = translations

break # only handle the first lemma

# MARK: process forms
def _process_lexeme_forms(self, lexeme: dict) -> None:
"""
Process forms for categories in self.data_types if 'form' is in parse_type.
Store them in self.forms_index.
"""
lemmas = lexeme.get("lemmas", {})
lexical_category = lexeme.get("lexicalCategory")

# Skip if category missing or not recognized.
if not lexical_category or lexical_category not in data_type_metadata.values():
return

# Convert Q1084 -> "nouns", etc.
category_name = self._category_lookup.get(lexical_category)
if not category_name:
return

# If the category_name is NOT in our data_types list, skip
# e.g., category_name = "nouns", but user didn't request "nouns" in data_types.
if category_name not in self.data_types:
return

# Process forms.
for lang_code, lemma_data in lemmas.items():
if lang_code not in self.iso_to_name:
continue

word = lemma_data.get("value", "").lower()
if not word:
continue

forms_data = defaultdict(list)
for form in lexeme.get("forms", []):
representations = form.get("representations", {})
grammatical_features = form.get("grammaticalFeatures", [])

for rep_lang, rep_data in representations.items():
if rep_lang == lang_code:
if form_value := rep_data.get("value"):
forms_data[form_value].extend(grammatical_features)

if forms_data:
self.forms_index[word][lang_code][category_name] = dict(forms_data)
self.forms_counts[lang_code][category_name] += len(forms_data)

break # only first valid lemma
return iso_mapping

# MARK: process lines
def process_lines(self, line: str) -> None:
Expand Down Expand Up @@ -385,6 +284,12 @@ def export_translations_json(self, filepath: str, language_iso: str = None) -> N
for word, lang_data in self.translations_index.items()
if language_iso in lang_data
}

# Check if filtered data is empty before saving.
if not filtered:
print(f"No translations found for {language_iso}, skipping export...")
return

self._save_by_language(filtered, filepath, language_iso, "translations")

# MARK: export forms
Expand Down Expand Up @@ -418,6 +323,11 @@ def export_forms_json(
else:
filtered[word] = {language_iso: lang_data[language_iso]}

# Check if filtered data is empty before saving.
if not filtered:
print(f"No forms found for {language_iso}, skipping export...")
return

self._save_by_language(
filtered, filepath, language_iso, data_type or "forms"
)
Expand Down
2 changes: 1 addition & 1 deletion tests/cli/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_get_all_data_types_for_language_user_says_yes(
mock_parse.assert_called_once_with(
language="English",
wikidata_dump_type=["form"],
data_types=None, # because data_types = [data_type] if provided else None
data_types="all", # because if only language given, data_types is None
type_output_dir="scribe_data_json_export", # default for JSON
)
mock_query_data.assert_not_called()
Expand Down
Loading

0 comments on commit 9bbc0c6

Please sign in to comment.