Skip to content

Commit

Permalink
Add project metadata workflow and minor edits to script
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Oct 24, 2024
1 parent 0fc2200 commit 464a8ee
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 54 deletions.
44 changes: 44 additions & 0 deletions .github/workflows/check_project_metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: Check Project Metadata
on:
push:
branches: [main]
pull_request:
branches: [main]
types: [opened, reopened, synchronize]

jobs:
structure-check:
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
python-version:
- "3.9"

runs-on: ${{ matrix.os }}

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Add project root to PYTHONPATH
run: echo "PYTHONPATH=$(pwd)/src" >> $GITHUB_ENV

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Run check_project_metadata.py
working-directory: ./src/scribe_data/check
run: python check_project_metadata.py

- name: Post-run status
if: failure()
run: echo "Project metadata check failed. Please fix the reported errors."
Original file line number Diff line number Diff line change
@@ -1,13 +1,38 @@
"""
Check the Scribe-Data metadata files to make sure that all information is included.
Example
-------
python3 src/scribe_data/check/check_project_metadata.py
.. raw:: html
<!--
* Copyright (C) 2024 Scribe
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
-->
"""

import difflib
import sys

from scribe_data.cli.cli_utils import (
from scribe_data.utils import (
LANGUAGE_DATA_EXTRACTION_DIR,
_languages,
data_type_metadata,
)

from scribe_data.utils import _languages

all_data_types = tuple(data_type_metadata.keys())


Expand All @@ -23,34 +48,32 @@ def get_available_languages() -> dict[str, list[str]]:
available_languages = {}

for lang_folder in extraction_dir.iterdir():
if lang_folder.is_dir(): # Check if it's a directory
if lang_folder.is_dir(): # check if it's a directory
lang_name = (
lang_folder.name.lower()
) # Normalize keys to lowercase for case-insensitive comparison
) # normalize keys to lowercase for case-insensitive comparison
sub_languages = []

# Check if lang_folder contains subdirectories
# Check if lang_folder contains subdirectories.
for sub_folder in lang_folder.iterdir():
if sub_folder.is_dir():
sub_lang_name = (
sub_folder.name.lower()
) # Normalize to lowercase for case-insensitive comparison
) # normalize to lowercase for case-insensitive comparison.

# Check for almost similar keys using difflib
# Check for almost similar keys using difflib.
close_matches = difflib.get_close_matches( # verb, noun, etc.
sub_lang_name, all_data_types, n=1, cutoff=0.8
)

# Append sub-language name if no close match found (not a data type).
if not close_matches:
sub_languages.append(
sub_lang_name
) # Append sub-language name if no close match found (not a data type)
sub_languages.append(sub_lang_name)

if (
sub_languages
): # If we found sub-languages, add them to available_languages
# If we found sub-languages, add them to available_languages.s
if sub_languages:
available_languages[lang_name] = {"sub_languages": sub_languages}
else: # No sub-languages found, initialize entry without them
else:
available_languages[lang_name] = {}

return available_languages
Expand All @@ -65,31 +88,32 @@ def get_missing_languages(
Parameters
----------
reference_languages : dict
A dictionary of languages from the reference source.
target_languages : dict
A dictionary of languages from the target source to check for missing entries.
reference_languages : dict
A dictionary of languages from the reference source.
target_languages : dict
A dictionary of languages from the target source to check for missing entries.
Returns
-------
list[str]
A list of languages and sub-languages that are in target_languages but not in reference_languages.
list[str]
A list of languages and sub-languages that are in target_languages but not in reference_languages.
"""
missing_languages = []
reference_keys = reference_languages.keys()

for lang, details in target_languages.items():
# Check if the parent language exists
# Check if the parent language exists.
if lang not in reference_keys:
# If it's a parent language, check for sub-languages and append them
# If it's a parent language, check for sub-languages and append them.
if "sub_languages" in details:
for sub_lang in details["sub_languages"]:
missing_languages.append(f"{lang}/{sub_lang}")
else:
# Individual language, append directly
# Individual language, append directly.
missing_languages.append(lang)
else:
# If the parent exists, only check for missing sub-languages
# If the parent exists, only check for missing sub-languages.
ref_sub_languages = reference_languages[lang].get("sub_languages", {})

if "sub_languages" in details:
Expand All @@ -106,33 +130,34 @@ def validate_language_properties(languages_dict: dict) -> dict:
Parameters
----------
languages_dict (dict): A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key.
languages_dict : dict
A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key.
Returns
-------
dict: A dictionary with two lists:
- "missing_qids": Languages or sub-languages missing the 'qid' property.
- "missing_isos": Languages or sub-languages missing the 'iso' property.
- "missing_qids": Languages or sub-languages missing the 'qid' property.
- "missing_isos": Languages or sub-languages missing the 'iso' property.
Each entry in these lists is in the format "parent_language - sub_language" for sub-languages,
or simply "parent_language" for the parent languages.
Each entry in these lists is in the format "parent_language - sub_language" for sub-languages,
or simply "parent_language" for the parent languages.
"""
missing_qids = []
missing_isos = []

for lang, details in languages_dict.items():
# Check if the language has sub-languages
# Check if the language has sub-languages.
if "sub_languages" in details:
sub_languages = details["sub_languages"]

# Validate each sub-language
# Validate each sub-language.
for sub_lang, sub_details in sub_languages.items():
if "qid" not in sub_details:
missing_qids.append(f"{lang}/{sub_lang}")
if "iso" not in sub_details:
missing_isos.append(f"{lang}/{sub_lang}")
else:
# Validate the parent language itself
# Validate the parent language itself.
if "qid" not in details:
missing_qids.append(lang)
if "iso" not in details:
Expand All @@ -146,9 +171,10 @@ def check_language_metadata():
Validates language metadata by performing the following checks:
1. Ensures that all languages listed in `language_data_extraction` are present in `language_metadata.json`, and vice versa.
2. Checks if each language in `language_metadata.json` has the required properties:
- 'qid' (a unique identifier)
- 'iso' (ISO language code)
- 'qid' (a unique identifier)
- 'iso' (ISO language code)
This function helps identify missing languages or missing properties, ensuring data consistency across both sources.
Expand All @@ -160,52 +186,45 @@ def check_language_metadata():
languages_in_metadata = {key.lower(): value for key, value in _languages.items()}

languages_in_directory = get_available_languages()
missing_languages_metadata = get_missing_languages(
languages_in_metadata, languages_in_directory
)

missing_languages_extraction = get_missing_languages(
languages_in_directory, languages_in_metadata
)

languages_with_missing_properties = validate_language_properties(
languages_in_metadata
)

if (
missing_languages_metadata
or missing_languages_extraction
missing_languages_extraction
or languages_with_missing_properties["missing_qids"]
or languages_with_missing_properties["missing_isos"]
):
if missing_languages_metadata or missing_languages_extraction:
if missing_languages_extraction:
print(
"There are missing languages or inconsistencies between language_metadata.json and language_data_extraction.\n"
)

if missing_languages_metadata:
print("Languages missing from language_metadata.json:")
for lang in missing_languages_metadata:
print(f" • {lang.title()}")

if missing_languages_extraction:
print("\nLanguages missing from language_data_extraction:")
for lang in missing_languages_extraction:
print(f" {lang.title()}")
print(f" - {lang.title()}")

if languages_with_missing_properties["missing_qids"]:
print("\nLanguages missing the `qid` property:")
for lang in languages_with_missing_properties["missing_qids"]:
print(f" {lang.title()}")
print(f" - {lang.title()}")

if languages_with_missing_properties["missing_isos"]:
print("\nLanguages missing the `iso` property:")
for lang in languages_with_missing_properties["missing_isos"]:
print(f" {lang.title()}")
print(f" - {lang.title()}")

# Exit with a non-zero status code to indicate failure
# Exit with a non-zero status code to indicate failure.
sys.exit(1)

print(
"All languages match between language_metadata.json and language_data_extraction; languages in language_metadata.json have the correct properties."
"All languages in language_metadata.json are included in Scribe-Data.\nLanguages in language_metadata.json have the correct properties."
)


Expand Down
6 changes: 3 additions & 3 deletions src/scribe_data/check/check_project_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def check_data_type_folders(
)


def validate_project_structure():
def check_project_structure():
"""
Validate that all directories follow the expected project structure and check for unexpected files and directories.
Also validate SPARQL query file names in data_type folders and SUBDIRECTORIES.
Expand Down Expand Up @@ -175,7 +175,7 @@ def validate_project_structure():
continue

if language not in LANGUAGES:
errors.append(f"Unexpected language directory: {language}")
errors.append(f"Unexpected language directory given: {language}")
continue

# Check for unexpected files in language directory.
Expand Down Expand Up @@ -249,4 +249,4 @@ def validate_project_structure():


if __name__ == "__main__":
validate_project_structure()
check_project_structure()

0 comments on commit 464a8ee

Please sign in to comment.