Add project metadata workflow and minor edits to script

scribe-org · Oct 24, 2024 · 464a8ee · 464a8ee
1 parent 0fc2200
commit 464a8ee
Show file tree

Hide file tree

Showing 3 changed files with 117 additions and 54 deletions.
diff --git a/.github/workflows/check_project_metadata.yaml b/.github/workflows/check_project_metadata.yaml
@@ -0,0 +1,44 @@
+name: Check Project Metadata
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+    types: [opened, reopened, synchronize]
+
+jobs:
+  structure-check:
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+        python-version:
+          - "3.9"
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Add project root to PYTHONPATH
+        run: echo "PYTHONPATH=$(pwd)/src" >> $GITHUB_ENV
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run check_project_metadata.py
+        working-directory: ./src/scribe_data/check
+        run: python check_project_metadata.py
+
+      - name: Post-run status
+        if: failure()
+        run: echo "Project metadata check failed. Please fix the reported errors."
diff --git a/...ibe_data/check/check_language_metadata.py → ...ribe_data/check/check_project_metadata.py b/...ibe_data/check/check_language_metadata.py → ...ribe_data/check/check_project_metadata.py
@@ -1,13 +1,38 @@
+"""
+Check the Scribe-Data metadata files to make sure that all information is included.
+
+Example
+-------
+    python3 src/scribe_data/check/check_project_metadata.py
+
+.. raw:: html
+    <!--
+    * Copyright (C) 2024 Scribe
+    *
+    * This program is free software: you can redistribute it and/or modify
+    * it under the terms of the GNU General Public License as published by
+    * the Free Software Foundation, either version 3 of the License, or
+    * (at your option) any later version.
+    *
+    * This program is distributed in the hope that it will be useful,
+    * but WITHOUT ANY WARRANTY; without even the implied warranty of
+    * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    * GNU General Public License for more details.
+    *
+    * You should have received a copy of the GNU General Public License
+    * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+    -->
+"""
+
 import difflib
 import sys
 
-from scribe_data.cli.cli_utils import (
+from scribe_data.utils import (
     LANGUAGE_DATA_EXTRACTION_DIR,
+    _languages,
     data_type_metadata,
 )
 
-from scribe_data.utils import _languages
-
 all_data_types = tuple(data_type_metadata.keys())
 
 
@@ -23,34 +48,32 @@ def get_available_languages() -> dict[str, list[str]]:
     available_languages = {}
 
     for lang_folder in extraction_dir.iterdir():
-        if lang_folder.is_dir():  # Check if it's a directory
+        if lang_folder.is_dir():  # check if it's a directory
             lang_name = (
                 lang_folder.name.lower()
-            )  # Normalize keys to lowercase for case-insensitive comparison
+            )  # normalize keys to lowercase for case-insensitive comparison
             sub_languages = []
 
-            # Check if lang_folder contains subdirectories
+            # Check if lang_folder contains subdirectories.
             for sub_folder in lang_folder.iterdir():
                 if sub_folder.is_dir():
                     sub_lang_name = (
                         sub_folder.name.lower()
-                    )  # Normalize to lowercase for case-insensitive comparison
+                    )  # normalize to lowercase for case-insensitive comparison.
 
-                    # Check for almost similar keys using difflib
+                    # Check for almost similar keys using difflib.
                     close_matches = difflib.get_close_matches(  # verb, noun, etc.
                         sub_lang_name, all_data_types, n=1, cutoff=0.8
                     )
 
+                    # Append sub-language name if no close match found (not a data type).
                     if not close_matches:
-                        sub_languages.append(
-                            sub_lang_name
-                        )  # Append sub-language name if no close match found (not a data type)
+                        sub_languages.append(sub_lang_name)
 
-            if (
-                sub_languages
-            ):  # If we found sub-languages, add them to available_languages
+            # If we found sub-languages, add them to available_languages.s
+            if sub_languages:
                 available_languages[lang_name] = {"sub_languages": sub_languages}
-            else:  # No sub-languages found, initialize entry without them
+            else:
                 available_languages[lang_name] = {}
 
     return available_languages
@@ -65,31 +88,32 @@ def get_missing_languages(
 
     Parameters
     ----------
-    reference_languages : dict
-        A dictionary of languages from the reference source.
-    target_languages : dict
-        A dictionary of languages from the target source to check for missing entries.
+        reference_languages : dict
+            A dictionary of languages from the reference source.
+
+        target_languages : dict
+            A dictionary of languages from the target source to check for missing entries.
 
     Returns
     -------
-    list[str]
-        A list of languages and sub-languages that are in target_languages but not in reference_languages.
+        list[str]
+            A list of languages and sub-languages that are in target_languages but not in reference_languages.
     """
     missing_languages = []
     reference_keys = reference_languages.keys()
 
     for lang, details in target_languages.items():
-        # Check if the parent language exists
+        # Check if the parent language exists.
         if lang not in reference_keys:
-            # If it's a parent language, check for sub-languages and append them
+            # If it's a parent language, check for sub-languages and append them.
             if "sub_languages" in details:
                 for sub_lang in details["sub_languages"]:
                     missing_languages.append(f"{lang}/{sub_lang}")
             else:
-                # Individual language, append directly
+                # Individual language, append directly.
                 missing_languages.append(lang)
         else:
-            # If the parent exists, only check for missing sub-languages
+            # If the parent exists, only check for missing sub-languages.
             ref_sub_languages = reference_languages[lang].get("sub_languages", {})
 
             if "sub_languages" in details:
@@ -106,33 +130,34 @@ def validate_language_properties(languages_dict: dict) -> dict:
 
     Parameters
     ----------
-        languages_dict (dict): A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key.
+        languages_dict : dict
+            A dictionary where each key is a language, and the value is another dictionary containing details about the language. If the language has sub-languages, they are stored under the 'sub_languages' key.
 
     Returns
     -------
         dict: A dictionary with two lists:
-              - "missing_qids": Languages or sub-languages missing the 'qid' property.
-              - "missing_isos": Languages or sub-languages missing the 'iso' property.
+            - "missing_qids": Languages or sub-languages missing the 'qid' property.
+            - "missing_isos": Languages or sub-languages missing the 'iso' property.
 
-              Each entry in these lists is in the format "parent_language - sub_language" for sub-languages,
-              or simply "parent_language" for the parent languages.
+            Each entry in these lists is in the format "parent_language - sub_language" for sub-languages,
+            or simply "parent_language" for the parent languages.
     """
     missing_qids = []
     missing_isos = []
 
     for lang, details in languages_dict.items():
-        # Check if the language has sub-languages
+        # Check if the language has sub-languages.
         if "sub_languages" in details:
             sub_languages = details["sub_languages"]
 
-            # Validate each sub-language
+            # Validate each sub-language.
             for sub_lang, sub_details in sub_languages.items():
                 if "qid" not in sub_details:
                     missing_qids.append(f"{lang}/{sub_lang}")
                 if "iso" not in sub_details:
                     missing_isos.append(f"{lang}/{sub_lang}")
         else:
-            # Validate the parent language itself
+            # Validate the parent language itself.
             if "qid" not in details:
                 missing_qids.append(lang)
             if "iso" not in details:
@@ -146,9 +171,10 @@ def check_language_metadata():
     Validates language metadata by performing the following checks:
 
     1. Ensures that all languages listed in `language_data_extraction` are present in `language_metadata.json`, and vice versa.
+
     2. Checks if each language in `language_metadata.json` has the required properties:
-       - 'qid' (a unique identifier)
-       - 'iso' (ISO language code)
+        - 'qid' (a unique identifier)
+        - 'iso' (ISO language code)
 
     This function helps identify missing languages or missing properties, ensuring data consistency across both sources.
 
@@ -160,52 +186,45 @@ def check_language_metadata():
     languages_in_metadata = {key.lower(): value for key, value in _languages.items()}
 
     languages_in_directory = get_available_languages()
-    missing_languages_metadata = get_missing_languages(
-        languages_in_metadata, languages_in_directory
-    )
+
     missing_languages_extraction = get_missing_languages(
         languages_in_directory, languages_in_metadata
     )
+
     languages_with_missing_properties = validate_language_properties(
         languages_in_metadata
     )
 
     if (
-        missing_languages_metadata
-        or missing_languages_extraction
+        missing_languages_extraction
         or languages_with_missing_properties["missing_qids"]
         or languages_with_missing_properties["missing_isos"]
     ):
-        if missing_languages_metadata or missing_languages_extraction:
+        if missing_languages_extraction:
             print(
                 "There are missing languages or inconsistencies between language_metadata.json and language_data_extraction.\n"
             )
 
-        if missing_languages_metadata:
-            print("Languages missing from language_metadata.json:")
-            for lang in missing_languages_metadata:
-                print(f"  • {lang.title()}")
-
         if missing_languages_extraction:
             print("\nLanguages missing from language_data_extraction:")
             for lang in missing_languages_extraction:
-                print(f"  • {lang.title()}")
+                print(f"  - {lang.title()}")
 
         if languages_with_missing_properties["missing_qids"]:
             print("\nLanguages missing the `qid` property:")
             for lang in languages_with_missing_properties["missing_qids"]:
-                print(f"  • {lang.title()}")
+                print(f"  - {lang.title()}")
 
         if languages_with_missing_properties["missing_isos"]:
             print("\nLanguages missing the `iso` property:")
             for lang in languages_with_missing_properties["missing_isos"]:
-                print(f"  • {lang.title()}")
+                print(f"  - {lang.title()}")
 
-        # Exit with a non-zero status code to indicate failure
+        # Exit with a non-zero status code to indicate failure.
         sys.exit(1)
 
     print(
-        "All languages match between language_metadata.json and language_data_extraction; languages in language_metadata.json have the correct properties."
+        "All languages in language_metadata.json are included in Scribe-Data.\nLanguages in language_metadata.json have the correct properties."
     )
 
 

diff --git a/src/scribe_data/check/check_project_structure.py b/src/scribe_data/check/check_project_structure.py
@@ -146,7 +146,7 @@ def check_data_type_folders(
                     )
 
 
-def validate_project_structure():
+def check_project_structure():
     """
     Validate that all directories follow the expected project structure and check for unexpected files and directories.
     Also validate SPARQL query file names in data_type folders and SUBDIRECTORIES.
@@ -175,7 +175,7 @@ def validate_project_structure():
             continue
 
         if language not in LANGUAGES:
-            errors.append(f"Unexpected language directory: {language}")
+            errors.append(f"Unexpected language directory given: {language}")
             continue
 
         # Check for unexpected files in language directory.
@@ -249,4 +249,4 @@ def validate_project_structure():
 
 
 if __name__ == "__main__":
-    validate_project_structure()
+    check_project_structure()