continue work on file parsing in #2

softwaresaved · Feb 5, 2024 · 0e5f050 · 0e5f050
1 parent d13ee50
commit 0e5f050
Show file tree

Hide file tree

Showing 4 changed files with 180 additions and 16 deletions.
diff --git a/fuji_server/data/software_file.json b/fuji_server/data/software_file.json
@@ -0,0 +1,57 @@
+{
+  "Jenkinsfile": {
+    "category": [
+      "automation"
+    ],
+    "parse": "full",
+    "pattern": [
+      "**/Jenkinsfile"
+    ]
+  },
+  "README": {
+    "category": [
+      "documentation"
+    ],
+    "parse": "full",
+    "pattern": [
+      "**/README*"
+    ]
+  },
+  "docs_directory": {
+    "category": [
+      "documentation"
+    ],
+    "parse": "file_name",
+    "pattern": [
+      "**/docs/"
+    ]
+  },
+  "github_actions": {
+    "category": [
+      "automation"
+    ],
+    "parse": "full",
+    "pattern": [
+      ".github/workflows/"
+    ]
+  },
+  "mvn_pom": {
+    "category": [
+      "documentation",
+      "automation"
+    ],
+    "parse": "full",
+    "pattern": [
+      "pom.xml"
+    ]
+  },
+  "python_code": {
+    "category": [
+      "development"
+    ],
+    "parse": "file_name",
+    "pattern": [
+      "*.py"
+    ]
+  }
+}
diff --git a/fuji_server/evaluators/fair_evaluator_requirements.py b/fuji_server/evaluators/fair_evaluator_requirements.py
@@ -33,6 +33,18 @@ def __init__(self, fuji_instance):
         }
 
     def nestedDataContainsKeyword(self, data, key):
+        """Recursively check whether text data in nested structures (such as list and dict) contains a keyword.
+
+        Args:
+            data (list | dict): nested structure containing text data
+            key (str): keyword to look for
+
+        Raises:
+            TypeError: argument data must be one of list or dict
+
+        Returns:
+            bool: True if key found somewhere in nested structure.
+        """
         values = None
         if type(data) == list:
             values = data
@@ -54,6 +66,31 @@ def nestedDataContainsKeyword(self, data, key):
                     self.logger.warning(f"{self.metric_identifier}: scan of nested data failed ({e.message}).")
         return False
 
+    def scanForKeywords(self, keywords, locations):
+        """Scan GitHub harvesting results for keywords.
+
+        Args:
+            keywords (list<str>): list of keywords to look for
+            locations (list<str>): list of locations to scan, used as keys for GitHub harvesting results
+
+        Returns:
+            dict<str, bool>: dictionary with keywords as keys and a boolean as value indicating whether the keyword was found in some location.
+        """
+        hit_dict = {k: False for k in keywords}
+        keys_to_check = keywords
+        # check each location (if available) for keywords
+        for location in locations:
+            for k in keys_to_check:
+                content = self.fuji.github_data.get(location)
+                if content is not None:
+                    if type(content) == str:
+                        if k in content.lower():
+                            hit_dict[k] = True  # found keyword in location
+                            keys_to_check.remove(k)  # stop looking, have found something for this key
+                    else:
+                        hit_dict[k] = self.nestedDataContainsKeyword(content, k)
+        return hit_dict
+
     def testBuildInstructions(self):
         """The software has build, installation and/or execution instructions.
 
@@ -76,17 +113,7 @@ def testBuildInstructions(self):
             self.logger.info(
                 f"{self.metric_identifier} : Looking for {required_modality} keywords {required_keywords} in {required_locations}."
             )
-            hit_dict = {k: False for k in required_keywords}
-            # check each location (if available) for keywords
-            for location in required_locations:
-                for k in hit_dict.keys():
-                    content = self.fuji.github_data.get(location)
-                    if content is not None:
-                        if type(content) == str:
-                            if k in content.lower():
-                                hit_dict[k] = True  # found keyword in location
-                        else:
-                            hit_dict[k] = self.nestedDataContainsKeyword(content, k)
+            hit_dict = self.scanForKeywords(required_keywords, required_locations)
             found_instructions = False
             if required_modality == "all":
                 found_instructions = all(hit_dict.values())
@@ -102,6 +129,10 @@ def testBuildInstructions(self):
                 self.maturity = self.getTestConfigMaturity(test_id)
                 self.setEvaluationCriteriumScore(test_id, test_score, "pass")
                 self.score.earned += test_score
+            else:  # does not pass
+                self.logger.warning(
+                    f"{self.metric_identifier} : Did not find {required_modality} keywords {required_keywords} in {required_locations}."
+                )
         return test_status
 
     def testDependencies(self):
@@ -119,6 +150,42 @@ def testDependencies(self):
                 break
         if test_defined:
             self.logger.warning(f"{self.metric_identifier} : Test for dependencies is not implemented.")
+            test_score = self.getTestConfigScore(test_id)
+            # Check for presence of machine-readable dependency files
+            dependency_requirements = self.metric_tests[test_id].metric_test_requirements[0]
+            assert (
+                dependency_requirements["modality"] == "any"
+            ), f"Found requirement modality {dependency_requirements['modality']}, please choose 'any' instead. Any other modality is too strict for this test layout."
+            required_dependency_files = dependency_requirements["required"]["dependency_file"]
+            self.logger.info(f"{self.metric_identifier} : Checking presence of any of {required_dependency_files}.")
+            dependency_present = not set(self.fuji.github_data.keys()).isdisjoint(required_dependency_files)
+            # Check for automated building and installation
+            automation_requirements = self.metric_tests[test_id].metric_test_requirements[1]
+            required_automation_locations = automation_requirements["required"]["automation_file"]
+            required_automation_keywords = automation_requirements["required"]["keywords"]
+            self.logger.warning(
+                f"{self.metric_identifier} : Looking for {automation_requirements['modality']} keywords {required_automation_keywords} in {required_automation_locations}."
+            )
+            automation_hit_dict = self.scanForKeywords(required_automation_keywords, required_automation_locations)
+            found_automation = False
+            if automation_requirements["modality"] == "all":
+                found_automation = all(automation_hit_dict.values())
+            elif automation_requirements["modality"] == "any":
+                found_automation = any(automation_hit_dict.values())
+            else:
+                self.logger.warning(
+                    f"{self.metric_identifier} : Unknown modality {automation_requirements['modality']} in test requirements. Choose 'all' or 'any'."
+                )
+            if dependency_present and found_automation:  # pass
+                test_status = True
+                self.logger.log(self.fuji.LOG_SUCCESS, f"{self.metric_identifier} : Found required keywords.")
+                self.maturity = self.getTestConfigMaturity(test_id)
+                self.setEvaluationCriteriumScore(test_id, test_score, "pass")
+                self.score.earned += test_score
+            else:  # fail
+                self.logger.warning(
+                    f"{self.metric_identifier} : Did not find {automation_requirements['modality']} keywords {required_automation_keywords} in {required_automation_locations}."
+                )
         return test_status
 
     def testDependenciesBuildAutomatedChecks(self):
@@ -139,6 +206,13 @@ def testDependenciesBuildAutomatedChecks(self):
             self.logger.warning(
                 f"{self.metric_identifier} : Test for dependency information, build instructions and automated checks is not implemented."
             )
+            test_score = self.getTestConfigScore(test_id)
+            test_requirements = self.metric_tests[test_id].metric_test_requirements[0]
+            # dependency info and build instruction in README
+            first_half = self.scanForKeywords(["dependency", "dependencies", "build"], ["README"])
+            # linting and other relevant checks present in automated build and test process
+            # TODO
+            print((test_score, test_requirements, first_half))  # fix linting error for now
         return test_status
 
     def testBadgeIncluded(self):

diff --git a/fuji_server/harvester/github_harvester.py b/fuji_server/harvester/github_harvester.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: MIT
 
+import json
 import os
 from configparser import ConfigParser
 
@@ -32,6 +33,11 @@ def __init__(self, id, logger, host="https://github.com"):
             self.handle = Github(auth=auth)
         self.logger = logger
         self.data = {}  # dictionary with all info
+        fuji_server_dir = os.path.dirname(os.path.dirname(__file__))  # project_root
+        software_file_path = os.path.join(fuji_server_dir, "data", "software_file.json")
+        with open(software_file_path) as f:
+            self.files_map = json.load(f)
+            self.files_parse_fully = {k: v for (k, v) in self.files_map.items() if v["parse"] == "full"}
 
     def harvest(self):
         # check if it's a URL or repo ID
@@ -93,20 +99,32 @@ def harvest(self):
 
         # TODO: parse README (full), wiki (page names?), docs (file names)
         # NOTE: cannot retrieve wiki through API
-        self.data["readme"] = repo.get_readme().decoded_content
+        self.data["README"] = repo.get_readme().decoded_content
         # see if there's a folder named docs/
         try:
-            docs_folder = repo.get_contents("docs")
-            self.data["docs"] = []
+            self.data["docs_directory"] = []
             # get docs/ content recursively
             docs_folder = repo.get_contents("docs")
             while docs_folder:
                 doc_file = docs_folder.pop(0)
                 if doc_file.type == "dir":
                     docs_folder.extend(repo.get_contents(doc_file.path))
                 else:
-                    self.data["docs"].append({"name": doc_file.name})
+                    self.data["docs_directory"].append({"name": doc_file.name})
         except UnknownObjectException:
             pass
 
         # TODO: consider merging parts of the GitHub data with metadata?
+
+    def retrieve_all(self, repo):
+        self.data["contents"] = []
+        repo_contents = repo.get_contents("")
+        while repo_contents:
+            doc_file = repo_contents.pop(0)
+            if doc_file.type == "dir":
+                repo_contents.extend(repo.get_contents(doc_file.path))
+            else:
+                # TODO: construct a regex string out of ors (https://stackoverflow.com/questions/3040716/python-elegant-way-to-check-if-at-least-one-regex-in-list-matches-a-string)
+                # and use named groups to return the dictionary key with m.groupdict()
+                self.data["docs_directory"].append({"name": doc_file.name})
+        pass
diff --git a/fuji_server/yaml/metrics_v0.7_software.yaml b/fuji_server/yaml/metrics_v0.7_software.yaml
@@ -340,7 +340,7 @@ metrics:
       required:
         location:
         - README
-        - docs
+        - docs_directory
         - wiki
         keywords:
         - build
@@ -350,6 +350,21 @@ metrics:
     metric_test_name: Dependencies are provided in a machine-readable format and the building and installation of the software is automated.
     metric_test_score: 1
     metric_test_maturity: 2
+    metric_test_requirements:
+    - target: https://f-uji.net/vocab/metadata/standards
+      modality: any
+      required:
+        dependency_file:
+        - requirements.txt
+    - target: https://f-uji.net/vocab/metadata/standards
+      modality: all
+      required:
+        automation_file:
+        - Jenkinsfile
+        - github_actions
+        automation_keywords:
+        - build
+        - install
   created_by: FAIR4RS
   date_created: 2024-01-18
   date_updated: 2024-01-18