Skip to content

Commit

Permalink
continue work on file parsing in #2
Browse files Browse the repository at this point in the history
  • Loading branch information
karacolada committed Feb 5, 2024
1 parent d13ee50 commit 0e5f050
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 16 deletions.
57 changes: 57 additions & 0 deletions fuji_server/data/software_file.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"Jenkinsfile": {
"category": [
"automation"
],
"parse": "full",
"pattern": [
"**/Jenkinsfile"
]
},
"README": {
"category": [
"documentation"
],
"parse": "full",
"pattern": [
"**/README*"
]
},
"docs_directory": {
"category": [
"documentation"
],
"parse": "file_name",
"pattern": [
"**/docs/"
]
},
"github_actions": {
"category": [
"automation"
],
"parse": "full",
"pattern": [
".github/workflows/"
]
},
"mvn_pom": {
"category": [
"documentation",
"automation"
],
"parse": "full",
"pattern": [
"pom.xml"
]
},
"python_code": {
"category": [
"development"
],
"parse": "file_name",
"pattern": [
"*.py"
]
}
}
96 changes: 85 additions & 11 deletions fuji_server/evaluators/fair_evaluator_requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@ def __init__(self, fuji_instance):
}

def nestedDataContainsKeyword(self, data, key):
"""Recursively check whether text data in nested structures (such as list and dict) contains a keyword.
Args:
data (list | dict): nested structure containing text data
key (str): keyword to look for
Raises:
TypeError: argument data must be one of list or dict
Returns:
bool: True if key found somewhere in nested structure.
"""
values = None
if type(data) == list:
values = data
Expand All @@ -54,6 +66,31 @@ def nestedDataContainsKeyword(self, data, key):
self.logger.warning(f"{self.metric_identifier}: scan of nested data failed ({e.message}).")
return False

def scanForKeywords(self, keywords, locations):
"""Scan GitHub harvesting results for keywords.
Args:
keywords (list<str>): list of keywords to look for
locations (list<str>): list of locations to scan, used as keys for GitHub harvesting results
Returns:
dict<str, bool>: dictionary with keywords as keys and a boolean as value indicating whether the keyword was found in some location.
"""
hit_dict = {k: False for k in keywords}
keys_to_check = keywords
# check each location (if available) for keywords
for location in locations:
for k in keys_to_check:
content = self.fuji.github_data.get(location)
if content is not None:
if type(content) == str:
if k in content.lower():
hit_dict[k] = True # found keyword in location
keys_to_check.remove(k) # stop looking, have found something for this key
else:
hit_dict[k] = self.nestedDataContainsKeyword(content, k)
return hit_dict

def testBuildInstructions(self):
"""The software has build, installation and/or execution instructions.
Expand All @@ -76,17 +113,7 @@ def testBuildInstructions(self):
self.logger.info(
f"{self.metric_identifier} : Looking for {required_modality} keywords {required_keywords} in {required_locations}."
)
hit_dict = {k: False for k in required_keywords}
# check each location (if available) for keywords
for location in required_locations:
for k in hit_dict.keys():
content = self.fuji.github_data.get(location)
if content is not None:
if type(content) == str:
if k in content.lower():
hit_dict[k] = True # found keyword in location
else:
hit_dict[k] = self.nestedDataContainsKeyword(content, k)
hit_dict = self.scanForKeywords(required_keywords, required_locations)
found_instructions = False
if required_modality == "all":
found_instructions = all(hit_dict.values())
Expand All @@ -102,6 +129,10 @@ def testBuildInstructions(self):
self.maturity = self.getTestConfigMaturity(test_id)
self.setEvaluationCriteriumScore(test_id, test_score, "pass")
self.score.earned += test_score
else: # does not pass
self.logger.warning(
f"{self.metric_identifier} : Did not find {required_modality} keywords {required_keywords} in {required_locations}."
)
return test_status

def testDependencies(self):
Expand All @@ -119,6 +150,42 @@ def testDependencies(self):
break
if test_defined:
self.logger.warning(f"{self.metric_identifier} : Test for dependencies is not implemented.")
test_score = self.getTestConfigScore(test_id)
# Check for presence of machine-readable dependency files
dependency_requirements = self.metric_tests[test_id].metric_test_requirements[0]
assert (
dependency_requirements["modality"] == "any"
), f"Found requirement modality {dependency_requirements['modality']}, please choose 'any' instead. Any other modality is too strict for this test layout."
required_dependency_files = dependency_requirements["required"]["dependency_file"]
self.logger.info(f"{self.metric_identifier} : Checking presence of any of {required_dependency_files}.")
dependency_present = not set(self.fuji.github_data.keys()).isdisjoint(required_dependency_files)
# Check for automated building and installation
automation_requirements = self.metric_tests[test_id].metric_test_requirements[1]
required_automation_locations = automation_requirements["required"]["automation_file"]
required_automation_keywords = automation_requirements["required"]["keywords"]
self.logger.warning(
f"{self.metric_identifier} : Looking for {automation_requirements['modality']} keywords {required_automation_keywords} in {required_automation_locations}."
)
automation_hit_dict = self.scanForKeywords(required_automation_keywords, required_automation_locations)
found_automation = False
if automation_requirements["modality"] == "all":
found_automation = all(automation_hit_dict.values())
elif automation_requirements["modality"] == "any":
found_automation = any(automation_hit_dict.values())
else:
self.logger.warning(
f"{self.metric_identifier} : Unknown modality {automation_requirements['modality']} in test requirements. Choose 'all' or 'any'."
)
if dependency_present and found_automation: # pass
test_status = True
self.logger.log(self.fuji.LOG_SUCCESS, f"{self.metric_identifier} : Found required keywords.")
self.maturity = self.getTestConfigMaturity(test_id)
self.setEvaluationCriteriumScore(test_id, test_score, "pass")
self.score.earned += test_score
else: # fail
self.logger.warning(
f"{self.metric_identifier} : Did not find {automation_requirements['modality']} keywords {required_automation_keywords} in {required_automation_locations}."
)
return test_status

def testDependenciesBuildAutomatedChecks(self):
Expand All @@ -139,6 +206,13 @@ def testDependenciesBuildAutomatedChecks(self):
self.logger.warning(
f"{self.metric_identifier} : Test for dependency information, build instructions and automated checks is not implemented."
)
test_score = self.getTestConfigScore(test_id)
test_requirements = self.metric_tests[test_id].metric_test_requirements[0]
# dependency info and build instruction in README
first_half = self.scanForKeywords(["dependency", "dependencies", "build"], ["README"])
# linting and other relevant checks present in automated build and test process
# TODO
print((test_score, test_requirements, first_half)) # fix linting error for now
return test_status

def testBadgeIncluded(self):
Expand Down
26 changes: 22 additions & 4 deletions fuji_server/harvester/github_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# SPDX-License-Identifier: MIT

import json
import os
from configparser import ConfigParser

Expand Down Expand Up @@ -32,6 +33,11 @@ def __init__(self, id, logger, host="https://github.com"):
self.handle = Github(auth=auth)
self.logger = logger
self.data = {} # dictionary with all info
fuji_server_dir = os.path.dirname(os.path.dirname(__file__)) # project_root
software_file_path = os.path.join(fuji_server_dir, "data", "software_file.json")
with open(software_file_path) as f:
self.files_map = json.load(f)
self.files_parse_fully = {k: v for (k, v) in self.files_map.items() if v["parse"] == "full"}

def harvest(self):
# check if it's a URL or repo ID
Expand Down Expand Up @@ -93,20 +99,32 @@ def harvest(self):

# TODO: parse README (full), wiki (page names?), docs (file names)
# NOTE: cannot retrieve wiki through API
self.data["readme"] = repo.get_readme().decoded_content
self.data["README"] = repo.get_readme().decoded_content
# see if there's a folder named docs/
try:
docs_folder = repo.get_contents("docs")
self.data["docs"] = []
self.data["docs_directory"] = []
# get docs/ content recursively
docs_folder = repo.get_contents("docs")
while docs_folder:
doc_file = docs_folder.pop(0)
if doc_file.type == "dir":
docs_folder.extend(repo.get_contents(doc_file.path))
else:
self.data["docs"].append({"name": doc_file.name})
self.data["docs_directory"].append({"name": doc_file.name})
except UnknownObjectException:
pass

# TODO: consider merging parts of the GitHub data with metadata?

def retrieve_all(self, repo):
self.data["contents"] = []
repo_contents = repo.get_contents("")
while repo_contents:
doc_file = repo_contents.pop(0)
if doc_file.type == "dir":
repo_contents.extend(repo.get_contents(doc_file.path))
else:
# TODO: construct a regex string out of ors (https://stackoverflow.com/questions/3040716/python-elegant-way-to-check-if-at-least-one-regex-in-list-matches-a-string)
# and use named groups to return the dictionary key with m.groupdict()
self.data["docs_directory"].append({"name": doc_file.name})
pass
17 changes: 16 additions & 1 deletion fuji_server/yaml/metrics_v0.7_software.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ metrics:
required:
location:
- README
- docs
- docs_directory
- wiki
keywords:
- build
Expand All @@ -350,6 +350,21 @@ metrics:
metric_test_name: Dependencies are provided in a machine-readable format and the building and installation of the software is automated.
metric_test_score: 1
metric_test_maturity: 2
metric_test_requirements:
- target: https://f-uji.net/vocab/metadata/standards
modality: any
required:
dependency_file:
- requirements.txt
- target: https://f-uji.net/vocab/metadata/standards
modality: all
required:
automation_file:
- Jenkinsfile
- github_actions
automation_keywords:
- build
- install
created_by: FAIR4RS
date_created: 2024-01-18
date_updated: 2024-01-18
Expand Down

0 comments on commit 0e5f050

Please sign in to comment.