diff --git a/src/macaron/repo_finder/commit_finder.py b/src/macaron/repo_finder/commit_finder.py index a032ec06e..846214b1b 100644 --- a/src/macaron/repo_finder/commit_finder.py +++ b/src/macaron/repo_finder/commit_finder.py @@ -96,10 +96,11 @@ MAX_ZERO_DIGIT_EXTENSION = 4 split_pattern = re.compile("[^0-9a-z]", flags=re.IGNORECASE) -validation_pattern = re.compile("[0-9a-z]+$", flags=re.IGNORECASE) -alphabetic_only_pattern = re.compile("[a-z]+$", flags=re.IGNORECASE) -numeric_only_pattern = re.compile("[0-9]+$") -versioned_string = re.compile("[a-z]+[0-9]+$", flags=re.IGNORECASE) # e.g. RC1, M5, etc. +validation_pattern = re.compile("^[0-9a-z]+$", flags=re.IGNORECASE) +alphabetic_only_pattern = re.compile("^[a-z]+$", flags=re.IGNORECASE) +hex_only_pattern = re.compile("^[0-9a-f]+$", flags=re.IGNORECASE) +numeric_only_pattern = re.compile("^[0-9]+$") +versioned_string = re.compile("^[a-z]+[0-9]+$", flags=re.IGNORECASE) # e.g. RC1, M5, etc. class PurlType(Enum): @@ -133,7 +134,7 @@ def find_commit(git_obj: Git, purl: PackageURL) -> tuple[str, str]: The branch name and digest as a tuple. """ version = purl.version - if version is None: + if not version: logger.debug("Missing version for analysis target: %s", purl.name) return "", "" @@ -141,7 +142,7 @@ def find_commit(git_obj: Git, purl: PackageURL) -> tuple[str, str]: if repo_type == PurlType.REPOSITORY: return extract_commit_from_version(git_obj, version) if repo_type == PurlType.ARTIFACT: - return find_commit_from_version_and_name(git_obj, purl.name, version) + return find_commit_from_version_and_name(git_obj, re.escape(purl.name), version) logger.debug("Type of PURL is not supported for commit finding: %s", purl.type) return "", "" @@ -176,6 +177,10 @@ def abstract_purl_type(purl: PackageURL) -> PurlType: def extract_commit_from_version(git_obj: Git, version: str) -> tuple[str, str]: """Try to extract the commit from the PURL's version parameter. + E.g. + With commit: pkg:github/package-url/purl-spec@244fd47e07d1004f0aed9c. + With tag: pkg:github/apache/maven@maven-3.9.1. + Parameters ---------- git_obj: Git @@ -190,7 +195,7 @@ def extract_commit_from_version(git_obj: Git, version: str) -> tuple[str, str]: """ # A commit hash is 40 characters in length, but commits are often referenced using only some of those. commit: Commit | None = None - if 7 <= len(version) <= 40: + if 7 <= len(version) <= 40 and re.match(hex_only_pattern, version): try: commit = git_obj.get_commit(version) except BadName as error: @@ -200,7 +205,9 @@ def extract_commit_from_version(git_obj: Git, version: str) -> tuple[str, str]: # Treat the 'commit' as a tag. try: commit = git_obj.get_commit_from_tag(version) - except IndexError as error: + except (IndexError, ValueError) as error: + # If the tag exists but represents a tree or blob, a ValueError will be raised when trying to retrieve its + # commit. logger.debug("Failed to retrieve commit: %s", error) if not commit: @@ -291,7 +298,7 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> return branch_name, hexsha -def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, list[str], bool]: +def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, list[str]]: """Build a version pattern to match the passed version string. Parameters @@ -303,37 +310,36 @@ def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, lis Returns ------- - tuple[Pattern | None, list[str], bool] - The tuple of the regex pattern that will match the version, the list of version parts that were extracted, and - whether the version string has a non-numeric suffix. If an exception occurs from any regex operation, the - pattern will be returned as None. + tuple[Pattern | None, list[str]] + The tuple of the regex pattern that will match the version, and the list of version parts that were extracted. + If an exception occurs from any regex operation, the pattern will be returned as None. """ + if not version: + return None, [] + # The version is split on non-alphanumeric characters to separate the version parts from the non-version parts. # e.g. 1.2.3-DEV -> [1, 2, 3, DEV] - try: - split = split_pattern.split(version) - except TypeError as error: - logger.debug("Failed to perform regex split on version string %s -- %s", version, error) - return None, [], False - + split = split_pattern.split(version) logger.debug("Split version: %s", split) - if not split: - # If the version string contains no separators use it as is. - split = [version] - this_version_pattern = "" parts = [] - has_non_numeric_suffix = False - # Detect versions that end with a zero, so the zero can be made optional. - has_trailing_zero = len(split) > 2 and split[-1] == "0" - for count, part in enumerate(split): + for part in split: # Validate the split part by checking it is only comprised of alphanumeric characters. valid = validation_pattern.match(part) if not valid: continue parts.append(part) + if not parts: + logger.debug("Version contained no valid parts: %s", version) + return None, [] + + this_version_pattern = "" + has_non_numeric_suffix = False + # Detect versions that end with a zero, so the zero can be made optional. + has_trailing_zero = len(split) > 2 and split[-1] == "0" + for count, part in enumerate(parts): numeric_only = numeric_only_pattern.match(part) if not has_non_numeric_suffix and not numeric_only: @@ -374,23 +380,23 @@ def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, lis f"{this_version_pattern}){SUFFIX_SEPARATOR}{SUFFIX}$" ) try: - return re.compile(this_version_pattern, flags=re.IGNORECASE), parts, has_non_numeric_suffix + return re.compile(this_version_pattern, flags=re.IGNORECASE), parts except Exception as error: # pylint: disable=broad-exception-caught # The regex library uses an internal error that cannot be used here to satisfy pylint. logger.debug("Error while compiling version regex: %s", error) - return None, [], False + return None, [] -def match_tags(tag_list: list[str], artifact_name: str, artifact_version: str) -> list[str]: +def match_tags(tag_list: list[str], name: str, version: str) -> list[str]: """Return items of the passed tag list that match the passed artifact name and version. Parameters ---------- tag_list: list[str] The list of tags to check. - artifact_name: str + name: str The name of the analysis target. - artifact_version: str + version: str The version of the analysis target. Returns @@ -399,7 +405,7 @@ def match_tags(tag_list: list[str], artifact_name: str, artifact_version: str) - The list of tags that matched the pattern. """ # Create the pattern for the passed version. - pattern, parts, has_non_numeric_suffix = _build_version_pattern(artifact_name, artifact_version) + pattern, parts = _build_version_pattern(name, version) if not pattern: return [] @@ -425,23 +431,9 @@ def match_tags(tag_list: list[str], artifact_name: str, artifact_version: str) - return [_["tag"] for _ in matched_tags] # In the case of multiple matches, further work must be done. - # Firstly, combine matches with their suffixes as some version patterns will not include the required suffix in the - # version group. - if has_non_numeric_suffix: - filtered_tags = [] - for item in matched_tags: - # Discard tags with no suffix or with one that does not match the version. - suffix: str | None = item["suffix"] - if not suffix: - filtered_tags.append(item) - continue - if suffix == parts[-1]: - filtered_tags.append(item) - continue - - matched_tags = filtered_tags - - # If any of the matches contain a prefix that matches the target artifact name, remove those that don't. + + # If any of the matches contain a prefix that matches the target artifact name, and otherwise perfectly matches + # the version, remove those that don't. named_tags = [] for item in matched_tags: prefix: str | None = item["prefix"] @@ -450,7 +442,10 @@ def match_tags(tag_list: list[str], artifact_name: str, artifact_version: str) - if "/" in prefix: # Exclude prefix parts that exists before a forward slash, e.g. rel/ _, _, prefix = prefix.rpartition("/") - if prefix.lower() == artifact_name.lower(): + if ( + prefix.lower() == name.lower() + and _compute_tag_version_similarity(item["version"], item["suffix"], parts) == 0 + ): named_tags.append(item) if named_tags: @@ -515,7 +510,6 @@ def _compute_tag_version_similarity(tag_version: str, tag_suffix: str, version_p versioned_string_match = True else: count = count + 1 - if tag_suffix != last_part: count = count + 1 else: @@ -528,7 +522,8 @@ def _get_branch_of_commit(commit: Commit) -> str: """Get the branch of the passed commit as a string or return None.""" branches = commit.branches - if not branches: + if len(branches) == 1 and "" in branches: + # An 'empty' result for branches is a set containing a zero length string. logger.debug("No branch associated with commit: %s", commit.hash) return "" diff --git a/tests/e2e/repo_finder/commit_finder.py b/tests/e2e/repo_finder/commit_finder.py index d7489acdd..36dfd7ca3 100644 --- a/tests/e2e/repo_finder/commit_finder.py +++ b/tests/e2e/repo_finder/commit_finder.py @@ -55,11 +55,12 @@ def update_commit_finder_results() -> None: with open(java_tags_file_path, encoding="utf-8") as tag_file: json_data = json.load(tag_file) for item in json_data: - name = str(item["name"]) - name, version = name.split("@") - matched_tags = commit_finder.match_tags(item["tags"], name, version) - matched_tag = matched_tags[0] if matched_tags else "" - item["match"] = matched_tag + tags = item["tags"] + for artifact in item["artifacts"]: + purl = PackageURL.from_string(artifact["purl"]) + matched_tags = commit_finder.match_tags(tags, purl.name, purl.version or "") + matched_tag = matched_tags[0] if matched_tags else "" + artifact["match"] = matched_tag with open(java_tags_file_path, "w", encoding="utf-8") as tag_file: json.dump(json_data, tag_file, indent=4) diff --git a/tests/e2e/repo_finder/resources/java_tags.json b/tests/e2e/repo_finder/resources/java_tags.json index 8bbb27c73..5637c2335 100644 --- a/tests/e2e/repo_finder/resources/java_tags.json +++ b/tests/e2e/repo_finder/resources/java_tags.json @@ -52662,7 +52662,7 @@ { "purl": "pkg:maven/javax.xml.ws/jaxws-api@2.3.1", "repo": "https://github.com/javaee/jax-ws-spec", - "match": "jaxws-api-2.3.1-b170918.0311", + "match": "2.3.1", "comment": "" } ] @@ -286467,8 +286467,8 @@ { "purl": "pkg:maven/org.eclipse.sisu/org.eclipse.sisu.inject@0.9.0-SNAPSHOT", "repo": "https://github.com/eclipse/sisu.inject", - "match": "", - "comment": "Could possibly accept milestones/0.9.0.M1." + "match": "milestones/0.9.0.M1", + "comment": "This is the closest matching tag to the target version." } ] }, diff --git a/tests/repo_finder/test_commit_finder.py b/tests/repo_finder/test_commit_finder.py index 84e5ef563..c35674c3f 100644 --- a/tests/repo_finder/test_commit_finder.py +++ b/tests/repo_finder/test_commit_finder.py @@ -3,7 +3,9 @@ """This module tests the commit finder.""" import logging +import os import re +import shutil import hypothesis import pytest @@ -13,9 +15,13 @@ from macaron.repo_finder import commit_finder from macaron.repo_finder.commit_finder import PurlType +from tests.slsa_analyzer.mock_git_utils import commit_files, initiate_repo logger: logging.Logger = logging.getLogger(__name__) +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +REPO_DIR = os.path.join(BASE_DIR, "mock_repos", "commit_finder/sample_repo") + def test_get_commit_from_version() -> None: """Test resolving commits from version tags.""" @@ -81,8 +87,140 @@ def test_abstract_purl_type(purls: list[str], expected: PurlType) -> None: assert commit_finder.abstract_purl_type(PackageURL.from_string(purl)) == expected +def test_commit_finder() -> None: + """Test commit finder using mocked repository.""" + if os.path.exists(REPO_DIR): + shutil.rmtree(REPO_DIR) + git_obj = initiate_repo(REPO_DIR) + + # Create a commit from a newly created file. + with open(os.path.join(REPO_DIR, "file_1"), "w", encoding="utf-8") as file: + file.write("A") + commit_files(git_obj, ["file_1"]) + + # Create a commit with no associated branch. + git = git_obj.repo.git + commit_0 = git_obj.repo.index.commit(message="Commit_0") + git.checkout("HEAD", b="missing_branch") + commit_with_no_branch = git_obj.repo.index.commit(message="Commit_1") + git.checkout("master") + git.branch("-D", "missing_branch") + + # No version in PURL. + branch, _ = commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:maven/apache/maven")) + assert not branch + + # Unsupported PURL type. + branch, _ = commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:gem/ruby-artifact@1")) + assert not branch + + # Hash not present in repository, tests hash and tag. + branch, _ = commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:github/apache/maven@ab4ce3e")) + assert not branch + + # Hash present but no associated branch. + branch, _ = commit_finder.find_commit( + git_obj, PackageURL.from_string(f"pkg:github/apache/maven@{commit_with_no_branch.hexsha}") + ) + assert not branch + + # Valid PURL but repository has no tags yet. + branch, _ = commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:maven/apache/maven@1.0")) + assert not branch + + # Additional setup is done here to avoid tainting earlier tests. + + # Create a tag from a tree. + tag_tree_version = "1.0" + tree = git_obj.repo.heads.master.commit.tree + git_obj.repo.create_tag(tag_tree_version, ref=tree) + + # Add a new tag with an associated commit. This is the Japanese character for 'snow'. + bad_version = "雪" + git_obj.repo.create_tag(bad_version, commit_0.hexsha) + + # Create a more proper tag on the same commit. + tag_version = "2.3.4" + git_obj.repo.create_tag(tag_version, commit_0.hexsha) + + # Add an empty commit with some tags. + empty_commit = git_obj.repo.index.commit("Empty commit.") + tag_version_2 = "4.5.2" + git_obj.repo.create_tag(f"{tag_version_2}-DEV", ref=empty_commit.hexsha) + git_obj.repo.create_tag(f"{tag_version_2}_DEV_RC1_RELEASE", ref=empty_commit.hexsha) + git_obj.repo.create_tag(f"rel/prefix_name-{tag_version}", ref=empty_commit.hexsha) + + # Create a tag on the commit that has no branch. + tag_no_branch = "0.1.2" + git_obj.repo.create_tag(tag_no_branch, ref=commit_with_no_branch.hexsha) + + # Tag with no branch. + branch, _ = commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_no_branch}")) + assert not branch + + # Version that fails to create a pattern. + branch, _ = commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{bad_version}")) + assert not branch + + # Version with a suffix and no matching tag. + branch, _ = commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:maven/apache/maven@1-JRE")) + assert not branch + + # Version with only one digit and no matching tag. + branch, _ = commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:maven/apache/maven@1")) + assert not branch + + # Valid repository PURL. + branch, digest = commit_finder.find_commit( + git_obj, PackageURL.from_string(f"pkg:github/apache/maven@{commit_0.hexsha}") + ) + assert branch == "master" + assert digest == commit_0.hexsha + + # Valid artifact PURL. + branch, digest = commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version}")) + assert branch == "master" + assert digest == commit_0.hexsha + + # Valid artifact PURL with an alphanumeric suffix. + branch, digest = commit_finder.find_commit( + git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version}-RC1") + ) + assert branch == "master" + assert digest == commit_0.hexsha + + # Valid artifact PURL that should match a tag with a name prefix. + branch, digest = commit_finder.find_commit( + git_obj, PackageURL.from_string(f"pkg:maven/apache/prefix_name@{tag_version}") + ) + assert branch == "master" + assert digest == empty_commit.hexsha + + # Valid artifact PURL that matches a version with a suffix, to a tag with the same suffix. + branch, digest = commit_finder.find_commit( + git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version_2}-DEV") + ) + assert branch == "master" + assert digest == empty_commit.hexsha + + # Valid artifact PURL that matches a version with a suffix, to a tag with the same suffix part in a multi-suffix. + branch, digest = commit_finder.find_commit( + git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version_2}_RELEASE") + ) + assert branch == "master" + assert digest == empty_commit.hexsha + + # Valid artifact PURL that matches a version with an alphanumeric suffix, to a tag with the same suffix part in a + # multi-suffix. + branch, digest = commit_finder.find_commit( + git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version_2}_RC1") + ) + assert branch == "master" + assert digest == empty_commit.hexsha + + @given(text()) -@settings(max_examples=1000) +@settings(max_examples=10000, deadline=None) def test_pattern_generation(version: str) -> None: """Test stability of pattern creation from user input.""" # pylint: disable=protected-access @@ -104,7 +242,7 @@ def test_pattern_generation(version: str) -> None: input_pattern = re.compile(r"[0-9]{1,3}(\.[0-9a-z]{1,3}){,5}([-+#][a-z0-9].+)?", flags=re.IGNORECASE) # These numbers should be kept low as the complex regex makes generation slow. VERSION_ITERATIONS = 50 # The number of times to iterate the test_version_to_tag_matching test. -TAG_ITERATIONS = 10 # The number of tags to generate per version iteration. +TAG_ITERATIONS = 1 # The number of tags to generate per version iteration. @given(data()) @@ -123,7 +261,7 @@ def test_version_to_tag_matching(_data: DataObject) -> None: # noqa: PT019 if not purl.version: return # Build the pattern from the version. - pattern, parts, _ = commit_finder._build_version_pattern(purl.name, purl.version) + pattern, parts = commit_finder._build_version_pattern(purl.name, purl.version) if not pattern: return # Generate the tag from a pattern that is very similar to how version patterns are made.