diff --git a/docs/source/pages/developers_guide/apidoc/macaron.repo_finder.rst b/docs/source/pages/developers_guide/apidoc/macaron.repo_finder.rst index 1fc7aca3b..724c2614f 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.repo_finder.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.repo_finder.rst @@ -57,6 +57,14 @@ macaron.repo\_finder.repo\_finder\_deps\_dev module :undoc-members: :show-inheritance: +macaron.repo\_finder.repo\_finder\_enums module +----------------------------------------------- + +.. automodule:: macaron.repo_finder.repo_finder_enums + :members: + :undoc-members: + :show-inheritance: + macaron.repo\_finder.repo\_finder\_java module ---------------------------------------------- diff --git a/src/macaron/database/table_definitions.py b/src/macaron/database/table_definitions.py index 035df8f31..669a650d5 100644 --- a/src/macaron/database/table_definitions.py +++ b/src/macaron/database/table_definitions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """ @@ -36,6 +36,7 @@ from macaron.database.database_manager import ORMBase from macaron.database.db_custom_types import RFC3339DateTime from macaron.errors import InvalidPURLError +from macaron.repo_finder.repo_finder_enums import CommitFinderOutcome, RepoFinderOutcome from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, ProvenanceSubjectPURLMatcher from macaron.slsa_analyzer.slsa_req import ReqName @@ -177,7 +178,12 @@ class Component(PackageURLMixin, ORMBase): lazy="immediate", ) - def __init__(self, purl: str, analysis: Analysis, repository: "Repository | None"): + #: The one-to-one relationship with Repo Finder metadata. + repo_finder_metadata: Mapped["RepoFinderMetadata"] = relationship(back_populates="component", lazy="immediate") + + def __init__( + self, purl: str, analysis: Analysis, repository: "Repository | None", repo_finder_metadata: "RepoFinderMetadata" + ): """ Instantiate the software component using PURL identifier. @@ -204,7 +210,13 @@ def __init__(self, purl: str, analysis: Analysis, repository: "Repository | None # TODO: Explore the ``dbm`` or ``shelve`` packages to support dict type, which are part of the Python standard library. purl_kwargs = purl_parts.to_dict(encode=True) - super().__init__(purl=purl, analysis=analysis, repository=repository, **purl_kwargs) + super().__init__( + purl=purl, + analysis=analysis, + repository=repository, + repo_finder_metadata=repo_finder_metadata, + **purl_kwargs, + ) @property def report_file_name(self) -> str: @@ -605,3 +617,34 @@ def from_purl_and_provenance( return cls(sha256=sha256) return None + + +class RepoFinderMetadata(ORMBase): + """Metadata from the Repo Finder and Commit Finder runs for an associated Component.""" + + __tablename__ = "_repo_finder_metadata" + + #: The primary key. + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) # noqa: A003 + + #: The foreign key to the software component. + component_id: Mapped[int] = mapped_column(Integer, ForeignKey(Component.id), nullable=False) + + #: A one-to-one relationship with software components. + component: Mapped["Component"] = relationship(back_populates="repo_finder_metadata") + + #: The outcome of the Repo Finder. + repo_finder_outcome: Mapped[Enum] = mapped_column( + Enum(RepoFinderOutcome), nullable=False # pylint: disable=protected-access,no-member + ) + + #: The outcome of the Commit Finder. + commit_finder_outcome: Mapped[Enum] = mapped_column( + Enum(CommitFinderOutcome), nullable=False # pylint: disable=protected-access,no-member + ) + + #: The URL found by the Repo Finder (if applicable). + found_url: Mapped[str] = mapped_column(String) + + #: The commit of the tag matched by the Commit Finder. + found_commit: Mapped[str] = mapped_column(String) diff --git a/src/macaron/dependency_analyzer/cyclonedx.py b/src/macaron/dependency_analyzer/cyclonedx.py index d1bd93eac..1ebc930d2 100644 --- a/src/macaron/dependency_analyzer/cyclonedx.py +++ b/src/macaron/dependency_analyzer/cyclonedx.py @@ -30,6 +30,7 @@ from macaron.errors import CycloneDXParserError, DependencyAnalyzerError from macaron.output_reporter.scm import SCMStatus from macaron.repo_finder.repo_finder import find_repo +from macaron.repo_finder.repo_finder_enums import RepoFinderOutcome from macaron.repo_finder.repo_validator import find_valid_repository_url logger: logging.Logger = logging.getLogger(__name__) @@ -465,12 +466,12 @@ def _resolve_more_dependencies(dependencies: dict[str, DependencyInfo]) -> None: for item in dependencies.values(): if item["available"] != SCMStatus.MISSING_SCM: continue - - item["url"] = find_repo(item["purl"]) - if item["url"] == "": + url, outcome = find_repo(item["purl"]) + if outcome not in {RepoFinderOutcome.FOUND, RepoFinderOutcome.FOUND_FROM_PARENT}: logger.debug("Failed to find url for purl: %s", item["purl"]) else: # TODO decide how to handle possible duplicates here + item["url"] = url item["available"] = SCMStatus.AVAILABLE item["note"] = "" diff --git a/src/macaron/repo_finder/commit_finder.py b/src/macaron/repo_finder/commit_finder.py index a637c2aaf..2850ab998 100644 --- a/src/macaron/repo_finder/commit_finder.py +++ b/src/macaron/repo_finder/commit_finder.py @@ -13,6 +13,7 @@ from pydriller import Commit, Git from macaron.repo_finder import repo_finder_deps_dev, to_domain_from_known_purl_types +from macaron.repo_finder.repo_finder_enums import CommitFinderOutcome from macaron.slsa_analyzer.git_service import GIT_SERVICES logger: logging.Logger = logging.getLogger(__name__) @@ -121,7 +122,7 @@ class AbstractPurlType(Enum): UNSUPPORTED = (2,) -def find_commit(git_obj: Git, purl: PackageURL) -> str | None: +def find_commit(git_obj: Git, purl: PackageURL) -> tuple[str | None, CommitFinderOutcome]: """Try to find the commit matching the passed PURL. The PURL may be a repository type, e.g. GitHub, in which case the commit might be in its version part. @@ -137,13 +138,13 @@ def find_commit(git_obj: Git, purl: PackageURL) -> str | None: Returns ------- - str | None - The digest, or None if the commit cannot be correctly retrieved. + tuple[str | None, CommitFinderOutcome] + The digest, or None if the commit cannot be correctly retrieved, and the outcome to report. """ version = purl.version if not version: logger.debug("Missing version for analysis target: %s", purl.name) - return None + return None, CommitFinderOutcome.NO_VERSION_PROVIDED repo_type = determine_abstract_purl_type(purl) if repo_type == AbstractPurlType.REPOSITORY: @@ -151,7 +152,7 @@ def find_commit(git_obj: Git, purl: PackageURL) -> str | None: if repo_type == AbstractPurlType.ARTIFACT: return find_commit_from_version_and_name(git_obj, purl.name, version) logger.debug("Type of PURL is not supported for commit finding: %s", purl.type) - return None + return None, CommitFinderOutcome.UNSUPPORTED_PURL_TYPE def determine_abstract_purl_type(purl: PackageURL) -> AbstractPurlType: @@ -181,7 +182,7 @@ def determine_abstract_purl_type(purl: PackageURL) -> AbstractPurlType: return AbstractPurlType.UNSUPPORTED -def extract_commit_from_version(git_obj: Git, version: str) -> str | None: +def extract_commit_from_version(git_obj: Git, version: str) -> tuple[str | None, CommitFinderOutcome]: """Try to extract the commit from the PURL's version parameter. E.g. @@ -197,8 +198,8 @@ def extract_commit_from_version(git_obj: Git, version: str) -> str | None: Returns ------- - str | None - The digest, or None if the commit cannot be correctly retrieved. + tuple[str | None, CommitFinderOutcome] + The digest, or None if the commit cannot be correctly retrieved, and the outcome to report. """ # A commit hash is 40 characters in length, but commits are often referenced using only some of those. commit: Commit | None = None @@ -218,12 +219,12 @@ def extract_commit_from_version(git_obj: Git, version: str) -> str | None: logger.debug("Failed to retrieve commit: %s", error) if not commit: - return None + return None, CommitFinderOutcome.REPO_PURL_FAILURE - return commit.hash if commit else None + return commit.hash if commit else None, CommitFinderOutcome.MATCHED -def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> str | None: +def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> tuple[str | None, CommitFinderOutcome]: """Try to find the matching commit in a repository of a given version (and name) via tags. The passed version is used to match with the tags in the target repository. The passed name is used in cases where @@ -240,14 +241,19 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> Returns ------- - str | None - The digest, or None if the commit cannot be correctly retrieved. + tuple[str | None, CommitFinderOutcome] + The digest, or None if the commit cannot be correctly retrieved, and the outcome to report. """ logger.debug("Searching for commit of artifact version using tags: %s@%s", name, version) # Only consider tags that have a commit. + repo_tags = git_obj.repo.tags + if not repo_tags: + logger.debug("No tags found for %s", name) + return None, CommitFinderOutcome.NO_TAGS + valid_tags = {} - for tag in git_obj.repo.tags: + for tag in repo_tags: commit = _get_tag_commit(tag) if not commit: logger.debug("No commit found for tag: %s", tag) @@ -258,14 +264,14 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> if not valid_tags: logger.debug("No tags with commits found for %s", name) - return None + return None, CommitFinderOutcome.NO_TAGS_WITH_COMMITS # Match tags. - matched_tags = match_tags(list(valid_tags.keys()), name, version) + matched_tags, outcome = match_tags(list(valid_tags.keys()), name, version) if not matched_tags: logger.debug("No tags matched for %s", name) - return None + return None, outcome if len(matched_tags) > 1: logger.debug("Tags found for %s: %s", name, len(matched_tags)) @@ -282,7 +288,7 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> hexsha = tag.commit.hexsha except ValueError: logger.debug("Error trying to retrieve digest of commit: %s", tag.commit) - return None + return None, CommitFinderOutcome.NO_TAG_COMMIT logger.debug( "Found tag %s with commit %s for artifact version %s@%s", @@ -291,7 +297,7 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> name, version, ) - return hexsha if hexsha else None + return hexsha if hexsha else None, CommitFinderOutcome.MATCHED def _split_name(name: str) -> list[str]: @@ -349,7 +355,7 @@ def _split_separators(version: str) -> list[str]: return [item for item in split if item] -def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, list[str]]: +def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, list[str], CommitFinderOutcome]: """Build a version pattern to match the passed version string. Parameters @@ -362,12 +368,12 @@ def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, lis Returns ------- tuple[Pattern | None, list[str]] - The tuple of the regex pattern that will match the version, and the list of version parts that were extracted. - If an exception occurs from any regex operation, the pattern will be returned as None. + The tuple of the regex pattern that will match the version, the list of version parts that were extracted, and + the outcome to report. If an exception occurs from any regex operation, the pattern will be returned as None. """ if not version: - return None, [] + return None, [], CommitFinderOutcome.NO_VERSION_PROVIDED # Escape input to prevent it being treated as regex. name = re.escape(name) @@ -376,7 +382,7 @@ def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, lis if not parts: logger.debug("Version contained no valid parts: %s", version) - return None, [] + return None, [], CommitFinderOutcome.INVALID_PURL logger.debug("Final version parts: %s", parts) @@ -470,14 +476,14 @@ def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, lis # Compile the pattern. try: - return re.compile(this_version_pattern, flags=re.IGNORECASE), parts + return re.compile(this_version_pattern, flags=re.IGNORECASE), parts, CommitFinderOutcome.MATCHED except Exception as error: # pylint: disable=broad-exception-caught # The regex library uses an internal error that cannot be used here to satisfy pylint. logger.debug("Error while compiling version regex: %s", error) - return None, [] + return None, [], CommitFinderOutcome.REGEX_COMPILE_FAILURE -def match_tags(tag_list: list[str], name: str, version: str) -> list[str]: +def match_tags(tag_list: list[str], name: str, version: str) -> tuple[list[str], CommitFinderOutcome]: """Return items of the passed tag list that match the passed artifact name and version. Parameters @@ -491,8 +497,8 @@ def match_tags(tag_list: list[str], name: str, version: str) -> list[str]: Returns ------- - list[str] - The list of tags that matched the pattern. + tuple[list[str], CommitFinderOutcome] + The list of tags that matched the pattern, if any, and the outcome to report. """ logger.debug("Tag Sample: %s", tag_list[:5]) @@ -518,14 +524,14 @@ def match_tags(tag_list: list[str], name: str, version: str) -> list[str]: if match.group(1): prefix_match = tag if prefix_match: - return [prefix_match] + return [prefix_match], CommitFinderOutcome.MATCHED if last_match: - return [last_match] + return [last_match], CommitFinderOutcome.MATCHED # Create the more complicated pattern for the passed version. - pattern, parts = _build_version_pattern(name, version) + pattern, parts, outcome = _build_version_pattern(name, version) if not pattern: - return [] + return [], outcome # Match the tags. matched_tags = [] @@ -546,8 +552,12 @@ def match_tags(tag_list: list[str], name: str, version: str) -> list[str]: matched_tags = _fix_misaligned_tag_matches(matched_tags, version) - if len(matched_tags) <= 1: - return [_["tag"] for _ in matched_tags] + if not matched_tags: + logger.debug("Failed to match any tags.") + return [], CommitFinderOutcome.NO_TAGS_MATCHED + + if len(matched_tags) == 1: + return [_["tag"] for _ in matched_tags], CommitFinderOutcome.MATCHED # In the case of multiple matches, further work must be done. @@ -588,7 +598,7 @@ def match_tags(tag_list: list[str], name: str, version: str) -> list[str]: ) ) - return [_["tag"] for _ in matched_tags] + return [_["tag"] for _ in matched_tags], CommitFinderOutcome.MATCHED def _fix_misaligned_tag_matches(matched_tags: list[dict[str, str]], version: str) -> list[dict[str, str]]: diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py index 7b446c00e..84c57ed3b 100644 --- a/src/macaron/repo_finder/provenance_extractor.py +++ b/src/macaron/repo_finder/provenance_extractor.py @@ -323,7 +323,7 @@ def check_if_input_purl_provenance_conflict( # Check the PURL commit against the provenance. if not digest_input and provenance_commit_digest and purl.version: - purl_commit = extract_commit_from_version(git_obj, purl.version) + purl_commit, _ = extract_commit_from_version(git_obj, purl.version) if purl_commit and purl_commit != provenance_commit_digest: logger.debug( "The commit digest passed via purl input does not match what exists in the " diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py index 29b114a11..8be3d219f 100644 --- a/src/macaron/repo_finder/repo_finder.py +++ b/src/macaron/repo_finder/repo_finder.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """ @@ -47,6 +47,7 @@ from macaron.repo_finder.commit_finder import find_commit, match_tags from macaron.repo_finder.repo_finder_base import BaseRepoFinder from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder +from macaron.repo_finder.repo_finder_enums import CommitFinderOutcome, RepoFinderOutcome from macaron.repo_finder.repo_finder_java import JavaRepoFinder from macaron.repo_finder.repo_utils import ( check_repo_urls_are_equivalent, @@ -69,7 +70,7 @@ logger: logging.Logger = logging.getLogger(__name__) -def find_repo(purl: PackageURL, check_latest_version: bool = True) -> str: +def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, RepoFinderOutcome]: """Retrieve the repository URL that matches the given PURL. Parameters @@ -81,8 +82,8 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> str: Returns ------- - str : - The repository URL found for the passed package. + tuple[str, RepoFinderOutcome] : + The repository URL for the passed package, if found, and the outcome to report. """ repo_finder: BaseRepoFinder if purl.type == "maven": @@ -96,26 +97,26 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> str: repo_finder = DepsDevRepoFinder() else: logger.debug("No Repo Finder found for package type: %s of %s", purl.type, purl) - return "" + return "", RepoFinderOutcome.UNSUPPORTED_PACKAGE_TYPE # Call Repo Finder and return first valid URL logger.debug("Analyzing %s with Repo Finder: %s", purl, type(repo_finder)) - found_repo = repo_finder.find_repo(purl) + found_repo, outcome = repo_finder.find_repo(purl) if found_repo or not check_latest_version: - return found_repo + return found_repo, outcome # Try to find the latest version repo. logger.error("Could not find repo for PURL: %s", purl) latest_version_purl = get_latest_purl_if_different(purl) if not latest_version_purl: logger.debug("Could not find newer PURL than provided: %s", purl) - return "" + return "", RepoFinderOutcome.NO_NEWER_VERSION - found_repo = DepsDevRepoFinder().find_repo(latest_version_purl) + found_repo, outcome = DepsDevRepoFinder().find_repo(latest_version_purl) if not found_repo: logger.debug("Could not find repo from latest version of PURL: %s", latest_version_purl) - return found_repo + return found_repo, outcome def to_repo_path(purl: PackageURL, available_domains: list[str]) -> str | None: @@ -147,7 +148,7 @@ def to_repo_path(purl: PackageURL, available_domains: list[str]) -> str | None: """ domain = to_domain_from_known_purl_types(purl.type) or (purl.type if purl.type in available_domains else None) if not domain: - logger.info("The PURL type of %s is not valid as a repository type. Trying to find the repository...", purl) + logger.info("The PURL type of %s is not valid as a repository type.", purl) return None if not purl.namespace: @@ -205,7 +206,7 @@ def find_source(purl_string: str, input_repo: str | None, latest_version_fallbac found_repo = input_repo if not found_repo: logger.debug("Searching for repo of PURL: %s", purl) - found_repo = find_repo(purl) + found_repo, _ = find_repo(purl) if not found_repo: logger.error("Could not find repo for PURL: %s", purl) @@ -214,31 +215,29 @@ def find_source(purl_string: str, input_repo: str | None, latest_version_fallbac # Disable other loggers for cleaner output. logging.getLogger("macaron.slsa_analyzer.analyzer").disabled = True - digest = None if defaults.getboolean("repofinder", "find_source_should_clone"): # Clone the repo to retrieve the tags. logger.debug("Preparing repo: %s", found_repo) repo_dir = os.path.join(global_config.output_path, GIT_REPOS_DIR) logging.getLogger("macaron.slsa_analyzer.git_url").disabled = True # The prepare_repo function will also check the latest version of the artifact if required. - git_obj = prepare_repo(repo_dir, found_repo, purl=purl, latest_version_fallback=not checked_latest_purl) - - if git_obj: - try: - digest = git_obj.get_head().hash - except ValueError: - logger.debug("Could not retrieve commit hash from repository.") + _, _, digest = prepare_repo(repo_dir, found_repo, purl=purl, latest_version_fallback=not checked_latest_purl) if not digest: return False else: # Retrieve the tags using a remote git operation. tags = get_tags_via_git_remote(found_repo) - if tags: - matches = match_tags(list(tags.keys()), purl.name, purl.version) - if matches: - matched_tag = matches[0] - digest = tags[matched_tag] + if not tags: + return False + + matches, _ = match_tags(list(tags.keys()), purl.name, purl.version) + + if not matches: + return False + + matched_tag = matches[0] + digest = tags[matched_tag] if not digest: logger.error("Could not find commit for purl / repository: %s / %s", purl, found_repo) @@ -286,7 +285,7 @@ def get_latest_purl_if_different(purl: PackageURL) -> PackageURL | None: else: no_version_purl = purl - latest_version_purl = DepsDevRepoFinder.get_latest_version(no_version_purl) + latest_version_purl, _ = DepsDevRepoFinder.get_latest_version(no_version_purl) if not latest_version_purl: logger.error("Latest version PURL could not be found.") return None @@ -314,7 +313,7 @@ def get_latest_repo_if_different(latest_version_purl: PackageURL, original_repo: str The latest repository, or an empty string if not found. """ - latest_repo = find_repo(latest_version_purl, False) + latest_repo, _ = find_repo(latest_version_purl, False) if not latest_repo: logger.error("Could not find repository from latest PURL: %s", latest_version_purl) return "" @@ -379,7 +378,7 @@ def prepare_repo( digest: str = "", purl: PackageURL | None = None, latest_version_fallback: bool = True, -) -> Git | None: +) -> tuple[Git | None, CommitFinderOutcome, str]: """Prepare the target repository for analysis. If ``repo_path`` is a remote path, the target repo is cloned to ``{target_dir}/{unique_path}``. @@ -407,8 +406,9 @@ def prepare_repo( Returns ------- - Git | None - The pydriller.Git object of the repository or None if error. + tuple[Git | None, CommitFinderOutcome, str] + The pydriller.Git object of the repository or None if error; the outcome of the Commit Finder; and the final + digest. """ # TODO: separate the logic for handling remote and local repos instead of putting them into this method. logger.info( @@ -418,15 +418,15 @@ def prepare_repo( digest, ) - resolved_local_path = "" is_remote = is_remote_repo(repo_path) + commit_finder_outcome = CommitFinderOutcome.NOT_USED if is_remote: logger.info("The path to repo %s is a remote path.", repo_path) resolved_remote_path = get_remote_vcs_url(repo_path) if not resolved_remote_path: logger.error("The provided path to repo %s is not a valid remote path.", repo_path) - return None + return None, commit_finder_outcome, digest git_service = get_git_service(resolved_remote_path) repo_unique_path = get_repo_dir_name(resolved_remote_path) @@ -436,7 +436,7 @@ def prepare_repo( git_service.clone_repo(resolved_local_path, resolved_remote_path) except CloneError as error: logger.error("Cannot clone %s: %s", resolved_remote_path, str(error)) - return None + return None, commit_finder_outcome, digest else: logger.info("Checking if the path to repo %s is a local path.", repo_path) resolved_local_path = resolve_local_path(get_local_repos_path(), repo_path) @@ -446,29 +446,29 @@ def prepare_repo( git_obj = Git(resolved_local_path) except InvalidGitRepositoryError: logger.error("No git repo exists at %s.", resolved_local_path) - return None + return None, commit_finder_outcome, digest else: logger.error("Error happened while preparing the repo.") - return None + return None, commit_finder_outcome, digest if is_empty_repo(git_obj): logger.error("The target repository does not have any commit.") - return None + return None, commit_finder_outcome, digest # Find the digest and branch if a version has been specified if not digest and purl and purl.version: - found_digest = find_commit(git_obj, purl) + found_digest, commit_finder_outcome = find_commit(git_obj, purl) if not found_digest: logger.error("Could not map the input purl string to a specific commit in the corresponding repository.") if not latest_version_fallback: - return None + return None, commit_finder_outcome, digest # If the commit could not be found, check if the latest version of the artifact has a different repository. latest_purl = get_latest_purl_if_different(purl) if not latest_purl: - return None + return None, commit_finder_outcome, digest latest_repo = get_latest_repo_if_different(latest_purl, repo_path) if not latest_repo: - return None + return None, commit_finder_outcome, digest return prepare_repo(latest_repo, latest_repo, target_dir, latest_version_fallback=False) digest = found_digest @@ -490,15 +490,15 @@ def prepare_repo( # ``git_url.check_out_repo_target``. if not check_out_repo_target(git_obj, branch_name, digest, not is_remote): logger.error("Cannot checkout the specific branch or commit of the target repo.") - return None + return None, commit_finder_outcome, digest - return git_obj + return git_obj, commit_finder_outcome, digest try: git_service.check_out_repo(git_obj, branch_name, digest, not is_remote) except RepoCheckOutError as error: logger.error("Failed to check out repository at %s", resolved_local_path) logger.error(error) - return None + return None, commit_finder_outcome, digest - return git_obj + return git_obj, commit_finder_outcome, digest diff --git a/src/macaron/repo_finder/repo_finder_base.py b/src/macaron/repo_finder/repo_finder_base.py index ba177c89f..1e82aa475 100644 --- a/src/macaron/repo_finder/repo_finder_base.py +++ b/src/macaron/repo_finder/repo_finder_base.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the base class for the repo finders.""" @@ -7,12 +7,14 @@ from packageurl import PackageURL +from macaron.repo_finder.repo_finder_enums import RepoFinderOutcome + class BaseRepoFinder(ABC): """This abstract class is used to represent Repository Finders.""" @abstractmethod - def find_repo(self, purl: PackageURL) -> str: + def find_repo(self, purl: PackageURL) -> tuple[str, RepoFinderOutcome]: """ Generate iterator from _find_repo that attempts to retrieve a repository URL that matches the passed artifact. @@ -23,6 +25,6 @@ def find_repo(self, purl: PackageURL) -> str: Returns ------- - str : - The URL of the found repository. + tuple[str, RepoFinderOutcome] : + A tuple of the found URL (or an empty string), and the outcome of the Repo Finder. """ diff --git a/src/macaron/repo_finder/repo_finder_deps_dev.py b/src/macaron/repo_finder/repo_finder_deps_dev.py index d66aaaebf..c2aaf61ef 100644 --- a/src/macaron/repo_finder/repo_finder_deps_dev.py +++ b/src/macaron/repo_finder/repo_finder_deps_dev.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the PythonRepoFinderDD class to be used for finding repositories using deps.dev.""" @@ -12,6 +12,7 @@ from macaron.json_tools import json_extract from macaron.repo_finder.repo_finder_base import BaseRepoFinder +from macaron.repo_finder.repo_finder_enums import RepoFinderOutcome from macaron.repo_finder.repo_validator import find_valid_repository_url from macaron.slsa_analyzer.git_url import clean_url from macaron.util import send_get_http_raw @@ -39,7 +40,7 @@ class DepsDevRepoFinder(BaseRepoFinder): # See https://docs.deps.dev/api/v3alpha/ BASE_URL = "https://api.deps.dev/v3alpha/purl/" - def find_repo(self, purl: PackageURL) -> str: + def find_repo(self, purl: PackageURL) -> tuple[str, RepoFinderOutcome]: """ Attempt to retrieve a repository URL that matches the passed artifact. @@ -50,31 +51,31 @@ def find_repo(self, purl: PackageURL) -> str: Returns ------- - str : - The URL of the found repository. + tuple[str, RepoFinderOutcome] : + A tuple of the found URL (or an empty string), and the outcome of the Repo Finder. """ - request_urls = self._create_urls(purl) + request_urls, outcome = self._create_urls(purl) if not request_urls: logger.debug("No urls found for: %s", purl) - return "" + return "", outcome json_data = self._retrieve_json(request_urls[0]) if not json_data: logger.debug("Failed to retrieve json data for: %s", purl) - return "" + return "", RepoFinderOutcome.DDEV_JSON_FETCH_ERROR - urls = self._read_json(json_data) + urls, outcome = self._read_json(json_data) if not urls: logger.debug("Failed to extract repository URLs from json data: %s", purl) - return "" + return "", outcome logger.debug("Found %s urls: %s", len(urls), urls) url = find_valid_repository_url(urls) if url: logger.debug("Found valid url: %s", url) - return url + return url, RepoFinderOutcome.FOUND - return "" + return "", RepoFinderOutcome.DDEV_NO_URLS @staticmethod def get_project_info(project_url: str) -> dict[str, Any] | None: @@ -112,7 +113,7 @@ def get_project_info(project_url: str) -> dict[str, Any] | None: return response_json @staticmethod - def get_latest_version(purl: PackageURL) -> PackageURL | None: + def get_latest_version(purl: PackageURL) -> tuple[PackageURL | None, RepoFinderOutcome]: """Return a PURL representing the latest version of the passed artifact. Parameters @@ -122,8 +123,8 @@ def get_latest_version(purl: PackageURL) -> PackageURL | None: Returns ------- - PackageURL | None - The latest version of the PURL, or None if it could not be found. + tuple[PackageURL | None, RepoFinderOutcome] + The latest version of the PURL, or None if it could not be found, and the outcome to report. """ if purl.version: namespace = purl.namespace + "/" if purl.namespace else "" @@ -133,26 +134,29 @@ def get_latest_version(purl: PackageURL) -> PackageURL | None: response = send_get_http_raw(url) if not response: - return None + return None, RepoFinderOutcome.DDEV_BAD_RESPONSE try: metadata: dict = json.loads(response.text) except ValueError as error: logger.debug("Failed to parse response from deps.dev: %s", error) - return None + return None, RepoFinderOutcome.DDEV_JSON_FETCH_ERROR versions_keys = ["package", "versions"] if "package" in metadata else ["version"] versions = json_extract(metadata, versions_keys, list) if not versions: - return None + return None, RepoFinderOutcome.DDEV_JSON_INVALID latest_version = json_extract(versions[-1], ["versionKey", "version"], str) if not latest_version: - return None + return None, RepoFinderOutcome.DDEV_JSON_INVALID namespace = purl.namespace + "/" if purl.namespace else "" - return PackageURL.from_string(f"pkg:{purl.type}/{namespace}{purl.name}@{latest_version}") + return ( + PackageURL.from_string(f"pkg:{purl.type}/{namespace}{purl.name}@{latest_version}"), + RepoFinderOutcome.FOUND_FROM_LATEST, + ) - def _create_urls(self, purl: PackageURL) -> list[str]: + def _create_urls(self, purl: PackageURL) -> tuple[list[str], RepoFinderOutcome]: """ Create the urls to search for the metadata relating to the passed artifact. @@ -168,13 +172,14 @@ def _create_urls(self, purl: PackageURL) -> list[str]: list[str] The list of created URLs. """ + outcome = None if not purl.version: - latest_purl = DepsDevRepoFinder.get_latest_version(purl) + latest_purl, outcome = DepsDevRepoFinder.get_latest_version(purl) if not latest_purl: - return [] + return [], outcome purl = latest_purl - return [f"{DepsDevRepoFinder.BASE_URL}{encode(str(purl), safe='')}"] + return [f"{DepsDevRepoFinder.BASE_URL}{encode(str(purl), safe='')}"], outcome or RepoFinderOutcome.FOUND def _retrieve_json(self, url: str) -> str: """ @@ -197,7 +202,7 @@ def _retrieve_json(self, url: str) -> str: return response.text - def _read_json(self, json_data: str) -> list[str]: + def _read_json(self, json_data: str) -> tuple[list[str], RepoFinderOutcome]: """ Parse the deps.dev json file and extract the repository links. @@ -208,20 +213,20 @@ def _read_json(self, json_data: str) -> list[str]: Returns ------- - list[str] : - The extracted contents as a list of strings. + tuple[list[str], RepoFinderOutcome] : + The extracted contents as a list, and the outcome to report. """ try: parsed = json.loads(json_data) except ValueError as error: logger.debug("Failed to parse response from deps.dev: %s", error) - return [] + return [], RepoFinderOutcome.DDEV_JSON_FETCH_ERROR links_keys = ["version", "links"] if "version" in parsed else ["links"] links = json_extract(parsed, links_keys, list) if not links: logger.debug("Could not extract 'version' or 'links' from deps.dev response.") - return [] + return [], RepoFinderOutcome.DDEV_JSON_INVALID result = [] for item in links: @@ -229,4 +234,4 @@ def _read_json(self, json_data: str) -> list[str]: if url and isinstance(url, str): result.append(url) - return result + return result, RepoFinderOutcome.FOUND diff --git a/src/macaron/repo_finder/repo_finder_enums.py b/src/macaron/repo_finder/repo_finder_enums.py new file mode 100644 index 000000000..24299f097 --- /dev/null +++ b/src/macaron/repo_finder/repo_finder_enums.py @@ -0,0 +1,70 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains Enums used to represent the outcome of Repo Finder or Commit Finder executions.""" +from enum import Enum + + +class RepoFinderOutcome(Enum): + """An Enum of all outcomes of the Repo Finder being run for a software component.""" + + # States that relate to problems with user input. + NO_MAVEN_HOST_PROVIDED = "No maven host provided" + NO_POM_TAGS_PROVIDED = "No POM tags provided" + NO_VERSION_PROVIDED = "No version provided" + UNSUPPORTED_PACKAGE_TYPE = "Unsupported package type" + + # States that relate to the target POM (Java). + POM_READ_ERROR = "POM read error" + + # States that relate to the SCM in the POM (Java). + SCM_NO_URLS = "SCM no URLs" + SCM_NO_VALID_URLS = "SCM no valid URLs" + + # States that relate to HTTP requests. + HTTP_INVALID = "HTTP invalid" + HTTP_NOT_FOUND = "HTTP not found" + HTTP_FORBIDDEN = "HTTP forbidden" + HTTP_OTHER = "HTTP other" + + # States that relate to deps.dev (Non-Java). + DDEV_BAD_RESPONSE = "deps.dev bad response" + DDEV_JSON_FETCH_ERROR = "deps.dev fetch error" + DDEV_JSON_INVALID = "deps.dev JSON invalid" + DDEV_NO_URLS = "deps.dev no URLs" + + # Version related states. + NO_NEWER_VERSION = "No newer version than provided which failed" + + # Success states. + FOUND = "Found" + FOUND_FROM_PARENT = "Found from parent" + FOUND_FROM_LATEST = "Found form latest" + + # Default state. + NOT_USED = "Not used" + + +class CommitFinderOutcome(Enum): + """An Enum of all outcomes of the Commit Finder being run for a software component.""" + + # States that relate to problems with user input. + NO_VERSION_PROVIDED = "No version provided" + UNSUPPORTED_PURL_TYPE = "Unsupported PURL type" + + # States that relate to repository type PURLs. + REPO_PURL_FAILURE = "Repository PURL failure" + + # States that relate to artifact type PURLs. + NO_TAGS = "No tags" + NO_TAGS_WITH_COMMITS = "No tags with commits" + NO_TAG_COMMIT = "No tag commit" + INVALID_PURL = "No valid parts" + REGEX_COMPILE_FAILURE = "Regex compile failure" + NO_TAGS_MATCHED = "No tags matched" + + # Success state. + MATCHED = "Matched" + + # Default state. + NOT_USED = "Not used" diff --git a/src/macaron/repo_finder/repo_finder_java.py b/src/macaron/repo_finder/repo_finder_java.py index e6f349d3b..5fd100b68 100644 --- a/src/macaron/repo_finder/repo_finder_java.py +++ b/src/macaron/repo_finder/repo_finder_java.py @@ -1,9 +1,10 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the JavaRepoFinder class to be used for finding Java repositories.""" import logging import re +import urllib.parse from xml.etree.ElementTree import Element # nosec from packageurl import PackageURL @@ -12,6 +13,7 @@ from macaron.parsers.pomparser import parse_pom_string from macaron.repo_finder.repo_finder_base import BaseRepoFinder from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder +from macaron.repo_finder.repo_finder_enums import RepoFinderOutcome from macaron.repo_finder.repo_validator import find_valid_repository_url from macaron.util import send_get_http_raw @@ -25,7 +27,7 @@ def __init__(self) -> None: """Initialise the Java repository finder instance.""" self.pom_element: Element | None = None - def find_repo(self, purl: PackageURL) -> str: + def find_repo(self, purl: PackageURL) -> tuple[str, RepoFinderOutcome]: """ Attempt to retrieve a repository URL that matches the passed artifact. @@ -36,52 +38,68 @@ def find_repo(self, purl: PackageURL) -> str: Yields ------ - str : - The URL of the found repository. + tuple[str, RepoFinderOutcome] : + A tuple of the found URL (or an empty string), and the outcome of the Repo Finder. """ + # Check POM tags exist. + tags = defaults.get_list("repofinder.java", "repo_pom_paths") + if not tags: + logger.debug("No POM tags found for URL discovery.") + return "", RepoFinderOutcome.NO_POM_TAGS_PROVIDED + + group = purl.namespace or "" + artifact = purl.name + version = purl.version or "" + + if not version: + logger.debug("Version missing for maven artifact: %s:%s", group, artifact) + # TODO add support for Java artifacts without a version + return "", RepoFinderOutcome.NO_VERSION_PROVIDED + # Perform the following in a loop: # - Create URLs for the current artifact POM # - Parse the POM # - Try to extract SCM metadata and return URLs # - Try to extract parent information and change current artifact to it # - Repeat - group = purl.namespace or "" - artifact = purl.name - version = purl.version or "" limit = defaults.getint("repofinder.java", "parent_limit", fallback=10) + initial_limit = limit + last_outcome = RepoFinderOutcome.FOUND + check_parents = defaults.getboolean("repofinder.java", "find_parents") if not version: logger.info("Version missing for maven artifact: %s:%s", group, artifact) - latest_purl = DepsDevRepoFinder().get_latest_version(purl) + latest_purl, outcome = DepsDevRepoFinder().get_latest_version(purl) if not latest_purl or not latest_purl.version: logger.debug("Could not find version for artifact: %s:%s", purl.namespace, purl.name) - return "" + return "", outcome group = latest_purl.namespace or "" artifact = latest_purl.name version = latest_purl.version while group and artifact and version and limit > 0: - # Create the URLs for retrieving the artifact's POM + # Create the URLs for retrieving the artifact's POM. group = group.replace(".", "/") request_urls = self._create_urls(group, artifact, version) if not request_urls: - # Abort if no URLs were created + # Abort if no URLs were created. logger.debug("Failed to create request URLs for %s:%s:%s", group, artifact, version) - return "" + return "", RepoFinderOutcome.NO_MAVEN_HOST_PROVIDED - # Try each POM URL in order, terminating early if a match is found + # Try each POM URL in order, terminating early if a match is found. pom = "" + pom_outcome = RepoFinderOutcome.FOUND for request_url in request_urls: - pom = self._retrieve_pom(request_url) + pom, pom_outcome = self._retrieve_pom(request_url) if pom != "": break if pom == "": - # Abort if no POM was found + # Abort if no POM was found. logger.debug("No POM found for %s:%s:%s", group, artifact, version) - return "" + return "", pom_outcome - urls = self._read_pom(pom) + urls, read_outcome = self._read_pom(pom, tags) if urls: # If the found URLs fail to validate, finding can continue on to the next parent POM @@ -89,18 +107,25 @@ def find_repo(self, purl: PackageURL) -> str: url = find_valid_repository_url(urls) if url: logger.debug("Found valid url: %s", url) - return url + return url, ( + RepoFinderOutcome.FOUND if initial_limit == limit else RepoFinderOutcome.FOUND_FROM_PARENT + ) - if defaults.getboolean("repofinder.java", "find_parents") and self.pom_element is not None: - # Attempt to extract parent information from POM + # No valid URLs were found from this POM. + last_outcome = RepoFinderOutcome.SCM_NO_VALID_URLS + else: + last_outcome = read_outcome + + if check_parents and self.pom_element is not None: + # Attempt to extract parent information from POM. group, artifact, version = self._find_parent(self.pom_element) else: break limit = limit - 1 - # Nothing found - return "" + # Nothing found. + return "", last_outcome def _create_urls(self, group: str, artifact: str, version: str) -> list[str]: """ @@ -127,10 +152,22 @@ def _create_urls(self, group: str, artifact: str, version: str) -> list[str]: ) urls = [] for repo in repositories: - urls.append(f"{repo}/{group}/{artifact}/{version}/{artifact}-{version}.pom") + repo_url = urllib.parse.urlparse(repo) + pom_url = urllib.parse.ParseResult( + scheme=repo_url.scheme, + netloc=repo_url.netloc, + path=( + ((repo_url.path + "/") if repo_url.path else "") + + "/".join([group, artifact, version, f"{artifact}-{version}.pom"]) + ), + params="", + query="", + fragment="", + ).geturl() + urls.append(pom_url) return urls - def _retrieve_pom(self, url: str) -> str: + def _retrieve_pom(self, url: str) -> tuple[str, RepoFinderOutcome]: """ Attempt to retrieve the file located at the passed URL. @@ -141,18 +178,26 @@ def _retrieve_pom(self, url: str) -> str: Returns ------- - str : - The retrieved file data or an empty string. + tuple[str, RepoFinderOutcome] : + The retrieved file data or an empty string, and the outcome to report. """ - response = send_get_http_raw(url, {}) + response = send_get_http_raw(url, always_return_response=True) if not response: - return "" + return "", RepoFinderOutcome.HTTP_INVALID + + if response.status_code == 404: + return "", RepoFinderOutcome.HTTP_NOT_FOUND + if response.status_code == 403: + return "", RepoFinderOutcome.HTTP_FORBIDDEN + if response.status_code != 200: + logger.debug("Failed to retrieve POM: HTTP %s", response.status_code) + return "", RepoFinderOutcome.HTTP_OTHER logger.debug("Found artifact POM at: %s", url) - return response.text + return response.text, RepoFinderOutcome.FOUND - def _read_pom(self, pom: str) -> list[str]: + def _read_pom(self, pom: str, tags: list[str]) -> tuple[list[str], RepoFinderOutcome]: """ Parse the passed pom and extract the relevant tags. @@ -163,23 +208,18 @@ def _read_pom(self, pom: str) -> list[str]: Returns ------- - list[str] : - The extracted contents as a list of strings. + tuple[list[str], RepoFinderOutcome] : + A tuple of the found URLs, or an empty list, and the outcome to report. """ - # Retrieve tags - tags = defaults.get_list("repofinder.java", "repo_pom_paths") - if not any(tags): - logger.debug("No POM tags found for URL discovery.") - return [] - - # Parse POM using defusedxml + # Parse POM using defusedxml. pom_element = parse_pom_string(pom) if pom_element is None: - return [] + return [], RepoFinderOutcome.POM_READ_ERROR self.pom_element = pom_element - # Attempt to extract SCM data and return URL - return self._find_scm(pom_element, tags) + # Attempt to extract SCM data and return URL. + results = self._find_scm(pom_element, tags) + return results, RepoFinderOutcome.FOUND if results else RepoFinderOutcome.SCM_NO_URLS def _find_scm(self, pom: Element, tags: list[str], resolve_properties: bool = True) -> list[str]: """ @@ -206,8 +246,8 @@ def _find_scm(self, pom: Element, tags: list[str], resolve_properties: bool = Tr element: Element | None = pom if tag.startswith("properties."): - # Tags under properties are often "." separated - # These can be safely split into two resulting tags as nested tags are not allowed here + # Tags under properties are often "." separated. + # These can be safely split into two resulting tags as nested tags are not allowed here. tag_parts = ["properties", tag[11:]] else: # Other tags can be split into distinct elements via "." @@ -218,10 +258,10 @@ def _find_scm(self, pom: Element, tags: list[str], resolve_properties: bool = Tr if element is None: break if index == len(tag_parts) - 1 and element.text: - # Add the contents of the final tag + # Add the contents of the final tag. results.append(element.text.strip()) - # Resolve any Maven properties within the results + # Resolve any Maven properties within the results. if resolve_properties: results = self._resolve_properties(pom, results) @@ -281,20 +321,20 @@ def _resolve_properties(self, pom: Element, values: list[str]) -> list[str]: resolved_values = [] for value in values: replacements: list = [] - # Calculate replacements - matches any number of ${...} entries in the current value + # Calculate replacements - matches any number of ${...} entries in the current value. for match in re.finditer("\\$\\{[^}]+}", value): text = match.group().replace("$", "").replace("{", "").replace("}", "") if text.startswith("project."): text = text.replace("project.", "") else: text = f"properties.{text}" - # Call find_scm with property resolution flag set to False to prevent the possibility of endless looping + # Call find_scm with property resolution flag as False to prevent the possibility of endless looping. result = self._find_scm(pom, [text], False) if not result: break replacements.append([match.start(), result[0], match.end()]) - # Apply replacements in reverse order + # Apply replacements in reverse order. # E.g. # git@github.com:owner/project${javac.src.version}-${project.inceptionYear}.git # -> diff --git a/src/macaron/repo_finder/repo_utils.py b/src/macaron/repo_finder/repo_utils.py index 467776673..0f9ca2683 100644 --- a/src/macaron/repo_finder/repo_utils.py +++ b/src/macaron/repo_finder/repo_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the utility functions for repo and commit finder operations.""" diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index 894c82134..47e073b3f 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module handles the cloning and analyzing a Git repo.""" @@ -24,7 +24,7 @@ from macaron.config.global_config import global_config from macaron.config.target_config import Configuration from macaron.database.database_manager import DatabaseManager, get_db_manager, get_db_session -from macaron.database.table_definitions import Analysis, Component, ProvenanceSubject, Repository +from macaron.database.table_definitions import Analysis, Component, ProvenanceSubject, RepoFinderMetadata, Repository from macaron.dependency_analyzer.cyclonedx import DependencyAnalyzer, DependencyInfo from macaron.errors import ( DuplicateError, @@ -44,6 +44,7 @@ ) from macaron.repo_finder.provenance_finder import ProvenanceFinder, find_provenance_from_ci from macaron.repo_finder.repo_finder import prepare_repo +from macaron.repo_finder.repo_finder_enums import CommitFinderOutcome, RepoFinderOutcome from macaron.repo_finder.repo_utils import get_git_service from macaron.repo_verifier.repo_verifier import verify_repo from macaron.slsa_analyzer import git_url @@ -378,8 +379,10 @@ def run_single( # Prepare the repo. git_obj = None + commit_finder_outcome = CommitFinderOutcome.NOT_USED + final_digest = analysis_target.digest if analysis_target.repo_path: - git_obj = prepare_repo( + git_obj, commit_finder_outcome, final_digest = prepare_repo( os.path.join(self.output_path, GIT_REPOS_DIR), analysis_target.repo_path, analysis_target.branch, @@ -387,6 +390,13 @@ def run_single( analysis_target.parsed_purl, ) + repo_finder_metadata = RepoFinderMetadata( + repo_finder_outcome=analysis_target.repo_finder_outcome, + commit_finder_outcome=commit_finder_outcome, + found_url=analysis_target.repo_path, + found_commit=final_digest, + ) + # Check if only one of the repo or digest came from direct input. if git_obj and (provenance_repo_url or provenance_commit_digest) and parsed_purl: if check_if_input_purl_provenance_conflict( @@ -410,6 +420,7 @@ def run_single( analysis, analysis_target, git_obj, + repo_finder_metadata, existing_records, provenance_payload, ) @@ -614,11 +625,15 @@ class AnalysisTarget(NamedTuple): #: The digest of the commit to analyze. digest: str + #: The outcome of the Repo Finder on this analysis target. + repo_finder_outcome: RepoFinderOutcome + def add_component( self, analysis: Analysis, analysis_target: AnalysisTarget, git_obj: Git | None, + repo_finder_metadata: RepoFinderMetadata, existing_records: dict[str, Record] | None = None, provenance_payload: InTotoPayload | None = None, ) -> Component: @@ -635,6 +650,8 @@ def add_component( The target of this analysis. git_obj: Git | None The pydriller.Git object of the repository. + repo_finder_metadata: RepoFinderMetadata + The Repo Finder metadata for this component. existing_records : dict[str, Record] | None The mapping of existing records that the analysis has run successfully. provenance_payload: InTotoVPayload | None @@ -694,6 +711,7 @@ def add_component( purl=str(purl), analysis=analysis, repository=repository, + repo_finder_metadata=repo_finder_metadata, ) if provenance_payload: @@ -777,6 +795,7 @@ def to_analysis_target( repo_path_input: str = config.get_value("path") input_branch: str = config.get_value("branch") input_digest: str = config.get_value("digest") + repo_finder_outcome = RepoFinderOutcome.NOT_USED match (parsed_purl, repo_path_input): case (None, ""): @@ -797,19 +816,21 @@ def to_analysis_target( repo_path=provenance_repo_url or "", branch="", digest=provenance_commit_digest or "", + repo_finder_outcome=repo_finder_outcome, ) # As there is no repo or commit from provenance, use the Repo Finder to find the repo. converted_repo_path = repo_finder.to_repo_path(parsed_purl, available_domains) if converted_repo_path is None: # Try to find repo from PURL - repo = repo_finder.find_repo(parsed_purl) + repo, repo_finder_outcome = repo_finder.find_repo(parsed_purl) return Analyzer.AnalysisTarget( parsed_purl=parsed_purl, repo_path=converted_repo_path or repo or "", branch=input_branch, digest=input_digest, + repo_finder_outcome=repo_finder_outcome, ) case (_, _) | (None, _): @@ -828,6 +849,7 @@ def to_analysis_target( repo_path=repo_path_input, branch=input_branch, digest=input_digest, + repo_finder_outcome=repo_finder_outcome, ) return Analyzer.AnalysisTarget( @@ -835,6 +857,7 @@ def to_analysis_target( repo_path=repo_path_input, branch=input_branch, digest=provenance_commit_digest or "", + repo_finder_outcome=repo_finder_outcome, ) case _: diff --git a/src/macaron/util.py b/src/macaron/util.py index 047d14125..c90f534e7 100644 --- a/src/macaron/util.py +++ b/src/macaron/util.py @@ -126,7 +126,11 @@ def send_head_http_raw( def send_get_http_raw( - url: str, headers: dict | None = None, timeout: int | None = None, allow_redirects: bool = True + url: str, + headers: dict | None = None, + timeout: int | None = None, + allow_redirects: bool = True, + always_return_response: bool = False, ) -> Response | None: """Send the GET HTTP request with the given url and headers. @@ -179,7 +183,7 @@ def send_get_http_raw( if response.status_code == 403: check_rate_limit(response) else: - return None + return None if not always_return_response else response retry_counter = retry_counter - 1 response = requests.get( url=url, diff --git a/tests/conftest.py b/tests/conftest.py index d6b83bd78..b47aa7269 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,7 @@ import macaron from macaron.code_analyzer.call_graph import BaseNode, CallGraph from macaron.config.defaults import create_defaults, defaults, load_defaults -from macaron.database.table_definitions import Analysis, Component, Repository +from macaron.database.table_definitions import Analysis, Component, RepoFinderMetadata, Repository from macaron.parsers.bashparser import BashScriptType, create_bash_node from macaron.parsers.github_workflow_model import Identified, Job, NormalJob, RunStep, Workflow from macaron.slsa_analyzer.analyze_context import AnalyzeContext @@ -413,6 +413,7 @@ def __init__( # Must match test_provenance_finder.MockGit.MockTag.commit. commit_sha="dig", ), + repo_finder_metadata=RepoFinderMetadata(), ) super().__init__(component, *args, **kwargs) diff --git a/tests/dependency_analyzer/cyclonedx/test_cyclonedx.py b/tests/dependency_analyzer/cyclonedx/test_cyclonedx.py index df1eeacb0..ce421f56c 100644 --- a/tests/dependency_analyzer/cyclonedx/test_cyclonedx.py +++ b/tests/dependency_analyzer/cyclonedx/test_cyclonedx.py @@ -9,7 +9,7 @@ from cyclonedx.model.component import Component as CDXComponent from macaron.config.defaults import defaults, load_defaults -from macaron.database.table_definitions import Analysis, Component, Repository +from macaron.database.table_definitions import Analysis, Component, RepoFinderMetadata, Repository from macaron.dependency_analyzer.cyclonedx import CycloneDXParserError, DependencyInfo, deserialize_bom_json from macaron.dependency_analyzer.cyclonedx_mvn import CycloneDxMaven from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool @@ -65,6 +65,7 @@ def test_get_dep_components_java( purl="pkg:maven/io.micronaut.aws/aws-parent@4.0.0-SNAPSHOT?type=pom", analysis=Analysis(), repository=Repository(complete_name="github.com/micronaut-projects/micronaut-aws", fs_path=""), + repo_finder_metadata=RepoFinderMetadata(), ) # Path to the sub-project bom.json files. @@ -107,6 +108,7 @@ def test_get_dep_components_python( purl="pkg:pypi/requests@2.31.0", analysis=Analysis(), repository=Repository(complete_name="github.com/psf/requests", fs_path=""), + repo_finder_metadata=RepoFinderMetadata(), ) # Path to the sub-project bom.json files. @@ -144,6 +146,7 @@ def test_convert_components_to_artifacts_java( purl="pkg:maven/io.micronaut.aws/aws-parent@4.0.0-SNAPSHOT?type=pom", analysis=Analysis(), repository=Repository(complete_name="github.com/micronaut-projects/micronaut-aws", fs_path=""), + repo_finder_metadata=RepoFinderMetadata(), ) # Path to the sub-project bom.json files. @@ -177,6 +180,7 @@ def test_convert_components_to_artifacts_python( purl="pkg:pypi/requests@2.31.0", analysis=Analysis(), repository=Repository(complete_name="github.com/psf/requests", fs_path=""), + repo_finder_metadata=RepoFinderMetadata(), ) # Pass the root bom.json. @@ -210,6 +214,7 @@ def test_low_quality_bom( purl="pkg:maven/com.amazonaws/aws-lambda-java-events@3.11.0?type=jar", analysis=Analysis(), repository=Repository(complete_name="github.com/aws/aws-lambda-java-libs", fs_path=""), + repo_finder_metadata=RepoFinderMetadata(), ) result = dep_analyzer.get_deps_from_sbom(bom_path, target_component=component) assert snapshot == result @@ -236,6 +241,7 @@ def test_multiple_versions( purl="pkg:maven/com.amazonaws/aws-lambda-java-events@3.11.0?type=jar", analysis=Analysis(), repository=Repository(complete_name="github.com/aws/aws-lambda-java-libs", fs_path=""), + repo_finder_metadata=RepoFinderMetadata(), ) result = dep_analyzer.get_deps_from_sbom(bom_path, target_component=component) assert snapshot == result @@ -250,6 +256,7 @@ def test_custom_sbom_name_with_maven() -> None: purl="pkg:maven/com.example/cyclonedx-test@1.0-SNAPSHOT?type=jar", analysis=Analysis(), repository=None, + repo_finder_metadata=RepoFinderMetadata(), ) custom_bom_dir = RESOURCES_DIR.joinpath("sbom_name_tests") assert cyclonedx.collect_dependencies(str(custom_bom_dir.joinpath("single_named_sbom")), target_component=component) diff --git a/tests/integration/cases/commit_finder_tag_matching_functionality/commit_finder.py b/tests/integration/cases/commit_finder_tag_matching_functionality/commit_finder.py index 105af78b4..857acb0d0 100644 --- a/tests/integration/cases/commit_finder_tag_matching_functionality/commit_finder.py +++ b/tests/integration/cases/commit_finder_tag_matching_functionality/commit_finder.py @@ -30,7 +30,7 @@ def test_commit_finder() -> int: artifacts = item["artifacts"] for artifact in artifacts: purl = PackageURL.from_string(artifact["purl"]) - matched_tags = commit_finder.match_tags(item["tags"], purl.name, purl.version or "") + matched_tags, _ = commit_finder.match_tags(item["tags"], purl.name, purl.version or "") matched_tag = matched_tags[0] if matched_tags else "" expected = str(artifact["match"]) if matched_tag != expected: diff --git a/tests/integration/cases/repo_finder_remote_calls/repo_finder.py b/tests/integration/cases/repo_finder_remote_calls/repo_finder.py index f529cb771..d6ba2081b 100644 --- a/tests/integration/cases/repo_finder_remote_calls/repo_finder.py +++ b/tests/integration/cases/repo_finder_remote_calls/repo_finder.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This script tests the functionality of the repo finder's remote API calls.""" @@ -13,6 +13,7 @@ from macaron.repo_finder import repo_validator from macaron.repo_finder.repo_finder import find_repo from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder +from macaron.repo_finder.repo_finder_enums import RepoFinderOutcome from macaron.slsa_analyzer.git_url import clean_url logger: logging.Logger = logging.getLogger(__name__) @@ -43,27 +44,33 @@ def test_repo_finder() -> int: defaults.set("git_service.gitlab", "hostname", "gitlab.com") # Test Java package with SCM metadata in artifact POM. - if not find_repo(PackageURL.from_string("pkg:maven/com.fasterxml.jackson.core/jackson-databind@2.14.2")): + match, outcome = find_repo(PackageURL.from_string("pkg:maven/com.fasterxml.jackson.core/jackson-databind@2.14.2")) + if not match or outcome != RepoFinderOutcome.FOUND: return os.EX_UNAVAILABLE # Test Java package with SCM metadata in artifact's parent POM. - if not find_repo(PackageURL.from_string("pkg:maven/commons-cli/commons-cli@1.5.0")): + match, outcome = find_repo(PackageURL.from_string("pkg:maven/commons-cli/commons-cli@1.5.0")) + if not match or outcome != RepoFinderOutcome.FOUND: return os.EX_UNAVAILABLE # Test deps.dev API for a Python package. - if not find_repo(PackageURL.from_string("pkg:pypi/packageurl-python@0.11.1")): + match, outcome = find_repo(PackageURL.from_string("pkg:pypi/packageurl-python@0.11.1")) + if not match or outcome != RepoFinderOutcome.FOUND: return os.EX_UNAVAILABLE # Test deps.dev API for a Nuget package. - if not find_repo(PackageURL.from_string("pkg:nuget/azure.core")): + match, outcome = find_repo(PackageURL.from_string("pkg:nuget/azure.core")) + if not match or outcome != RepoFinderOutcome.FOUND: return os.EX_UNAVAILABLE # Test deps.dev API for an NPM package. - if not find_repo(PackageURL.from_string("pkg:npm/@colors/colors")): + match, outcome = find_repo(PackageURL.from_string("pkg:npm/@colors/colors")) + if not match or outcome != RepoFinderOutcome.FOUND: return os.EX_UNAVAILABLE # Test deps.dev API for Cargo package. - if not find_repo(PackageURL.from_string("pkg:cargo/rand_core")): + match, outcome = find_repo(PackageURL.from_string("pkg:cargo/rand_core")) + if not match or outcome != RepoFinderOutcome.FOUND: return os.EX_UNAVAILABLE # Test redirecting URL from Apache commons-io package. @@ -73,17 +80,18 @@ def test_repo_finder() -> int: # Test Java package whose SCM metadata only points to the repo in later versions than is provided here. purl = PackageURL.from_string("pkg:maven/io.vertx/vertx-auth-common@3.8.0") - repo = find_repo(purl) + repo, _ = find_repo(purl) if repo == "https://github.com/eclipse-vertx/vertx-auth": return os.EX_UNAVAILABLE - latest_purl = DepsDevRepoFinder().get_latest_version(purl) + latest_purl, _ = DepsDevRepoFinder().get_latest_version(purl) assert latest_purl - repo = find_repo(latest_purl) + repo, _ = find_repo(latest_purl) if repo != "https://github.com/eclipse-vertx/vertx-auth": return os.EX_UNAVAILABLE # Test Java package that has no version. - if not find_repo(PackageURL.from_string("pkg:maven/io.vertx/vertx-auth-common")): + match, outcome = find_repo(PackageURL.from_string("pkg:maven/io.vertx/vertx-auth-common")) + if not match or outcome != RepoFinderOutcome.FOUND: return os.EX_UNAVAILABLE return os.EX_OK diff --git a/tests/malware_analyzer/pypi/conftest.py b/tests/malware_analyzer/pypi/conftest.py index a5f775531..4a583fda3 100644 --- a/tests/malware_analyzer/pypi/conftest.py +++ b/tests/malware_analyzer/pypi/conftest.py @@ -7,7 +7,7 @@ import pytest -from macaron.database.table_definitions import Analysis, Component +from macaron.database.table_definitions import Analysis, Component, RepoFinderMetadata from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry @@ -23,5 +23,7 @@ def pypi_package_json() -> MagicMock: pypi_registry = MagicMock(spec=PyPIRegistry) pypi_package = MagicMock(spec=PyPIPackageJsonAsset) pypi_package.pypi_registry = pypi_registry - pypi_package.component = Component(purl="pkg:pypi/package", analysis=Analysis(), repository=None) + pypi_package.component = Component( + purl="pkg:pypi/package", analysis=Analysis(), repository=None, repo_finder_metadata=RepoFinderMetadata() + ) return pypi_package diff --git a/tests/repo_finder/test_commit_finder.py b/tests/repo_finder/test_commit_finder.py index 45fa15aea..47c5ee6a2 100644 --- a/tests/repo_finder/test_commit_finder.py +++ b/tests/repo_finder/test_commit_finder.py @@ -6,48 +6,52 @@ import os import re import shutil +from typing import Any import hypothesis import pytest from hypothesis import given, settings from hypothesis.strategies import DataObject, data, text from packageurl import PackageURL +from pydriller.git import Git from macaron.repo_finder import commit_finder from macaron.repo_finder.commit_finder import AbstractPurlType +from macaron.repo_finder.repo_finder_enums import CommitFinderOutcome from tests.slsa_analyzer.mock_git_utils import commit_files, initiate_repo logger: logging.Logger = logging.getLogger(__name__) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) REPO_DIR = os.path.join(BASE_DIR, "mock_repos", "commit_finder/sample_repo") +UNICODE_VERSION = "雪" # The Japanese character for "snow". +TAG_VERSION = "2.3.4" +TAG_VERSION_2 = "4.5.2" -def test_get_commit_from_version() -> None: +@pytest.fixture(name="tag_list") +def tag_list_() -> list[str]: + """Return a list of tags.""" + return ["test-name-v1.0.1-A", "v1.0.3+test", "v_1.0.5", "50_0_2", "r78rv109", "1.0.5-JRE"] + + +@pytest.mark.parametrize( + ("version", "name", "tag_list_index"), + [ + ("1.0.1-A", "test-name-1", 0), + ("1.0.3+test", "test-name-2", 1), + ("1.0.5", "test-name-3", 2), + ("50.0.2", "test-name-4", 3), + ("78.109", "test-name-5", 4), + ("1.0.5-JRE", "test-name-6", 5), + ], +) +def test_get_commit_from_version(version: str, name: str, tag_list_index: int, tag_list: list[str]) -> None: """Test resolving commits from version tags.""" - versions = [ - "1.0.1-A", # To match a tag with a named suffix. - "1.0.3+test", # To match a tag with a '+' suffix. - "1.0.5", # To match a tag with a 'v_' prefix. - "50.0.2", # To match a tag separated by '_'. - "78.109", # To match a tag separated by characters 'r' 'rv'. - "1.0.5-JRE", # To NOT match the similar tag without the 'JRE' suffix. - ] - - tags = ["test-name-v1.0.1-A", "v1.0.3+test", "v_1.0.5", "50_0_2", "r78rv109", "1.0.5-JRE"] - - # Perform tests - purl_name = "test-name" - for count, value in enumerate(versions): - _test_version(tags, purl_name, value, tags[count]) - purl_name = "test-name" + "-" + str(count + 1) - - -def _test_version(tags: list[str], name: str, version: str, target_tag: str) -> None: - """Retrieve tag matching version and check it is correct.""" - matched_tags = commit_finder.match_tags(tags, name, version) + matched_tags, outcome = commit_finder.match_tags(tag_list, name, version) assert matched_tags - assert matched_tags[0] == target_tag + assert matched_tags[0] == tag_list[tag_list_index] + assert outcome == CommitFinderOutcome.MATCHED @pytest.mark.parametrize( @@ -87,8 +91,9 @@ def test_abstract_purl_type(purls: list[str], expected: AbstractPurlType) -> Non assert commit_finder.determine_abstract_purl_type(PackageURL.from_string(purl)) == expected -def test_commit_finder() -> None: - """Test commit finder using mocked repository.""" +@pytest.fixture(name="mocked_repo") +def mocked_repo_() -> Git: + """Create a mocked repository.""" if os.path.exists(REPO_DIR): shutil.rmtree(REPO_DIR) git_obj = initiate_repo( @@ -106,82 +111,141 @@ def test_commit_finder() -> None: file.write("A") commit_files(git_obj, ["file_1"]) - # Create a commit with no associated branch. - commit_0 = git_obj.repo.index.commit(message="Commit_0") + return git_obj - # No version in PURL. - assert not commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:maven/apache/maven")) - # Unsupported PURL type. - assert not commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:gem/ruby-artifact@1")) +@pytest.fixture(name="mocked_repo_commit") +def mocked_repo_commit_(mocked_repo: Git) -> Any: + """Add a commit to the mocked repository.""" + return mocked_repo.repo.index.commit(message="Commit_0") - # Hash not present in repository, tests hash and tag. - assert not commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:github/apache/maven@ab4ce3e")) - # Valid PURL but repository has no tags yet. - assert not commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:maven/apache/maven@1.0")) +@pytest.fixture(name="mocked_repo_empty_commit") +def mocked_repo_empty_commit_(mocked_repo: Git) -> Any: + """Add an empty commit to the mocked repository.""" + return mocked_repo.repo.index.commit(message="Empty_Commit") - # Additional setup is done here to avoid tainting earlier tests. + +@pytest.fixture(name="mocked_repo_expanded") +def mocked_repo_expanded_(mocked_repo: Git, mocked_repo_commit: Any, mocked_repo_empty_commit: Any) -> Any: + """Add tags to the mocked repository.""" + mocked_repo.repo.create_tag("4.5", mocked_repo_commit.hexsha) # Create a tag from a tree. - tag_tree_version = "1.0" - tree = git_obj.repo.heads.master.commit.tree - git_obj.repo.create_tag(tag_tree_version, ref=tree) + mocked_repo.repo.create_tag("1.0", ref=mocked_repo.repo.heads.master.commit.tree) - # Add a new tag with an associated commit. This is the Japanese character for 'snow'. - unicode_version = "雪" - git_obj.repo.create_tag(unicode_version, commit_0.hexsha) + # Add a tag with unicode version. + mocked_repo.repo.create_tag(UNICODE_VERSION, mocked_repo_commit.hexsha) # Create a more typical tag on the same commit. - tag_version = "2.3.4" - git_obj.repo.create_tag(tag_version, commit_0.hexsha) + mocked_repo.repo.create_tag(TAG_VERSION, mocked_repo_commit.hexsha) - # Add an empty commit with some tags. - empty_commit = git_obj.repo.index.commit("Empty commit.") - tag_version_2 = "4.5.2" - git_obj.repo.create_tag(f"{tag_version_2}-DEV", ref=empty_commit.hexsha) - git_obj.repo.create_tag(f"{tag_version_2}_DEV_RC1_RELEASE", ref=empty_commit.hexsha) - git_obj.repo.create_tag(f"rel/prefix_name-{tag_version}", ref=empty_commit.hexsha) + # Add more tags. + mocked_repo.repo.create_tag(f"{TAG_VERSION_2}-DEV", ref=mocked_repo_empty_commit.hexsha) + mocked_repo.repo.create_tag(f"{TAG_VERSION_2}_DEV_RC1_RELEASE", ref=mocked_repo_empty_commit.hexsha) + mocked_repo.repo.create_tag(f"rel/prefix_name-{TAG_VERSION}", ref=mocked_repo_empty_commit.hexsha) - # Version with a suffix and no matching tag. - assert not commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:maven/apache/maven@1-JRE")) + return mocked_repo - # Version with only one digit and no matching tag. - assert not commit_finder.find_commit(git_obj, PackageURL.from_string("pkg:maven/apache/maven@1")) - # Unicode version. - assert commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{unicode_version}")) +@pytest.mark.parametrize( + ("purl_string", "expected_outcome"), + [ + # No version in PURL. + ("pkg:maven/apache/maven", CommitFinderOutcome.NO_VERSION_PROVIDED), + # Unsupported PURL type. + ("pkg:gem/ruby-artifact@1", CommitFinderOutcome.UNSUPPORTED_PURL_TYPE), + # Hash not present in repository. + ("pkg:github/apache/maven@ab4ce3e", CommitFinderOutcome.REPO_PURL_FAILURE), + # Valid PURL but repository has no tags yet. + ("pkg:maven/apache/maven@1.0", CommitFinderOutcome.NO_TAGS), + ], +) +def test_commit_finder_tagless_failure( + mocked_repo: Git, purl_string: str, expected_outcome: CommitFinderOutcome +) -> None: + """Test commit finder using mocked repository with no tags.""" + match, outcome = commit_finder.find_commit(mocked_repo, PackageURL.from_string(purl_string)) + assert not match + assert outcome == expected_outcome - # Valid repository PURL. - digest = commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:github/apache/maven@{commit_0.hexsha}")) - assert digest == commit_0.hexsha - # Valid artifact PURL. - digest = commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version}")) - assert digest == commit_0.hexsha +@pytest.mark.parametrize( + ("purl_string", "expected_outcome"), + [ + # Invalid PURL. + ("pkg:maven/[]@()", CommitFinderOutcome.INVALID_PURL), + # Version with a suffix and no matching tag. + ("pkg:maven/apache/maven@1-JRE", CommitFinderOutcome.NO_TAGS_MATCHED), + # Version with only one digit and no matching tag. + ("pkg:maven/apache/maven@1", CommitFinderOutcome.NO_TAGS_MATCHED), + ], +) +def test_commit_finder_tag_failure( + mocked_repo_expanded: Git, purl_string: str, expected_outcome: CommitFinderOutcome +) -> None: + """Test commit finder using mocked repository with tags.""" + match, outcome = commit_finder.find_commit(mocked_repo_expanded, PackageURL.from_string(purl_string)) + assert not match + assert outcome == expected_outcome - # Valid artifact PURL with an alphanumeric suffix. - digest = commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version}-RC1")) - assert digest == commit_0.hexsha - # Valid artifact PURL that should match a tag with a name prefix. - digest = commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:maven/apache/prefix_name@{tag_version}")) - assert digest == empty_commit.hexsha +@pytest.mark.parametrize( + "purl_string", + [ + f"pkg:maven/apache/maven@{UNICODE_VERSION}", + f"pkg:maven/apache/maven@{TAG_VERSION}", + f"pkg:maven/apache/maven@{TAG_VERSION}-RC1", + ], +) +def test_commit_finder_success_commit( + mocked_repo_expanded: Git, + mocked_repo_commit: Any, + purl_string: str, +) -> None: + """Test Commit Finder on mocked repository that should match valid PURLs.""" + match, outcome = commit_finder.find_commit(mocked_repo_expanded, PackageURL.from_string(purl_string)) + assert match == mocked_repo_commit.hexsha + assert outcome == CommitFinderOutcome.MATCHED - # Valid artifact PURL that matches a version with a suffix, to a tag with the same suffix. - digest = commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version_2}-DEV")) - assert digest == empty_commit.hexsha - # Valid artifact PURL that matches a version with a suffix, to a tag with the same suffix part in a multi-suffix. - digest = commit_finder.find_commit( - git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version_2}_RELEASE") +@pytest.mark.parametrize( + "purl_string", + [ + # Match name prefix. + f"pkg:maven/apache/prefix_name@{TAG_VERSION}", + # Match suffix. + f"pkg:maven/apache/maven@{TAG_VERSION_2}-DEV", + # Match suffix in multi-suffix. + f"pkg:maven/apache/maven@{TAG_VERSION_2}_RELEASE", + # Match alphanumeric suffix in multi-suffix. + f"pkg:maven/apache/maven@{TAG_VERSION_2}_RC1", + ], +) +def test_commit_finder_success_empty_commit( + mocked_repo_expanded: Git, mocked_repo_empty_commit: Any, purl_string: str +) -> None: + """Test Commit Finder on mocked repository that should match value PURLs.""" + match, outcome = commit_finder.find_commit(mocked_repo_expanded, PackageURL.from_string(purl_string)) + assert match == mocked_repo_empty_commit.hexsha + assert outcome == CommitFinderOutcome.MATCHED + + +def test_commit_finder_repo_purl_success(mocked_repo_expanded: Git, mocked_repo_commit: Any) -> None: + """Test Commit Finder on mocked repository using a repo type PURL.""" + match, outcome = commit_finder.find_commit( + mocked_repo_expanded, PackageURL.from_string(f"pkg:github/apache/maven@{mocked_repo_commit.hexsha}") ) - assert digest == empty_commit.hexsha + assert match == mocked_repo_commit.hexsha + assert outcome == CommitFinderOutcome.MATCHED + - # Valid artifact PURL that matches a version with an alphanumeric suffix, to a tag with the same suffix part in a - # multi-suffix. - digest = commit_finder.find_commit(git_obj, PackageURL.from_string(f"pkg:maven/apache/maven@{tag_version_2}_RC1")) - assert digest == empty_commit.hexsha +def test_commit_finder_tag_no_commit(mocked_repo: Git) -> None: + """Test the Commit Finder on a mocked repository that has a tag with no commit.""" + mocked_repo.repo.create_tag("TEST", ref=mocked_repo.repo.heads.master.commit.tree) + match, outcome = commit_finder.find_commit(mocked_repo, PackageURL.from_string("pkg:maven/apache/maven@TEST")) + assert not match + assert outcome == CommitFinderOutcome.NO_TAGS_WITH_COMMITS @given(text()) @@ -226,7 +290,7 @@ def test_version_to_tag_matching(_data: DataObject) -> None: # noqa: PT019 if not purl.version: return # Build the pattern from the version. - pattern, parts = commit_finder._build_version_pattern(purl.name, purl.version) + pattern, parts, _ = commit_finder._build_version_pattern(purl.name, purl.version) if not pattern: return # Generate the tag from a pattern that is very similar to how version patterns are made. diff --git a/tests/repo_finder/test_repo_finder.py b/tests/repo_finder/test_repo_finder.py index ba0bc2b20..de73c8a03 100644 --- a/tests/repo_finder/test_repo_finder.py +++ b/tests/repo_finder/test_repo_finder.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module tests the repo finder.""" @@ -6,13 +6,32 @@ from pathlib import Path import pytest +from packageurl import PackageURL +from pytest_httpserver import HTTPServer from macaron.config.defaults import load_defaults -from macaron.repo_finder.repo_finder_java import JavaRepoFinder +from macaron.repo_finder import repo_finder +from macaron.repo_finder.repo_finder_enums import RepoFinderOutcome + + +@pytest.fixture(name="httpserver_java") +def httpserver_java_(tmp_path: Path, httpserver: HTTPServer) -> HTTPServer: + """Set up the mock HTTP Server for the Repo Finder.""" + url = httpserver.url_for("") + test_config = f""" + [repofinder.java] + artifact_repositories = {url} + """ + test_config_path = os.path.join(tmp_path, "config.ini") + with open(test_config_path, "w", encoding="utf-8") as test_config_file: + test_config_file.write(test_config) + load_defaults(test_config_path) + + return httpserver @pytest.mark.parametrize( - ("user_config_input", "expected"), + ("test_config", "expected"), [ ( """ @@ -21,7 +40,7 @@ scm.connection scm.url """, - ["scm:git:git@github.com:oracle-samples/macaron.git", "https://github.com/oracle/macaron"], + "https://github.com/oracle-samples/macaron", ), ( """ @@ -30,12 +49,19 @@ scm.url scm.connection """, - ["https://github.com/oracle/macaron", "scm:git:git@github.com:oracle-samples/macaron.git"], + "https://github.com/oracle/macaron", ), ], ) -def test_pom_extraction_ordering(tmp_path: Path, user_config_input: str, expected: list[str]) -> None: +def test_pom_extraction_ordering(tmp_path: Path, test_config: str, expected: str, httpserver: HTTPServer) -> None: """Test the ordering of elements extracted from the POM is correct and maintained.""" + url = httpserver.url_for("") + test_config = test_config + f"\nartifact_repositories = {url}" + test_config_path = os.path.join(tmp_path, "config.ini") + with open(test_config_path, "w", encoding="utf-8") as test_config_file: + test_config_file.write(test_config) + load_defaults(test_config_path) + pom_text = """ https://example.org @@ -48,12 +74,165 @@ def test_pom_extraction_ordering(tmp_path: Path, user_config_input: str, expecte """ - user_config_path = os.path.join(tmp_path, "config.ini") - with open(user_config_path, "w", encoding="utf-8") as user_config_file: - user_config_file.write(user_config_input) - load_defaults(user_config_path) - repo_finder = JavaRepoFinder() + group = "com.oracle.tools" + artifact = "oracle-tools-macaron" + version = "0.4" + target_url = "/" + "/".join(["/".join(group.split(".")), artifact, version, f"{artifact}-{version}.pom"]) + httpserver.expect_request(target_url).respond_with_data(pom_text) + + found_repo, outcome = repo_finder.find_repo(PackageURL.from_string(f"pkg:maven/{group}/{artifact}@{version}")) + assert found_repo + assert found_repo == expected + assert outcome == RepoFinderOutcome.FOUND + + +@pytest.mark.parametrize( + ("test_config", "expected"), + [ + ( + """ + [repofinder.java] + artifact_repositories = +   + """, + RepoFinderOutcome.NO_MAVEN_HOST_PROVIDED, + ), + ( + """ + [repofinder.java] + repo_pom_paths = +   + """, + RepoFinderOutcome.NO_POM_TAGS_PROVIDED, + ), + ], +) +def test_repo_finder_java_invalid_config(tmp_path: Path, test_config: str, expected: RepoFinderOutcome) -> None: + """Test the Repo Finder when inputs are invalid: a non-breaking space.""" + test_config_path = os.path.join(tmp_path, "config.ini") + with open(test_config_path, "w", encoding="utf-8") as test_config_file: + test_config_file.write(test_config) + load_defaults(test_config_path) + + found_repo, outcome = repo_finder.find_repo(PackageURL.from_string("pkg:maven/test/test@1"), False) + assert not found_repo + assert outcome == expected + + +@pytest.mark.parametrize( + ("purl_string", "expected"), + [ + ("pkg:maven/test/test", RepoFinderOutcome.NO_VERSION_PROVIDED), + ("pkg:test/test@test", RepoFinderOutcome.UNSUPPORTED_PACKAGE_TYPE), + ], +) +def test_repo_finder_java_invalid_input(purl_string: str, expected: RepoFinderOutcome) -> None: + """Test the Repo Finder when invalid input is provided.""" + found_repo, outcome = repo_finder.find_repo(PackageURL.from_string(purl_string), False) + assert not found_repo + assert outcome == expected + + +@pytest.mark.parametrize( + ("test_pom", "expected"), + [ + ( + """ + ##### + + + + + """, + RepoFinderOutcome.SCM_NO_URLS, + ), + ( + """ + + + TEST + + + """, + RepoFinderOutcome.SCM_NO_VALID_URLS, + ), + ], +) +def test_repo_finder_java_invalid_pom_or_scm( + httpserver_java: HTTPServer, test_pom: str, expected: RepoFinderOutcome +) -> None: + """Test the Repo Finder when the POM or SCM metadata is invalid.""" + group = "oracle" + artifact = "macaron" + version = "0.3" + target_url = "/" + "/".join([group, artifact, version, f"{artifact}-{version}.pom"]) + httpserver_java.expect_request(target_url).respond_with_data(test_pom) + + found_repo, outcome = repo_finder.find_repo( + PackageURL.from_string(f"pkg:maven/{group}/{artifact}@{version}"), False + ) + assert not found_repo + assert outcome == expected + + +def test_repo_finder_java_success(httpserver_java: HTTPServer) -> None: + """Test the Repo Finder on a repository with a valid POM.""" + pom = """ + + + https://github.com/oracle/macaron + + + """ + + group = "oracle" + artifact = "macaron" + version = "0.3" + target_url = "/" + "/".join([group, artifact, version, f"{artifact}-{version}.pom"]) + httpserver_java.expect_request(target_url).respond_with_data(pom) + + found_repo, outcome = repo_finder.find_repo(PackageURL.from_string(f"pkg:maven/{group}/{artifact}@{version}")) + assert found_repo + assert outcome == RepoFinderOutcome.FOUND + + +def test_repo_finder_java_success_via_parent(httpserver_java: HTTPServer) -> None: + """Test the Repo Finder on a repository with a valid parent POM.""" + pom = """ + + + oracle + macaron + 0.4 + + + """ + + parent_pom = """ + + + https://github.com/oracle/macaron + + + """ + + group = "oracle" + artifact = "macaron" + version = "0.3" + target_url = "/" + "/".join([group, artifact, version, f"{artifact}-{version}.pom"]) + httpserver_java.expect_request(target_url).respond_with_data(pom) + + parent_version = "0.4" + parent_url = "/" + "/".join([group, artifact, parent_version, f"{artifact}-{parent_version}.pom"]) + httpserver_java.expect_request(parent_url).respond_with_data(parent_pom) - # Retrieve SCM from POM. - assert expected == repo_finder._read_pom(pom_text) # pylint: disable=W0212 + found_repo, outcome = repo_finder.find_repo(PackageURL.from_string(f"pkg:maven/{group}/{artifact}@{version}")) + assert found_repo + assert outcome == RepoFinderOutcome.FOUND_FROM_PARENT diff --git a/tests/slsa_analyzer/checks/test_registry_e2e.py b/tests/slsa_analyzer/checks/test_registry_e2e.py index 81be21580..63f54e07c 100644 --- a/tests/slsa_analyzer/checks/test_registry_e2e.py +++ b/tests/slsa_analyzer/checks/test_registry_e2e.py @@ -3,7 +3,7 @@ """This module contains an end-to-end test for the check registry.""" -from macaron.database.table_definitions import Analysis, Component, Repository +from macaron.database.table_definitions import Analysis, Component, RepoFinderMetadata, Repository from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType @@ -118,6 +118,7 @@ def test_registry_e2e(self) -> None: purl="pkg:github.com/package-url/purl-spec@244fd47e07d1004f0aed9c", analysis=Analysis(), repository=Repository(complete_name="github.com/package-url/purl-spec", fs_path=""), + repo_finder_metadata=RepoFinderMetadata(), ) target = AnalyzeContext(component=component) results = registry.scan(target) diff --git a/tests/slsa_analyzer/mock_git_utils.py b/tests/slsa_analyzer/mock_git_utils.py index 680515983..d5a0f918f 100644 --- a/tests/slsa_analyzer/mock_git_utils.py +++ b/tests/slsa_analyzer/mock_git_utils.py @@ -11,7 +11,7 @@ from git.exc import GitError from pydriller.git import Git -from macaron.database.table_definitions import Analysis, Component, Repository +from macaron.database.table_definitions import Analysis, Component, RepoFinderMetadata, Repository from macaron.slsa_analyzer.analyze_context import AnalyzeContext @@ -111,6 +111,7 @@ def prepare_repo_for_testing( files=git_repo.files(), fs_path=str(repo_path), ), + repo_finder_metadata=RepoFinderMetadata(), ) analyze_ctx = AnalyzeContext(component=component, macaron_path=str(macaron_path), output_dir=str(output_dir)) diff --git a/tests/slsa_analyzer/test_analyzer.py b/tests/slsa_analyzer/test_analyzer.py index d2b754cba..19c971af9 100644 --- a/tests/slsa_analyzer/test_analyzer.py +++ b/tests/slsa_analyzer/test_analyzer.py @@ -11,6 +11,7 @@ from macaron.config.target_config import Configuration from macaron.errors import InvalidAnalysisTargetError, InvalidPURLError +from macaron.repo_finder.repo_finder_enums import RepoFinderOutcome from macaron.slsa_analyzer.analyzer import Analyzer @@ -25,13 +26,18 @@ repo_path="https://github.com/apache/maven", branch="", digest="", + repo_finder_outcome=RepoFinderOutcome.NOT_USED, ), ), ( Configuration({"purl": "", "path": "https://github.com/apache/maven"}), ["github.com", "gitlab.com", "bitbucket.org"], Analyzer.AnalysisTarget( - parsed_purl=None, repo_path="https://github.com/apache/maven", branch="", digest="" + parsed_purl=None, + repo_path="https://github.com/apache/maven", + branch="", + digest="", + repo_finder_outcome=RepoFinderOutcome.NOT_USED, ), ), ( @@ -42,6 +48,7 @@ repo_path="https://github.com/apache/maven", branch="", digest="", + repo_finder_outcome=RepoFinderOutcome.NOT_USED, ), ), ( @@ -59,6 +66,7 @@ repo_path="https://github.com/apache/maven", branch="master", digest="abcxyz", + repo_finder_outcome=RepoFinderOutcome.NOT_USED, ), ), ],