Skip to content

Commit

Permalink
feat: find repo from latest artifact when provided artifact has none (#…
Browse files Browse the repository at this point in the history
…931)

This PR adds new functionality to the Repo Finder: If the repository of the provided artifact cannot be found for the specified version, the latest version will be checked instead. The latest version will also be checked in cases where the repository was found, but the sought version (tag) did not exist within it.

Signed-off-by: Ben Selwyn-Smith <[email protected]>
  • Loading branch information
benmss authored Dec 19, 2024
1 parent 12d8593 commit 1ea1bd5
Show file tree
Hide file tree
Showing 8 changed files with 414 additions and 196 deletions.
316 changes: 279 additions & 37 deletions src/macaron/repo_finder/repo_finder.py

Large diffs are not rendered by default.

66 changes: 43 additions & 23 deletions src/macaron/repo_finder/repo_finder_deps_dev.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ class DepsDevType(StrEnum):
class DepsDevRepoFinder(BaseRepoFinder):
"""This class is used to find repositories using Google's Open Source Insights A.K.A. deps.dev."""

# See https://docs.deps.dev/api/v3alpha/
BASE_URL = "https://api.deps.dev/v3alpha/purl/"

def find_repo(self, purl: PackageURL) -> str:
"""
Attempt to retrieve a repository URL that matches the passed artifact.
Expand Down Expand Up @@ -108,53 +111,70 @@ def get_project_info(project_url: str) -> dict[str, Any] | None:

return response_json

def _create_urls(self, purl: PackageURL) -> list[str]:
"""
Create the urls to search for the metadata relating to the passed artifact.
If a version is not specified, remote API calls will be used to try and find one.
@staticmethod
def get_latest_version(purl: PackageURL) -> PackageURL | None:
"""Return a PURL representing the latest version of the passed artifact.
Parameters
----------
purl : PackageURL
The PURL of an artifact.
The current PURL.
Returns
-------
list[str]
The list of created URLs.
PackageURL | None
The latest version of the PURL, or None if it could not be found.
"""
# See https://docs.deps.dev/api/v3alpha/
base_url = f"https://api.deps.dev/v3alpha/purl/{encode(str(purl), safe='')}"

if not base_url:
return []

if purl.version:
return [base_url]
namespace = purl.namespace + "/" if purl.namespace else ""
purl = PackageURL.from_string(f"pkg:{purl.type}/{namespace}{purl.name}")

# Find the latest version.
response = send_get_http_raw(base_url, {})
url = f"{DepsDevRepoFinder.BASE_URL}{encode(str(purl), safe='')}"
response = send_get_http_raw(url)

if not response:
return []
return None

try:
metadata: dict = json.loads(response.text)
except ValueError as error:
logger.debug("Failed to parse response from deps.dev: %s", error)
return []
return None

versions_keys = ["package", "versions"] if "package" in metadata else ["version"]
versions = json_extract(metadata, versions_keys, list)
if not versions:
return []
return None
latest_version = json_extract(versions[-1], ["versionKey", "version"], str)
if not latest_version:
return []
return None

namespace = purl.namespace + "/" if purl.namespace else ""
return PackageURL.from_string(f"pkg:{purl.type}/{namespace}{purl.name}@{latest_version}")

def _create_urls(self, purl: PackageURL) -> list[str]:
"""
Create the urls to search for the metadata relating to the passed artifact.
If a version is not specified, remote API calls will be used to try and find one.
Parameters
----------
purl : PackageURL
The PURL of an artifact.
Returns
-------
list[str]
The list of created URLs.
"""
if not purl.version:
latest_purl = DepsDevRepoFinder.get_latest_version(purl)
if not latest_purl:
return []
purl = latest_purl

logger.debug("Found latest version: %s", latest_version)
return [f"{base_url}%40{latest_version}"]
return [f"{DepsDevRepoFinder.BASE_URL}{encode(str(purl), safe='')}"]

def _retrieve_json(self, url: str) -> str:
"""
Expand Down
10 changes: 8 additions & 2 deletions src/macaron/repo_finder/repo_finder_java.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from macaron.config.defaults import defaults
from macaron.parsers.pomparser import parse_pom_string
from macaron.repo_finder.repo_finder_base import BaseRepoFinder
from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder
from macaron.repo_finder.repo_validator import find_valid_repository_url
from macaron.util import send_get_http_raw

Expand Down Expand Up @@ -51,8 +52,13 @@ def find_repo(self, purl: PackageURL) -> str:

if not version:
logger.info("Version missing for maven artifact: %s:%s", group, artifact)
# TODO add support for Java artifacts without a version
return ""
latest_purl = DepsDevRepoFinder().get_latest_version(purl)
if not latest_purl or not latest_purl.version:
logger.debug("Could not find version for artifact: %s:%s", purl.namespace, purl.name)
return ""
group = latest_purl.namespace or ""
artifact = latest_purl.name
version = latest_purl.version

while group and artifact and version and limit > 0:
# Create the URLs for retrieving the artifact's POM
Expand Down
157 changes: 24 additions & 133 deletions src/macaron/repo_finder/repo_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,12 @@
import string
from urllib.parse import urlparse

from git import InvalidGitRepositoryError
from packageurl import PackageURL
from pydriller import Git

from macaron.config.global_config import global_config
from macaron.errors import CloneError, RepoCheckOutError
from macaron.repo_finder.commit_finder import find_commit
from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService
from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService
from macaron.slsa_analyzer.git_url import (
GIT_REPOS_DIR,
check_out_repo_target,
get_remote_origin_of_local_repo,
get_remote_vcs_url,
get_repo_dir_name,
is_empty_repo,
is_remote_repo,
resolve_local_path,
)
from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR

logger: logging.Logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -125,125 +112,6 @@ def create_report(purl: str, commit: str, repo: str) -> str:
return json.dumps(data, indent=4)


def prepare_repo(
target_dir: str,
repo_path: str,
branch_name: str = "",
digest: str = "",
purl: PackageURL | None = None,
) -> Git | None:
"""Prepare the target repository for analysis.
If ``repo_path`` is a remote path, the target repo is cloned to ``{target_dir}/{unique_path}``.
The ``unique_path`` of a repository will depend on its remote url.
For example, if given the ``repo_path`` https://github.com/org/name.git, it will
be cloned to ``{target_dir}/github_com/org/name``.
If ``repo_path`` is a local path, this method will check if ``repo_path`` resolves to a directory inside
``local_repos_path`` and to a valid git repository.
Parameters
----------
target_dir : str
The directory where all remote repository will be cloned.
repo_path : str
The path to the repository, can be either local or remote.
branch_name : str
The name of the branch we want to checkout.
digest : str
The hash of the commit that we want to checkout in the branch.
purl : PackageURL | None
The PURL of the analysis target.
Returns
-------
Git | None
The pydriller.Git object of the repository or None if error.
"""
# TODO: separate the logic for handling remote and local repos instead of putting them into this method.
logger.info(
"Preparing the repository for the analysis (path=%s, branch=%s, digest=%s)",
repo_path,
branch_name,
digest,
)

resolved_local_path = ""
is_remote = is_remote_repo(repo_path)

if is_remote:
logger.info("The path to repo %s is a remote path.", repo_path)
resolved_remote_path = get_remote_vcs_url(repo_path)
if not resolved_remote_path:
logger.error("The provided path to repo %s is not a valid remote path.", repo_path)
return None

git_service = get_git_service(resolved_remote_path)
repo_unique_path = get_repo_dir_name(resolved_remote_path)
resolved_local_path = os.path.join(target_dir, repo_unique_path)
logger.info("Cloning the repository.")
try:
git_service.clone_repo(resolved_local_path, resolved_remote_path)
except CloneError as error:
logger.error("Cannot clone %s: %s", resolved_remote_path, str(error))
return None
else:
logger.info("Checking if the path to repo %s is a local path.", repo_path)
resolved_local_path = resolve_local_path(get_local_repos_path(), repo_path)

if resolved_local_path:
try:
git_obj = Git(resolved_local_path)
except InvalidGitRepositoryError:
logger.error("No git repo exists at %s.", resolved_local_path)
return None
else:
logger.error("Error happened while preparing the repo.")
return None

if is_empty_repo(git_obj):
logger.error("The target repository does not have any commit.")
return None

# Find the digest and branch if a version has been specified
if not digest and purl and purl.version:
found_digest = find_commit(git_obj, purl)
if not found_digest:
logger.error("Could not map the input purl string to a specific commit in the corresponding repository.")
return None
digest = found_digest

# Checking out the specific branch or commit. This operation varies depends on the git service that the
# repository uses.
if not is_remote:
# If the repo path provided by the user is a local path, we need to get the actual origin remote URL of
# the repo to decide on the suitable git service.
origin_remote_url = get_remote_origin_of_local_repo(git_obj)
if is_remote_repo(origin_remote_url):
# The local repo's origin remote url is a remote URL (e.g https://host.com/a/b): In this case, we obtain
# the corresponding git service using ``self.get_git_service``.
git_service = get_git_service(origin_remote_url)
else:
# The local repo's origin remote url is a local path (e.g /path/to/local/...). This happens when the
# target repository is a clone from another local repo or is a clone from a git archive -
# https://git-scm.com/docs/git-archive: In this case, we fall-back to the generic function
# ``git_url.check_out_repo_target``.
if not check_out_repo_target(git_obj, branch_name, digest, not is_remote):
logger.error("Cannot checkout the specific branch or commit of the target repo.")
return None

return git_obj

try:
git_service.check_out_repo(git_obj, branch_name, digest, not is_remote)
except RepoCheckOutError as error:
logger.error("Failed to check out repository at %s", resolved_local_path)
logger.error(error)
return None

return git_obj


def get_local_repos_path() -> str:
"""Get the local repos path from global config or use default.
Expand Down Expand Up @@ -278,3 +146,26 @@ def get_git_service(remote_path: str | None) -> BaseGitService:
return git_service

return NoneGitService()


def check_repo_urls_are_equivalent(repo_1: str, repo_2: str) -> bool:
"""Check if the two passed repo URLs are equivalent.
Parameters
----------
repo_1: str
The first repository URL as a string.
repo_2: str
The second repository URL as a string.
Returns
-------
bool
True if the repository URLs have equal hostnames and paths, otherwise False.
"""
repo_url_1 = urlparse(repo_1)
repo_url_2 = urlparse(repo_2)
if repo_url_1.hostname != repo_url_2.hostname or repo_url_1.path != repo_url_2.path:
return False

return True
3 changes: 2 additions & 1 deletion src/macaron/slsa_analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
extract_repo_and_commit_from_provenance,
)
from macaron.repo_finder.provenance_finder import ProvenanceFinder, find_provenance_from_ci
from macaron.repo_finder.repo_utils import get_git_service, prepare_repo
from macaron.repo_finder.repo_finder import prepare_repo
from macaron.repo_finder.repo_utils import get_git_service
from macaron.repo_verifier.repo_verifier import verify_repo
from macaron.slsa_analyzer import git_url
from macaron.slsa_analyzer.analyze_context import AnalyzeContext
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

[[ "$(jq -r '.commit' output/reports/maven/io_avaje/avaje-prisms/avaje-prisms.source.json)" = "1f6f953df0b58f0c35b5e136f62f63ba7a22bc03" ]] &&
[[ "$(jq -r '.repo' output/reports/maven/io_avaje/avaje-prisms/avaje-prisms.source.json)" = "https://github.com/avaje/avaje-prisms" ]]
36 changes: 36 additions & 0 deletions tests/integration/cases/latest_repo_comparison/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

description: |
Check that the find-source and analyze commands behave the same for a given artifact.
tags:
- macaron-python-package
- macaron-docker-image

steps:
- name: Run macaron find source
kind: find-source
options:
command_args:
- -purl
- pkg:maven/io.avaje/[email protected]
- name: Check that the repository was not cloned
kind: shell
options:
cmd: ls output/git_repos/github_com/avaje/avaje-prisms/
expect_fail: true
- name: Check the report contents
kind: shell
options:
cmd: ./check_output.sh
- name: Run macaron analyze
kind: analyze
options:
command_args:
- -purl
- pkg:maven/io.avaje/[email protected]
- name: Check that correct repository was cloned
kind: shell
options:
cmd: ls output/git_repos/github_com/avaje/avaje-prisms/
16 changes: 16 additions & 0 deletions tests/integration/cases/repo_finder_remote_calls/repo_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from macaron.config.defaults import defaults
from macaron.repo_finder import repo_validator
from macaron.repo_finder.repo_finder import find_repo
from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder
from macaron.slsa_analyzer.git_url import clean_url

logger: logging.Logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -70,6 +71,21 @@ def test_repo_finder() -> int:
if not parsed_url or not repo_validator.resolve_redirects(parsed_url):
return os.EX_UNAVAILABLE

# Test Java package whose SCM metadata only points to the repo in later versions than is provided here.
purl = PackageURL.from_string("pkg:maven/io.vertx/[email protected]")
repo = find_repo(purl)
if repo == "https://github.com/eclipse-vertx/vertx-auth":
return os.EX_UNAVAILABLE
latest_purl = DepsDevRepoFinder().get_latest_version(purl)
assert latest_purl
repo = find_repo(latest_purl)
if repo != "https://github.com/eclipse-vertx/vertx-auth":
return os.EX_UNAVAILABLE

# Test Java package that has no version.
if not find_repo(PackageURL.from_string("pkg:maven/io.vertx/vertx-auth-common")):
return os.EX_UNAVAILABLE

return os.EX_OK


Expand Down

0 comments on commit 1ea1bd5

Please sign in to comment.