Skip to content

Commit

Permalink
feat: add repo finder and commit finder outcomes to database
Browse files Browse the repository at this point in the history
Signed-off-by: Ben Selwyn-Smith <[email protected]>
  • Loading branch information
benmss committed Nov 18, 2024
1 parent ab0dd3e commit bad6718
Show file tree
Hide file tree
Showing 23 changed files with 728 additions and 252 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ macaron.repo\_finder.repo\_finder\_deps\_dev module
:undoc-members:
:show-inheritance:

macaron.repo\_finder.repo\_finder\_enums module
-----------------------------------------------

.. automodule:: macaron.repo_finder.repo_finder_enums
:members:
:undoc-members:
:show-inheritance:

macaron.repo\_finder.repo\_finder\_java module
----------------------------------------------

Expand Down
49 changes: 47 additions & 2 deletions src/macaron/database/table_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from macaron.database.database_manager import ORMBase
from macaron.database.db_custom_types import RFC3339DateTime
from macaron.errors import InvalidPURLError
from macaron.repo_finder.repo_finder_enums import CommitFinderOutcome, RepoFinderOutcome
from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, ProvenanceSubjectPURLMatcher
from macaron.slsa_analyzer.slsa_req import ReqName

Expand Down Expand Up @@ -177,7 +178,14 @@ class Component(PackageURLMixin, ORMBase):
lazy="immediate",
)

def __init__(self, purl: str, analysis: Analysis, repository: "Repository | None"):
#: The one-to-one relationship with Repo Finder metadata.
repo_finder_metadata: Mapped["RepoFinderMetadata"] = relationship(
uselist=False, back_populates="component", lazy="immediate"
)

def __init__(
self, purl: str, analysis: Analysis, repository: "Repository | None", repo_finder_metadata: "RepoFinderMetadata"
):
"""
Instantiate the software component using PURL identifier.
Expand All @@ -204,7 +212,13 @@ def __init__(self, purl: str, analysis: Analysis, repository: "Repository | None
# TODO: Explore the ``dbm`` or ``shelve`` packages to support dict type, which are part of the Python standard library.
purl_kwargs = purl_parts.to_dict(encode=True)

super().__init__(purl=purl, analysis=analysis, repository=repository, **purl_kwargs)
super().__init__(
purl=purl,
analysis=analysis,
repository=repository,
repo_finder_metadata=repo_finder_metadata,
**purl_kwargs,
)

@property
def report_file_name(self) -> str:
Expand Down Expand Up @@ -605,3 +619,34 @@ def from_purl_and_provenance(
return cls(sha256=sha256)

return None


class RepoFinderMetadata(ORMBase):
"""Metadata from the Repo Finder and Commit Finder runs for an associated Component."""

__tablename__ = "_repo_finder_metadata"

#: The primary key.
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) # noqa: A003

#: The foreign key to the software component.
component_id: Mapped[int] = mapped_column(Integer, ForeignKey(Component.id), nullable=False)

#: A one-to-one relationship with software components.
component: Mapped["Component"] = relationship(back_populates="repo_finder_metadata")

#: The outcome of the Repo Finder.
repo_finder_outcome: Mapped[Enum] = mapped_column(
Enum(RepoFinderOutcome), nullable=False # pylint: disable=protected-access,no-member
)

#: The outcome of the Commit Finder.
commit_finder_outcome: Mapped[Enum] = mapped_column(
Enum(CommitFinderOutcome), nullable=False # pylint: disable=protected-access,no-member
)

#: The URL found by the Repo Finder (if applicable).
found_url: Mapped[str] = mapped_column(String)

#: The commit of the tag matched by the Commit Finder.
found_commit: Mapped[str] = mapped_column(String)
7 changes: 4 additions & 3 deletions src/macaron/dependency_analyzer/cyclonedx.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from macaron.errors import CycloneDXParserError, DependencyAnalyzerError
from macaron.output_reporter.scm import SCMStatus
from macaron.repo_finder.repo_finder import find_repo
from macaron.repo_finder.repo_finder_enums import RepoFinderOutcome
from macaron.repo_finder.repo_validator import find_valid_repository_url

logger: logging.Logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -465,12 +466,12 @@ def _resolve_more_dependencies(dependencies: dict[str, DependencyInfo]) -> None:
for item in dependencies.values():
if item["available"] != SCMStatus.MISSING_SCM:
continue

item["url"] = find_repo(item["purl"])
if item["url"] == "":
url, outcome = find_repo(item["purl"])
if outcome not in {RepoFinderOutcome.FOUND, RepoFinderOutcome.FOUND_FROM_PARENT}:
logger.debug("Failed to find url for purl: %s", item["purl"])
else:
# TODO decide how to handle possible duplicates here
item["url"] = url
item["available"] = SCMStatus.AVAILABLE
item["note"] = ""

Expand Down
82 changes: 46 additions & 36 deletions src/macaron/repo_finder/commit_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from pydriller import Commit, Git

from macaron.repo_finder import repo_finder_deps_dev, to_domain_from_known_purl_types
from macaron.repo_finder.repo_finder_enums import CommitFinderOutcome
from macaron.slsa_analyzer.git_service import GIT_SERVICES

logger: logging.Logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -121,7 +122,7 @@ class AbstractPurlType(Enum):
UNSUPPORTED = (2,)


def find_commit(git_obj: Git, purl: PackageURL) -> str | None:
def find_commit(git_obj: Git, purl: PackageURL) -> tuple[str | None, CommitFinderOutcome]:
"""Try to find the commit matching the passed PURL.
The PURL may be a repository type, e.g. GitHub, in which case the commit might be in its version part.
Expand All @@ -137,21 +138,21 @@ def find_commit(git_obj: Git, purl: PackageURL) -> str | None:
Returns
-------
str | None
The digest, or None if the commit cannot be correctly retrieved.
tuple[str | None, CommitFinderOutcome]
The digest, or None if the commit cannot be correctly retrieved, and the outcome to report.
"""
version = purl.version
if not version:
logger.debug("Missing version for analysis target: %s", purl.name)
return None
return None, CommitFinderOutcome.NO_VERSION_PROVIDED

repo_type = determine_abstract_purl_type(purl)
if repo_type == AbstractPurlType.REPOSITORY:
return extract_commit_from_version(git_obj, version)
if repo_type == AbstractPurlType.ARTIFACT:
return find_commit_from_version_and_name(git_obj, purl.name, version)
logger.debug("Type of PURL is not supported for commit finding: %s", purl.type)
return None
return None, CommitFinderOutcome.UNSUPPORTED_PURL_TYPE


def determine_abstract_purl_type(purl: PackageURL) -> AbstractPurlType:
Expand Down Expand Up @@ -181,7 +182,7 @@ def determine_abstract_purl_type(purl: PackageURL) -> AbstractPurlType:
return AbstractPurlType.UNSUPPORTED


def extract_commit_from_version(git_obj: Git, version: str) -> str | None:
def extract_commit_from_version(git_obj: Git, version: str) -> tuple[str | None, CommitFinderOutcome]:
"""Try to extract the commit from the PURL's version parameter.
E.g.
Expand All @@ -197,8 +198,8 @@ def extract_commit_from_version(git_obj: Git, version: str) -> str | None:
Returns
-------
str | None
The digest, or None if the commit cannot be correctly retrieved.
tuple[str | None, CommitFinderOutcome]
The digest, or None if the commit cannot be correctly retrieved, and the outcome to report.
"""
# A commit hash is 40 characters in length, but commits are often referenced using only some of those.
commit: Commit | None = None
Expand All @@ -218,12 +219,12 @@ def extract_commit_from_version(git_obj: Git, version: str) -> str | None:
logger.debug("Failed to retrieve commit: %s", error)

if not commit:
return None
return None, CommitFinderOutcome.REPO_PURL_FAILURE

return commit.hash if commit else None
return commit.hash if commit else None, CommitFinderOutcome.MATCHED


def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> str | None:
def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) -> tuple[str | None, CommitFinderOutcome]:
"""Try to find the matching commit in a repository of a given version (and name) via tags.
The passed version is used to match with the tags in the target repository. The passed name is used in cases where
Expand All @@ -240,14 +241,19 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) ->
Returns
-------
str | None
The digest, or None if the commit cannot be correctly retrieved.
tuple[str | None, CommitFinderOutcome]
The digest, or None if the commit cannot be correctly retrieved, and the outcome to report.
"""
logger.debug("Searching for commit of artifact version using tags: %s@%s", name, version)

# Only consider tags that have a commit.
repo_tags = git_obj.repo.tags
if not repo_tags:
logger.debug("No tags found for %s", name)
return None, CommitFinderOutcome.NO_TAGS

valid_tags = {}
for tag in git_obj.repo.tags:
for tag in repo_tags:
commit = _get_tag_commit(tag)
if not commit:
logger.debug("No commit found for tag: %s", tag)
Expand All @@ -258,14 +264,14 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) ->

if not valid_tags:
logger.debug("No tags with commits found for %s", name)
return None
return None, CommitFinderOutcome.NO_TAGS_WITH_COMMITS

# Match tags.
matched_tags = match_tags(list(valid_tags.keys()), name, version)
matched_tags, outcome = match_tags(list(valid_tags.keys()), name, version)

if not matched_tags:
logger.debug("No tags matched for %s", name)
return None
return None, outcome

if len(matched_tags) > 1:
logger.debug("Tags found for %s: %s", name, len(matched_tags))
Expand All @@ -282,7 +288,7 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) ->
hexsha = tag.commit.hexsha
except ValueError:
logger.debug("Error trying to retrieve digest of commit: %s", tag.commit)
return None
return None, CommitFinderOutcome.NO_TAG_COMMIT

logger.debug(
"Found tag %s with commit %s for artifact version %s@%s",
Expand All @@ -291,7 +297,7 @@ def find_commit_from_version_and_name(git_obj: Git, name: str, version: str) ->
name,
version,
)
return hexsha if hexsha else None
return hexsha if hexsha else None, CommitFinderOutcome.MATCHED


def _split_name(name: str) -> list[str]:
Expand Down Expand Up @@ -349,7 +355,7 @@ def _split_separators(version: str) -> list[str]:
return [item for item in split if item]


def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, list[str]]:
def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, list[str], CommitFinderOutcome]:
"""Build a version pattern to match the passed version string.
Parameters
Expand All @@ -362,12 +368,12 @@ def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, lis
Returns
-------
tuple[Pattern | None, list[str]]
The tuple of the regex pattern that will match the version, and the list of version parts that were extracted.
If an exception occurs from any regex operation, the pattern will be returned as None.
The tuple of the regex pattern that will match the version, the list of version parts that were extracted, and
the outcome to report. If an exception occurs from any regex operation, the pattern will be returned as None.
"""
if not version:
return None, []
return None, [], CommitFinderOutcome.NO_VERSION_PROVIDED

# Escape input to prevent it being treated as regex.
name = re.escape(name)
Expand All @@ -376,7 +382,7 @@ def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, lis

if not parts:
logger.debug("Version contained no valid parts: %s", version)
return None, []
return None, [], CommitFinderOutcome.INVALID_PURL

logger.debug("Final version parts: %s", parts)

Expand Down Expand Up @@ -470,14 +476,14 @@ def _build_version_pattern(name: str, version: str) -> tuple[Pattern | None, lis

# Compile the pattern.
try:
return re.compile(this_version_pattern, flags=re.IGNORECASE), parts
return re.compile(this_version_pattern, flags=re.IGNORECASE), parts, CommitFinderOutcome.MATCHED
except Exception as error: # pylint: disable=broad-exception-caught
# The regex library uses an internal error that cannot be used here to satisfy pylint.
logger.debug("Error while compiling version regex: %s", error)
return None, []
return None, [], CommitFinderOutcome.REGEX_COMPILE_FAILURE


def match_tags(tag_list: list[str], name: str, version: str) -> list[str]:
def match_tags(tag_list: list[str], name: str, version: str) -> tuple[list[str], CommitFinderOutcome]:
"""Return items of the passed tag list that match the passed artifact name and version.
Parameters
Expand All @@ -491,8 +497,8 @@ def match_tags(tag_list: list[str], name: str, version: str) -> list[str]:
Returns
-------
list[str]
The list of tags that matched the pattern.
tuple[list[str], CommitFinderOutcome]
The list of tags that matched the pattern, if any, and the outcome to report.
"""
logger.debug("Tag Sample: %s", tag_list[:5])

Expand All @@ -518,14 +524,14 @@ def match_tags(tag_list: list[str], name: str, version: str) -> list[str]:
if match.group(1):
prefix_match = tag
if prefix_match:
return [prefix_match]
return [prefix_match], CommitFinderOutcome.MATCHED
if last_match:
return [last_match]
return [last_match], CommitFinderOutcome.MATCHED

# Create the more complicated pattern for the passed version.
pattern, parts = _build_version_pattern(name, version)
pattern, parts, outcome = _build_version_pattern(name, version)
if not pattern:
return []
return [], outcome

# Match the tags.
matched_tags = []
Expand All @@ -546,8 +552,12 @@ def match_tags(tag_list: list[str], name: str, version: str) -> list[str]:

matched_tags = _fix_misaligned_tag_matches(matched_tags, version)

if len(matched_tags) <= 1:
return [_["tag"] for _ in matched_tags]
if not matched_tags:
logger.debug("Failed to match any tags.")
return [], CommitFinderOutcome.NO_TAGS_MATCHED

if len(matched_tags) == 1:
return [_["tag"] for _ in matched_tags], CommitFinderOutcome.MATCHED

# In the case of multiple matches, further work must be done.

Expand Down Expand Up @@ -588,7 +598,7 @@ def match_tags(tag_list: list[str], name: str, version: str) -> list[str]:
)
)

return [_["tag"] for _ in matched_tags]
return [_["tag"] for _ in matched_tags], CommitFinderOutcome.MATCHED


def _fix_misaligned_tag_matches(matched_tags: list[dict[str, str]], version: str) -> list[dict[str, str]]:
Expand Down
2 changes: 1 addition & 1 deletion src/macaron/repo_finder/provenance_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def check_if_input_purl_provenance_conflict(

# Check the PURL commit against the provenance.
if not digest_input and provenance_commit_digest and purl.version:
purl_commit = extract_commit_from_version(git_obj, purl.version)
purl_commit, _ = extract_commit_from_version(git_obj, purl.version)
if purl_commit and purl_commit != provenance_commit_digest:
logger.debug(
"The commit digest passed via purl input does not match what exists in the "
Expand Down
Loading

0 comments on commit bad6718

Please sign in to comment.