From 6f9d7f4e43fc40dcff853e8b2f104b52ce70c131 Mon Sep 17 00:00:00 2001 From: behnazh-w Date: Sun, 5 Jan 2025 22:26:06 +1000 Subject: [PATCH] fix: report known malware even when not labeled Signed-off-by: behnazh-w --- src/macaron/config/defaults.ini | 7 +- .../checks/detect_malicious_metadata_check.py | 49 ++++++----- .../package_registry/deps_dev.py | 83 +++++++++++++++++++ .../package_registry/package_registry.py | 47 +++-------- tests/integration/cases/ultralytics/policy.dl | 10 +++ tests/integration/cases/ultralytics/test.yaml | 21 +++++ .../cases/ultralytics_8.3.46/policy.dl | 10 +++ .../cases/ultralytics_8.3.46/test.yaml | 21 +++++ .../test_detect_malicious_metadata_check.py | 6 +- .../package_registry/test_npm_registry.py | 39 ++++++++- 10 files changed, 229 insertions(+), 64 deletions(-) create mode 100644 src/macaron/slsa_analyzer/package_registry/deps_dev.py create mode 100644 tests/integration/cases/ultralytics/policy.dl create mode 100644 tests/integration/cases/ultralytics/test.yaml create mode 100644 tests/integration/cases/ultralytics_8.3.46/policy.dl create mode 100644 tests/integration/cases/ultralytics_8.3.46/test.yaml diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index ae0b72cb8..07692d201 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -1,4 +1,4 @@ -# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. [requests] @@ -538,6 +538,11 @@ registry_url_scheme = https fileserver_url_netloc = files.pythonhosted.org fileserver_url_scheme = https +[deps_dev] +url_netloc = api.deps.dev +url_scheme = https +v3alpha_purl_endpoint = v3alpha/purl + # Configuration options for selecting the checks to run. # Both the exclude and include are defined as list of strings: # - The exclude list is used to specify the checks that will not run. diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 15daf8d65..094e6212d 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This check examines the metadata of pypi packages with seven heuristics.""" @@ -11,7 +11,7 @@ from macaron.database.db_custom_types import DBJsonDict from macaron.database.table_definitions import CheckFacts -from macaron.errors import HeuristicAnalyzerValueError +from macaron.errors import HeuristicAnalyzerValueError, InvalidHTTPResponseError from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics @@ -28,6 +28,7 @@ from macaron.slsa_analyzer.build_tool.poetry import Poetry from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType +from macaron.slsa_analyzer.package_registry.deps_dev import DepsDevService from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry from macaron.slsa_analyzer.registry import registry from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo @@ -177,7 +178,7 @@ def __init__(self) -> None: """Initialize a check instance.""" check_id = "mcn_detect_malicious_metadata_1" description = """This check analyzes the metadata of a package based on reports malicious behavior. - Supported ecosystem: PyPI. + Supported ecosystem for unknown malware: PyPI. """ super().__init__(check_id=check_id, description=description, eval_reqs=[]) @@ -259,21 +260,28 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: The result of the check. """ result_tables: list[CheckFacts] = [] - # First check if this package is a known malware + package_registry_info_entries = ctx.dynamic_data["package_registries"] + # First check if this package is a known malware url = "https://api.osv.dev/v1/query" data = {"package": {"purl": ctx.component.purl}} - response = send_post_http_raw(url, json_data=data, headers=None) - res_obj = None - if response: - try: - res_obj = response.json() - except requests.exceptions.JSONDecodeError as error: - logger.debug("Unable to get a valid response from %s: %s", url, error) - if res_obj: - for vuln in res_obj.get("vulns", {}): - v_id = json_extract(vuln, ["id"], str) - if v_id and v_id.startswith("MAL-"): + + try: + package_exists = bool(DepsDevService.get_package_info(ctx.component.purl)) + except InvalidHTTPResponseError as error: + logger.debug(error) + + if not package_exists: + response = send_post_http_raw(url, json_data=data, headers=None) + res_obj = None + if response: + try: + res_obj = response.json() + except requests.exceptions.JSONDecodeError as error: + logger.debug("Unable to get a valid response from %s: %s", url, error) + if res_obj: + for vuln in res_obj.get("vulns", {}): + v_id = json_extract(vuln, ["id"], str) result_tables.append( MaliciousMetadataFacts( known_malware=f"https://osv.dev/vulnerability/{v_id}", @@ -282,13 +290,12 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: confidence=Confidence.HIGH, ) ) - if result_tables: - return CheckResultData( - result_tables=result_tables, - result_type=CheckResultType.FAILED, - ) + if result_tables: + return CheckResultData( + result_tables=result_tables, + result_type=CheckResultType.FAILED, + ) - package_registry_info_entries = ctx.dynamic_data["package_registries"] for package_registry_info_entry in package_registry_info_entries: match package_registry_info_entry: case PackageRegistryInfo( diff --git a/src/macaron/slsa_analyzer/package_registry/deps_dev.py b/src/macaron/slsa_analyzer/package_registry/deps_dev.py new file mode 100644 index 000000000..769ff99e3 --- /dev/null +++ b/src/macaron/slsa_analyzer/package_registry/deps_dev.py @@ -0,0 +1,83 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains implementation of deps.dev service.""" + +import json +import logging +import urllib.parse +from urllib.parse import quote as encode + +import requests + +from macaron.config.defaults import defaults +from macaron.errors import ConfigurationError, InvalidHTTPResponseError +from macaron.util import send_get_http_raw + +logger: logging.Logger = logging.getLogger(__name__) + + +class DepsDevService: + """The deps.dev service class.""" + + @staticmethod + def get_package_info(purl: str) -> dict | None: + """Check if the package identified by the PackageURL (PURL) exists and return its information. + + Parameters + ---------- + purl: str + The PackageURL (PURL). + + Returns + ------- + dict | None + The package metadata or None if it doesn't exist. + + Raises + ------ + InvalidHTTPResponseError + If a network error happens or unexpected response is returned by the API. + """ + section_name = "deps_dev" + if not defaults.has_section(section_name): + return None + section = defaults[section_name] + + url_netloc = section.get("url_netloc") + if not url_netloc: + raise ConfigurationError( + f'The "url_netloc" key is missing in section [{section_name}] of the .ini configuration file.' + ) + url_scheme = section.get("url_scheme", "https") + v3alpha_purl_endpoint = section.get("v3alpha_purl_endpoint") + if not v3alpha_purl_endpoint: + raise ConfigurationError( + f'The "v3alpha_purl_endpoint" key is missing in section [{section_name}] of the .ini configuration file.' + ) + + path_params = "/".join([v3alpha_purl_endpoint, encode(purl, safe="")]) + try: + url = urllib.parse.urlunsplit( + urllib.parse.SplitResult( + scheme=url_scheme, + netloc=url_netloc, + path=path_params, + query="", + fragment="", + ) + ) + except ValueError as error: + raise InvalidHTTPResponseError("Failed to construct the API URL.") from error + + response = send_get_http_raw(url) + if response and response.text: + try: + metadata: dict = json.loads(response.text) + except requests.exceptions.JSONDecodeError as error: + raise InvalidHTTPResponseError(f"Failed to process response from deps.dev for {url}.") from error + if not metadata: + raise InvalidHTTPResponseError(f"Empty response returned by {url} .") + return metadata + + return None diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py index 55ae778b7..624a67b7c 100644 --- a/src/macaron/slsa_analyzer/package_registry/package_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py @@ -1,21 +1,16 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module defines package registries.""" -import json import logging -import urllib.parse from abc import ABC, abstractmethod from datetime import datetime -from urllib.parse import quote as encode - -import requests from macaron.errors import InvalidHTTPResponseError from macaron.json_tools import json_extract from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool -from macaron.util import send_get_http_raw +from macaron.slsa_analyzer.package_registry.deps_dev import DepsDevService logger: logging.Logger = logging.getLogger(__name__) @@ -50,7 +45,7 @@ def is_detected(self, build_tool: BaseBuildTool) -> bool: based on the given build tool. """ - def find_publish_timestamp(self, purl: str, registry_url: str | None = None) -> datetime: + def find_publish_timestamp(self, purl: str) -> datetime: """Retrieve the publication timestamp for a package specified by its purl from the deps.dev repository by default. This method constructs a request URL based on the provided purl, sends an HTTP GET @@ -65,8 +60,6 @@ def find_publish_timestamp(self, purl: str, registry_url: str | None = None) -> purl: str The Package URL (purl) of the package whose publication timestamp is to be retrieved. This should conform to the PURL specification. - registry_url: str | None - The registry URL that can be set for testing. Returns ------- @@ -86,40 +79,20 @@ def find_publish_timestamp(self, purl: str, registry_url: str | None = None) -> # in the AnalyzeContext object retrieved by the Repo Finder. This step should be # implemented at the beginning of the analyze command to ensure that the data # is available for subsequent processing. - - base_url_parsed = urllib.parse.urlparse(registry_url or "https://api.deps.dev") - path_params = "/".join(["v3alpha", "purl", encode(purl, safe="")]) try: - url = urllib.parse.urlunsplit( - urllib.parse.SplitResult( - scheme=base_url_parsed.scheme, - netloc=base_url_parsed.netloc, - path=path_params, - query="", - fragment="", - ) - ) - except ValueError as error: - raise InvalidHTTPResponseError("Failed to construct the API URL.") from error - - response = send_get_http_raw(url) - if response and response.text: - try: - metadata: dict = json.loads(response.text) - except requests.exceptions.JSONDecodeError as error: - raise InvalidHTTPResponseError(f"Failed to process response from deps.dev for {url}.") from error - if not metadata: - raise InvalidHTTPResponseError(f"Empty response returned by {url} .") - + metadata = DepsDevService.get_package_info(purl) + except InvalidHTTPResponseError as error: + raise InvalidHTTPResponseError(f"Invalid response from deps.dev for {purl}.") from error + if metadata: timestamp = json_extract(metadata, ["version", "publishedAt"], str) if not timestamp: - raise InvalidHTTPResponseError(f"The timestamp is missing in the response returned by {url}.") + raise InvalidHTTPResponseError(f"The timestamp is missing in the response returned for {purl}.") logger.debug("Found timestamp: %s.", timestamp) try: return datetime.fromisoformat(timestamp) except ValueError as error: - raise InvalidHTTPResponseError(f"The timestamp returned by {url} is invalid") from error + raise InvalidHTTPResponseError(f"The timestamp returned for {purl} is invalid") from error - raise InvalidHTTPResponseError(f"Invalid response from deps.dev for {url}.") + raise InvalidHTTPResponseError(f"Invalid response from deps.dev for {purl}.") diff --git a/tests/integration/cases/ultralytics/policy.dl b/tests/integration/cases/ultralytics/policy.dl new file mode 100644 index 000000000..22dbc90f1 --- /dev/null +++ b/tests/integration/cases/ultralytics/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("check-malicious-package", component_id, "Check the malicious package.") :- + check_passed(component_id, "mcn_detect_malicious_metadata_1"). + +apply_policy_to("check-malicious-package", component_id) :- + is_component(component_id, "pkg:pypi/ultralytics"). diff --git a/tests/integration/cases/ultralytics/test.yaml b/tests/integration/cases/ultralytics/test.yaml new file mode 100644 index 000000000..a10ac556b --- /dev/null +++ b/tests/integration/cases/ultralytics/test.yaml @@ -0,0 +1,21 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Analyzing a popular package that some of its versions are compromised. + +tags: +- macaron-python-package +- macaron-docker-image + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/ultralytics +- name: Run macaron verify-policy to verify that the malicious metadata check passes. + kind: verify + options: + policy: policy.dl diff --git a/tests/integration/cases/ultralytics_8.3.46/policy.dl b/tests/integration/cases/ultralytics_8.3.46/policy.dl new file mode 100644 index 000000000..a7202543e --- /dev/null +++ b/tests/integration/cases/ultralytics_8.3.46/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("check-malicious-package", component_id, "Check the malicious package.") :- + check_failed(component_id, "mcn_detect_malicious_metadata_1"). + +apply_policy_to("check-malicious-package", component_id) :- + is_component(component_id, "pkg:pypi/ultralytics@8.3.46"). diff --git a/tests/integration/cases/ultralytics_8.3.46/test.yaml b/tests/integration/cases/ultralytics_8.3.46/test.yaml new file mode 100644 index 000000000..2ff3e934c --- /dev/null +++ b/tests/integration/cases/ultralytics_8.3.46/test.yaml @@ -0,0 +1,21 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Analyzing a known malicious package. + +tags: +- macaron-python-package +- macaron-docker-image + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/ultralytics@8.3.46 +- name: Run macaron verify-policy to verify that the malicious metadata check fails. + kind: verify + options: + policy: policy.dl diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py index 45786aa78..bbf28b8c4 100644 --- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py +++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Module to test the malicious metadata detection check.""" @@ -62,6 +62,10 @@ def test_detect_malicious_metadata( registry_url_scheme = {base_url_parsed.scheme} fileserver_url_netloc = {base_url_parsed.netloc} fileserver_url_scheme = {base_url_parsed.scheme} + + [deps_dev] + url_netloc = {base_url_parsed.netloc} + url_scheme = {base_url_parsed.scheme} """ user_config_path = os.path.join(tmp_path, "config.ini") with open(user_config_path, "w", encoding="utf-8") as user_config_file: diff --git a/tests/slsa_analyzer/package_registry/test_npm_registry.py b/tests/slsa_analyzer/package_registry/test_npm_registry.py index ef4ed893e..a6cadb4ba 100644 --- a/tests/slsa_analyzer/package_registry/test_npm_registry.py +++ b/tests/slsa_analyzer/package_registry/test_npm_registry.py @@ -1,9 +1,10 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Tests for the npm registry.""" import os +import urllib from datetime import datetime from pathlib import Path @@ -146,6 +147,7 @@ def test_npm_attestation_asset_url( def test_find_publish_timestamp( resources_path: Path, httpserver: HTTPServer, + tmp_path: Path, purl: str, npm_json_path: str, expected_timestamp: str, @@ -153,6 +155,20 @@ def test_find_publish_timestamp( """Test that the function finds the timestamp correctly.""" registry = NPMRegistry() + base_url_parsed = urllib.parse.urlparse(httpserver.url_for("")) + user_config_input = f""" + [deps_dev] + url_netloc = {base_url_parsed.netloc} + url_scheme = {base_url_parsed.scheme} + """ + user_config_path = os.path.join(tmp_path, "config.ini") + with open(user_config_path, "w", encoding="utf-8") as user_config_file: + user_config_file.write(user_config_input) + # We don't have to worry about modifying the ``defaults`` object causing test + # pollution here, since we reload the ``defaults`` object before every test with the + # ``setup_test`` fixture. + load_defaults(user_config_path) + with open(os.path.join(resources_path, "npm_registry_files", npm_json_path), encoding="utf8") as page: response = page.read() @@ -160,7 +176,7 @@ def test_find_publish_timestamp( "/".join(["/v3alpha", "purl", purl]), ).respond_with_data(response) - publish_time_obj = registry.find_publish_timestamp(purl=purl, registry_url=httpserver.url_for("")) + publish_time_obj = registry.find_publish_timestamp(purl=purl) expected_time_obj = datetime.strptime(expected_timestamp, "%Y-%m-%dT%H:%M:%S%z") assert publish_time_obj == expected_time_obj @@ -176,13 +192,14 @@ def test_find_publish_timestamp( ( "pkg:npm/@sigstore/mock@0.7.5", "invalid_sigstore.mock@0.7.5.json", - "The timestamp is missing in the response returned by", + "The timestamp is missing in the response returned for", ), ], ) def test_find_publish_timestamp_errors( resources_path: Path, httpserver: HTTPServer, + tmp_path: Path, purl: str, npm_json_path: str, expected_msg: str, @@ -190,6 +207,20 @@ def test_find_publish_timestamp_errors( """Test that the function handles errors correctly.""" registry = NPMRegistry() + base_url_parsed = urllib.parse.urlparse(httpserver.url_for("")) + user_config_input = f""" + [deps_dev] + url_netloc = {base_url_parsed.netloc} + url_scheme = {base_url_parsed.scheme} + """ + user_config_path = os.path.join(tmp_path, "config.ini") + with open(user_config_path, "w", encoding="utf-8") as user_config_file: + user_config_file.write(user_config_input) + # We don't have to worry about modifying the ``defaults`` object causing test + # pollution here, since we reload the ``defaults`` object before every test with the + # ``setup_test`` fixture. + load_defaults(user_config_path) + with open(os.path.join(resources_path, "npm_registry_files", npm_json_path), encoding="utf8") as page: response = page.read() @@ -199,4 +230,4 @@ def test_find_publish_timestamp_errors( pat = f"^{expected_msg}" with pytest.raises(InvalidHTTPResponseError, match=pat): - registry.find_publish_timestamp(purl=purl, registry_url=httpserver.url_for("")) + registry.find_publish_timestamp(purl=purl)