From 9cca82098ccb2f0c36b0037253b8d93fd9d61975 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Fri, 27 Oct 2023 17:06:37 +0530 Subject: [PATCH] Enable unknown license detection by default Signed-off-by: Ayan Sinha Mahapatra --- etc/scripts/licenses/synclic.py | 3 +- src/licensedcode/detection.py | 81 ++++++++++++++++++++++++------ src/licensedcode/match.py | 12 ++++- src/licensedcode/match_aho.py | 6 +-- src/licensedcode/match_hash.py | 3 +- src/licensedcode/match_seq.py | 3 +- src/licensedcode/match_spdx_lid.py | 3 +- src/licensedcode/match_unknown.py | 2 +- src/licensedcode/plugin_license.py | 9 ---- src/licensedcode/tracing.py | 4 +- src/scancode/api.py | 4 -- tests/licensedcode/test_detect.py | 7 +-- tests/licensedcode/test_match.py | 5 +- tests/licensedcode/test_query.py | 3 +- 14 files changed, 98 insertions(+), 47 deletions(-) diff --git a/etc/scripts/licenses/synclic.py b/etc/scripts/licenses/synclic.py index e3208430f81..69994b99742 100644 --- a/etc/scripts/licenses/synclic.py +++ b/etc/scripts/licenses/synclic.py @@ -28,6 +28,7 @@ import licensedcode from licensedcode.cache import get_licenses_by_spdx_key from licensedcode import models +from licensedcode.match import MATCH_HASH from licensedcode.models import load_licenses from licensedcode.models import License @@ -253,7 +254,7 @@ def get_match(text): len(matches) == 1 and rule.is_from_license and len(rule_licenses) == 1 - and match.matcher == "1-hash" + and match.matcher == MATCH_HASH and match.score() == 100 and match.len() == query_len ) diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py index a617735ae53..c9b31c8c5ad 100644 --- a/src/licensedcode/detection.py +++ b/src/licensedcode/detection.py @@ -28,6 +28,11 @@ from licensedcode.cache import build_spdx_license_expression from licensedcode.match import LicenseMatch from licensedcode.match import set_matched_lines +from licensedcode.match import MATCH_UNKNOWN +from licensedcode.match import MATCH_UNDETECTED +from licensedcode.match import MATCH_HASH +from licensedcode.match import MATCH_AHO_EXACT +from licensedcode.match import MATCH_SPDX_ID from licensedcode.models import UnDetectedRule from licensedcode.models import compute_relevance from licensedcode.models import Rule @@ -69,7 +74,6 @@ def logger_debug(*args): def logger_debug(*args): return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) -MATCHER_UNDETECTED = '5-undetected' # All values of match_coverage less than this value then they are not considered # as perfect detections @@ -105,6 +109,7 @@ class DetectionCategory(Enum): PACKAGE_ADD_FROM_FILE = 'from-package-file' EXTRA_WORDS = 'extra-words' UNKNOWN_MATCH = 'unknown-match' + UNKNOWN_NGRAMS_MATCH = 'unknown-ngrams-match' LICENSE_CLUES = 'license-clues' LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches' IMPERFECT_COVERAGE = 'imperfect-match-coverage' @@ -133,6 +138,7 @@ class DetectionRule(Enum): CONTAINED_SAME_LICENSE = 'contained-with-same-license' UNVERSIONED_FOLLOWED_BY_VERSIONED = 'un-versioned-followed-by-versioned' UNDETECTED_LICENSE = 'undetected-license' + UNKNOWN_NGRAMS_MATCH = 'unknown-ngrams-match' PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'package-unknown-reference-to-local-file' PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file' PACKAGE_ADD_FROM_FILE = 'from-package-file' @@ -961,10 +967,17 @@ def is_undetected_license_matches(license_matches): if len(license_matches) != 1: return False - if license_matches[0].matcher == MATCHER_UNDETECTED: + if license_matches[0].matcher == MATCH_UNDETECTED: return True +def is_ngrams_unknown_license_matches(license_matches): + return all([ + license_match.matcher == MATCH_UNKNOWN + for license_match in license_matches + ]) + + def is_correct_detection_non_unknown(license_matches): """ Return True if all the matches in ``license_matches`` List of LicenseMatch @@ -988,7 +1001,7 @@ def is_correct_detection(license_matches): ] return ( - all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers) + all(matcher in (MATCH_HASH, MATCH_SPDX_ID, MATCH_AHO_EXACT) for matcher in matchers) and all(is_match_coverage_perfect) ) @@ -1309,14 +1322,19 @@ def get_detected_license_expression( ) matches_for_expression = None - combined_expression = None detection_log = [] if analysis == DetectionCategory.FALSE_POSITVE.value: if TRACE_ANALYSIS: logger_debug(f'analysis {DetectionRule.FALSE_POSITIVE.value}') detection_log.append(DetectionRule.FALSE_POSITIVE.value) - return detection_log, combined_expression + return detection_log, None + + elif analysis == DetectionCategory.UNKNOWN_NGRAMS_MATCH.value: + if TRACE_ANALYSIS: + logger_debug(f'analysis {DetectionCategory.UNKNOWN_NGRAMS_MATCH.value}') + matches_for_expression = license_matches + detection_log.append(DetectionRule.UNKNOWN_NGRAMS_MATCH.value) elif analysis == DetectionCategory.UNDETECTED_LICENSE.value: if TRACE_ANALYSIS: @@ -1377,7 +1395,7 @@ def get_detected_license_expression( if TRACE_ANALYSIS: logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}') detection_log.append(DetectionRule.LICENSE_CLUES.value) - return detection_log, combined_expression + return detection_log, None elif analysis == DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value: if TRACE_ANALYSIS: @@ -1385,7 +1403,7 @@ def get_detected_license_expression( # TODO: we are temporarily returning these as license clues, and not # in detections but ideally we should return synthetic unknowns for these detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value) - return detection_log, combined_expression + return detection_log, None else: if TRACE_ANALYSIS: @@ -1454,7 +1472,7 @@ def get_undetected_matches(query_string): ispan=ispan, hispan=hispan, query_run_start=match_start, - matcher=MATCHER_UNDETECTED, + matcher=MATCH_UNDETECTED, query=query_run.query, ) @@ -1510,7 +1528,10 @@ def get_ambiguous_license_detections_by_type(unique_license_detections): elif is_undetected_license_matches(license_matches=detection.matches): ambi_license_detections[DetectionCategory.UNDETECTED_LICENSE.value] = detection - + + elif is_ngrams_unknown_license_matches(license_matches=detection.matches): + ambi_license_detections[DetectionCategory.UNKNOWN_NGRAMS_MATCH.value] = detection + elif has_correct_license_clue_matches(license_matches=detection.matches): ambi_license_detections[DetectionCategory.LICENSE_CLUES.value] = detection @@ -1542,7 +1563,10 @@ def analyze_detection(license_matches, package_license=False): if TRACE: logger_debug(f'license_matches {license_matches}', f'package_license {package_license}') - if is_undetected_license_matches(license_matches=license_matches): + if is_ngrams_unknown_license_matches(license_matches=license_matches): + return DetectionCategory.UNKNOWN_NGRAMS_MATCH.value + + elif is_undetected_license_matches(license_matches=license_matches): return DetectionCategory.UNDETECTED_LICENSE.value elif has_unknown_intro_before_detection(license_matches=license_matches): @@ -1593,6 +1617,20 @@ def analyze_detection(license_matches, package_license=False): return DetectionCategory.PERFECT_DETECTION.value +def has_low_quality_matches(license_matches): + """ + Given a list of ``license_matches`` LicenseMatch objects, return True if + any of the LicenseMatch object is a low quality match, otherwise return + False. + """ + for group_of_matches in group_matches(license_matches=license_matches): + analysis = analyze_detection(license_matches=group_of_matches,) + if analysis == DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value: + return True + + return False + + def group_matches(license_matches, lines_threshold=LINES_THRESHOLD): """ Given a list of ``license_matches`` LicenseMatch objects, yield lists of @@ -1746,7 +1784,6 @@ def detect_licenses( analysis=None, post_scan=False, package_license=False, - unknown_licenses=False, min_score=0, deadline=sys.maxsize, as_expression=False, @@ -1781,12 +1818,28 @@ def detect_licenses( min_score=min_score, deadline=deadline, as_expression=as_expression, - unknown_licenses=unknown_licenses, + unknown_licenses=False, **kwargs, ) - if not license_matches: - return + # TODO: Instead of analysing all matches once more, and then matching the + # whole query with unknown license detection on, we should get query runs + # for only the matches with low quality matches and then run the specific + # unknown license matching on those parts (outcome would be same, but with + # better performance) + if has_low_quality_matches(license_matches) or not license_matches: + unknown_license_matches = index.match( + location=location, + query_string=query_string, + min_score=min_score, + deadline=deadline, + unknown_licenses=True, + **kwargs, + ) + if not unknown_license_matches: + return + + license_matches = unknown_license_matches if TRACE: logger_debug(f"detection: detect_licenses: location: {location}: query_string: {query_string}") diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index 7d7880606e3..f4e7fc1ec82 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -72,6 +72,16 @@ TRACE_REPR_ALL_MATCHED_TEXTS = False +# All matchers +MATCH_SPDX_ID = '1-spdx-id' +MATCH_HASH = '1-hash' +MATCH_AHO_EXACT = '2-aho' +MATCH_SEQ = '3-seq' +MATCH_UNDETECTED = '5-undetected' +MATCH_AHO_FRAG = '5-aho-frag' +MATCH_UNKNOWN = '6-unknown' + + def logger_debug(*args): pass @@ -2606,7 +2616,7 @@ def is_candidate_false_positive( # only tags or refs, (match.rule.is_license_reference or match.rule.is_license_tag or match.rule.is_license_intro) # but not tags that are SPDX license identifiers - and not match.matcher == '1-spdx-id' + and not match.matcher == MATCH_SPDX_ID # exact matches only and match.coverage() == 100 diff --git a/src/licensedcode/match_aho.py b/src/licensedcode/match_aho.py index 7a46821a31e..1129294d42c 100644 --- a/src/licensedcode/match_aho.py +++ b/src/licensedcode/match_aho.py @@ -13,6 +13,8 @@ from licensedcode import SMALL_RULE from licensedcode.match import LicenseMatch +from licensedcode.match import MATCH_AHO_EXACT +from licensedcode.match import MATCH_AHO_FRAG from licensedcode.spans import Span """ @@ -75,10 +77,6 @@ def add_sequence(automaton, tids, rid, start=0, with_duplicates=False): automaton.add_word(tokens, [value]) -MATCH_AHO_EXACT = '2-aho' -MATCH_AHO_FRAG = '5-aho-frag' - - def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs): """ Return a list of exact LicenseMatch by matching the `query_run` against diff --git a/src/licensedcode/match_hash.py b/src/licensedcode/match_hash.py index e0ed6c8b719..2bcb317ce14 100644 --- a/src/licensedcode/match_hash.py +++ b/src/licensedcode/match_hash.py @@ -12,6 +12,7 @@ from licensedcode.match import LicenseMatch +from licensedcode.match import MATCH_HASH from licensedcode.spans import Span """ @@ -38,8 +39,6 @@ def logger_debug(*args): def logger_debug(*args): pass -MATCH_HASH = '1-hash' - def tokens_hash(tokens): """ diff --git a/src/licensedcode/match_seq.py b/src/licensedcode/match_seq.py index 26e59e37f02..96dc1c65493 100644 --- a/src/licensedcode/match_seq.py +++ b/src/licensedcode/match_seq.py @@ -12,6 +12,7 @@ from licensedcode.match import LicenseMatch +from licensedcode.match import MATCH_SEQ from licensedcode.spans import Span @@ -44,8 +45,6 @@ def logger_debug(*args): like approaches. """ -MATCH_SEQ = '3-seq' - def match_sequence(idx, rule, query_run, high_postings, start_offset=0, match_blocks=None, deadline=sys.maxsize): diff --git a/src/licensedcode/match_spdx_lid.py b/src/licensedcode/match_spdx_lid.py index d5937fb1466..4c3346ebc4c 100644 --- a/src/licensedcode/match_spdx_lid.py +++ b/src/licensedcode/match_spdx_lid.py @@ -18,6 +18,7 @@ from license_expression import Licensing from licensedcode.match import LicenseMatch +from licensedcode.match import MATCH_SPDX_ID from licensedcode.models import SpdxRule from licensedcode.spans import Span from textcode.markup import is_markup_text @@ -58,8 +59,6 @@ def logger_debug(*args): def logger_debug(*args): return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) -MATCH_SPDX_ID = '1-spdx-id' - def spdx_id_match(idx, query_run, text, expression_symbols=None): """ diff --git a/src/licensedcode/match_unknown.py b/src/licensedcode/match_unknown.py index b83a62a01e0..0df000111e8 100644 --- a/src/licensedcode/match_unknown.py +++ b/src/licensedcode/match_unknown.py @@ -13,6 +13,7 @@ from licensedcode.models import UnknownRule from licensedcode.match import get_full_qspan_matched_text from licensedcode.match import LicenseMatch +from licensedcode.match import MATCH_UNKNOWN from licensedcode.spans import Span """ @@ -43,7 +44,6 @@ def logger_debug(*args): def logger_debug(*args): pass -MATCH_UNKNOWN = '6-unknown' UNKNOWN_NGRAM_LENGTH = 6 diff --git a/src/licensedcode/plugin_license.py b/src/licensedcode/plugin_license.py index ecae0a6901b..a5d14f096a7 100644 --- a/src/licensedcode/plugin_license.py +++ b/src/licensedcode/plugin_license.py @@ -128,13 +128,6 @@ class LicenseScanner(ScanPlugin): help_group=SCAN_OPTIONS_GROUP, ), - PluggableCommandLineOption( - ('--unknown-licenses',), - is_flag=True, - required_options=['license'], - help='[EXPERIMENTAL] Detect unknown licenses. ', - help_group=SCAN_OPTIONS_GROUP, - ) ] def is_enabled(self, license, **kwargs): # NOQA @@ -155,7 +148,6 @@ def get_scanner( license_text_diagnostics=False, license_diagnostics=False, license_url_template=SCANCODE_LICENSEDB_URL, - unknown_licenses=False, **kwargs ): @@ -166,7 +158,6 @@ def get_scanner( license_text_diagnostics=license_text_diagnostics, license_diagnostics=license_diagnostics, license_url_template=license_url_template, - unknown_licenses=unknown_licenses, ) def process_codebase(self, codebase, license_diagnostics, **kwargs): diff --git a/src/licensedcode/tracing.py b/src/licensedcode/tracing.py index bcbf2de4e37..edb5cd1523b 100644 --- a/src/licensedcode/tracing.py +++ b/src/licensedcode/tracing.py @@ -10,6 +10,8 @@ from functools import partial import textwrap +from licensedcode.match import MATCH_UNKNOWN + """ Utility function to trace matched texts used for tracing and testing. """ @@ -27,7 +29,7 @@ def get_texts(match, width=80, margin=0): """ qtokens = match.matched_text(whole_lines=False).split() mqt = format_text(tokens=qtokens, width=width, margin=margin) - if match.matcher == '6-unknown': + if match.matcher == MATCH_UNKNOWN: itokens = match.rule.text.split() else: itokens = matched_rule_tokens_str(match) diff --git a/src/scancode/api.py b/src/scancode/api.py index 7d3edbf1516..4f7c6575777 100644 --- a/src/scancode/api.py +++ b/src/scancode/api.py @@ -154,7 +154,6 @@ def get_licenses( license_text_diagnostics=False, license_diagnostics=False, deadline=sys.maxsize, - unknown_licenses=False, **kwargs, ): """ @@ -173,8 +172,6 @@ def get_licenses( `licenses` data as well as a file-level `percentage_of_license_text` as the percentage of file words detected as license text or notice. This is used to determine if a file contains mostly licensing. - - If ``unknown_licenses`` is True, also detect unknown licenses. """ from licensedcode.cache import build_spdx_license_expression from licensedcode.cache import get_cache @@ -191,7 +188,6 @@ def get_licenses( location=location, min_score=min_score, deadline=deadline, - unknown_licenses=unknown_licenses, **kwargs, ) diff --git a/tests/licensedcode/test_detect.py b/tests/licensedcode/test_detect.py index c44995fbd43..4e8b452f078 100644 --- a/tests/licensedcode/test_detect.py +++ b/tests/licensedcode/test_detect.py @@ -18,6 +18,7 @@ from licensedcode import match_seq from licensedcode.legalese import build_dictionary_from_iterable from licensedcode.match import LicenseMatch +from licensedcode.match import MATCH_AHO_EXACT from licensedcode.models import load_rules from licensedcode.spans import Span from licensedcode.tracing import get_texts @@ -930,9 +931,9 @@ def test_match_has_correct_positions_basic(self): matches = idx.match(query_string=querys) rule = [r for r in idx.rules_by_rid if r.identifier == 'gpl_69.RULE'][0] - m1 = LicenseMatch(rule=rule, matcher='2-aho', qspan=Span(0, 7), ispan=Span(0, 7), start_line=1, end_line=1) - m2 = LicenseMatch(rule=rule, matcher='2-aho', qspan=Span(8, 15), ispan=Span(0, 7), start_line=2, end_line=2) - m3 = LicenseMatch(rule=rule, matcher='2-aho', qspan=Span(16, 23), ispan=Span(0, 7), start_line=3, end_line=3) + m1 = LicenseMatch(rule=rule, matcher=MATCH_AHO_EXACT, qspan=Span(0, 7), ispan=Span(0, 7), start_line=1, end_line=1) + m2 = LicenseMatch(rule=rule, matcher=MATCH_AHO_EXACT, qspan=Span(8, 15), ispan=Span(0, 7), start_line=2, end_line=2) + m3 = LicenseMatch(rule=rule, matcher=MATCH_AHO_EXACT, qspan=Span(16, 23), ispan=Span(0, 7), start_line=3, end_line=3) assert matches == [m1, m2, m3] def test_match_has_correct_line_positions_for_query_with_repeats(self): diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py index 4baf1e9433e..c0e2d841a06 100644 --- a/tests/licensedcode/test_match.py +++ b/tests/licensedcode/test_match.py @@ -21,6 +21,7 @@ from licensedcode.match import get_full_matched_text from licensedcode.match import get_matching_regions from licensedcode.match import LicenseMatch +from licensedcode.match import MATCH_AHO_EXACT from licensedcode.match import merge_matches from licensedcode.match import reportable_tokens from licensedcode.match import restore_non_overlapping @@ -959,8 +960,8 @@ def test_filter_matches_handles_interlaced_matches_with_overlap_and_same_license query_loc = self.get_test_loc('match_filter/query') matches = idx.match(location=query_loc) expected = [ - # filtered: LicenseMatch(matcher='3-seq', rule=rules['rule1.RULE'], qspan=Span(4, 47) | Span(50, 59), ispan=Span(1, 53)), - LicenseMatch(matcher='2-aho', rule=rules['rule2.RULE'], qspan=Span(24, 85), ispan=Span(0, 61)), + # filtered: LicenseMatch(matcher=MATCH_SEQ, rule=rules['rule1.RULE'], qspan=Span(4, 47) | Span(50, 59), ispan=Span(1, 53)), + LicenseMatch(matcher=MATCH_AHO_EXACT, rule=rules['rule2.RULE'], qspan=Span(24, 85), ispan=Span(0, 61)), ] assert matches == expected diff --git a/tests/licensedcode/test_query.py b/tests/licensedcode/test_query.py index 8dd9f8ca43e..46cc9d6ce98 100644 --- a/tests/licensedcode/test_query.py +++ b/tests/licensedcode/test_query.py @@ -16,6 +16,7 @@ from licensedcode import cache from licensedcode import index from licensedcode import models +from licensedcode.match import MATCH_AHO_EXACT from licensedcode.legalese import build_dictionary_from_iterable from licensedcode.query import Query @@ -863,7 +864,7 @@ def test_match_does_not_change_query_unknown_positions(self): ][0] expected = LicenseMatch( - matcher='2-aho', + matcher=MATCH_AHO_EXACT, rule=rule, qspan=Span(0, 48), ispan=Span(0, 48),