From 9cca82098ccb2f0c36b0037253b8d93fd9d61975 Mon Sep 17 00:00:00 2001
From: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
Date: Fri, 27 Oct 2023 17:06:37 +0530
Subject: [PATCH] Enable unknown license detection by default

Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
---
 etc/scripts/licenses/synclic.py    |  3 +-
 src/licensedcode/detection.py      | 81 ++++++++++++++++++++++++------
 src/licensedcode/match.py          | 12 ++++-
 src/licensedcode/match_aho.py      |  6 +--
 src/licensedcode/match_hash.py     |  3 +-
 src/licensedcode/match_seq.py      |  3 +-
 src/licensedcode/match_spdx_lid.py |  3 +-
 src/licensedcode/match_unknown.py  |  2 +-
 src/licensedcode/plugin_license.py |  9 ----
 src/licensedcode/tracing.py        |  4 +-
 src/scancode/api.py                |  4 --
 tests/licensedcode/test_detect.py  |  7 +--
 tests/licensedcode/test_match.py   |  5 +-
 tests/licensedcode/test_query.py   |  3 +-
 14 files changed, 98 insertions(+), 47 deletions(-)

diff --git a/etc/scripts/licenses/synclic.py b/etc/scripts/licenses/synclic.py
index e3208430f81..69994b99742 100644
--- a/etc/scripts/licenses/synclic.py
+++ b/etc/scripts/licenses/synclic.py
@@ -28,6 +28,7 @@
 import licensedcode
 from licensedcode.cache import get_licenses_by_spdx_key
 from licensedcode import models
+from licensedcode.match import MATCH_HASH
 from licensedcode.models import load_licenses
 from licensedcode.models import License
 
@@ -253,7 +254,7 @@ def get_match(text):
         len(matches) == 1
         and rule.is_from_license
         and len(rule_licenses) == 1
-        and match.matcher == "1-hash"
+        and match.matcher == MATCH_HASH
         and match.score() == 100
         and match.len() == query_len
     )
diff --git a/src/licensedcode/detection.py b/src/licensedcode/detection.py
index a617735ae53..c9b31c8c5ad 100644
--- a/src/licensedcode/detection.py
+++ b/src/licensedcode/detection.py
@@ -28,6 +28,11 @@
 from licensedcode.cache import build_spdx_license_expression
 from licensedcode.match import LicenseMatch
 from licensedcode.match import set_matched_lines
+from licensedcode.match import MATCH_UNKNOWN
+from licensedcode.match import MATCH_UNDETECTED
+from licensedcode.match import MATCH_HASH
+from licensedcode.match import MATCH_AHO_EXACT
+from licensedcode.match import MATCH_SPDX_ID
 from licensedcode.models import UnDetectedRule
 from licensedcode.models import compute_relevance
 from licensedcode.models import Rule
@@ -69,7 +74,6 @@ def logger_debug(*args):
         def logger_debug(*args):
             return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
 
-MATCHER_UNDETECTED = '5-undetected'
 
 # All values of match_coverage less than this value then they are not considered
 # as perfect detections
@@ -105,6 +109,7 @@ class DetectionCategory(Enum):
     PACKAGE_ADD_FROM_FILE = 'from-package-file'
     EXTRA_WORDS = 'extra-words'
     UNKNOWN_MATCH = 'unknown-match'
+    UNKNOWN_NGRAMS_MATCH = 'unknown-ngrams-match'
     LICENSE_CLUES = 'license-clues'
     LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
     IMPERFECT_COVERAGE = 'imperfect-match-coverage'
@@ -133,6 +138,7 @@ class DetectionRule(Enum):
     CONTAINED_SAME_LICENSE = 'contained-with-same-license'
     UNVERSIONED_FOLLOWED_BY_VERSIONED = 'un-versioned-followed-by-versioned'
     UNDETECTED_LICENSE = 'undetected-license'
+    UNKNOWN_NGRAMS_MATCH = 'unknown-ngrams-match'
     PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'package-unknown-reference-to-local-file'
     PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
     PACKAGE_ADD_FROM_FILE = 'from-package-file'
@@ -961,10 +967,17 @@ def is_undetected_license_matches(license_matches):
     if len(license_matches) != 1:
         return False
 
-    if license_matches[0].matcher == MATCHER_UNDETECTED:
+    if license_matches[0].matcher == MATCH_UNDETECTED:
         return True
 
 
+def is_ngrams_unknown_license_matches(license_matches):
+    return all([
+        license_match.matcher == MATCH_UNKNOWN
+        for license_match in license_matches
+    ])
+
+
 def is_correct_detection_non_unknown(license_matches):
     """
     Return True if all the matches in ``license_matches`` List of LicenseMatch
@@ -988,7 +1001,7 @@ def is_correct_detection(license_matches):
     ]
 
     return (
-        all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers)
+        all(matcher in (MATCH_HASH, MATCH_SPDX_ID, MATCH_AHO_EXACT) for matcher in matchers)
         and all(is_match_coverage_perfect)
     )
 
@@ -1309,14 +1322,19 @@ def get_detected_license_expression(
         )
 
     matches_for_expression = None
-    combined_expression = None
     detection_log = []
 
     if analysis == DetectionCategory.FALSE_POSITVE.value:
         if TRACE_ANALYSIS:
             logger_debug(f'analysis {DetectionRule.FALSE_POSITIVE.value}')
         detection_log.append(DetectionRule.FALSE_POSITIVE.value)
-        return detection_log, combined_expression
+        return detection_log, None
+
+    elif analysis == DetectionCategory.UNKNOWN_NGRAMS_MATCH.value:
+        if TRACE_ANALYSIS:
+            logger_debug(f'analysis {DetectionCategory.UNKNOWN_NGRAMS_MATCH.value}')
+        matches_for_expression = license_matches
+        detection_log.append(DetectionRule.UNKNOWN_NGRAMS_MATCH.value)
 
     elif analysis == DetectionCategory.UNDETECTED_LICENSE.value:
         if TRACE_ANALYSIS:
@@ -1377,7 +1395,7 @@ def get_detected_license_expression(
         if TRACE_ANALYSIS:
             logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
         detection_log.append(DetectionRule.LICENSE_CLUES.value)
-        return detection_log, combined_expression
+        return detection_log, None
 
     elif analysis == DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value:
         if TRACE_ANALYSIS:
@@ -1385,7 +1403,7 @@ def get_detected_license_expression(
         # TODO: we are temporarily returning these as license clues, and not
         # in detections but ideally we should return synthetic unknowns for these
         detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
-        return detection_log, combined_expression
+        return detection_log, None
 
     else:
         if TRACE_ANALYSIS:
@@ -1454,7 +1472,7 @@ def get_undetected_matches(query_string):
         ispan=ispan,
         hispan=hispan,
         query_run_start=match_start,
-        matcher=MATCHER_UNDETECTED,
+        matcher=MATCH_UNDETECTED,
         query=query_run.query,
     )
 
@@ -1510,7 +1528,10 @@ def get_ambiguous_license_detections_by_type(unique_license_detections):
 
         elif is_undetected_license_matches(license_matches=detection.matches):
             ambi_license_detections[DetectionCategory.UNDETECTED_LICENSE.value] = detection
-        
+
+        elif is_ngrams_unknown_license_matches(license_matches=detection.matches):
+            ambi_license_detections[DetectionCategory.UNKNOWN_NGRAMS_MATCH.value] = detection
+
         elif has_correct_license_clue_matches(license_matches=detection.matches):
             ambi_license_detections[DetectionCategory.LICENSE_CLUES.value] = detection
 
@@ -1542,7 +1563,10 @@ def analyze_detection(license_matches, package_license=False):
     if TRACE:
         logger_debug(f'license_matches {license_matches}', f'package_license {package_license}')
 
-    if is_undetected_license_matches(license_matches=license_matches):
+    if is_ngrams_unknown_license_matches(license_matches=license_matches):
+        return DetectionCategory.UNKNOWN_NGRAMS_MATCH.value
+
+    elif is_undetected_license_matches(license_matches=license_matches):
         return DetectionCategory.UNDETECTED_LICENSE.value
 
     elif has_unknown_intro_before_detection(license_matches=license_matches):
@@ -1593,6 +1617,20 @@ def analyze_detection(license_matches, package_license=False):
         return DetectionCategory.PERFECT_DETECTION.value
 
 
+def has_low_quality_matches(license_matches):
+    """
+    Given a list of ``license_matches`` LicenseMatch objects, return True if
+    any of the LicenseMatch object is a low quality match, otherwise return
+    False.
+    """
+    for group_of_matches in group_matches(license_matches=license_matches):
+        analysis = analyze_detection(license_matches=group_of_matches,)
+        if analysis == DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value:
+            return True
+
+    return False
+
+
 def group_matches(license_matches, lines_threshold=LINES_THRESHOLD):
     """
     Given a list of ``license_matches`` LicenseMatch objects, yield lists of
@@ -1746,7 +1784,6 @@ def detect_licenses(
     analysis=None,
     post_scan=False,
     package_license=False,
-    unknown_licenses=False,
     min_score=0,
     deadline=sys.maxsize,
     as_expression=False,
@@ -1781,12 +1818,28 @@ def detect_licenses(
         min_score=min_score,
         deadline=deadline,
         as_expression=as_expression,
-        unknown_licenses=unknown_licenses,
+        unknown_licenses=False,
         **kwargs,
     )
 
-    if not license_matches:
-        return
+    # TODO: Instead of analysing all matches once more, and then matching the
+    # whole query with unknown license detection on, we should get query runs
+    # for only the matches with low quality matches and then run the specific
+    # unknown license matching on those parts (outcome would be same, but with
+    # better performance)
+    if has_low_quality_matches(license_matches) or not license_matches:
+        unknown_license_matches = index.match(
+            location=location,
+            query_string=query_string,
+            min_score=min_score,
+            deadline=deadline,
+            unknown_licenses=True,
+            **kwargs,
+        )
+        if not unknown_license_matches:
+            return
+
+        license_matches = unknown_license_matches
 
     if TRACE:
         logger_debug(f"detection: detect_licenses: location: {location}: query_string: {query_string}")
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
index 7d7880606e3..f4e7fc1ec82 100644
--- a/src/licensedcode/match.py
+++ b/src/licensedcode/match.py
@@ -72,6 +72,16 @@
 TRACE_REPR_ALL_MATCHED_TEXTS = False
 
 
+# All matchers
+MATCH_SPDX_ID = '1-spdx-id'
+MATCH_HASH = '1-hash'
+MATCH_AHO_EXACT = '2-aho'
+MATCH_SEQ = '3-seq'
+MATCH_UNDETECTED = '5-undetected'
+MATCH_AHO_FRAG = '5-aho-frag'
+MATCH_UNKNOWN = '6-unknown'
+
+
 def logger_debug(*args): pass
 
 
@@ -2606,7 +2616,7 @@ def is_candidate_false_positive(
         # only tags or refs,
         (match.rule.is_license_reference or match.rule.is_license_tag or match.rule.is_license_intro)
         # but not tags that are SPDX license identifiers
-        and not match.matcher == '1-spdx-id'
+        and not match.matcher == MATCH_SPDX_ID
         # exact matches only
         and match.coverage() == 100
 
diff --git a/src/licensedcode/match_aho.py b/src/licensedcode/match_aho.py
index 7a46821a31e..1129294d42c 100644
--- a/src/licensedcode/match_aho.py
+++ b/src/licensedcode/match_aho.py
@@ -13,6 +13,8 @@
 
 from licensedcode import SMALL_RULE
 from licensedcode.match import LicenseMatch
+from licensedcode.match import MATCH_AHO_EXACT
+from licensedcode.match import MATCH_AHO_FRAG
 from licensedcode.spans import Span
 
 """
@@ -75,10 +77,6 @@ def add_sequence(automaton, tids, rid, start=0, with_duplicates=False):
         automaton.add_word(tokens, [value])
 
 
-MATCH_AHO_EXACT = '2-aho'
-MATCH_AHO_FRAG = '5-aho-frag'
-
-
 def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
     """
     Return a list of exact LicenseMatch by matching the `query_run` against
diff --git a/src/licensedcode/match_hash.py b/src/licensedcode/match_hash.py
index e0ed6c8b719..2bcb317ce14 100644
--- a/src/licensedcode/match_hash.py
+++ b/src/licensedcode/match_hash.py
@@ -12,6 +12,7 @@
 
 
 from licensedcode.match import LicenseMatch
+from licensedcode.match import MATCH_HASH
 from licensedcode.spans import Span
 
 """
@@ -38,8 +39,6 @@ def logger_debug(*args):
     def logger_debug(*args):
         pass
 
-MATCH_HASH = '1-hash'
-
 
 def tokens_hash(tokens):
     """
diff --git a/src/licensedcode/match_seq.py b/src/licensedcode/match_seq.py
index 26e59e37f02..96dc1c65493 100644
--- a/src/licensedcode/match_seq.py
+++ b/src/licensedcode/match_seq.py
@@ -12,6 +12,7 @@
 
 
 from licensedcode.match import LicenseMatch
+from licensedcode.match import MATCH_SEQ
 from licensedcode.spans import Span
 
 
@@ -44,8 +45,6 @@ def logger_debug(*args):
 like approaches.
 """
 
-MATCH_SEQ = '3-seq'
-
 
 def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
                    match_blocks=None, deadline=sys.maxsize):
diff --git a/src/licensedcode/match_spdx_lid.py b/src/licensedcode/match_spdx_lid.py
index d5937fb1466..4c3346ebc4c 100644
--- a/src/licensedcode/match_spdx_lid.py
+++ b/src/licensedcode/match_spdx_lid.py
@@ -18,6 +18,7 @@
 from license_expression import Licensing
 
 from licensedcode.match import LicenseMatch
+from licensedcode.match import MATCH_SPDX_ID
 from licensedcode.models import SpdxRule
 from licensedcode.spans import Span
 from textcode.markup import is_markup_text
@@ -58,8 +59,6 @@ def logger_debug(*args):
     def logger_debug(*args):
         return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
 
-MATCH_SPDX_ID = '1-spdx-id'
-
 
 def spdx_id_match(idx, query_run, text, expression_symbols=None):
     """
diff --git a/src/licensedcode/match_unknown.py b/src/licensedcode/match_unknown.py
index b83a62a01e0..0df000111e8 100644
--- a/src/licensedcode/match_unknown.py
+++ b/src/licensedcode/match_unknown.py
@@ -13,6 +13,7 @@
 from licensedcode.models import UnknownRule
 from licensedcode.match import get_full_qspan_matched_text
 from licensedcode.match import LicenseMatch
+from licensedcode.match import MATCH_UNKNOWN
 from licensedcode.spans import Span
 
 """
@@ -43,7 +44,6 @@ def logger_debug(*args):
     def logger_debug(*args):
         pass
 
-MATCH_UNKNOWN = '6-unknown'
 
 UNKNOWN_NGRAM_LENGTH = 6
 
diff --git a/src/licensedcode/plugin_license.py b/src/licensedcode/plugin_license.py
index ecae0a6901b..a5d14f096a7 100644
--- a/src/licensedcode/plugin_license.py
+++ b/src/licensedcode/plugin_license.py
@@ -128,13 +128,6 @@ class LicenseScanner(ScanPlugin):
             help_group=SCAN_OPTIONS_GROUP,
         ),
 
-        PluggableCommandLineOption(
-            ('--unknown-licenses',),
-            is_flag=True,
-            required_options=['license'],
-            help='[EXPERIMENTAL] Detect unknown licenses. ',
-            help_group=SCAN_OPTIONS_GROUP,
-        )
     ]
 
     def is_enabled(self, license, **kwargs):  # NOQA
@@ -155,7 +148,6 @@ def get_scanner(
         license_text_diagnostics=False,
         license_diagnostics=False,
         license_url_template=SCANCODE_LICENSEDB_URL,
-        unknown_licenses=False,
         **kwargs
     ):
 
@@ -166,7 +158,6 @@ def get_scanner(
             license_text_diagnostics=license_text_diagnostics,
             license_diagnostics=license_diagnostics,
             license_url_template=license_url_template,
-            unknown_licenses=unknown_licenses,
         )
 
     def process_codebase(self, codebase, license_diagnostics, **kwargs):
diff --git a/src/licensedcode/tracing.py b/src/licensedcode/tracing.py
index bcbf2de4e37..edb5cd1523b 100644
--- a/src/licensedcode/tracing.py
+++ b/src/licensedcode/tracing.py
@@ -10,6 +10,8 @@
 from functools import partial
 import textwrap
 
+from licensedcode.match import MATCH_UNKNOWN
+
 """
 Utility function to trace matched texts used for tracing and testing.
 """
@@ -27,7 +29,7 @@ def get_texts(match, width=80, margin=0):
     """
     qtokens = match.matched_text(whole_lines=False).split()
     mqt = format_text(tokens=qtokens, width=width, margin=margin)
-    if match.matcher == '6-unknown':
+    if match.matcher == MATCH_UNKNOWN:
         itokens = match.rule.text.split()
     else:
         itokens = matched_rule_tokens_str(match)
diff --git a/src/scancode/api.py b/src/scancode/api.py
index 7d3edbf1516..4f7c6575777 100644
--- a/src/scancode/api.py
+++ b/src/scancode/api.py
@@ -154,7 +154,6 @@ def get_licenses(
     license_text_diagnostics=False,
     license_diagnostics=False,
     deadline=sys.maxsize,
-    unknown_licenses=False,
     **kwargs,
 ):
     """
@@ -173,8 +172,6 @@ def get_licenses(
     `licenses` data as well as a file-level `percentage_of_license_text` 
     as the percentage of file words detected as license text or notice.
     This is used to determine if a file contains mostly licensing.
-
-    If ``unknown_licenses`` is True, also detect unknown licenses.
     """
     from licensedcode.cache import build_spdx_license_expression
     from licensedcode.cache import get_cache
@@ -191,7 +188,6 @@ def get_licenses(
         location=location,
         min_score=min_score,
         deadline=deadline,
-        unknown_licenses=unknown_licenses,
         **kwargs,
     )
 
diff --git a/tests/licensedcode/test_detect.py b/tests/licensedcode/test_detect.py
index c44995fbd43..4e8b452f078 100644
--- a/tests/licensedcode/test_detect.py
+++ b/tests/licensedcode/test_detect.py
@@ -18,6 +18,7 @@
 from licensedcode import match_seq
 from licensedcode.legalese import build_dictionary_from_iterable
 from licensedcode.match import LicenseMatch
+from licensedcode.match import MATCH_AHO_EXACT
 from licensedcode.models import load_rules
 from licensedcode.spans import Span
 from licensedcode.tracing import get_texts
@@ -930,9 +931,9 @@ def test_match_has_correct_positions_basic(self):
         matches = idx.match(query_string=querys)
 
         rule = [r for r in idx.rules_by_rid if r.identifier == 'gpl_69.RULE'][0]
-        m1 = LicenseMatch(rule=rule, matcher='2-aho', qspan=Span(0, 7), ispan=Span(0, 7), start_line=1, end_line=1)
-        m2 = LicenseMatch(rule=rule, matcher='2-aho', qspan=Span(8, 15), ispan=Span(0, 7), start_line=2, end_line=2)
-        m3 = LicenseMatch(rule=rule, matcher='2-aho', qspan=Span(16, 23), ispan=Span(0, 7), start_line=3, end_line=3)
+        m1 = LicenseMatch(rule=rule, matcher=MATCH_AHO_EXACT, qspan=Span(0, 7), ispan=Span(0, 7), start_line=1, end_line=1)
+        m2 = LicenseMatch(rule=rule, matcher=MATCH_AHO_EXACT, qspan=Span(8, 15), ispan=Span(0, 7), start_line=2, end_line=2)
+        m3 = LicenseMatch(rule=rule, matcher=MATCH_AHO_EXACT, qspan=Span(16, 23), ispan=Span(0, 7), start_line=3, end_line=3)
         assert matches == [m1, m2, m3]
 
     def test_match_has_correct_line_positions_for_query_with_repeats(self):
diff --git a/tests/licensedcode/test_match.py b/tests/licensedcode/test_match.py
index 4baf1e9433e..c0e2d841a06 100644
--- a/tests/licensedcode/test_match.py
+++ b/tests/licensedcode/test_match.py
@@ -21,6 +21,7 @@
 from licensedcode.match import get_full_matched_text
 from licensedcode.match import get_matching_regions
 from licensedcode.match import LicenseMatch
+from licensedcode.match import MATCH_AHO_EXACT
 from licensedcode.match import merge_matches
 from licensedcode.match import reportable_tokens
 from licensedcode.match import restore_non_overlapping
@@ -959,8 +960,8 @@ def test_filter_matches_handles_interlaced_matches_with_overlap_and_same_license
         query_loc = self.get_test_loc('match_filter/query')
         matches = idx.match(location=query_loc)
         expected = [
-            # filtered: LicenseMatch(matcher='3-seq', rule=rules['rule1.RULE'], qspan=Span(4, 47) | Span(50, 59), ispan=Span(1, 53)),
-            LicenseMatch(matcher='2-aho', rule=rules['rule2.RULE'], qspan=Span(24, 85), ispan=Span(0, 61)),
+            # filtered: LicenseMatch(matcher=MATCH_SEQ, rule=rules['rule1.RULE'], qspan=Span(4, 47) | Span(50, 59), ispan=Span(1, 53)),
+            LicenseMatch(matcher=MATCH_AHO_EXACT, rule=rules['rule2.RULE'], qspan=Span(24, 85), ispan=Span(0, 61)),
         ]
 
         assert matches == expected
diff --git a/tests/licensedcode/test_query.py b/tests/licensedcode/test_query.py
index 8dd9f8ca43e..46cc9d6ce98 100644
--- a/tests/licensedcode/test_query.py
+++ b/tests/licensedcode/test_query.py
@@ -16,6 +16,7 @@
 from licensedcode import cache
 from licensedcode import index
 from licensedcode import models
+from licensedcode.match import MATCH_AHO_EXACT
 from licensedcode.legalese import build_dictionary_from_iterable
 from licensedcode.query import Query
 
@@ -863,7 +864,7 @@ def test_match_does_not_change_query_unknown_positions(self):
         ][0]
 
         expected = LicenseMatch(
-            matcher='2-aho',
+            matcher=MATCH_AHO_EXACT,
             rule=rule,
             qspan=Span(0, 48),
             ispan=Span(0, 48),