Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable unknown license detection by default #3558

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion etc/scripts/licenses/synclic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import licensedcode
from licensedcode.cache import get_licenses_by_spdx_key
from licensedcode import models
from licensedcode.match import MATCH_HASH
from licensedcode.models import load_licenses
from licensedcode.models import License

Expand Down Expand Up @@ -253,7 +254,7 @@ def get_match(text):
len(matches) == 1
and rule.is_from_license
and len(rule_licenses) == 1
and match.matcher == "1-hash"
and match.matcher == MATCH_HASH
and match.score() == 100
and match.len() == query_len
)
Expand Down
81 changes: 67 additions & 14 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
from licensedcode.cache import build_spdx_license_expression
from licensedcode.match import LicenseMatch
from licensedcode.match import set_matched_lines
from licensedcode.match import MATCH_UNKNOWN
from licensedcode.match import MATCH_UNDETECTED
from licensedcode.match import MATCH_HASH
from licensedcode.match import MATCH_AHO_EXACT
from licensedcode.match import MATCH_SPDX_ID
from licensedcode.models import UnDetectedRule
from licensedcode.models import compute_relevance
from licensedcode.models import Rule
Expand Down Expand Up @@ -69,7 +74,6 @@ def logger_debug(*args):
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))

MATCHER_UNDETECTED = '5-undetected'

# All values of match_coverage less than this value then they are not considered
# as perfect detections
Expand Down Expand Up @@ -105,6 +109,7 @@ class DetectionCategory(Enum):
PACKAGE_ADD_FROM_FILE = 'from-package-file'
EXTRA_WORDS = 'extra-words'
UNKNOWN_MATCH = 'unknown-match'
UNKNOWN_NGRAMS_MATCH = 'unknown-ngrams-match'
LICENSE_CLUES = 'license-clues'
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
Expand Down Expand Up @@ -133,6 +138,7 @@ class DetectionRule(Enum):
CONTAINED_SAME_LICENSE = 'contained-with-same-license'
UNVERSIONED_FOLLOWED_BY_VERSIONED = 'un-versioned-followed-by-versioned'
UNDETECTED_LICENSE = 'undetected-license'
UNKNOWN_NGRAMS_MATCH = 'unknown-ngrams-match'
PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'package-unknown-reference-to-local-file'
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
PACKAGE_ADD_FROM_FILE = 'from-package-file'
Expand Down Expand Up @@ -961,10 +967,17 @@ def is_undetected_license_matches(license_matches):
if len(license_matches) != 1:
return False

if license_matches[0].matcher == MATCHER_UNDETECTED:
if license_matches[0].matcher == MATCH_UNDETECTED:
return True


def is_ngrams_unknown_license_matches(license_matches):
return all([
license_match.matcher == MATCH_UNKNOWN
for license_match in license_matches
])


def is_correct_detection_non_unknown(license_matches):
"""
Return True if all the matches in ``license_matches`` List of LicenseMatch
Expand All @@ -988,7 +1001,7 @@ def is_correct_detection(license_matches):
]

return (
all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers)
all(matcher in (MATCH_HASH, MATCH_SPDX_ID, MATCH_AHO_EXACT) for matcher in matchers)
and all(is_match_coverage_perfect)
)

Expand Down Expand Up @@ -1309,14 +1322,19 @@ def get_detected_license_expression(
)

matches_for_expression = None
combined_expression = None
detection_log = []

if analysis == DetectionCategory.FALSE_POSITVE.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionRule.FALSE_POSITIVE.value}')
detection_log.append(DetectionRule.FALSE_POSITIVE.value)
return detection_log, combined_expression
return detection_log, None

elif analysis == DetectionCategory.UNKNOWN_NGRAMS_MATCH.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.UNKNOWN_NGRAMS_MATCH.value}')
matches_for_expression = license_matches
detection_log.append(DetectionRule.UNKNOWN_NGRAMS_MATCH.value)

elif analysis == DetectionCategory.UNDETECTED_LICENSE.value:
if TRACE_ANALYSIS:
Expand Down Expand Up @@ -1377,15 +1395,15 @@ def get_detected_license_expression(
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
detection_log.append(DetectionRule.LICENSE_CLUES.value)
return detection_log, combined_expression
return detection_log, None

elif analysis == DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
# TODO: we are temporarily returning these as license clues, and not
# in detections but ideally we should return synthetic unknowns for these
detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
return detection_log, combined_expression
return detection_log, None

else:
if TRACE_ANALYSIS:
Expand Down Expand Up @@ -1454,7 +1472,7 @@ def get_undetected_matches(query_string):
ispan=ispan,
hispan=hispan,
query_run_start=match_start,
matcher=MATCHER_UNDETECTED,
matcher=MATCH_UNDETECTED,
query=query_run.query,
)

Expand Down Expand Up @@ -1510,7 +1528,10 @@ def get_ambiguous_license_detections_by_type(unique_license_detections):

elif is_undetected_license_matches(license_matches=detection.matches):
ambi_license_detections[DetectionCategory.UNDETECTED_LICENSE.value] = detection


elif is_ngrams_unknown_license_matches(license_matches=detection.matches):
ambi_license_detections[DetectionCategory.UNKNOWN_NGRAMS_MATCH.value] = detection

elif has_correct_license_clue_matches(license_matches=detection.matches):
ambi_license_detections[DetectionCategory.LICENSE_CLUES.value] = detection

Expand Down Expand Up @@ -1542,7 +1563,10 @@ def analyze_detection(license_matches, package_license=False):
if TRACE:
logger_debug(f'license_matches {license_matches}', f'package_license {package_license}')

if is_undetected_license_matches(license_matches=license_matches):
if is_ngrams_unknown_license_matches(license_matches=license_matches):
return DetectionCategory.UNKNOWN_NGRAMS_MATCH.value

elif is_undetected_license_matches(license_matches=license_matches):
return DetectionCategory.UNDETECTED_LICENSE.value

elif has_unknown_intro_before_detection(license_matches=license_matches):
Expand Down Expand Up @@ -1593,6 +1617,20 @@ def analyze_detection(license_matches, package_license=False):
return DetectionCategory.PERFECT_DETECTION.value


def has_low_quality_matches(license_matches):
"""
Given a list of ``license_matches`` LicenseMatch objects, return True if
any of the LicenseMatch object is a low quality match, otherwise return
False.
"""
for group_of_matches in group_matches(license_matches=license_matches):
analysis = analyze_detection(license_matches=group_of_matches,)
if analysis == DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value:
return True

return False


def group_matches(license_matches, lines_threshold=LINES_THRESHOLD):
"""
Given a list of ``license_matches`` LicenseMatch objects, yield lists of
Expand Down Expand Up @@ -1746,7 +1784,6 @@ def detect_licenses(
analysis=None,
post_scan=False,
package_license=False,
unknown_licenses=False,
min_score=0,
deadline=sys.maxsize,
as_expression=False,
Expand Down Expand Up @@ -1781,12 +1818,28 @@ def detect_licenses(
min_score=min_score,
deadline=deadline,
as_expression=as_expression,
unknown_licenses=unknown_licenses,
unknown_licenses=False,
**kwargs,
)

if not license_matches:
return
# TODO: Instead of analysing all matches once more, and then matching the
# whole query with unknown license detection on, we should get query runs
# for only the matches with low quality matches and then run the specific
# unknown license matching on those parts (outcome would be same, but with
# better performance)
if has_low_quality_matches(license_matches) or not license_matches:
unknown_license_matches = index.match(
location=location,
query_string=query_string,
min_score=min_score,
deadline=deadline,
unknown_licenses=True,
**kwargs,
)
if not unknown_license_matches:
return

license_matches = unknown_license_matches

if TRACE:
logger_debug(f"detection: detect_licenses: location: {location}: query_string: {query_string}")
Expand Down
12 changes: 11 additions & 1 deletion src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@
TRACE_REPR_ALL_MATCHED_TEXTS = False


# All matchers
MATCH_SPDX_ID = '1-spdx-id'
MATCH_HASH = '1-hash'
MATCH_AHO_EXACT = '2-aho'
MATCH_SEQ = '3-seq'
MATCH_UNDETECTED = '5-undetected'
MATCH_AHO_FRAG = '5-aho-frag'
MATCH_UNKNOWN = '6-unknown'


def logger_debug(*args): pass


Expand Down Expand Up @@ -2606,7 +2616,7 @@ def is_candidate_false_positive(
# only tags or refs,
(match.rule.is_license_reference or match.rule.is_license_tag or match.rule.is_license_intro)
# but not tags that are SPDX license identifiers
and not match.matcher == '1-spdx-id'
and not match.matcher == MATCH_SPDX_ID
# exact matches only
and match.coverage() == 100

Expand Down
6 changes: 2 additions & 4 deletions src/licensedcode/match_aho.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

from licensedcode import SMALL_RULE
from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_AHO_EXACT
from licensedcode.match import MATCH_AHO_FRAG
from licensedcode.spans import Span

"""
Expand Down Expand Up @@ -75,10 +77,6 @@ def add_sequence(automaton, tids, rid, start=0, with_duplicates=False):
automaton.add_word(tokens, [value])


MATCH_AHO_EXACT = '2-aho'
MATCH_AHO_FRAG = '5-aho-frag'


def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
"""
Return a list of exact LicenseMatch by matching the `query_run` against
Expand Down
3 changes: 1 addition & 2 deletions src/licensedcode/match_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@


from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_HASH
from licensedcode.spans import Span

"""
Expand All @@ -38,8 +39,6 @@ def logger_debug(*args):
def logger_debug(*args):
pass

MATCH_HASH = '1-hash'


def tokens_hash(tokens):
"""
Expand Down
3 changes: 1 addition & 2 deletions src/licensedcode/match_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@


from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_SEQ
from licensedcode.spans import Span


Expand Down Expand Up @@ -44,8 +45,6 @@ def logger_debug(*args):
like approaches.
"""

MATCH_SEQ = '3-seq'


def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
match_blocks=None, deadline=sys.maxsize):
Expand Down
3 changes: 1 addition & 2 deletions src/licensedcode/match_spdx_lid.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from license_expression import Licensing

from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_SPDX_ID
from licensedcode.models import SpdxRule
from licensedcode.spans import Span
from textcode.markup import is_markup_text
Expand Down Expand Up @@ -58,8 +59,6 @@ def logger_debug(*args):
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))

MATCH_SPDX_ID = '1-spdx-id'


def spdx_id_match(idx, query_run, text, expression_symbols=None):
"""
Expand Down
2 changes: 1 addition & 1 deletion src/licensedcode/match_unknown.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from licensedcode.models import UnknownRule
from licensedcode.match import get_full_qspan_matched_text
from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_UNKNOWN
from licensedcode.spans import Span

"""
Expand Down Expand Up @@ -43,7 +44,6 @@ def logger_debug(*args):
def logger_debug(*args):
pass

MATCH_UNKNOWN = '6-unknown'

UNKNOWN_NGRAM_LENGTH = 6

Expand Down
9 changes: 0 additions & 9 deletions src/licensedcode/plugin_license.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,6 @@ class LicenseScanner(ScanPlugin):
help_group=SCAN_OPTIONS_GROUP,
),

PluggableCommandLineOption(
('--unknown-licenses',),
is_flag=True,
required_options=['license'],
help='[EXPERIMENTAL] Detect unknown licenses. ',
help_group=SCAN_OPTIONS_GROUP,
)
]

def is_enabled(self, license, **kwargs): # NOQA
Expand All @@ -155,7 +148,6 @@ def get_scanner(
license_text_diagnostics=False,
license_diagnostics=False,
license_url_template=SCANCODE_LICENSEDB_URL,
unknown_licenses=False,
**kwargs
):

Expand All @@ -166,7 +158,6 @@ def get_scanner(
license_text_diagnostics=license_text_diagnostics,
license_diagnostics=license_diagnostics,
license_url_template=license_url_template,
unknown_licenses=unknown_licenses,
)

def process_codebase(self, codebase, license_diagnostics, **kwargs):
Expand Down
4 changes: 3 additions & 1 deletion src/licensedcode/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from functools import partial
import textwrap

from licensedcode.match import MATCH_UNKNOWN

"""
Utility function to trace matched texts used for tracing and testing.
"""
Expand All @@ -27,7 +29,7 @@ def get_texts(match, width=80, margin=0):
"""
qtokens = match.matched_text(whole_lines=False).split()
mqt = format_text(tokens=qtokens, width=width, margin=margin)
if match.matcher == '6-unknown':
if match.matcher == MATCH_UNKNOWN:
itokens = match.rule.text.split()
else:
itokens = matched_rule_tokens_str(match)
Expand Down
Loading
Loading