From 4504fdf43ced317ead7c0ce9316a8ebbd8f889fd Mon Sep 17 00:00:00 2001 From: "Ryan Addessi (raddessi)" Date: Thu, 14 Nov 2019 10:29:41 -0700 Subject: [PATCH 1/6] Added EXTENSIONS_NEED_SHEBANG_CHECK and logic --- identify/extensions.py | 15 ++++++++++++++- identify/identify.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/identify/extensions.py b/identify/extensions.py index e7aa969..9a7a6b0 100644 --- a/identify/extensions.py +++ b/identify/extensions.py @@ -123,7 +123,6 @@ 'scala': {'text', 'scala'}, 'scss': {'text', 'scss'}, 'sh': {'text', 'shell'}, - 'sls': {'text', 'salt'}, 'so': {'binary'}, 'sol': {'text', 'solidity'}, 'spec': {'text', 'spec'}, @@ -174,6 +173,20 @@ EXTENSIONS_NEED_BINARY_CHECK = { 'plist': {'plist'}, } +EXTENSIONS_NEED_SHEBANG_CHECK = { + 'sls': { + 'cheetah': {'text', 'salt-cheetah'}, + 'dson': {'text', 'salt-dson'}, + 'genshi': {'text', 'salt-genshi'}, + 'mako': {'text', 'salt-mako'}, + 'py': {'text', 'python', 'salt-py'}, + 'pydsl': {'text', 'python', 'salt-pydsl'}, + 'pyobjects': {'text', 'python', 'salt-pyobjects'}, + 'wempy': {'text', 'salt-wempy'}, + 'yamlex': {'text', 'salt-yamlex'}, + None: {'text', 'salt'}, + }, +} NAMES = { '.babelrc': {'text', 'json', 'babelrc'}, diff --git a/identify/identify.py b/identify/identify.py index a25d579..b87471c 100644 --- a/identify/identify.py +++ b/identify/identify.py @@ -28,6 +28,12 @@ ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY} ALL_TAGS.update(*extensions.EXTENSIONS.values()) ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values()) +ALL_TAGS.update( + *[ + shebang_tags + for shebang_tags in extensions.EXTENSIONS_NEED_SHEBANG_CHECK.values() + ] +) ALL_TAGS.update(*extensions.NAMES.values()) ALL_TAGS.update(*interpreters.INTERPRETERS.values()) ALL_TAGS = frozenset(ALL_TAGS) @@ -60,6 +66,8 @@ def tags_from_path(path): if len(shebang) > 0: tags.update(tags_from_interpreter(shebang[0])) + tags.update(tags_from_extension_specific_shebang(path)) + # some extensions can be both binary and text # see EXTENSIONS_NEED_BINARY_CHECK if not {TEXT, BINARY} & tags: @@ -73,6 +81,28 @@ def tags_from_path(path): return tags +def tags_from_extension_specific_shebang(path): + """Match tags from an extension that we need to read the shabang from.""" + ext = os.path.splitext(path)[1].lstrip('.').lower() + ret = set() + if ext not in extensions.EXTENSIONS_NEED_SHEBANG_CHECK: + return ret + + with open(path, 'rb') as f: + shebang = parse_shebang(f) + + try: + ret.update( + extensions.EXTENSIONS_NEED_SHEBANG_CHECK[ext][ + shebang[0] if shebang else None + ], + ) + except KeyError: + pass + + return ret + + def tags_from_filename(filename): _, filename = os.path.split(filename) _, ext = os.path.splitext(filename) From 7617fa2bf74cc852e0d4b61a9946081987308cf8 Mon Sep 17 00:00:00 2001 From: "Ryan Addessi (raddessi)" Date: Thu, 14 Nov 2019 11:13:19 -0700 Subject: [PATCH 2/6] Added tests for tags_from_extension_specific_shebang --- identify/identify.py | 3 +++ tests/identify_test.py | 47 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/identify/identify.py b/identify/identify.py index b87471c..90c9400 100644 --- a/identify/identify.py +++ b/identify/identify.py @@ -98,6 +98,9 @@ def tags_from_extension_specific_shebang(path): ], ) except KeyError: + # The extension merits inspection but the parsed shebang is not on the + # known list. In this case the actual format is probably unknown so + # don't return a tag pass return ret diff --git a/tests/identify_test.py b/tests/identify_test.py index 767d2d8..fc09219 100644 --- a/tests/identify_test.py +++ b/tests/identify_test.py @@ -109,6 +109,53 @@ def test_tags_from_path_plist_text(tmpdir): } +def test_tags_from_extension_specific_shebang_executable_file(tmpdir): + x = tmpdir.join('test.sls') + x.write('') + make_executable(x.strpath) + assert identify.tags_from_extension_specific_shebang(x.strpath) == { + 'salt', + 'text', + } + + +@pytest.mark.parametrize( + ('interpreter', 'expected'), + ( + ('cheetah', {'text', 'salt-cheetah'}), + ('dson', {'text', 'salt-dson'}), + ('genshi', {'text', 'salt-genshi'}), + ('mako', {'text', 'salt-mako'}), + ('py', {'text', 'python', 'salt-py'}), + ('pydsl', {'text', 'python', 'salt-pydsl'}), + ('pyobjects', {'text', 'python', 'salt-pyobjects'}), + ('wempy', {'text', 'salt-wempy'}), + ('yamlex', {'text', 'salt-yamlex'}), + + + + # Should not be tagged since we don't match the contents + ('/usr/bin/env python', set()), + ), +) +@pytest.mark.parametrize( + ('shebang_prefix',), + ( + ('#!',), + ('#! ',), + ), +) +def test_tags_from_extension_specific_shebang( + tmpdir, + shebang_prefix, + interpreter, + expected, +): + x = tmpdir.join('test.sls') + x.write(shebang_prefix + interpreter) + assert identify.tags_from_extension_specific_shebang(x.strpath) == expected + + @pytest.mark.parametrize( ('filename', 'expected'), ( From e74a495e4066c19604ae98d9721340f3899bfcec Mon Sep 17 00:00:00 2001 From: "Ryan Addessi (raddessi)" Date: Thu, 14 Nov 2019 11:38:10 -0700 Subject: [PATCH 3/6] Added a few more test cases that illustrate a shortcoming --- tests/identify_test.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/identify_test.py b/tests/identify_test.py index fc09219..dfcd4b6 100644 --- a/tests/identify_test.py +++ b/tests/identify_test.py @@ -132,10 +132,18 @@ def test_tags_from_extension_specific_shebang_executable_file(tmpdir): ('wempy', {'text', 'salt-wempy'}), ('yamlex', {'text', 'salt-yamlex'}), - + # Should be tagged as normal salt state files # FIXME: ..I think? + ('gpg', {'text', 'salt'}), + ('jinja', {'text', 'salt'}), + ('jinja|yaml', {'text', 'salt'}), + ('jinja|yaml|gpg', {'text', 'salt'}), + ('yaml', {'text', 'salt'}), + ('yaml|gpg', {'text', 'salt'}), # Should not be tagged since we don't match the contents ('/usr/bin/env python', set()), + ('python3', set()), + ('jinja|py', set()), ), ) @pytest.mark.parametrize( From d5d039fb41af8300a63d122cc6e92bfb63805c71 Mon Sep 17 00:00:00 2001 From: "Ryan Addessi (raddessi)" Date: Sat, 13 Feb 2021 20:01:59 -0700 Subject: [PATCH 4/6] Finally updated, but coverage is not happy --- identify/extensions.py | 20 +++++++++---------- identify/identify.py | 45 +++++++++++++++++++++++------------------- tests/identify_test.py | 40 ++++++++++++++++--------------------- 3 files changed, 51 insertions(+), 54 deletions(-) diff --git a/identify/extensions.py b/identify/extensions.py index 1e4a71f..ce938fb 100644 --- a/identify/extensions.py +++ b/identify/extensions.py @@ -52,9 +52,10 @@ 'gif': {'binary', 'image', 'gif'}, 'go': {'text', 'go'}, 'gotmpl': {'text', 'gotmpl'}, + 'gpg': {'text', 'gnupg'}, 'gpx': {'text', 'gpx', 'xml'}, - 'graphql': {'text', 'graphql'}, 'gradle': {'text', 'groovy'}, + 'graphql': {'text', 'graphql'}, 'groovy': {'text', 'groovy'}, 'gyb': {'text', 'gyb'}, 'gyp': {'text', 'gyp', 'python'}, @@ -100,6 +101,7 @@ 'lr': {'text', 'lektor'}, 'lua': {'text', 'lua'}, 'm': {'text', 'c', 'objective-c'}, + 'mako': {'text', 'mako'}, 'manifest': {'text', 'manifest'}, 'map': {'text', 'map'}, 'markdown': {'text', 'markdown'}, @@ -179,6 +181,7 @@ 'tgz': {'binary', 'gzip'}, 'thrift': {'text', 'thrift'}, 'tiff': {'binary', 'image', 'tiff'}, + 'tmpl': {'text', 'cheetah'}, 'toml': {'text', 'toml'}, 'ts': {'text', 'ts'}, 'tsx': {'text', 'tsx'}, @@ -223,18 +226,13 @@ EXTENSIONS_NEED_BINARY_CHECK = { 'plist': {'plist'}, } +# This should contain a map of file extensions to a map of interpreter names to +# their own file extensions EXTENSIONS_NEED_SHEBANG_CHECK = { 'sls': { - 'cheetah': {'text', 'salt-cheetah'}, - 'dson': {'text', 'salt-dson'}, - 'genshi': {'text', 'salt-genshi'}, - 'mako': {'text', 'salt-mako'}, - 'py': {'text', 'python', 'salt-py'}, - 'pydsl': {'text', 'python', 'salt-pydsl'}, - 'pyobjects': {'text', 'python', 'salt-pyobjects'}, - 'wempy': {'text', 'salt-wempy'}, - 'yamlex': {'text', 'salt-yamlex'}, - None: {'text', 'salt'}, + 'pydsl': 'py', + 'pyobjects': 'py', + 'cheetah': 'tmpl', }, } diff --git a/identify/identify.py b/identify/identify.py index 2be7436..0f17c7f 100644 --- a/identify/identify.py +++ b/identify/identify.py @@ -28,12 +28,6 @@ ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY} ALL_TAGS.update(*extensions.EXTENSIONS.values()) ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values()) -ALL_TAGS.update( - *[ - shebang_tags - for shebang_tags in extensions.EXTENSIONS_NEED_SHEBANG_CHECK.values() - ] -) ALL_TAGS.update(*extensions.NAMES.values()) ALL_TAGS.update(*interpreters.INTERPRETERS.values()) ALL_TAGS = frozenset(ALL_TAGS) @@ -82,26 +76,37 @@ def tags_from_path(path): def tags_from_extension_specific_shebang(path): - """Match tags from an extension that we need to read the shabang from.""" - ext = os.path.splitext(path)[1].lstrip('.').lower() + """Match tags from an extension that we need to read the shebang from.""" + _, filename = os.path.split(path) + _, ext = os.path.splitext(filename) ret = set() - if ext not in extensions.EXTENSIONS_NEED_SHEBANG_CHECK: + if ext.lstrip('.') not in extensions.EXTENSIONS_NEED_SHEBANG_CHECK: return ret + interpreter_to_extension_map = extensions.EXTENSIONS_NEED_SHEBANG_CHECK[ + ext.lstrip('.') + ] + with open(path, 'rb') as f: shebang = parse_shebang(f) - try: - ret.update( - extensions.EXTENSIONS_NEED_SHEBANG_CHECK[ext][ - shebang[0] if shebang else None - ], - ) - except KeyError: - # The extension merits inspection but the parsed shebang is not on the - # known list. In this case the actual format is probably unknown so - # don't return a tag - pass + if ext == '.sls': + if shebang: + # try to match tags for the file extension of the first interpreter + try: + first_interpreter = shebang[0].split('|')[0] + ret.update( + extensions.EXTENSIONS[ + interpreter_to_extension_map.get( + first_interpreter, first_interpreter, + ) + ], + ) + except (IndexError, KeyError): + pass + else: + # the default interpreter is jinja + ret.update(extensions.EXTENSIONS['jinja']) return ret diff --git a/tests/identify_test.py b/tests/identify_test.py index fce3e68..e2de0b4 100644 --- a/tests/identify_test.py +++ b/tests/identify_test.py @@ -123,7 +123,7 @@ def test_tags_from_extension_specific_shebang_executable_file(tmpdir): x.write('') make_executable(x.strpath) assert identify.tags_from_extension_specific_shebang(x.strpath) == { - 'salt', + 'jinja', 'text', } @@ -131,28 +131,22 @@ def test_tags_from_extension_specific_shebang_executable_file(tmpdir): @pytest.mark.parametrize( ('interpreter', 'expected'), ( - ('cheetah', {'text', 'salt-cheetah'}), - ('dson', {'text', 'salt-dson'}), - ('genshi', {'text', 'salt-genshi'}), - ('mako', {'text', 'salt-mako'}), - ('py', {'text', 'python', 'salt-py'}), - ('pydsl', {'text', 'python', 'salt-pydsl'}), - ('pyobjects', {'text', 'python', 'salt-pyobjects'}), - ('wempy', {'text', 'salt-wempy'}), - ('yamlex', {'text', 'salt-yamlex'}), - - # Should be tagged as normal salt state files # FIXME: ..I think? - ('gpg', {'text', 'salt'}), - ('jinja', {'text', 'salt'}), - ('jinja|yaml', {'text', 'salt'}), - ('jinja|yaml|gpg', {'text', 'salt'}), - ('yaml', {'text', 'salt'}), - ('yaml|gpg', {'text', 'salt'}), - - # Should not be tagged since we don't match the contents - ('/usr/bin/env python', set()), - ('python3', set()), - ('jinja|py', set()), + ('cheetah', {'text', 'cheetah'}), + ('dson', set()), + ('genshi', set()), + ('gpg', {'text', 'gnupg'}), + ('jinja', {'text', 'jinja'}), + ('jinja|py', {'text', 'jinja'}), + ('jinja|yaml', {'text', 'jinja'}), + ('jinja|yaml|gpg', {'text', 'jinja'}), + ('mako', {'text', 'mako'}), + ('py', {'text', 'python'}), + ('pydsl', {'text', 'python'}), + ('pyobjects', {'text', 'python'}), + ('wempy', set()), + ('yaml', {'text', 'yaml'}), + ('yamlex', set()), + ('yaml|gpg', {'text', 'yaml'}), ), ) @pytest.mark.parametrize( From 9e7bb37da0c8179b198242a26f74b67ea8096e58 Mon Sep 17 00:00:00 2001 From: "Ryan Addessi (raddessi)" Date: Tue, 9 Mar 2021 12:03:55 -0700 Subject: [PATCH 5/6] Removed unclear extensions and changes gpg type --- identify/extensions.py | 5 +---- identify/identify.py | 4 ++-- tests/identify_test.py | 4 +--- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/identify/extensions.py b/identify/extensions.py index ba8d61a..5bb2017 100644 --- a/identify/extensions.py +++ b/identify/extensions.py @@ -48,7 +48,7 @@ 'gif': {'binary', 'image', 'gif'}, 'go': {'text', 'go'}, 'gotmpl': {'text', 'gotmpl'}, - 'gpg': {'text', 'gnupg'}, + 'gpg': {'binary', 'gnupg'}, 'gpx': {'text', 'gpx', 'xml'}, 'gradle': {'text', 'groovy'}, 'graphql': {'text', 'graphql'}, @@ -97,7 +97,6 @@ 'lr': {'text', 'lektor'}, 'lua': {'text', 'lua'}, 'm': {'text', 'c', 'objective-c'}, - 'mako': {'text', 'mako'}, 'manifest': {'text', 'manifest'}, 'map': {'text', 'map'}, 'markdown': {'text', 'markdown'}, @@ -177,7 +176,6 @@ 'tgz': {'binary', 'gzip'}, 'thrift': {'text', 'thrift'}, 'tiff': {'binary', 'image', 'tiff'}, - 'tmpl': {'text', 'cheetah'}, 'toml': {'text', 'toml'}, 'ts': {'text', 'ts'}, 'tsx': {'text', 'tsx'}, @@ -228,7 +226,6 @@ 'sls': { 'pydsl': 'py', 'pyobjects': 'py', - 'cheetah': 'tmpl', }, } diff --git a/identify/identify.py b/identify/identify.py index 2246d3c..6fdead5 100644 --- a/identify/identify.py +++ b/identify/identify.py @@ -86,11 +86,11 @@ def tags_from_path(path: str) -> Set[str]: return tags -def tags_from_extension_specific_shebang(path): +def tags_from_extension_specific_shebang(path: str) -> Set[str]: """Match tags from an extension that we need to read the shebang from.""" _, filename = os.path.split(path) _, ext = os.path.splitext(filename) - ret = set() + ret = set() # type: Set[str] if ext.lstrip('.') not in extensions.EXTENSIONS_NEED_SHEBANG_CHECK: return ret diff --git a/tests/identify_test.py b/tests/identify_test.py index 994cd7f..e430953 100644 --- a/tests/identify_test.py +++ b/tests/identify_test.py @@ -158,15 +158,13 @@ def test_tags_from_extension_specific_shebang_executable_file(tmpdir): @pytest.mark.parametrize( ('interpreter', 'expected'), ( - ('cheetah', {'text', 'cheetah'}), ('dson', set()), ('genshi', set()), - ('gpg', {'text', 'gnupg'}), + ('gpg', {'binary', 'gnupg'}), ('jinja', {'text', 'jinja'}), ('jinja|py', {'text', 'jinja'}), ('jinja|yaml', {'text', 'jinja'}), ('jinja|yaml|gpg', {'text', 'jinja'}), - ('mako', {'text', 'mako'}), ('py', {'text', 'python'}), ('pydsl', {'text', 'python'}), ('pyobjects', {'text', 'python'}), From 96a56b9eaff85e1c9e20bca470d8a16ef2137e4e Mon Sep 17 00:00:00 2001 From: "Ryan Addessi (raddessi)" Date: Tue, 9 Mar 2021 13:25:57 -0700 Subject: [PATCH 6/6] Next attempt at doing this better --- identify/extensions.py | 10 +---- identify/identify.py | 80 ++++++++++++++++------------------------ identify/interpreters.py | 6 +++ tests/identify_test.py | 52 ++++++++++++-------------- 4 files changed, 62 insertions(+), 86 deletions(-) diff --git a/identify/extensions.py b/identify/extensions.py index 5bb2017..1a9b5d3 100644 --- a/identify/extensions.py +++ b/identify/extensions.py @@ -155,7 +155,6 @@ 'scss': {'text', 'scss'}, 'sh': {'text', 'shell'}, 'sln': {'text', 'sln'}, - 'sls': {'text', 'salt'}, 'so': {'binary'}, 'sol': {'text', 'solidity'}, 'spec': {'text', 'spec'}, @@ -220,13 +219,8 @@ EXTENSIONS_NEED_BINARY_CHECK = { 'plist': {'plist'}, } -# This should contain a map of file extensions to a map of interpreter names to -# their own file extensions -EXTENSIONS_NEED_SHEBANG_CHECK = { - 'sls': { - 'pydsl': 'py', - 'pyobjects': 'py', - }, +EXTENSIONS_NEED_INTERPRETER_CHECK = { + 'sls': {'text', 'salt'}, } NAMES = { diff --git a/identify/identify.py b/identify/identify.py index 6fdead5..051fdc9 100644 --- a/identify/identify.py +++ b/identify/identify.py @@ -33,6 +33,7 @@ _ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS} _ALL_TAGS.update(*extensions.EXTENSIONS.values()) _ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values()) +_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_INTERPRETER_CHECK.values()) _ALL_TAGS.update(*extensions.NAMES.values()) _ALL_TAGS.update(*interpreters.INTERPRETERS.values()) ALL_TAGS = frozenset(_ALL_TAGS) @@ -61,17 +62,27 @@ def tags_from_path(path: str) -> Set[str]: tags.add(NON_EXECUTABLE) # As an optimization, if we're able to read tags from the filename, then we - # don't peek at the file contents. + # don't peek at the file contents unless the file extension requires it t = tags_from_filename(os.path.basename(path)) if len(t) > 0: tags.update(t) - else: - if executable: - shebang = parse_shebang_from_file(path) - if len(shebang) > 0: - tags.update(tags_from_interpreter(shebang[0])) - tags.update(tags_from_extension_specific_shebang(path)) + ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.') + if ( + not len(t) and executable + ) or ext in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK: + try: + tags.update(extensions.EXTENSIONS_NEED_INTERPRETER_CHECK[ext]) + except KeyError: + pass + + shebang = parse_shebang_from_file(path) + if len(shebang) > 0: + tags.update( + tags_from_interpreter( + shebang[0].split('|')[0].strip(), + ), + ) # some extensions can be both binary and text # see EXTENSIONS_NEED_BINARY_CHECK @@ -86,42 +97,6 @@ def tags_from_path(path: str) -> Set[str]: return tags -def tags_from_extension_specific_shebang(path: str) -> Set[str]: - """Match tags from an extension that we need to read the shebang from.""" - _, filename = os.path.split(path) - _, ext = os.path.splitext(filename) - ret = set() # type: Set[str] - if ext.lstrip('.') not in extensions.EXTENSIONS_NEED_SHEBANG_CHECK: - return ret - - interpreter_to_extension_map = extensions.EXTENSIONS_NEED_SHEBANG_CHECK[ - ext.lstrip('.') - ] - - with open(path, 'rb') as f: - shebang = parse_shebang(f) - - if ext == '.sls': - if shebang: - # try to match tags for the file extension of the first interpreter - try: - first_interpreter = shebang[0].split('|')[0] - ret.update( - extensions.EXTENSIONS[ - interpreter_to_extension_map.get( - first_interpreter, first_interpreter, - ) - ], - ) - except (IndexError, KeyError): - pass - else: - # the default interpreter is jinja - ret.update(extensions.EXTENSIONS['jinja']) - - return ret - - def tags_from_filename(path: str) -> Set[str]: _, filename = os.path.split(path) _, ext = os.path.splitext(filename) @@ -166,7 +141,7 @@ def is_text(bytesio: IO[bytes]) -> bool: text_chars = ( bytearray([7, 8, 9, 10, 11, 12, 13, 27]) + bytearray(range(0x20, 0x7F)) + - bytearray(range(0x80, 0X100)) + bytearray(range(0x80, 0x100)) ) return not bool(bytesio.read(1024).translate(None, text_chars)) @@ -191,8 +166,8 @@ def _shebang_split(line: str) -> List[str]: def _parse_nix_shebang( - bytesio: IO[bytes], - cmd: Tuple[str, ...], + bytesio: IO[bytes], + cmd: Tuple[str, ...], ) -> Tuple[str, ...]: while bytesio.read(2) == b'#!': next_line_b = bytesio.readline() @@ -241,7 +216,14 @@ def parse_shebang_from_file(path: str) -> Tuple[str, ...]: """Parse the shebang given a file path.""" if not os.path.lexists(path): raise ValueError(f'{path} does not exist.') - if not os.access(path, os.X_OK): + ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.') + if ( + ext not in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK and + not os.access( + path, + os.X_OK, + ) + ): return () try: @@ -295,7 +277,7 @@ def license_id(filename: str) -> Optional[str]: return spdx # skip the slow calculation if the lengths are very different - if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05: + if norm and abs(len(norm) - len(norm_license)) / len(norm) > 0.05: continue edit_dist = editdistance.eval(norm, norm_license) @@ -304,7 +286,7 @@ def license_id(filename: str) -> Optional[str]: min_edit_dist_spdx = spdx # if there's less than 5% edited from the license, we found our match - if norm and min_edit_dist / len(norm) < .05: + if norm and min_edit_dist / len(norm) < 0.05: return min_edit_dist_spdx else: # no matches :'( diff --git a/identify/interpreters.py b/identify/interpreters.py index dabf36c..d5a8884 100644 --- a/identify/interpreters.py +++ b/identify/interpreters.py @@ -6,15 +6,21 @@ 'csh': {'shell', 'csh'}, 'dash': {'shell', 'dash'}, 'expect': {'expect'}, + 'gpg': {'gnupg'}, + 'jinja': {'jinja'}, 'ksh': {'shell', 'ksh'}, 'node': {'javascript'}, 'nodejs': {'javascript'}, 'perl': {'perl'}, + 'py': {'python'}, + 'pydsl': {'python'}, + 'pyobjects': {'python'}, 'python': {'python'}, 'python2': {'python', 'python2'}, 'python3': {'python', 'python3'}, 'ruby': {'ruby'}, 'sh': {'shell', 'sh'}, 'tcsh': {'shell', 'tcsh'}, + 'yaml': {'yaml'}, 'zsh': {'shell', 'zsh'}, } diff --git a/tests/identify_test.py b/tests/identify_test.py index e430953..a2669b9 100644 --- a/tests/identify_test.py +++ b/tests/identify_test.py @@ -145,51 +145,45 @@ def test_tags_from_path_plist_text(tmpdir): } -def test_tags_from_extension_specific_shebang_executable_file(tmpdir): - x = tmpdir.join('test.sls') - x.write('') - make_executable(x.strpath) - assert identify.tags_from_extension_specific_shebang(x.strpath) == { - 'jinja', - 'text', - } - - @pytest.mark.parametrize( ('interpreter', 'expected'), ( - ('dson', set()), - ('genshi', set()), - ('gpg', {'binary', 'gnupg'}), - ('jinja', {'text', 'jinja'}), - ('jinja|py', {'text', 'jinja'}), - ('jinja|yaml', {'text', 'jinja'}), - ('jinja|yaml|gpg', {'text', 'jinja'}), - ('py', {'text', 'python'}), - ('pydsl', {'text', 'python'}), - ('pyobjects', {'text', 'python'}), - ('wempy', set()), - ('yaml', {'text', 'yaml'}), - ('yamlex', set()), - ('yaml|gpg', {'text', 'yaml'}), + ('dson', {'salt', 'file', 'non-executable', 'text'}), + ('genshi', {'salt', 'file', 'non-executable', 'text'}), + ('gpg', {'salt', 'file', 'non-executable', 'text', 'gnupg'}), + ('jinja', {'salt', 'file', 'non-executable', 'text', 'jinja'}), + ('jinja|py', {'salt', 'file', 'non-executable', 'text', 'jinja'}), + ('jinja|yaml', {'salt', 'file', 'non-executable', 'text', 'jinja'}), + ( + 'jinja|yaml|gpg', { + 'salt', 'file', 'non-executable', 'text', 'jinja', + }, + ), + ('py', {'salt', 'file', 'non-executable', 'text', 'python'}), + ('pydsl', {'salt', 'file', 'non-executable', 'text', 'python'}), + ('pyobjects', {'salt', 'file', 'non-executable', 'text', 'python'}), + ('wempy', {'salt', 'file', 'non-executable', 'text'}), + ('yaml', {'salt', 'file', 'non-executable', 'text', 'yaml'}), + ('yamlex', {'salt', 'file', 'non-executable', 'text'}), + ('yaml|gpg', {'salt', 'file', 'non-executable', 'text', 'yaml'}), ), ) @pytest.mark.parametrize( - ('shebang_prefix',), + ('interpreter_prefix',), ( ('#!',), ('#! ',), ), ) -def test_tags_from_extension_specific_shebang( +def test_tags_from_path_with_interpreter_check( tmpdir, - shebang_prefix, + interpreter_prefix, interpreter, expected, ): x = tmpdir.join('test.sls') - x.write(shebang_prefix + interpreter) - assert identify.tags_from_extension_specific_shebang(x.strpath) == expected + x.write(interpreter_prefix + interpreter) + assert identify.tags_from_path(x.strpath) == expected @pytest.mark.parametrize(