pre-commit · raddessi · Nov 14, 2019 · Nov 14, 2019 · Nov 14, 2019 · Feb 13, 2021
diff --git a/identify/extensions.py b/identify/extensions.py
@@ -48,9 +48,10 @@
     'gif': {'binary', 'image', 'gif'},
     'go': {'text', 'go'},
     'gotmpl': {'text', 'gotmpl'},
+    'gpg': {'binary', 'gnupg'},
     'gpx': {'text', 'gpx', 'xml'},
-    'graphql': {'text', 'graphql'},
     'gradle': {'text', 'groovy'},
+    'graphql': {'text', 'graphql'},
     'groovy': {'text', 'groovy'},
     'gyb': {'text', 'gyb'},
     'gyp': {'text', 'gyp', 'python'},
@@ -154,7 +155,6 @@
     'scss': {'text', 'scss'},
     'sh': {'text', 'shell'},
     'sln': {'text', 'sln'},
-    'sls': {'text', 'salt'},
     'so': {'binary'},
     'sol': {'text', 'solidity'},
     'spec': {'text', 'spec'},
@@ -219,6 +219,9 @@
 EXTENSIONS_NEED_BINARY_CHECK = {
     'plist': {'plist'},
 }
+EXTENSIONS_NEED_INTERPRETER_CHECK = {
+    'sls': {'text', 'salt'},
+}
 
 NAMES = {
     '.babelrc': EXTENSIONS['json'] | {'babelrc'},

diff --git a/identify/identify.py b/identify/identify.py
@@ -33,6 +33,7 @@
 _ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
 _ALL_TAGS.update(*extensions.EXTENSIONS.values())
 _ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
+_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_INTERPRETER_CHECK.values())
 _ALL_TAGS.update(*extensions.NAMES.values())
 _ALL_TAGS.update(*interpreters.INTERPRETERS.values())
 ALL_TAGS = frozenset(_ALL_TAGS)
@@ -61,15 +62,27 @@ def tags_from_path(path: str) -> Set[str]:
         tags.add(NON_EXECUTABLE)
 
     # As an optimization, if we're able to read tags from the filename, then we
-    # don't peek at the file contents.
+    # don't peek at the file contents unless the file extension requires it
     t = tags_from_filename(os.path.basename(path))
     if len(t) > 0:
         tags.update(t)
-    else:
-        if executable:
-            shebang = parse_shebang_from_file(path)
-            if len(shebang) > 0:
-                tags.update(tags_from_interpreter(shebang[0]))
+
+    ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
-
-    ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
+    ext = os.path.splitext(path)[-1].lstrip('.')
-
-    ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
+    ext = os.path.splitext(path)[-1].lstrip('.')
+    if (
+        not len(t) and executable
+    ) or ext in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK:
+        try:
+            tags.update(extensions.EXTENSIONS_NEED_INTERPRETER_CHECK[ext])
+        except KeyError:
+            pass
+
-        try:
-            tags.update(extensions.EXTENSIONS_NEED_INTERPRETER_CHECK[ext])
-        except KeyError:
-            pass
-        try:
-            tags.update(extensions.EXTENSIONS_NEED_INTERPRETER_CHECK[ext])
-        except KeyError:
-            pass
+        shebang = parse_shebang_from_file(path)
+        if len(shebang) > 0:
+            tags.update(
+                tags_from_interpreter(
+                    shebang[0].split('|')[0].strip(),
+                ),
+            )
 
     # some extensions can be both binary and text
     # see EXTENSIONS_NEED_BINARY_CHECK
@@ -128,7 +141,7 @@ def is_text(bytesio: IO[bytes]) -> bool:
     text_chars = (
         bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
         bytearray(range(0x20, 0x7F)) +
-        bytearray(range(0x80, 0X100))
+        bytearray(range(0x80, 0x100))
     )
     return not bool(bytesio.read(1024).translate(None, text_chars))
 
@@ -153,8 +166,8 @@ def _shebang_split(line: str) -> List[str]:
 
 
 def _parse_nix_shebang(
-        bytesio: IO[bytes],
-        cmd: Tuple[str, ...],
+    bytesio: IO[bytes],
+    cmd: Tuple[str, ...],
 ) -> Tuple[str, ...]:
     while bytesio.read(2) == b'#!':
         next_line_b = bytesio.readline()
@@ -203,7 +216,14 @@ def parse_shebang_from_file(path: str) -> Tuple[str, ...]:
     """Parse the shebang given a file path."""
     if not os.path.lexists(path):
         raise ValueError(f'{path} does not exist.')
-    if not os.access(path, os.X_OK):
+    ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
+    if (
+        ext not in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK and
+        not os.access(
+            path,
+            os.X_OK,
+        )
+    ):
         return ()
 
-    ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
-    if (
-        ext not in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK and
-        not os.access(
-            path,
-            os.X_OK,
-        )
-    ):
-        return ()
-    ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
-    if (
-        ext not in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK and
-        not os.access(
-            path,
-            os.X_OK,
-        )
-    ):
-        return ()
     try:
@@ -257,7 +277,7 @@ def license_id(filename: str) -> Optional[str]:
             return spdx
 
         # skip the slow calculation if the lengths are very different
-        if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
+        if norm and abs(len(norm) - len(norm_license)) / len(norm) > 0.05:
             continue
 
         edit_dist = editdistance.eval(norm, norm_license)
@@ -266,7 +286,7 @@ def license_id(filename: str) -> Optional[str]:
             min_edit_dist_spdx = spdx
 
     # if there's less than 5% edited from the license, we found our match
-    if norm and min_edit_dist / len(norm) < .05:
+    if norm and min_edit_dist / len(norm) < 0.05:
         return min_edit_dist_spdx
     else:
         # no matches :'(

diff --git a/identify/interpreters.py b/identify/interpreters.py
@@ -6,15 +6,21 @@
     'csh': {'shell', 'csh'},
     'dash': {'shell', 'dash'},
     'expect': {'expect'},
+    'gpg': {'gnupg'},
+    'jinja': {'jinja'},
     'ksh': {'shell', 'ksh'},
     'node': {'javascript'},
     'nodejs': {'javascript'},
     'perl': {'perl'},
+    'py': {'python'},
+    'pydsl': {'python'},
+    'pyobjects': {'python'},
     'python': {'python'},
     'python2': {'python', 'python2'},
     'python3': {'python', 'python3'},
     'ruby': {'ruby'},
     'sh': {'shell', 'sh'},
     'tcsh': {'shell', 'tcsh'},
+    'yaml': {'yaml'},
     'zsh': {'shell', 'zsh'},
 }
diff --git a/tests/identify_test.py b/tests/identify_test.py
@@ -145,6 +145,47 @@ def test_tags_from_path_plist_text(tmpdir):
     }
 
 
+@pytest.mark.parametrize(
+    ('interpreter', 'expected'),
+    (
+        ('dson', {'salt', 'file', 'non-executable', 'text'}),
+        ('genshi', {'salt', 'file', 'non-executable', 'text'}),
+        ('gpg', {'salt', 'file', 'non-executable', 'text', 'gnupg'}),
+        ('jinja', {'salt', 'file', 'non-executable', 'text', 'jinja'}),
+        ('jinja|py', {'salt', 'file', 'non-executable', 'text', 'jinja'}),
+        ('jinja|yaml', {'salt', 'file', 'non-executable', 'text', 'jinja'}),
+        (
+            'jinja|yaml|gpg', {
+                'salt', 'file', 'non-executable', 'text', 'jinja',
+            },
+        ),
+        ('py', {'salt', 'file', 'non-executable', 'text', 'python'}),
+        ('pydsl', {'salt', 'file', 'non-executable', 'text', 'python'}),
+        ('pyobjects', {'salt', 'file', 'non-executable', 'text', 'python'}),
+        ('wempy', {'salt', 'file', 'non-executable', 'text'}),
+        ('yaml', {'salt', 'file', 'non-executable', 'text', 'yaml'}),
+        ('yamlex', {'salt', 'file', 'non-executable', 'text'}),
+        ('yaml|gpg', {'salt', 'file', 'non-executable', 'text', 'yaml'}),
+    ),
+)
+@pytest.mark.parametrize(
+    ('interpreter_prefix',),
+    (
+        ('#!',),
+        ('#! ',),
+    ),
+)
+def test_tags_from_path_with_interpreter_check(
+    tmpdir,
+    interpreter_prefix,
+    interpreter,
+    expected,
+):
+    x = tmpdir.join('test.sls')
+    x.write(interpreter_prefix + interpreter)
+    assert identify.tags_from_path(x.strpath) == expected
+
+
 @pytest.mark.parametrize(
     ('filename', 'expected'),
     (