Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added parsing for saltstack interpreted files #86

Closed
wants to merge 9 commits into from
7 changes: 5 additions & 2 deletions identify/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@
'gif': {'binary', 'image', 'gif'},
'go': {'text', 'go'},
'gotmpl': {'text', 'gotmpl'},
'gpg': {'binary', 'gnupg'},
'gpx': {'text', 'gpx', 'xml'},
'graphql': {'text', 'graphql'},
'gradle': {'text', 'groovy'},
'graphql': {'text', 'graphql'},
'groovy': {'text', 'groovy'},
'gyb': {'text', 'gyb'},
'gyp': {'text', 'gyp', 'python'},
Expand Down Expand Up @@ -154,7 +155,6 @@
'scss': {'text', 'scss'},
'sh': {'text', 'shell'},
'sln': {'text', 'sln'},
'sls': {'text', 'salt'},
'so': {'binary'},
'sol': {'text', 'solidity'},
'spec': {'text', 'spec'},
Expand Down Expand Up @@ -219,6 +219,9 @@
EXTENSIONS_NEED_BINARY_CHECK = {
'plist': {'plist'},
}
EXTENSIONS_NEED_INTERPRETER_CHECK = {
'sls': {'text', 'salt'},
}

NAMES = {
'.babelrc': EXTENSIONS['json'] | {'babelrc'},
Expand Down
44 changes: 32 additions & 12 deletions identify/identify.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
_ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
_ALL_TAGS.update(*extensions.EXTENSIONS.values())
_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_INTERPRETER_CHECK.values())
_ALL_TAGS.update(*extensions.NAMES.values())
_ALL_TAGS.update(*interpreters.INTERPRETERS.values())
ALL_TAGS = frozenset(_ALL_TAGS)
Expand Down Expand Up @@ -61,15 +62,27 @@ def tags_from_path(path: str) -> Set[str]:
tags.add(NON_EXECUTABLE)

# As an optimization, if we're able to read tags from the filename, then we
# don't peek at the file contents.
# don't peek at the file contents unless the file extension requires it
t = tags_from_filename(os.path.basename(path))
if len(t) > 0:
tags.update(t)
else:
if executable:
shebang = parse_shebang_from_file(path)
if len(shebang) > 0:
tags.update(tags_from_interpreter(shebang[0]))

ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
Comment on lines +69 to +70
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't love that this introduces a need to parse the extension on all files.. but I don't see a way around that when the logic to determine the extra requirement is by extension. Thoughts?

Copy link
Contributor

@CAM-Gerlach CAM-Gerlach Mar 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just

Suggested change
ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
ext = os.path.splitext(path)[-1].lstrip('.')

Also, you could move this to the if statement, given the other change I suggest below, though that would only avoid it in a subset of cases.

There isn't really any way I can see around it given this design, but you can at least only parse and check it once place instead of two, as is currently being done (as my suggestions implement).

if (
not len(t) and executable
) or ext in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK:
try:
tags.update(extensions.EXTENSIONS_NEED_INTERPRETER_CHECK[ext])
except KeyError:
pass

Comment on lines +74 to +78
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
try:
tags.update(extensions.EXTENSIONS_NEED_INTERPRETER_CHECK[ext])
except KeyError:
pass

I would think it would both make more sense and be more efficient to add the extension-based tags in the same function the others are added in, tags_from_filename() around line 116, just like is done with EXTENSIONS_NEED_BINARY_CHECK, instead of a bespoke code block here that fires for every unknown file, and follows a different pattern than the others. This also means you can just do os.path.splitext(path)[-1].lstrip('.') right in the if statement for only those files that need it (saving a bit of performance at the cost of readibility).

shebang = parse_shebang_from_file(path)
if len(shebang) > 0:
tags.update(
tags_from_interpreter(
shebang[0].split('|')[0].strip(),
),
)
Comment on lines +81 to +85
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The splitting of the | delimited interpreters may be better located in the _shebang_split function.. I can move there if this looks good.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the | convention to separate multiple interpreters is non-standard and only used with one particular file type, I don't know about trying to parse it on all files. I can't find much Googling about other uses, but the one usage I can find has different syntax than salt uses. But that's ultimately up to @asottile .


# some extensions can be both binary and text
# see EXTENSIONS_NEED_BINARY_CHECK
Expand Down Expand Up @@ -128,7 +141,7 @@ def is_text(bytesio: IO[bytes]) -> bool:
text_chars = (
bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
bytearray(range(0x20, 0x7F)) +
bytearray(range(0x80, 0X100))
bytearray(range(0x80, 0x100))
)
return not bool(bytesio.read(1024).translate(None, text_chars))

Expand All @@ -153,8 +166,8 @@ def _shebang_split(line: str) -> List[str]:


def _parse_nix_shebang(
bytesio: IO[bytes],
cmd: Tuple[str, ...],
bytesio: IO[bytes],
cmd: Tuple[str, ...],
) -> Tuple[str, ...]:
while bytesio.read(2) == b'#!':
next_line_b = bytesio.readline()
Expand Down Expand Up @@ -203,7 +216,14 @@ def parse_shebang_from_file(path: str) -> Tuple[str, ...]:
"""Parse the shebang given a file path."""
if not os.path.lexists(path):
raise ValueError(f'{path} does not exist.')
if not os.access(path, os.X_OK):
ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
if (
ext not in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK and
not os.access(
path,
os.X_OK,
)
):
Comment on lines +219 to +226
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the other place the extension is needed.. should it be passed to the function instead of re-parsed?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest eliminating this check entirely unless there's something I'm missing (which is entirely possible); see below.

return ()

Comment on lines +219 to 228
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.')
if (
ext not in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK and
not os.access(
path,
os.X_OK,
)
):
return ()

@asottile The only function that calls this already checks this same condition before running parse_shebang_from_file(), so is there a reason it needs to be checked twice? There might be something I'm missing, but if not, eliminating the redundant check would reduce complexity and increase performance without an obvious downside, and it also prevents any other third-party consumers from using it on files not marked executable if that restriction doesn't make sense for their application.

try:
Expand Down Expand Up @@ -257,7 +277,7 @@ def license_id(filename: str) -> Optional[str]:
return spdx

# skip the slow calculation if the lengths are very different
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
if norm and abs(len(norm) - len(norm_license)) / len(norm) > 0.05:
continue
Comment on lines 279 to 281
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't make a few of these changes, they were done via precommit or black.. Maybe the precommit config changed recently?


edit_dist = editdistance.eval(norm, norm_license)
Expand All @@ -266,7 +286,7 @@ def license_id(filename: str) -> Optional[str]:
min_edit_dist_spdx = spdx

# if there's less than 5% edited from the license, we found our match
if norm and min_edit_dist / len(norm) < .05:
if norm and min_edit_dist / len(norm) < 0.05:
return min_edit_dist_spdx
else:
# no matches :'(
Expand Down
6 changes: 6 additions & 0 deletions identify/interpreters.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,21 @@
'csh': {'shell', 'csh'},
'dash': {'shell', 'dash'},
'expect': {'expect'},
'gpg': {'gnupg'},
'jinja': {'jinja'},
'ksh': {'shell', 'ksh'},
'node': {'javascript'},
'nodejs': {'javascript'},
'perl': {'perl'},
'py': {'python'},
'pydsl': {'python'},
'pyobjects': {'python'},
'python': {'python'},
'python2': {'python', 'python2'},
'python3': {'python', 'python3'},
'ruby': {'ruby'},
'sh': {'shell', 'sh'},
'tcsh': {'shell', 'tcsh'},
'yaml': {'yaml'},
'zsh': {'shell', 'zsh'},
}
41 changes: 41 additions & 0 deletions tests/identify_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,47 @@ def test_tags_from_path_plist_text(tmpdir):
}


@pytest.mark.parametrize(
('interpreter', 'expected'),
(
('dson', {'salt', 'file', 'non-executable', 'text'}),
('genshi', {'salt', 'file', 'non-executable', 'text'}),
('gpg', {'salt', 'file', 'non-executable', 'text', 'gnupg'}),
('jinja', {'salt', 'file', 'non-executable', 'text', 'jinja'}),
('jinja|py', {'salt', 'file', 'non-executable', 'text', 'jinja'}),
('jinja|yaml', {'salt', 'file', 'non-executable', 'text', 'jinja'}),
(
'jinja|yaml|gpg', {
'salt', 'file', 'non-executable', 'text', 'jinja',
},
),
Comment on lines +157 to +161
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does have the benefit of files with the extension .gpg being marked as binary while files with the gpg interpreter still get marked as text.

Copy link
Author

@raddessi raddessi Mar 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I apologize, I quoted the wrong section. The relevant line is 153:

('gpg', {'salt', 'file', 'non-executable', 'text', 'gnupg'}),

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

('py', {'salt', 'file', 'non-executable', 'text', 'python'}),
('pydsl', {'salt', 'file', 'non-executable', 'text', 'python'}),
('pyobjects', {'salt', 'file', 'non-executable', 'text', 'python'}),
('wempy', {'salt', 'file', 'non-executable', 'text'}),
('yaml', {'salt', 'file', 'non-executable', 'text', 'yaml'}),
('yamlex', {'salt', 'file', 'non-executable', 'text'}),
('yaml|gpg', {'salt', 'file', 'non-executable', 'text', 'yaml'}),
),
)
@pytest.mark.parametrize(
('interpreter_prefix',),
(
('#!',),
('#! ',),
),
)
def test_tags_from_path_with_interpreter_check(
tmpdir,
interpreter_prefix,
interpreter,
expected,
):
x = tmpdir.join('test.sls')
x.write(interpreter_prefix + interpreter)
assert identify.tags_from_path(x.strpath) == expected


@pytest.mark.parametrize(
('filename', 'expected'),
(
Expand Down