-
-
Notifications
You must be signed in to change notification settings - Fork 149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added parsing for saltstack interpreted files #86
Changes from 8 commits
4504fdf
7617fa2
e74a495
74b5638
d5d039f
a3b69db
9e7bb37
96a56b9
11d2ce5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -33,6 +33,7 @@ | |||||||||||||||||||
_ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS} | ||||||||||||||||||||
_ALL_TAGS.update(*extensions.EXTENSIONS.values()) | ||||||||||||||||||||
_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values()) | ||||||||||||||||||||
_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_INTERPRETER_CHECK.values()) | ||||||||||||||||||||
_ALL_TAGS.update(*extensions.NAMES.values()) | ||||||||||||||||||||
_ALL_TAGS.update(*interpreters.INTERPRETERS.values()) | ||||||||||||||||||||
ALL_TAGS = frozenset(_ALL_TAGS) | ||||||||||||||||||||
|
@@ -61,15 +62,27 @@ def tags_from_path(path: str) -> Set[str]: | |||||||||||||||||||
tags.add(NON_EXECUTABLE) | ||||||||||||||||||||
|
||||||||||||||||||||
# As an optimization, if we're able to read tags from the filename, then we | ||||||||||||||||||||
# don't peek at the file contents. | ||||||||||||||||||||
# don't peek at the file contents unless the file extension requires it | ||||||||||||||||||||
t = tags_from_filename(os.path.basename(path)) | ||||||||||||||||||||
if len(t) > 0: | ||||||||||||||||||||
tags.update(t) | ||||||||||||||||||||
else: | ||||||||||||||||||||
if executable: | ||||||||||||||||||||
shebang = parse_shebang_from_file(path) | ||||||||||||||||||||
if len(shebang) > 0: | ||||||||||||||||||||
tags.update(tags_from_interpreter(shebang[0])) | ||||||||||||||||||||
|
||||||||||||||||||||
ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.') | ||||||||||||||||||||
if ( | ||||||||||||||||||||
not len(t) and executable | ||||||||||||||||||||
) or ext in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK: | ||||||||||||||||||||
try: | ||||||||||||||||||||
tags.update(extensions.EXTENSIONS_NEED_INTERPRETER_CHECK[ext]) | ||||||||||||||||||||
except KeyError: | ||||||||||||||||||||
pass | ||||||||||||||||||||
|
||||||||||||||||||||
Comment on lines
+74
to
+78
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I would think it would both make more sense and be more efficient to add the extension-based tags in the same function the others are added in, |
||||||||||||||||||||
shebang = parse_shebang_from_file(path) | ||||||||||||||||||||
if len(shebang) > 0: | ||||||||||||||||||||
tags.update( | ||||||||||||||||||||
tags_from_interpreter( | ||||||||||||||||||||
shebang[0].split('|')[0].strip(), | ||||||||||||||||||||
), | ||||||||||||||||||||
) | ||||||||||||||||||||
Comment on lines
+81
to
+85
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The splitting of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the |
||||||||||||||||||||
|
||||||||||||||||||||
# some extensions can be both binary and text | ||||||||||||||||||||
# see EXTENSIONS_NEED_BINARY_CHECK | ||||||||||||||||||||
|
@@ -128,7 +141,7 @@ def is_text(bytesio: IO[bytes]) -> bool: | |||||||||||||||||||
text_chars = ( | ||||||||||||||||||||
bytearray([7, 8, 9, 10, 11, 12, 13, 27]) + | ||||||||||||||||||||
bytearray(range(0x20, 0x7F)) + | ||||||||||||||||||||
bytearray(range(0x80, 0X100)) | ||||||||||||||||||||
bytearray(range(0x80, 0x100)) | ||||||||||||||||||||
) | ||||||||||||||||||||
return not bool(bytesio.read(1024).translate(None, text_chars)) | ||||||||||||||||||||
|
||||||||||||||||||||
|
@@ -153,8 +166,8 @@ def _shebang_split(line: str) -> List[str]: | |||||||||||||||||||
|
||||||||||||||||||||
|
||||||||||||||||||||
def _parse_nix_shebang( | ||||||||||||||||||||
bytesio: IO[bytes], | ||||||||||||||||||||
cmd: Tuple[str, ...], | ||||||||||||||||||||
bytesio: IO[bytes], | ||||||||||||||||||||
cmd: Tuple[str, ...], | ||||||||||||||||||||
) -> Tuple[str, ...]: | ||||||||||||||||||||
while bytesio.read(2) == b'#!': | ||||||||||||||||||||
next_line_b = bytesio.readline() | ||||||||||||||||||||
|
@@ -203,7 +216,14 @@ def parse_shebang_from_file(path: str) -> Tuple[str, ...]: | |||||||||||||||||||
"""Parse the shebang given a file path.""" | ||||||||||||||||||||
if not os.path.lexists(path): | ||||||||||||||||||||
raise ValueError(f'{path} does not exist.') | ||||||||||||||||||||
if not os.access(path, os.X_OK): | ||||||||||||||||||||
ext = os.path.splitext(os.path.split(path)[-1])[-1].lstrip('.') | ||||||||||||||||||||
if ( | ||||||||||||||||||||
ext not in extensions.EXTENSIONS_NEED_INTERPRETER_CHECK and | ||||||||||||||||||||
not os.access( | ||||||||||||||||||||
path, | ||||||||||||||||||||
os.X_OK, | ||||||||||||||||||||
) | ||||||||||||||||||||
): | ||||||||||||||||||||
Comment on lines
+219
to
+226
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the other place the extension is needed.. should it be passed to the function instead of re-parsed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would suggest eliminating this check entirely unless there's something I'm missing (which is entirely possible); see below. |
||||||||||||||||||||
return () | ||||||||||||||||||||
|
||||||||||||||||||||
Comment on lines
+219
to
228
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
@asottile The only function that calls this already checks this same condition before running |
||||||||||||||||||||
try: | ||||||||||||||||||||
|
@@ -257,7 +277,7 @@ def license_id(filename: str) -> Optional[str]: | |||||||||||||||||||
return spdx | ||||||||||||||||||||
|
||||||||||||||||||||
# skip the slow calculation if the lengths are very different | ||||||||||||||||||||
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05: | ||||||||||||||||||||
if norm and abs(len(norm) - len(norm_license)) / len(norm) > 0.05: | ||||||||||||||||||||
continue | ||||||||||||||||||||
Comment on lines
279
to
281
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't make a few of these changes, they were done via precommit or black.. Maybe the precommit config changed recently? |
||||||||||||||||||||
|
||||||||||||||||||||
edit_dist = editdistance.eval(norm, norm_license) | ||||||||||||||||||||
|
@@ -266,7 +286,7 @@ def license_id(filename: str) -> Optional[str]: | |||||||||||||||||||
min_edit_dist_spdx = spdx | ||||||||||||||||||||
|
||||||||||||||||||||
# if there's less than 5% edited from the license, we found our match | ||||||||||||||||||||
if norm and min_edit_dist / len(norm) < .05: | ||||||||||||||||||||
if norm and min_edit_dist / len(norm) < 0.05: | ||||||||||||||||||||
return min_edit_dist_spdx | ||||||||||||||||||||
else: | ||||||||||||||||||||
# no matches :'( | ||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -145,6 +145,47 @@ def test_tags_from_path_plist_text(tmpdir): | |
} | ||
|
||
|
||
@pytest.mark.parametrize( | ||
('interpreter', 'expected'), | ||
( | ||
('dson', {'salt', 'file', 'non-executable', 'text'}), | ||
('genshi', {'salt', 'file', 'non-executable', 'text'}), | ||
('gpg', {'salt', 'file', 'non-executable', 'text', 'gnupg'}), | ||
('jinja', {'salt', 'file', 'non-executable', 'text', 'jinja'}), | ||
('jinja|py', {'salt', 'file', 'non-executable', 'text', 'jinja'}), | ||
('jinja|yaml', {'salt', 'file', 'non-executable', 'text', 'jinja'}), | ||
( | ||
'jinja|yaml|gpg', { | ||
'salt', 'file', 'non-executable', 'text', 'jinja', | ||
}, | ||
), | ||
Comment on lines
+157
to
+161
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does have the benefit of files with the extension There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I apologize, I quoted the wrong section. The relevant line is 153:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! |
||
('py', {'salt', 'file', 'non-executable', 'text', 'python'}), | ||
('pydsl', {'salt', 'file', 'non-executable', 'text', 'python'}), | ||
('pyobjects', {'salt', 'file', 'non-executable', 'text', 'python'}), | ||
('wempy', {'salt', 'file', 'non-executable', 'text'}), | ||
('yaml', {'salt', 'file', 'non-executable', 'text', 'yaml'}), | ||
('yamlex', {'salt', 'file', 'non-executable', 'text'}), | ||
('yaml|gpg', {'salt', 'file', 'non-executable', 'text', 'yaml'}), | ||
), | ||
) | ||
@pytest.mark.parametrize( | ||
('interpreter_prefix',), | ||
( | ||
('#!',), | ||
('#! ',), | ||
), | ||
) | ||
def test_tags_from_path_with_interpreter_check( | ||
tmpdir, | ||
interpreter_prefix, | ||
interpreter, | ||
expected, | ||
): | ||
x = tmpdir.join('test.sls') | ||
x.write(interpreter_prefix + interpreter) | ||
assert identify.tags_from_path(x.strpath) == expected | ||
|
||
|
||
@pytest.mark.parametrize( | ||
('filename', 'expected'), | ||
( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't love that this introduces a need to parse the extension on all files.. but I don't see a way around that when the logic to determine the extra requirement is by extension. Thoughts?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not just
Also, you could move this to the if statement, given the other change I suggest below, though that would only avoid it in a subset of cases.
There isn't really any way I can see around it given this design, but you can at least only parse and check it once place instead of two, as is currently being done (as my suggestions implement).