Skip to content

Commit

Permalink
Added exception handling to code quality transform
Browse files Browse the repository at this point in the history
Signed-off-by: Parameswaran Selvam <[email protected]>
  • Loading branch information
Param-S committed Oct 26, 2024
1 parent 0711226 commit cdfb0a4
Showing 1 changed file with 37 additions and 15 deletions.
52 changes: 37 additions & 15 deletions transforms/code/code_quality/python/src/code_quality_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,25 +223,45 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab

contents = table.column(self.code_quality["contents_column_name"]).to_pylist()
languages = table.column(self.code_quality["language_column_name"]).to_pylist()
document_id = table.column("document_id").to_pylist()

failed_doc_ids = []
# loop over rows and compute filter stats
for i, c in enumerate(contents):
# compute lines statistics
stats = calculate_line_stats(c)
line_mean_values.append(stats["line_mean"])
line_max_values.append(stats["line_max"])
no_lines_values.append(stats["num_lines"])
avg_longest_lines_values.append(stats["avg_longest_lines"])

alphanum_frac_values.append(calculate_alpha_stats(c)["alphanum_frac"])
char_token_ratio_values.append(calculate_char_token_ratio(c, self.tokenizer)["char_token_ratio"])

is_autogenerated_values.append(is_autogenerated(c))
is_config_or_test_values.append(is_config_or_test(c))
has_no_keywords_values.append(has_no_keywords(c, languages[i]))
has_few_assignments_values.append(has_few_assignments(c, languages[i]))
is_xml_values.append(is_xml(c, languages[i]))
is_html_values.append(is_html(c, languages[i]))
try:
stats = calculate_line_stats(c)
line_mean_values.append(stats["line_mean"])
line_max_values.append(stats["line_max"])
no_lines_values.append(stats["num_lines"])
avg_longest_lines_values.append(stats["avg_longest_lines"])

alphanum_frac_values.append(calculate_alpha_stats(c)["alphanum_frac"])
char_token_ratio_values.append(calculate_char_token_ratio(c, self.tokenizer)["char_token_ratio"])

is_autogenerated_values.append(is_autogenerated(c))
is_config_or_test_values.append(is_config_or_test(c))
has_no_keywords_values.append(has_no_keywords(c, languages[i]))
has_few_assignments_values.append(has_few_assignments(c, languages[i]))
is_xml_values.append(is_xml(c, languages[i]))
is_html_values.append(is_html(c, languages[i]))
except Exception as e:
failed_doc_ids.append(document_id[i])
line_mean_values.append(0)
line_max_values.append(0)
no_lines_values.append(0)
avg_longest_lines_values.append(0)

alphanum_frac_values.append(0)
char_token_ratio_values.append(0)

is_autogenerated_values.append(False)
is_config_or_test_values.append(False)
has_no_keywords_values.append(False)
has_few_assignments_values.append(True)
is_xml_values.append(False)
is_html_values.append(False)


table = TransformUtils.add_column(table=table, name="line_mean", content=line_mean_values)
table = TransformUtils.add_column(table=table, name="line_max", content=line_max_values)
Expand All @@ -256,6 +276,8 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
table = TransformUtils.add_column(table=table, name="is_xml", content=is_xml_values)
table = TransformUtils.add_column(table=table, name="is_html", content=is_html_values)

if len(failed_doc_ids > 0):
print(f"Failed docs: {failed_doc_ids} in {file_name}")
return [table], {}


Expand Down

0 comments on commit cdfb0a4

Please sign in to comment.