From f101f3671a975e1f7bea719697fdcb1dbb8706a2 Mon Sep 17 00:00:00 2001 From: jp3477 Date: Tue, 19 Jan 2021 14:00:38 -0500 Subject: [PATCH] [DC-1307] Add scientific notation check to aou-ehr-file-check script (#9) * changed comment * changed error to show line number instead of error count * added scientific notation value in error msg --- omop_file_validator.py | 43 ++++++++++++++++++- .../examples_erroneous/errors/results.csv | 3 ++ .../examples_erroneous/errors/results.html | 24 +++++++++++ .../examples_erroneous/observation.csv | 6 +-- 4 files changed, 72 insertions(+), 4 deletions(-) diff --git a/omop_file_validator.py b/omop_file_validator.py index 69d1a75..1b3c396 100644 --- a/omop_file_validator.py +++ b/omop_file_validator.py @@ -29,6 +29,8 @@ '^\d{4}-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01]) ([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])$' ] +SCIENTIFIC_NOTATION_REGEX = "^(?:-?\d*)\.?\d+[eE][-\+]?\d+$" + csv.register_dialect('load', quotechar='"', doublequote=True, @@ -86,7 +88,9 @@ def cast_type(cdm_column_type, value): :return: """ if cdm_column_type in ('integer', 'int64'): - return int(value) + # Regex check only relevant if submission dtype is 'object' + if not re.match(SCIENTIFIC_NOTATION_REGEX, str(value)): + return int(value) if cdm_column_type in ('character varying', 'text', 'string'): return str(value) if cdm_column_type == 'numeric': @@ -176,6 +180,32 @@ def find_blank_lines(f): return [i + 1 for i in indices] +def find_scientific_notation_errors(f, int_columns): + df = pd.read_csv(f, dtype=str) + df = df.rename(columns=str.lower) + df = df[[col for col in int_columns if col in df.columns]] + + errors = [] + sci_not_line = collections.defaultdict(int) + + for submission_col_name in df.columns: + submission_column = df[submission_col_name] + for i, value in submission_column.items(): + if pd.notnull(value) and re.match(SCIENTIFIC_NOTATION_REGEX, + value): + sci_not_line[submission_col_name] = (value, i + 1) + break + + for col, (value, line_num) in sci_not_line.items(): + e = dict(message=( + f"Scientific notation value '{value}' was found on line {line_num}. " + "Scientific notation is not allowed for integer fields."), + column_name=col) + errors.append(e) + + return errors + + def check_csv_format(f, column_names): results = [] idx = 1 @@ -287,6 +317,17 @@ def run_checks(file_path, f): if not _check_columns(cdm_column_names, csv_columns, result): return result + #search for scientific notation + int_columns = [ + col['name'] for col in cdm_table_columns + if col['type'] == 'integer' + ] + sci_not_errors = find_scientific_notation_errors(f, int_columns) + for sci_not_error in sci_not_errors: + result['errors'].append(sci_not_error) + + f.seek(0) + # read file to be processed df = pd.read_csv(f, sep=',', diff --git a/tests/resources/examples_erroneous/errors/results.csv b/tests/resources/examples_erroneous/errors/results.csv index c371fd6..24f933b 100644 --- a/tests/resources/examples_erroneous/errors/results.csv +++ b/tests/resources/examples_erroneous/errors/results.csv @@ -16,6 +16,9 @@ "measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,2,3,4,5)","measurement_datetime","","" "person.csv","Person","Please add/fix incorrect headers at the top of the file, enclosed in double quotes","","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'birth_datetime', 'day_of_birth', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'day_of_birth', 'birth_datetime', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']" "person.csv","Person","Column not in expected order","birth_datetime","birth_datetime","day_of_birth" +"observation.csv","Observation","Scientific notation value '7.23e7' was found on line 2. Scientific notation is not allowed for integer fields.","observation_id","","" +"observation.csv","Observation","Scientific notation value '7.47e8' was found on line 2. Scientific notation is not allowed for integer fields.","observation_type_concept_id","","" +"observation.csv","Observation","Type mismatch line number 5","observation_id","23.890+11","integer" "observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"": line numbers (4,5)","observation_date","","" "observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,3,5)","observation_datetime","","" "observation.csv","Observation","Type mismatch line number 3","observation_type_concept_id","unknown","integer" diff --git a/tests/resources/examples_erroneous/errors/results.html b/tests/resources/examples_erroneous/errors/results.html index 043f7d3..f85702f 100644 --- a/tests/resources/examples_erroneous/errors/results.html +++ b/tests/resources/examples_erroneous/errors/results.html @@ -202,6 +202,30 @@

Local File Validation Error Results