Skip to content

Commit

Permalink
[DC-1307] Add scientific notation check to aou-ehr-file-check script (#9
Browse files Browse the repository at this point in the history
)

* changed comment

* changed error to show line number instead of error count

* added scientific notation value in error msg
  • Loading branch information
jp3477 authored Jan 19, 2021
1 parent e90056c commit f101f36
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 4 deletions.
43 changes: 42 additions & 1 deletion omop_file_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
'^\d{4}-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01]) ([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])$'
]

SCIENTIFIC_NOTATION_REGEX = "^(?:-?\d*)\.?\d+[eE][-\+]?\d+$"

csv.register_dialect('load',
quotechar='"',
doublequote=True,
Expand Down Expand Up @@ -86,7 +88,9 @@ def cast_type(cdm_column_type, value):
:return:
"""
if cdm_column_type in ('integer', 'int64'):
return int(value)
# Regex check only relevant if submission dtype is 'object'
if not re.match(SCIENTIFIC_NOTATION_REGEX, str(value)):
return int(value)
if cdm_column_type in ('character varying', 'text', 'string'):
return str(value)
if cdm_column_type == 'numeric':
Expand Down Expand Up @@ -176,6 +180,32 @@ def find_blank_lines(f):
return [i + 1 for i in indices]


def find_scientific_notation_errors(f, int_columns):
df = pd.read_csv(f, dtype=str)
df = df.rename(columns=str.lower)
df = df[[col for col in int_columns if col in df.columns]]

errors = []
sci_not_line = collections.defaultdict(int)

for submission_col_name in df.columns:
submission_column = df[submission_col_name]
for i, value in submission_column.items():
if pd.notnull(value) and re.match(SCIENTIFIC_NOTATION_REGEX,
value):
sci_not_line[submission_col_name] = (value, i + 1)
break

for col, (value, line_num) in sci_not_line.items():
e = dict(message=(
f"Scientific notation value '{value}' was found on line {line_num}. "
"Scientific notation is not allowed for integer fields."),
column_name=col)
errors.append(e)

return errors


def check_csv_format(f, column_names):
results = []
idx = 1
Expand Down Expand Up @@ -287,6 +317,17 @@ def run_checks(file_path, f):
if not _check_columns(cdm_column_names, csv_columns, result):
return result

#search for scientific notation
int_columns = [
col['name'] for col in cdm_table_columns
if col['type'] == 'integer'
]
sci_not_errors = find_scientific_notation_errors(f, int_columns)
for sci_not_error in sci_not_errors:
result['errors'].append(sci_not_error)

f.seek(0)

# read file to be processed
df = pd.read_csv(f,
sep=',',
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/examples_erroneous/errors/results.csv
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,2,3,4,5)","measurement_datetime","",""
"person.csv","Person","Please add/fix incorrect headers at the top of the file, enclosed in double quotes","","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'birth_datetime', 'day_of_birth', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'day_of_birth', 'birth_datetime', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']"
"person.csv","Person","Column not in expected order","birth_datetime","birth_datetime","day_of_birth"
"observation.csv","Observation","Scientific notation value '7.23e7' was found on line 2. Scientific notation is not allowed for integer fields.","observation_id","",""
"observation.csv","Observation","Scientific notation value '7.47e8' was found on line 2. Scientific notation is not allowed for integer fields.","observation_type_concept_id","",""
"observation.csv","Observation","Type mismatch line number 5","observation_id","23.890+11","integer"
"observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"": line numbers (4,5)","observation_date","",""
"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,3,5)","observation_datetime","",""
"observation.csv","Observation","Type mismatch line number 3","observation_type_concept_id","unknown","integer"
24 changes: 24 additions & 0 deletions tests/resources/examples_erroneous/errors/results.html
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,30 @@ <h1>Local File Validation Error Results</h1><table id="dataframe" style="width:8
<td>birth_datetime</td>
<td>day_of_birth</td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Scientific notation value '7.23e7' was found on line 2. Scientific notation is not allowed for integer fields.</td>
<td>observation_id</td>
<td></td>
<td></td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Scientific notation value '7.47e8' was found on line 2. Scientific notation is not allowed for integer fields.</td>
<td>observation_type_concept_id</td>
<td></td>
<td></td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Type mismatch line number 5</td>
<td>observation_id</td>
<td>23.890+11</td>
<td>integer</td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
Expand Down
6 changes: 3 additions & 3 deletions tests/resources/examples_erroneous/observation.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"observation_id","person_id","observation_concept_id","observation_date","observation_datetime","observation_type_concept_id","value_as_number","value_as_string","value_as_concept_id","qualifier_concept_id","unit_concept_id","provider_id","visit_occurrence_id","observation_source_value","observation_source_concept_id","unit_source_value","qualifier_source_value"
"6","6","6","1996-09-14","1989-10-07 22:13:11.1533266","6","832.66752717675058","GJTXJ2SIOOO92MMRZIG431F8W2AMP8GG8M","6","6","6","6","6","Add. Words","5","Internet","Internet"
"7","7","7","1990-06-03","1960-01-24 18:00:23","7","49.08448646268085","20RU9IJZEM71GNQAYFMCE4K1HR5TG781FEJQ8N7LK20MO1UY","7","","7","7","7","Internet","6","Sales","Add. Words"
"7.23e7","7","7","1990-06-03","1960-01-24 18:00:23","7.47e8","49.08448646268085","20RU9IJZEM71GNQAYFMCE4K1HR5TG781FEJQ8N7LK20MO1UY","7","","7","7","7","Internet","6","Sales","Add. Words"
"8","8","8","1959-09-12","1987-04-15 06:06:30.6260548","unknown","1253.5394752647446","Y","8","8","8","8","8","Sales","7","Sales","Previous Customer"
"9","9","9","1975-9-5","1982-02-26 12:35:59","9","1555.1072552591129","1LASYGPDHCPI7D6M1401B81SC4XCF","9","9","9","9","9","Previous Customer","8","Word of mouth","Sales"
"10","10","10","01-31-1963","1983-05-13 15:31:52.3905384","10","579.16600097863284","7UIP","10","10","10","10","10","Word of mouth","9","Previous Customer","Word of mouth"
"9e7","9","9","1975-9-5","1982-02-26 12:35:59","9","1555.1072552591129","1LASYGPDHCPI7D6M1401B81SC4XCF","9","9","9","9","9","Previous Customer","8","Word of mouth","Sales"
"23.890+11","10","10","01-31-1963","1983-05-13 15:31:52.3905384","10","579.16600097863284","7UIP","10","10","10","10","10","Word of mouth","9","Previous Customer","Word of mouth"

0 comments on commit f101f36

Please sign in to comment.