Skip to content

Commit

Permalink
False positives being shown in results files (#10)
Browse files Browse the repository at this point in the history
* [DC-1374] False positives being shown in results files

* [DC-1374] simplified if statements

* [DC-1374] simplified if statements

* [DC-1374] simplified if statements

* [DC-1374] results files uploaded after running tests successfully

* [DC-1374] updated nullable fields
  • Loading branch information
amcgrenera-vumc authored Mar 23, 2021
1 parent f101f36 commit 4c0adaa
Show file tree
Hide file tree
Showing 10 changed files with 105 additions and 50 deletions.
47 changes: 19 additions & 28 deletions omop_file_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import codecs
import pandas as pd
import numpy as np
import csv
import json
import datetime
Expand Down Expand Up @@ -359,21 +360,19 @@ def run_checks(file_path, f):
submission_column, meta_column_type,
submission_column_type, df)
if error_row_index:
e = dict(message=MSG_INVALID_TYPE +
" line number " +
str(error_row_index + 1),
column_name=submission_column,
actual=df[submission_column]
[error_row_index],
expected=meta_column_type)
result['errors'].append(e)
df = df.replace(np.nan, '')
if not (df[submission_column][error_row_index] == '' and not meta_column_required):
e = dict(message=MSG_INVALID_TYPE +
" line number " +
str(error_row_index + 1),
column_name=submission_column,
actual=df[submission_column]
[error_row_index],
expected=meta_column_type)
result['errors'].append(e)

# Check that date format is in the YYYY-MM-DD or YYYY-MM-DD hh:mm:ss format
if meta_column_type in ('date', 'timestamp'):
invalid_indices = []
invalid_date_strings = []

patterns = []
fmt = ''
err_msg = ''

Expand All @@ -388,27 +387,19 @@ def run_checks(file_path, f):

for idx, value in df[submission_column].iteritems(
):
df = df.replace(np.nan, '')
if not any(
list(
map(
lambda pattern:
date_format_valid(
pattern, str(value), fmt),
patterns))):
invalid_indices.append(idx + 1)
invalid_date_strings.append(str(value))

invalid_indices = [
str(idx) for idx in invalid_indices
]
if invalid_indices:
line_num_str = 'line numbers' if len(
invalid_indices) > 1 else 'line number'
e = dict(
message=
f"{err_msg}: {line_num_str} ({','.join(invalid_indices)})",
column_name=submission_column)
result['errors'].append(e)
pattern, str(value), fmt), patterns))):
if not (value == '' and not meta_column_required):
e = dict(message=err_msg + " line number " + str(idx + 1),
column_name=submission_column,
actual=value,
expected=meta_column_type)
result['errors'].append(e)

# Check if any nulls present in a required field
if meta_column_required and df[submission_column].isnull(
Expand Down
2 changes: 1 addition & 1 deletion resources/omop/death.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
{
"type": "timestamp",
"name": "death_datetime",
"mode": "required",
"mode": "nullable",
"description": "The date and time the person was deceased. If the precise date including day or month is not known or not allowed, December is used as the default month, and the last day of the month the default day."
},
{
Expand Down
2 changes: 1 addition & 1 deletion resources/omop/measurement.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
{
"type": "timestamp",
"name": "measurement_datetime",
"mode": "required",
"mode": "nullable",
"description": "The date and time of the Measurement. Some database systems don't have a datatype of time. To accomodate all temporal analyses, datatype datetime can be used (combining measurement_date and measurement_time [forum discussion](http://forums.ohdsi.org/t/date-time-and-datetime-problem-and-the-world-of-hours-and-1day/314))"
},
{
Expand Down
2 changes: 1 addition & 1 deletion resources/omop/observation.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
{
"type": "timestamp",
"name": "observation_datetime",
"mode": "required",
"mode": "nullable",
"description": "The date and time of the observation."
},
{
Expand Down
2 changes: 1 addition & 1 deletion resources/omop/specimen.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
{
"type": "timestamp",
"name": "specimen_datetime",
"mode": "required",
"mode": "nullable",
"description": "The date and time on the date when the Specimen was obtained from the person."
},
{
Expand Down
4 changes: 2 additions & 2 deletions resources/omop/visit_occurrence.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
{
"type": "timestamp",
"name": "visit_start_datetime",
"mode": "required",
"mode": "nullable",
"description": "The date and time of the visit started."
},
{
Expand All @@ -38,7 +38,7 @@
{
"type": "timestamp",
"name": "visit_end_datetime",
"mode": "required",
"mode": "nullable",
"description": "The date and time of the visit end."
},
{
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/examples_erroneous/device_exposure.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"device_exposure_id","person_id","device_concept_id","device_exposure_start_date","device_exposure_start_datetime","device_exposure_end_date","device_exposure_end_datetime","device_type_concept_id","unique_device_id","quantity","provider_id","visit_occurrence_id","device_source_value","device_source_concept_id"
11,2,31,2020-01-01,2020-01-01 01:00:00,,,4,,5,6,7,Supply/PACK BASIC:Supply/PACK BASIC,
12,2,32,2020-02-02,2020-02-02 02:00:00,2020-02-03,2020-02-03 03:00:00,4,, , , 8,"77334:PR 77334 TREATMENT DEVICES, DESIGN AND CONST",10
13 changes: 10 additions & 3 deletions tests/resources/examples_erroneous/errors/results.csv
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,19 @@
"death.csv","Death","Incorrect number of columns on line 3: ['', '', '', '', '', '']","","",""
"death.csv","Death","File contains blank lines on lines 1,2. If there is no data, please only submit the header line.","","",""
"measurement.csv","Measurement","NULL values are not allowed for column","person_id","",""
"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,2,3,4,5)","measurement_datetime","",""
"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 1","measurement_datetime","2009-02-02 05:00:00.000000 UTC","timestamp"
"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 2","measurement_datetime","2009-04-25 05:00:00.000000 UTC","timestamp"
"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 3","measurement_datetime","2009-04-25 05:00:00.000000 UTC","timestamp"
"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 4","measurement_datetime","2008-09-02 05:00:00.000000 UTC","timestamp"
"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 5","measurement_datetime","2008-09-02 05:00:00.000000 UTC","timestamp"
"person.csv","Person","Please add/fix incorrect headers at the top of the file, enclosed in double quotes","","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'birth_datetime', 'day_of_birth', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'day_of_birth', 'birth_datetime', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']"
"person.csv","Person","Column not in expected order","birth_datetime","birth_datetime","day_of_birth"
"observation.csv","Observation","Scientific notation value '7.23e7' was found on line 2. Scientific notation is not allowed for integer fields.","observation_id","",""
"observation.csv","Observation","Scientific notation value '7.47e8' was found on line 2. Scientific notation is not allowed for integer fields.","observation_type_concept_id","",""
"observation.csv","Observation","Type mismatch line number 5","observation_id","23.890+11","integer"
"observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"": line numbers (4,5)","observation_date","",""
"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,3,5)","observation_datetime","",""
"observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"" line number 4","observation_date","1975-9-5","date"
"observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"" line number 5","observation_date","01-31-1963","date"
"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 1","observation_datetime","1989-10-07 22:13:11.1533266","timestamp"
"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 3","observation_datetime","1987-04-15 06:06:30.6260548","timestamp"
"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 5","observation_datetime","1983-05-13 15:31:52.3905384","timestamp"
"observation.csv","Observation","Type mismatch line number 3","observation_type_concept_id","unknown","integer"
74 changes: 65 additions & 9 deletions tests/resources/examples_erroneous/errors/results.html
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,42 @@ <h1>Local File Validation Error Results</h1><table id="dataframe" style="width:8
<tr>
<td>measurement.csv</td>
<td>Measurement</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss": line numbers (1,2,3,4,5)</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 1</td>
<td>measurement_datetime</td>
<td></td>
<td></td>
<td>2009-02-02 05:00:00.000000 UTC</td>
<td>timestamp</td>
</tr>
<tr>
<td>measurement.csv</td>
<td>Measurement</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 2</td>
<td>measurement_datetime</td>
<td>2009-04-25 05:00:00.000000 UTC</td>
<td>timestamp</td>
</tr>
<tr>
<td>measurement.csv</td>
<td>Measurement</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 3</td>
<td>measurement_datetime</td>
<td>2009-04-25 05:00:00.000000 UTC</td>
<td>timestamp</td>
</tr>
<tr>
<td>measurement.csv</td>
<td>Measurement</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 4</td>
<td>measurement_datetime</td>
<td>2008-09-02 05:00:00.000000 UTC</td>
<td>timestamp</td>
</tr>
<tr>
<td>measurement.csv</td>
<td>Measurement</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 5</td>
<td>measurement_datetime</td>
<td>2008-09-02 05:00:00.000000 UTC</td>
<td>timestamp</td>
</tr>
<tr>
<td>person.csv</td>
Expand Down Expand Up @@ -229,18 +261,42 @@ <h1>Local File Validation Error Results</h1><table id="dataframe" style="width:8
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Invalid date format. Expecting "YYYY-MM-DD": line numbers (4,5)</td>
<td>Invalid date format. Expecting "YYYY-MM-DD" line number 4</td>
<td>observation_date</td>
<td></td>
<td></td>
<td>1975-9-5</td>
<td>date</td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Invalid date format. Expecting "YYYY-MM-DD" line number 5</td>
<td>observation_date</td>
<td>01-31-1963</td>
<td>date</td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss": line numbers (1,3,5)</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 1</td>
<td>observation_datetime</td>
<td></td>
<td></td>
<td>1989-10-07 22:13:11.1533266</td>
<td>timestamp</td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 3</td>
<td>observation_datetime</td>
<td>1987-04-15 06:06:30.6260548</td>
<td>timestamp</td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 5</td>
<td>observation_datetime</td>
<td>1983-05-13 15:31:52.3905384</td>
<td>timestamp</td>
</tr>
<tr>
<td>observation.csv</td>
Expand Down
6 changes: 2 additions & 4 deletions tests/test_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,12 @@ def test_error_list(self):
f_name = "observation.csv"
if self.assertIn("observation.csv", error_map):
self.check_invalid_type(f_name, error_map[f_name][2])

self.check_invalid_date(f_name, error_map[f_name][0])
self.check_invalid_timestamp(f_name, error_map[f_name][1])
self.check_invalid_date(f_name, error_map[f_name][0])
self.check_invalid_timestamp(f_name, error_map[f_name][1])

f_name = "measurement.csv"
if self.assertIn("measurement.csv", error_map):
self.check_required_value(f_name, error_map[f_name][0])


if __name__ == '__main__':
unittest.main()

0 comments on commit 4c0adaa

Please sign in to comment.