From 4c0adaae4ac1bfe25086c5a066f17a10b880826a Mon Sep 17 00:00:00 2001 From: amcgrenera-vumc <56938070+amcgrenera-vumc@users.noreply.github.com> Date: Tue, 23 Mar 2021 15:00:22 -0500 Subject: [PATCH] False positives being shown in results files (#10) * [DC-1374] False positives being shown in results files * [DC-1374] simplified if statements * [DC-1374] simplified if statements * [DC-1374] simplified if statements * [DC-1374] results files uploaded after running tests successfully * [DC-1374] updated nullable fields --- omop_file_validator.py | 47 +++++------- resources/omop/death.json | 2 +- resources/omop/measurement.json | 2 +- resources/omop/observation.json | 2 +- resources/omop/specimen.json | 2 +- resources/omop/visit_occurrence.json | 4 +- .../examples_erroneous/device_exposure.csv | 3 + .../examples_erroneous/errors/results.csv | 13 +++- .../examples_erroneous/errors/results.html | 74 ++++++++++++++++--- tests/test_reporter.py | 6 +- 10 files changed, 105 insertions(+), 50 deletions(-) create mode 100755 tests/resources/examples_erroneous/device_exposure.csv diff --git a/omop_file_validator.py b/omop_file_validator.py index 1b3c396..ec53cf6 100644 --- a/omop_file_validator.py +++ b/omop_file_validator.py @@ -5,6 +5,7 @@ import os import codecs import pandas as pd +import numpy as np import csv import json import datetime @@ -359,21 +360,19 @@ def run_checks(file_path, f): submission_column, meta_column_type, submission_column_type, df) if error_row_index: - e = dict(message=MSG_INVALID_TYPE + - " line number " + - str(error_row_index + 1), - column_name=submission_column, - actual=df[submission_column] - [error_row_index], - expected=meta_column_type) - result['errors'].append(e) + df = df.replace(np.nan, '') + if not (df[submission_column][error_row_index] == '' and not meta_column_required): + e = dict(message=MSG_INVALID_TYPE + + " line number " + + str(error_row_index + 1), + column_name=submission_column, + actual=df[submission_column] + [error_row_index], + expected=meta_column_type) + result['errors'].append(e) # Check that date format is in the YYYY-MM-DD or YYYY-MM-DD hh:mm:ss format if meta_column_type in ('date', 'timestamp'): - invalid_indices = [] - invalid_date_strings = [] - - patterns = [] fmt = '' err_msg = '' @@ -388,27 +387,19 @@ def run_checks(file_path, f): for idx, value in df[submission_column].iteritems( ): + df = df.replace(np.nan, '') if not any( list( map( lambda pattern: date_format_valid( - pattern, str(value), fmt), - patterns))): - invalid_indices.append(idx + 1) - invalid_date_strings.append(str(value)) - - invalid_indices = [ - str(idx) for idx in invalid_indices - ] - if invalid_indices: - line_num_str = 'line numbers' if len( - invalid_indices) > 1 else 'line number' - e = dict( - message= - f"{err_msg}: {line_num_str} ({','.join(invalid_indices)})", - column_name=submission_column) - result['errors'].append(e) + pattern, str(value), fmt), patterns))): + if not (value == '' and not meta_column_required): + e = dict(message=err_msg + " line number " + str(idx + 1), + column_name=submission_column, + actual=value, + expected=meta_column_type) + result['errors'].append(e) # Check if any nulls present in a required field if meta_column_required and df[submission_column].isnull( diff --git a/resources/omop/death.json b/resources/omop/death.json index 99bddfb..e4658c8 100644 --- a/resources/omop/death.json +++ b/resources/omop/death.json @@ -14,7 +14,7 @@ { "type": "timestamp", "name": "death_datetime", - "mode": "required", + "mode": "nullable", "description": "The date and time the person was deceased. If the precise date including day or month is not known or not allowed, December is used as the default month, and the last day of the month the default day." }, { diff --git a/resources/omop/measurement.json b/resources/omop/measurement.json index b5e610c..c66ab47 100644 --- a/resources/omop/measurement.json +++ b/resources/omop/measurement.json @@ -26,7 +26,7 @@ { "type": "timestamp", "name": "measurement_datetime", - "mode": "required", + "mode": "nullable", "description": "The date and time of the Measurement. Some database systems don't have a datatype of time. To accomodate all temporal analyses, datatype datetime can be used (combining measurement_date and measurement_time [forum discussion](http://forums.ohdsi.org/t/date-time-and-datetime-problem-and-the-world-of-hours-and-1day/314))" }, { diff --git a/resources/omop/observation.json b/resources/omop/observation.json index 9cba51a..e7c2ead 100644 --- a/resources/omop/observation.json +++ b/resources/omop/observation.json @@ -26,7 +26,7 @@ { "type": "timestamp", "name": "observation_datetime", - "mode": "required", + "mode": "nullable", "description": "The date and time of the observation." }, { diff --git a/resources/omop/specimen.json b/resources/omop/specimen.json index d60ef73..9c6e6eb 100644 --- a/resources/omop/specimen.json +++ b/resources/omop/specimen.json @@ -32,7 +32,7 @@ { "type": "timestamp", "name": "specimen_datetime", - "mode": "required", + "mode": "nullable", "description": "The date and time on the date when the Specimen was obtained from the person." }, { diff --git a/resources/omop/visit_occurrence.json b/resources/omop/visit_occurrence.json index a6805a4..cbd7dc0 100644 --- a/resources/omop/visit_occurrence.json +++ b/resources/omop/visit_occurrence.json @@ -26,7 +26,7 @@ { "type": "timestamp", "name": "visit_start_datetime", - "mode": "required", + "mode": "nullable", "description": "The date and time of the visit started." }, { @@ -38,7 +38,7 @@ { "type": "timestamp", "name": "visit_end_datetime", - "mode": "required", + "mode": "nullable", "description": "The date and time of the visit end." }, { diff --git a/tests/resources/examples_erroneous/device_exposure.csv b/tests/resources/examples_erroneous/device_exposure.csv new file mode 100755 index 0000000..77ec952 --- /dev/null +++ b/tests/resources/examples_erroneous/device_exposure.csv @@ -0,0 +1,3 @@ +"device_exposure_id","person_id","device_concept_id","device_exposure_start_date","device_exposure_start_datetime","device_exposure_end_date","device_exposure_end_datetime","device_type_concept_id","unique_device_id","quantity","provider_id","visit_occurrence_id","device_source_value","device_source_concept_id" +11,2,31,2020-01-01,2020-01-01 01:00:00,,,4,,5,6,7,Supply/PACK BASIC:Supply/PACK BASIC, +12,2,32,2020-02-02,2020-02-02 02:00:00,2020-02-03,2020-02-03 03:00:00,4,, , , 8,"77334:PR 77334 TREATMENT DEVICES, DESIGN AND CONST",10 \ No newline at end of file diff --git a/tests/resources/examples_erroneous/errors/results.csv b/tests/resources/examples_erroneous/errors/results.csv index 24f933b..533dfce 100644 --- a/tests/resources/examples_erroneous/errors/results.csv +++ b/tests/resources/examples_erroneous/errors/results.csv @@ -13,12 +13,19 @@ "death.csv","Death","Incorrect number of columns on line 3: ['', '', '', '', '', '']","","","" "death.csv","Death","File contains blank lines on lines 1,2. If there is no data, please only submit the header line.","","","" "measurement.csv","Measurement","NULL values are not allowed for column","person_id","","" -"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,2,3,4,5)","measurement_datetime","","" +"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 1","measurement_datetime","2009-02-02 05:00:00.000000 UTC","timestamp" +"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 2","measurement_datetime","2009-04-25 05:00:00.000000 UTC","timestamp" +"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 3","measurement_datetime","2009-04-25 05:00:00.000000 UTC","timestamp" +"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 4","measurement_datetime","2008-09-02 05:00:00.000000 UTC","timestamp" +"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 5","measurement_datetime","2008-09-02 05:00:00.000000 UTC","timestamp" "person.csv","Person","Please add/fix incorrect headers at the top of the file, enclosed in double quotes","","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'birth_datetime', 'day_of_birth', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'day_of_birth', 'birth_datetime', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']" "person.csv","Person","Column not in expected order","birth_datetime","birth_datetime","day_of_birth" "observation.csv","Observation","Scientific notation value '7.23e7' was found on line 2. Scientific notation is not allowed for integer fields.","observation_id","","" "observation.csv","Observation","Scientific notation value '7.47e8' was found on line 2. Scientific notation is not allowed for integer fields.","observation_type_concept_id","","" "observation.csv","Observation","Type mismatch line number 5","observation_id","23.890+11","integer" -"observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"": line numbers (4,5)","observation_date","","" -"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,3,5)","observation_datetime","","" +"observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"" line number 4","observation_date","1975-9-5","date" +"observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"" line number 5","observation_date","01-31-1963","date" +"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 1","observation_datetime","1989-10-07 22:13:11.1533266","timestamp" +"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 3","observation_datetime","1987-04-15 06:06:30.6260548","timestamp" +"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"" line number 5","observation_datetime","1983-05-13 15:31:52.3905384","timestamp" "observation.csv","Observation","Type mismatch line number 3","observation_type_concept_id","unknown","integer" diff --git a/tests/resources/examples_erroneous/errors/results.html b/tests/resources/examples_erroneous/errors/results.html index f85702f..88da164 100644 --- a/tests/resources/examples_erroneous/errors/results.html +++ b/tests/resources/examples_erroneous/errors/results.html @@ -181,10 +181,42 @@

Local File Validation Error Results

+ - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -229,18 +261,42 @@

Local File Validation Error Results

Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 1 measurement_datetime2009-02-02 05:00:00.000000 UTCtimestamp
measurement.csvMeasurementInvalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 2measurement_datetime2009-04-25 05:00:00.000000 UTCtimestamp
measurement.csvMeasurementInvalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 3measurement_datetime2009-04-25 05:00:00.000000 UTCtimestamp
measurement.csvMeasurementInvalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 4measurement_datetime2008-09-02 05:00:00.000000 UTCtimestamp
measurement.csvMeasurementInvalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 5measurement_datetime2008-09-02 05:00:00.000000 UTCtimestamp
person.csv
+ - - + + + + + + + + + + - + - - + + + + + + + + + + + + + + + + + + diff --git a/tests/test_reporter.py b/tests/test_reporter.py index d41b6cb..0009bb3 100644 --- a/tests/test_reporter.py +++ b/tests/test_reporter.py @@ -101,14 +101,12 @@ def test_error_list(self): f_name = "observation.csv" if self.assertIn("observation.csv", error_map): self.check_invalid_type(f_name, error_map[f_name][2]) - - self.check_invalid_date(f_name, error_map[f_name][0]) - self.check_invalid_timestamp(f_name, error_map[f_name][1]) + self.check_invalid_date(f_name, error_map[f_name][0]) + self.check_invalid_timestamp(f_name, error_map[f_name][1]) f_name = "measurement.csv" if self.assertIn("measurement.csv", error_map): self.check_required_value(f_name, error_map[f_name][0]) - if __name__ == '__main__': unittest.main()
Invalid date format. Expecting "YYYY-MM-DD" line number 4 observation_date1975-9-5date
observation.csvObservationInvalid date format. Expecting "YYYY-MM-DD" line number 5observation_date01-31-1963date
observation.csv ObservationInvalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss": line numbers (1,3,5)Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 1 observation_datetime1989-10-07 22:13:11.1533266timestamp
observation.csvObservationInvalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 3observation_datetime1987-04-15 06:06:30.6260548timestamp
observation.csvObservationInvalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss" line number 5observation_datetime1983-05-13 15:31:52.3905384timestamp
observation.csv