diff --git a/omop_file_validator.py b/omop_file_validator.py index f52b237..69d1a75 100644 --- a/omop_file_validator.py +++ b/omop_file_validator.py @@ -160,6 +160,22 @@ def find_error_in_file(column_name, cdm_column_type, submission_column_type, return index +def find_blank_lines(f): + """Check for rows in a csv file with only empty values + + :param f: A file object + :type f: file-like object + :return: List of rows with all empty values + :rtype: list + """ + df = pd.read_csv(f) + indices = df.index[df.apply( + lambda row: all(row.apply(lambda col: pd.isnull(col))), + axis=1)].tolist() + + return [i + 1 for i in indices] + + def check_csv_format(f, column_names): results = [] idx = 1 @@ -255,6 +271,18 @@ def run_checks(file_path, f): ] f.seek(0) + blank_lines = find_blank_lines(f) + if blank_lines: + blank_lines_str = ",".join(map(str, blank_lines)) + line_str = 'lines' if len(blank_lines) > 1 else 'line' + blank_lines_msg = f'File contains blank {line_str} on {line_str} {blank_lines_str}. ' \ + 'If there is no data, please only submit the header line.' + + result['errors'].append(dict(message=blank_lines_msg)) + return result + + f.seek(0) + # check columns if looks good process file if not _check_columns(cdm_column_names, csv_columns, result): return result diff --git a/tests/resources/examples_erroneous/death.csv b/tests/resources/examples_erroneous/death.csv new file mode 100644 index 0000000..b53712a --- /dev/null +++ b/tests/resources/examples_erroneous/death.csv @@ -0,0 +1,3 @@ +"person_id","death_date","death_datetime","death_type_concept_id","cause_concept_id","cause_source_value","cause_source_concept_id" +,,,,, +,,,,, \ No newline at end of file diff --git a/tests/resources/examples_erroneous/errors/results.csv b/tests/resources/examples_erroneous/errors/results.csv index 2cfbedf..c371fd6 100644 --- a/tests/resources/examples_erroneous/errors/results.csv +++ b/tests/resources/examples_erroneous/errors/results.csv @@ -9,8 +9,13 @@ "drug_exposure.csv","Drug Exposure","Column not in table definition","drug_id","drug_id","" "drug_exposure.csv","Drug Exposure","Column missing in file","person_id","","person_id" "drug_exposure.csv","Drug Exposure","Column not in expected order","drug_concept_id","drug_concept_id","person_id" -"measurement.csv","Measurement","Type mismatch line number 3","person_id","","integer" +"death.csv","Death","Incorrect number of columns on line 2: ['', '', '', '', '', '']","","","" +"death.csv","Death","Incorrect number of columns on line 3: ['', '', '', '', '', '']","","","" +"death.csv","Death","File contains blank lines on lines 1,2. If there is no data, please only submit the header line.","","","" "measurement.csv","Measurement","NULL values are not allowed for column","person_id","","" +"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,2,3,4,5)","measurement_datetime","","" "person.csv","Person","Please add/fix incorrect headers at the top of the file, enclosed in double quotes","","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'birth_datetime', 'day_of_birth', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'day_of_birth', 'birth_datetime', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']" "person.csv","Person","Column not in expected order","birth_datetime","birth_datetime","day_of_birth" +"observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"": line numbers (4,5)","observation_date","","" +"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,3,5)","observation_datetime","","" "observation.csv","Observation","Type mismatch line number 3","observation_type_concept_id","unknown","integer" diff --git a/tests/resources/examples_erroneous/errors/results.html b/tests/resources/examples_erroneous/errors/results.html index 443b9ee..043f7d3 100644 --- a/tests/resources/examples_erroneous/errors/results.html +++ b/tests/resources/examples_erroneous/errors/results.html @@ -146,19 +146,43 @@

Local File Validation Error Results

+ @@ -178,6 +202,22 @@

Local File Validation Error Results

measurement_datetime
+ + + + + + + + + + + +
observation_date
observation.csvObservationInvalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss": line numbers (1,3,5)observation_datetime
observation.csv Observation