Skip to content

Commit

Permalink
[DC-1309] Add Check for Blank Lines (#8)
Browse files Browse the repository at this point in the history
* added check for blank lines

* added function header

* displayed line numbers for empty lines

* ran checks on examples

* removed unnecessary scientific notation check
  • Loading branch information
jp3477 authored Jan 13, 2021
1 parent 2a38e50 commit e90056c
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 5 deletions.
28 changes: 28 additions & 0 deletions omop_file_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,22 @@ def find_error_in_file(column_name, cdm_column_type, submission_column_type,
return index


def find_blank_lines(f):
"""Check for rows in a csv file with only empty values
:param f: A file object
:type f: file-like object
:return: List of rows with all empty values
:rtype: list
"""
df = pd.read_csv(f)
indices = df.index[df.apply(
lambda row: all(row.apply(lambda col: pd.isnull(col))),
axis=1)].tolist()

return [i + 1 for i in indices]


def check_csv_format(f, column_names):
results = []
idx = 1
Expand Down Expand Up @@ -255,6 +271,18 @@ def run_checks(file_path, f):
]
f.seek(0)

blank_lines = find_blank_lines(f)
if blank_lines:
blank_lines_str = ",".join(map(str, blank_lines))
line_str = 'lines' if len(blank_lines) > 1 else 'line'
blank_lines_msg = f'File contains blank {line_str} on {line_str} {blank_lines_str}. ' \
'If there is no data, please only submit the header line.'

result['errors'].append(dict(message=blank_lines_msg))
return result

f.seek(0)

# check columns if looks good process file
if not _check_columns(cdm_column_names, csv_columns, result):
return result
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/examples_erroneous/death.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"person_id","death_date","death_datetime","death_type_concept_id","cause_concept_id","cause_source_value","cause_source_concept_id"
,,,,,
,,,,,
7 changes: 6 additions & 1 deletion tests/resources/examples_erroneous/errors/results.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,13 @@
"drug_exposure.csv","Drug Exposure","Column not in table definition","drug_id","drug_id",""
"drug_exposure.csv","Drug Exposure","Column missing in file","person_id","","person_id"
"drug_exposure.csv","Drug Exposure","Column not in expected order","drug_concept_id","drug_concept_id","person_id"
"measurement.csv","Measurement","Type mismatch line number 3","person_id","","integer"
"death.csv","Death","Incorrect number of columns on line 2: ['', '', '', '', '', '']","","",""
"death.csv","Death","Incorrect number of columns on line 3: ['', '', '', '', '', '']","","",""
"death.csv","Death","File contains blank lines on lines 1,2. If there is no data, please only submit the header line.","","",""
"measurement.csv","Measurement","NULL values are not allowed for column","person_id","",""
"measurement.csv","Measurement","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,2,3,4,5)","measurement_datetime","",""
"person.csv","Person","Please add/fix incorrect headers at the top of the file, enclosed in double quotes","","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'birth_datetime', 'day_of_birth', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']","['person_id', 'gender_concept_id', 'year_of_birth', 'month_of_birth', 'day_of_birth', 'birth_datetime', 'race_concept_id', 'ethnicity_concept_id', 'location_id', 'provider_id', 'care_site_id', 'person_source_value', 'gender_source_value', 'gender_source_concept_id', 'race_source_value', 'race_source_concept_id', 'ethnicity_source_value', 'ethnicity_source_concept_id']"
"person.csv","Person","Column not in expected order","birth_datetime","birth_datetime","day_of_birth"
"observation.csv","Observation","Invalid date format. Expecting ""YYYY-MM-DD"": line numbers (4,5)","observation_date","",""
"observation.csv","Observation","Invalid timestamp format. Expecting ""YYYY-MM-DD hh:mm:ss"": line numbers (1,3,5)","observation_datetime","",""
"observation.csv","Observation","Type mismatch line number 3","observation_type_concept_id","unknown","integer"
48 changes: 44 additions & 4 deletions tests/resources/examples_erroneous/errors/results.html
Original file line number Diff line number Diff line change
Expand Up @@ -146,19 +146,43 @@ <h1>Local File Validation Error Results</h1><table id="dataframe" style="width:8
<td>drug_concept_id</td>
<td>person_id</td>
</tr>
<tr>
<td>death.csv</td>
<td>Death</td>
<td>Incorrect number of columns on line 2: ['', '', '', '', '', '']</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>death.csv</td>
<td>Death</td>
<td>Incorrect number of columns on line 3: ['', '', '', '', '', '']</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>death.csv</td>
<td>Death</td>
<td>File contains blank lines on lines 1,2. If there is no data, please only submit the header line.</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>measurement.csv</td>
<td>Measurement</td>
<td>Type mismatch line number 3</td>
<td>NULL values are not allowed for column</td>
<td>person_id</td>
<td></td>
<td>integer</td>
<td></td>
</tr>
<tr>
<td>measurement.csv</td>
<td>Measurement</td>
<td>NULL values are not allowed for column</td>
<td>person_id</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss": line numbers (1,2,3,4,5)</td>
<td>measurement_datetime</td>
<td></td>
<td></td>
</tr>
Expand All @@ -178,6 +202,22 @@ <h1>Local File Validation Error Results</h1><table id="dataframe" style="width:8
<td>birth_datetime</td>
<td>day_of_birth</td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Invalid date format. Expecting "YYYY-MM-DD": line numbers (4,5)</td>
<td>observation_date</td>
<td></td>
<td></td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
<td>Invalid timestamp format. Expecting "YYYY-MM-DD hh:mm:ss": line numbers (1,3,5)</td>
<td>observation_datetime</td>
<td></td>
<td></td>
</tr>
<tr>
<td>observation.csv</td>
<td>Observation</td>
Expand Down

0 comments on commit e90056c

Please sign in to comment.