Skip to content

Commit

Permalink
DIG-1819: Warn instead of error when reference date missing & add val…
Browse files Browse the repository at this point in the history
…idation to required fields (#86)

* add warning for missing reference date (instead of error)

* blank dodiag and update to latest schema

* update test

* fix specimen validation

* only write warnings json if there are warnings

* add warnings for unconditional required fields

* add warning to validation file

* update tests

* cleanup

---------

Co-authored-by: Javier Castillo-Arnemann <[email protected]>
Co-authored-by: Javi <[email protected]>
  • Loading branch information
3 people authored Nov 8, 2024
1 parent 3ae6c6f commit b57146f
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 53 deletions.
9 changes: 6 additions & 3 deletions src/clinical_etl/CSVConvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,9 +766,12 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver
else:
json.dump(result, f, indent=4)
errors_present = False
with open(f"{input_path}_validation_results.json", 'w') as f:
json.dump(validation_results, f, indent=4)
print(f"Warnings written to {input_path}_validation_results.json.")
if len(validation_results["validation_errors"]) == 0 and len(validation_results["validation_warnings"]) == 0:
print(f"{Bcolors.OKGREEN}Validation passed!{Bcolors.ENDC}")
else:
with open(f"{input_path}_validation_results.json", 'w') as f:
json.dump(validation_results, f, indent=4)
print(f"Warnings written to {input_path}_validation_results.json.")
if len(validation_results["validation_warnings"]) > 0:
if len(validation_results["validation_warnings"]) > 20:
print(f"\n{Bcolors.WARNING}WARNING: There are {len(validation_results['validation_warnings'])} validation "
Expand Down
30 changes: 20 additions & 10 deletions src/clinical_etl/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import datetime
import math
from dateutil import relativedelta
import copy

VERBOSE = False
MODULES = {}
Expand Down Expand Up @@ -70,21 +71,27 @@ def earliest_date(data_values):
"""
fields = list(data_values.keys())
date_resolution = list(data_values[fields[0]].values())[0]
dates = list(data_values[fields[1]].values())[0]
dates = copy.deepcopy(list(data_values[fields[1]].values())[0])
earliest = DEFAULT_DATE_PARSER.get_date_data(str(datetime.date.today()))
# Ensure dates is a list, not a string, to allow non-indexed, single value entries.
if type(dates) is not list:
dates_list = [dates]
else:
dates_list = dates
for date in dates_list:
d = DEFAULT_DATE_PARSER.get_date_data(date)
if d['date_obj'] < earliest['date_obj']:
earliest = d
return {
"offset": earliest['date_obj'].strftime("%Y-%m-%d"),
"period": date_resolution
}
# If there's a None value, ignore it
if None in dates_list:
dates_list = [x for x in dates_list if x is not None]
if len(dates_list) > 0:
for date in dates_list:
d = DEFAULT_DATE_PARSER.get_date_data(date)
if d['date_obj'] < earliest['date_obj']:
earliest = d
return {
"offset": earliest['date_obj'].strftime("%Y-%m-%d"),
"period": date_resolution
}
else:
return None


def date_interval(data_values):
Expand All @@ -100,7 +107,9 @@ def date_interval(data_values):
try:
reference = INDEXED_DATA["data"]["CALCULATED"][IDENTIFIER]["REFERENCE_DATE"][0]
except KeyError:
raise MappingError("No reference date found to calculate date_interval: is there a reference_date specified in the manifest?", field_level=1)
_warn(message="No reference date found to calculate date_interval: check the reference_date is specified in the manifest or if it is missing for this donor",
input_values=data_values)
return None
DEFAULT_DATE_PARSER = dateparser.DateDataParser(
settings={"PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": DATE_FORMAT}
)
Expand Down Expand Up @@ -578,3 +587,4 @@ def _parse_date(date_string):
except Exception as e:
raise MappingError(f"error in date({date_string}): {type(e)} {e}", field_level=2)
return date_string

62 changes: 54 additions & 8 deletions src/clinical_etl/mohschemav3.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


"""
A class for the representation of a DonorWithClinicalData (MoHCCN data model v2) object in Katsu.
A class for the representation of a DonorWithClinicalData (MoHCCN data model v3) object in Katsu.
"""

class MoHSchemaV3(BaseSchema):
Expand Down Expand Up @@ -164,6 +164,10 @@ class MoHSchemaV3(BaseSchema):
}

def validate_donors(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["donors"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
match prop:
case "is_deceased":
Expand Down Expand Up @@ -283,9 +287,15 @@ def validate_donors(self, map_json):
self.warn("test_date is required for biomarkers not associated with nested events")

def validate_primary_diagnoses(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
if "date_of_diagnosis" in missing:
self.warn("date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient")
missing.remove("date_of_diagnosis")
for f in missing:
if f in self.validation_schema["primary_diagnoses"]["required_fields"]:
self.warn(f"{f} is a required field")
if "clinical_tumour_staging_system" not in map_json and "pathological_tumour_staging_system" not in map_json:
self.warn("Either clinical_tumour_staging_system or pathological_staging_system is required")

for prop in map_json:
if prop == "clinical_tumour_staging_system":
self.validate_staging_system(map_json, "clinical")
Expand All @@ -307,8 +317,12 @@ def validate_staging_system(self, map_json, staging_type):
self.warn(f"{staging_type}_stage_group is required for {staging_type}_tumour_staging_system {map_json[f'{staging_type}_tumour_staging_system']}")

def validate_specimens(self, map_json):
if "samples" in map_json:
for sample in map_json["samples"]:
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["specimens"]["required_fields"]:
self.warn(f"{f} is a required field")
if "sample_registrations" in map_json:
for sample in map_json["sample_registrations"]:
if "tumour_normal_designation" in sample and sample["tumour_normal_designation"] == "Tumour":
required_fields = [
"reference_pathology_confirmed_diagnosis",
Expand All @@ -323,10 +337,16 @@ def validate_specimens(self, map_json):
self.warn(f"Tumour specimens require a {f}")

def validate_sample_registrations(self, map_json):
# there aren't any additional validations here
return
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["sample_registrations"]["required_fields"]:
self.warn(f"{f} is a required field")

def validate_treatments(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["treatments"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "treatment_type" and map_json["treatment_type"] is not None:
for t_type in map_json["treatment_type"]:
Expand Down Expand Up @@ -370,6 +390,10 @@ def validate_treatments(self, map_json):
self.fail("Systemic therapy end date cannot be after its treatment end date.")

def validate_systemic_therapies(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["systemic_therapies"]["required_fields"]:
self.warn(f"{f} is a required field")
if "drug_dose_units" not in map_json or map_json["drug_dose_units"] is None:
for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]:
if x in map_json and map_json[x] is not None:
Expand All @@ -391,16 +415,26 @@ def validate_systemic_therapies(self, map_json):
self.fail("Systemic therapy start cannot be after systemic therapy end.")

def validate_radiations(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["radiations"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "radiation_boost" and map_json["radiation_boost"] == "Yes":
if "reference_radiation_treatment_id" not in map_json or map_json["reference_radiation_treatment_id"] is None:
self.warn("reference_radiation_treatment_id required if radiation_boost = Yes")

def validate_surgeries(self, map_json):
# No validations needed (submitter_specimen_id removed in V3)
return
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["surgeries"]["required_fields"]:
self.warn(f"{f} is a required field")

def validate_followups(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["followups"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "disease_status_at_followup":
states = [
Expand All @@ -423,19 +457,31 @@ def validate_followups(self, map_json):
self.warn(f"anatomic_site_progression_or_recurrence is required if disease_status_at_followup is {map_json['disease_status_at_followup']}")

def validate_biomarkers(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["biomarkers"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
match prop:
case "hpv_pcr_status":
if map_json["hpv_pcr_status"] == "Positive" and "hpv_strain" not in map_json:
self.warn("If hpv_pcr_status is positive, hpv_strain is required")

def validate_comorbidities(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["comorbities"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "laterality_of_prior_malignancy":
if "prior_malignancy" not in map_json or map_json["prior_malignancy"] != "Yes":
self.fail("laterality_of_prior_malignancy should not be submitted unless prior_malignancy = Yes")

def validate_exposures(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["exposures"]["required_fields"]:
self.warn(f"{f} is a required field")
is_smoker = False
if "tobacco_smoking_status" not in map_json or map_json["tobacco_smoking_status"] is None:
self.warn("tobacco_smoking_status required for exposure")
Expand Down
10 changes: 5 additions & 5 deletions tests/raw_data/PrimaryDiagnosis.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
submitter_donor_id, primary_site, submitter_primary_diagnosis_id, date_of_diagnosis, cancer_type_code, basis_of_diagnosis, clinical_tumour_staging_system, clinical_t_category, clinical_n_category, clinical_m_category, clinical_stage_group, laterality, pathological_t_category, pathological_n_category, pathological_m_category, pathological_stage_group
DONOR_1,Esophagus,PD_1,1/1/2018,C43.1,Cytology,International Neuroblastoma Staging System,,,,Stage 1,Left,T3e,N1,MX,
DONOR_2,Eye and adnexa,PD_2,1/3/2020,C04.9,Specific tumour markers,Rai staging system,,,,Stage 1A,Bilateral,,,,In situ
DONOR_2,Eye and adnexa,PD_2,,C04.9,Specific tumour markers,Rai staging system,,,,Stage 1A,Bilateral,,,,In situ
DONOR_3,Floor of mouth,PD_3,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIA
DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIB
DONOR_4,,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS
DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Cytology,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIB
DONOR_4,Brain,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS
DONOR_5,Gum,PD_5,1/3/2020,C64.9,,Revised International staging system (R-ISS),T1,N0a,M0,,Left,,,,Stage IIBES
DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2018,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB
DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,,
DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2016,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB
DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,,
59 changes: 32 additions & 27 deletions tests/test_data_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,35 +84,40 @@ def test_donor_2(packets):
def test_validation(packets, schema):
schema.validate_ingest_map({"donors": packets})
print(schema.validation_warnings)
assert len(schema.validation_warnings) == 4
# should be the following 4 warnings:
# "DONOR_5: cause_of_death required if is_deceased = Yes",
# "DONOR_5: date_of_death required if is_deceased = Yes",
# "DONOR_5 > PD_5: clinical_stage_group is required for clinical_tumour_staging_system Revised International staging system (RISS)",
# "DONOR_5 > PD_5 > TR_10: treatment type Systemic therapy should have one or more systemic therapies submitted"

print(schema.validation_errors)

warnings = [
"DONOR_2 > PD_2: date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient",
"DONOR_3 > PD_3: basis_of_diagnosis is a required field",
"DONOR_5: cause_of_death required if is_deceased = Yes",
"DONOR_5: date_of_death required if is_deceased = Yes",
"DONOR_5 > PD_5: basis_of_diagnosis is a required field",
"DONOR_5 > PD_5: clinical_stage_group is required for clinical_tumour_staging_system Revised International staging system (R-ISS)",
"DONOR_5 > PD_5 > TR_5 > Radiation 0: radiation_therapy_dosage is a required field",
"DONOR_5 > PD_5 > TR_10: Treatment type Systemic therapy should have one or more systemic therapies submitted",
]
assert (sorted(schema.validation_warnings) == sorted(warnings))
assert len(schema.validation_warnings) == 8


# temporary: remove 'month_interval' errors:
non_interval_errors = []
for e in schema.validation_errors:
if "month_interval" not in e:
non_interval_errors.append(e)
schema.validation_errors = non_interval_errors

schema.validation_errors = [e for e in schema.validation_errors if "month_interval" not in e]

print(schema.validation_errors)
errors = [
"DONOR_2 > PD_2 > TR_2: Treatment start cannot be after treatment end.",
"DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
"DONOR_2 > PD_2 > TR_2: Systemic therapy start date cannot be earlier than its treatment start date.",
"DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
"DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date.",
"DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas",
"DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date.",
"DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date ",
"DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death ",
"DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes",
"Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times"
]
assert (sorted(schema.validation_errors) == sorted(errors))
assert len(schema.validation_errors) == 11
# should be the following 11 errors:
# "DONOR_2 > PD_2 > TR_2: Treatment start cannot be after treatment end.",
# "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
# "DONOR_2 > PD_2 > TR_2: Systemic therapy start date cannot be earlier than its treatment start date.",
# "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
# "DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date.",
# "DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas",
# "DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date.",
# "DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date ",
# "DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death ",
# "DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes",
# "Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times"


# there should be an item named DUPLICATE_ID in both followup and sample_registration
print(json.dumps(schema.identifiers, indent=2))
Expand Down

0 comments on commit b57146f

Please sign in to comment.