Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DIG-1819: Warn instead of error when reference date missing & add validation to required fields #86

Merged
merged 10 commits into from
Nov 8, 2024
9 changes: 6 additions & 3 deletions src/clinical_etl/CSVConvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,9 +766,12 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver
else:
json.dump(result, f, indent=4)
errors_present = False
with open(f"{input_path}_validation_results.json", 'w') as f:
json.dump(validation_results, f, indent=4)
print(f"Warnings written to {input_path}_validation_results.json.")
if len(validation_results["validation_errors"]) == 0 and len(validation_results["validation_warnings"]) == 0:
print(f"{Bcolors.OKGREEN}Validation passed!{Bcolors.ENDC}")
else:
with open(f"{input_path}_validation_results.json", 'w') as f:
json.dump(validation_results, f, indent=4)
print(f"Warnings written to {input_path}_validation_results.json.")
if len(validation_results["validation_warnings"]) > 0:
if len(validation_results["validation_warnings"]) > 20:
print(f"\n{Bcolors.WARNING}WARNING: There are {len(validation_results['validation_warnings'])} validation "
Expand Down
30 changes: 20 additions & 10 deletions src/clinical_etl/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import datetime
import math
from dateutil import relativedelta
import copy

VERBOSE = False
MODULES = {}
Expand Down Expand Up @@ -70,21 +71,27 @@ def earliest_date(data_values):
"""
fields = list(data_values.keys())
date_resolution = list(data_values[fields[0]].values())[0]
dates = list(data_values[fields[1]].values())[0]
dates = copy.deepcopy(list(data_values[fields[1]].values())[0])
earliest = DEFAULT_DATE_PARSER.get_date_data(str(datetime.date.today()))
# Ensure dates is a list, not a string, to allow non-indexed, single value entries.
if type(dates) is not list:
dates_list = [dates]
else:
dates_list = dates
for date in dates_list:
d = DEFAULT_DATE_PARSER.get_date_data(date)
if d['date_obj'] < earliest['date_obj']:
earliest = d
return {
"offset": earliest['date_obj'].strftime("%Y-%m-%d"),
"period": date_resolution
}
# If there's a None value, ignore it
if None in dates_list:
dates_list = [x for x in dates_list if x is not None]
if len(dates_list) > 0:
for date in dates_list:
d = DEFAULT_DATE_PARSER.get_date_data(date)
if d['date_obj'] < earliest['date_obj']:
earliest = d
return {
"offset": earliest['date_obj'].strftime("%Y-%m-%d"),
"period": date_resolution
}
else:
return None


def date_interval(data_values):
Expand All @@ -100,7 +107,9 @@ def date_interval(data_values):
try:
reference = INDEXED_DATA["data"]["CALCULATED"][IDENTIFIER]["REFERENCE_DATE"][0]
except KeyError:
raise MappingError("No reference date found to calculate date_interval: is there a reference_date specified in the manifest?", field_level=1)
_warn(message="No reference date found to calculate date_interval: check the reference_date is specified in the manifest or if it is missing for this donor",
input_values=data_values)
return None
DEFAULT_DATE_PARSER = dateparser.DateDataParser(
settings={"PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": DATE_FORMAT}
)
Expand Down Expand Up @@ -578,3 +587,4 @@ def _parse_date(date_string):
except Exception as e:
raise MappingError(f"error in date({date_string}): {type(e)} {e}", field_level=2)
return date_string

62 changes: 54 additions & 8 deletions src/clinical_etl/mohschemav3.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


"""
A class for the representation of a DonorWithClinicalData (MoHCCN data model v2) object in Katsu.
A class for the representation of a DonorWithClinicalData (MoHCCN data model v3) object in Katsu.
"""

class MoHSchemaV3(BaseSchema):
Expand Down Expand Up @@ -164,6 +164,10 @@ class MoHSchemaV3(BaseSchema):
}

def validate_donors(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["donors"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
match prop:
case "is_deceased":
Expand Down Expand Up @@ -283,9 +287,15 @@ def validate_donors(self, map_json):
self.warn("test_date is required for biomarkers not associated with nested events")

def validate_primary_diagnoses(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
if "date_of_diagnosis" in missing:
self.warn("date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient")
missing.remove("date_of_diagnosis")
for f in missing:
if f in self.validation_schema["primary_diagnoses"]["required_fields"]:
self.warn(f"{f} is a required field")
if "clinical_tumour_staging_system" not in map_json and "pathological_tumour_staging_system" not in map_json:
self.warn("Either clinical_tumour_staging_system or pathological_staging_system is required")

for prop in map_json:
if prop == "clinical_tumour_staging_system":
self.validate_staging_system(map_json, "clinical")
Expand All @@ -307,8 +317,12 @@ def validate_staging_system(self, map_json, staging_type):
self.warn(f"{staging_type}_stage_group is required for {staging_type}_tumour_staging_system {map_json[f'{staging_type}_tumour_staging_system']}")

def validate_specimens(self, map_json):
if "samples" in map_json:
for sample in map_json["samples"]:
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["specimens"]["required_fields"]:
self.warn(f"{f} is a required field")
if "sample_registrations" in map_json:
for sample in map_json["sample_registrations"]:
if "tumour_normal_designation" in sample and sample["tumour_normal_designation"] == "Tumour":
required_fields = [
"reference_pathology_confirmed_diagnosis",
Expand All @@ -323,10 +337,16 @@ def validate_specimens(self, map_json):
self.warn(f"Tumour specimens require a {f}")

def validate_sample_registrations(self, map_json):
# there aren't any additional validations here
return
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["sample_registrations"]["required_fields"]:
self.warn(f"{f} is a required field")

def validate_treatments(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["treatments"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "treatment_type" and map_json["treatment_type"] is not None:
for t_type in map_json["treatment_type"]:
Expand Down Expand Up @@ -370,6 +390,10 @@ def validate_treatments(self, map_json):
self.fail("Systemic therapy end date cannot be after its treatment end date.")

def validate_systemic_therapies(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["systemic_therapies"]["required_fields"]:
self.warn(f"{f} is a required field")
if "drug_dose_units" not in map_json or map_json["drug_dose_units"] is None:
for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]:
if x in map_json and map_json[x] is not None:
Expand All @@ -391,16 +415,26 @@ def validate_systemic_therapies(self, map_json):
self.fail("Systemic therapy start cannot be after systemic therapy end.")

def validate_radiations(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["radiations"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "radiation_boost" and map_json["radiation_boost"] == "Yes":
if "reference_radiation_treatment_id" not in map_json or map_json["reference_radiation_treatment_id"] is None:
self.warn("reference_radiation_treatment_id required if radiation_boost = Yes")

def validate_surgeries(self, map_json):
# No validations needed (submitter_specimen_id removed in V3)
return
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["surgeries"]["required_fields"]:
self.warn(f"{f} is a required field")

def validate_followups(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["followups"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "disease_status_at_followup":
states = [
Expand All @@ -423,19 +457,31 @@ def validate_followups(self, map_json):
self.warn(f"anatomic_site_progression_or_recurrence is required if disease_status_at_followup is {map_json['disease_status_at_followup']}")

def validate_biomarkers(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["biomarkers"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
match prop:
case "hpv_pcr_status":
if map_json["hpv_pcr_status"] == "Positive" and "hpv_strain" not in map_json:
self.warn("If hpv_pcr_status is positive, hpv_strain is required")

def validate_comorbidities(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["comorbities"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "laterality_of_prior_malignancy":
if "prior_malignancy" not in map_json or map_json["prior_malignancy"] != "Yes":
self.fail("laterality_of_prior_malignancy should not be submitted unless prior_malignancy = Yes")

def validate_exposures(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["exposures"]["required_fields"]:
self.warn(f"{f} is a required field")
is_smoker = False
if "tobacco_smoking_status" not in map_json or map_json["tobacco_smoking_status"] is None:
self.warn("tobacco_smoking_status required for exposure")
Expand Down
10 changes: 5 additions & 5 deletions tests/raw_data/PrimaryDiagnosis.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
submitter_donor_id, primary_site, submitter_primary_diagnosis_id, date_of_diagnosis, cancer_type_code, basis_of_diagnosis, clinical_tumour_staging_system, clinical_t_category, clinical_n_category, clinical_m_category, clinical_stage_group, laterality, pathological_t_category, pathological_n_category, pathological_m_category, pathological_stage_group
DONOR_1,Esophagus,PD_1,1/1/2018,C43.1,Cytology,International Neuroblastoma Staging System,,,,Stage 1,Left,T3e,N1,MX,
DONOR_2,Eye and adnexa,PD_2,1/3/2020,C04.9,Specific tumour markers,Rai staging system,,,,Stage 1A,Bilateral,,,,In situ
DONOR_2,Eye and adnexa,PD_2,,C04.9,Specific tumour markers,Rai staging system,,,,Stage 1A,Bilateral,,,,In situ
DONOR_3,Floor of mouth,PD_3,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIA
DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIB
DONOR_4,,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS
DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Cytology,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIB
DONOR_4,Brain,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS
DONOR_5,Gum,PD_5,1/3/2020,C64.9,,Revised International staging system (R-ISS),T1,N0a,M0,,Left,,,,Stage IIBES
DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2018,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB
DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,,
DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2016,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB
DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,,
59 changes: 32 additions & 27 deletions tests/test_data_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,35 +84,40 @@ def test_donor_2(packets):
def test_validation(packets, schema):
schema.validate_ingest_map({"donors": packets})
print(schema.validation_warnings)
assert len(schema.validation_warnings) == 4
# should be the following 4 warnings:
# "DONOR_5: cause_of_death required if is_deceased = Yes",
# "DONOR_5: date_of_death required if is_deceased = Yes",
# "DONOR_5 > PD_5: clinical_stage_group is required for clinical_tumour_staging_system Revised International staging system (RISS)",
# "DONOR_5 > PD_5 > TR_10: treatment type Systemic therapy should have one or more systemic therapies submitted"

print(schema.validation_errors)

warnings = [
"DONOR_2 > PD_2: date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient",
"DONOR_3 > PD_3: basis_of_diagnosis is a required field",
"DONOR_5: cause_of_death required if is_deceased = Yes",
"DONOR_5: date_of_death required if is_deceased = Yes",
"DONOR_5 > PD_5: basis_of_diagnosis is a required field",
"DONOR_5 > PD_5: clinical_stage_group is required for clinical_tumour_staging_system Revised International staging system (R-ISS)",
"DONOR_5 > PD_5 > TR_5 > Radiation 0: radiation_therapy_dosage is a required field",
"DONOR_5 > PD_5 > TR_10: Treatment type Systemic therapy should have one or more systemic therapies submitted",
]
assert (sorted(schema.validation_warnings) == sorted(warnings))
assert len(schema.validation_warnings) == 8


# temporary: remove 'month_interval' errors:
non_interval_errors = []
for e in schema.validation_errors:
if "month_interval" not in e:
non_interval_errors.append(e)
schema.validation_errors = non_interval_errors

schema.validation_errors = [e for e in schema.validation_errors if "month_interval" not in e]

print(schema.validation_errors)
errors = [
"DONOR_2 > PD_2 > TR_2: Treatment start cannot be after treatment end.",
"DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
"DONOR_2 > PD_2 > TR_2: Systemic therapy start date cannot be earlier than its treatment start date.",
"DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
"DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date.",
"DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas",
"DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date.",
"DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date ",
"DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death ",
"DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes",
"Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times"
]
assert (sorted(schema.validation_errors) == sorted(errors))
assert len(schema.validation_errors) == 11
# should be the following 11 errors:
# "DONOR_2 > PD_2 > TR_2: Treatment start cannot be after treatment end.",
# "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
# "DONOR_2 > PD_2 > TR_2: Systemic therapy start date cannot be earlier than its treatment start date.",
# "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
# "DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date.",
# "DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas",
# "DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date.",
# "DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date ",
# "DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death ",
# "DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes",
# "Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times"


# there should be an item named DUPLICATE_ID in both followup and sample_registration
print(json.dumps(schema.identifiers, indent=2))
Expand Down
Loading