DIG-1819: Warn instead of error when reference date missing & add val…

…idation to required fields (#86) * add warning for missing reference date (instead of error) * blank dodiag and update to latest schema * update test * fix specimen validation * only write warnings json if there are warnings * add warnings for unconditional required fields * add warning to validation file * update tests * cleanup --------- Co-authored-by: Javier Castillo-Arnemann <[email protected]> Co-authored-by: Javi <[email protected]>
CanDIG · Nov 8, 2024 · b57146f · b57146f
1 parent 3ae6c6f
commit b57146f
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 53 deletions.
diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py
@@ -766,9 +766,12 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver
         else:
             json.dump(result, f, indent=4)
     errors_present = False
-    with open(f"{input_path}_validation_results.json", 'w') as f:
-        json.dump(validation_results, f, indent=4)
-    print(f"Warnings written to {input_path}_validation_results.json.")
+    if len(validation_results["validation_errors"]) == 0 and len(validation_results["validation_warnings"]) == 0:
+           print(f"{Bcolors.OKGREEN}Validation passed!{Bcolors.ENDC}")
+    else:
+        with open(f"{input_path}_validation_results.json", 'w') as f:
+            json.dump(validation_results, f, indent=4)
+        print(f"Warnings written to {input_path}_validation_results.json.")
     if len(validation_results["validation_warnings"]) > 0:
         if len(validation_results["validation_warnings"]) > 20:
             print(f"\n{Bcolors.WARNING}WARNING: There are {len(validation_results['validation_warnings'])} validation "

diff --git a/src/clinical_etl/mappings.py b/src/clinical_etl/mappings.py
@@ -4,6 +4,7 @@
 import datetime
 import math
 from dateutil import relativedelta
+import copy
 
 VERBOSE = False
 MODULES = {}
@@ -70,21 +71,27 @@ def earliest_date(data_values):
     """
     fields = list(data_values.keys())
     date_resolution = list(data_values[fields[0]].values())[0]
-    dates = list(data_values[fields[1]].values())[0]
+    dates = copy.deepcopy(list(data_values[fields[1]].values())[0])
     earliest = DEFAULT_DATE_PARSER.get_date_data(str(datetime.date.today()))
     # Ensure dates is a list, not a string, to allow non-indexed, single value entries.
     if type(dates) is not list:
         dates_list = [dates]
     else:
         dates_list = dates
-    for date in dates_list:
-        d = DEFAULT_DATE_PARSER.get_date_data(date)
-        if d['date_obj'] < earliest['date_obj']:
-            earliest = d
-    return {
-        "offset": earliest['date_obj'].strftime("%Y-%m-%d"),
-        "period": date_resolution
-    }
+    # If there's a None value, ignore it
+    if None in dates_list:
+        dates_list = [x for x in dates_list if x is not None]
+    if len(dates_list) > 0:
+        for date in dates_list:
+            d = DEFAULT_DATE_PARSER.get_date_data(date)
+            if d['date_obj'] < earliest['date_obj']:
+                earliest = d
+        return {
+            "offset": earliest['date_obj'].strftime("%Y-%m-%d"),
+            "period": date_resolution
+        }
+    else:
+        return None
 
 
 def date_interval(data_values):
@@ -100,7 +107,9 @@ def date_interval(data_values):
     try:
         reference = INDEXED_DATA["data"]["CALCULATED"][IDENTIFIER]["REFERENCE_DATE"][0]
     except KeyError:
-        raise MappingError("No reference date found to calculate date_interval: is there a reference_date specified in the manifest?", field_level=1)
+        _warn(message="No reference date found to calculate date_interval: check the reference_date is specified in the manifest or if it is missing for this donor",
+              input_values=data_values)
+        return None
     DEFAULT_DATE_PARSER = dateparser.DateDataParser(
         settings={"PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": DATE_FORMAT}
     )
@@ -578,3 +587,4 @@ def _parse_date(date_string):
         except Exception as e:
             raise MappingError(f"error in date({date_string}): {type(e)} {e}", field_level=2)
     return date_string
+
diff --git a/src/clinical_etl/mohschemav3.py b/src/clinical_etl/mohschemav3.py
@@ -4,7 +4,7 @@
 
 
 """
-A class for the representation of a DonorWithClinicalData (MoHCCN data model v2) object in Katsu.
+A class for the representation of a DonorWithClinicalData (MoHCCN data model v3) object in Katsu.
 """
 
 class MoHSchemaV3(BaseSchema):
@@ -164,6 +164,10 @@ class MoHSchemaV3(BaseSchema):
     }
 
     def validate_donors(self, map_json):
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["donors"]["required_fields"]:
+                self.warn(f"{f} is a required field")
         for prop in map_json:
             match prop:
                 case "is_deceased":
@@ -283,9 +287,15 @@ def validate_donors(self, map_json):
                             self.warn("test_date is required for biomarkers not associated with nested events")
 
     def validate_primary_diagnoses(self, map_json):
+        missing = {field for field, val in map_json.items() if val is None}
+        if "date_of_diagnosis" in missing:
+            self.warn("date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient")
+            missing.remove("date_of_diagnosis")
+        for f in missing:
+            if f in self.validation_schema["primary_diagnoses"]["required_fields"]:
+                self.warn(f"{f} is a required field")
         if "clinical_tumour_staging_system" not in map_json and "pathological_tumour_staging_system" not in map_json:
             self.warn("Either clinical_tumour_staging_system or pathological_staging_system is required")
-
         for prop in map_json:
             if prop == "clinical_tumour_staging_system":
                 self.validate_staging_system(map_json, "clinical")
@@ -307,8 +317,12 @@ def validate_staging_system(self, map_json, staging_type):
                 self.warn(f"{staging_type}_stage_group is required for {staging_type}_tumour_staging_system {map_json[f'{staging_type}_tumour_staging_system']}")
 
     def validate_specimens(self, map_json):
-        if "samples" in map_json:
-            for sample in map_json["samples"]:
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["specimens"]["required_fields"]:
+                self.warn(f"{f} is a required field")
+        if "sample_registrations" in map_json:
+            for sample in map_json["sample_registrations"]:
                 if "tumour_normal_designation" in sample and sample["tumour_normal_designation"] == "Tumour":
                     required_fields = [
                         "reference_pathology_confirmed_diagnosis",
@@ -323,10 +337,16 @@ def validate_specimens(self, map_json):
                             self.warn(f"Tumour specimens require a {f}")
 
     def validate_sample_registrations(self, map_json):
-        # there aren't any additional validations here
-        return
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["sample_registrations"]["required_fields"]:
+                self.warn(f"{f} is a required field")
 
     def validate_treatments(self, map_json):
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["treatments"]["required_fields"]:
+                self.warn(f"{f} is a required field")
         for prop in map_json:
             if prop == "treatment_type" and map_json["treatment_type"] is not None:
                 for t_type in map_json["treatment_type"]:
@@ -370,6 +390,10 @@ def validate_treatments(self, map_json):
                                     self.fail("Systemic therapy end date cannot be after its treatment end date.")
 
     def validate_systemic_therapies(self, map_json):
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["systemic_therapies"]["required_fields"]:
+                self.warn(f"{f} is a required field")
         if "drug_dose_units" not in map_json or map_json["drug_dose_units"] is None:
             for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]:
                 if x in map_json and map_json[x] is not None:
@@ -391,16 +415,26 @@ def validate_systemic_therapies(self, map_json):
                         self.fail("Systemic therapy start cannot be after systemic therapy end.")
 
     def validate_radiations(self, map_json):
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["radiations"]["required_fields"]:
+                self.warn(f"{f} is a required field")
         for prop in map_json:
             if prop == "radiation_boost" and map_json["radiation_boost"] == "Yes":
                 if "reference_radiation_treatment_id" not in map_json or map_json["reference_radiation_treatment_id"] is None:
                     self.warn("reference_radiation_treatment_id required if radiation_boost = Yes")
 
     def validate_surgeries(self, map_json):
-        # No validations needed (submitter_specimen_id removed in V3)
-        return
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["surgeries"]["required_fields"]:
+                self.warn(f"{f} is a required field")
 
     def validate_followups(self, map_json):
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["followups"]["required_fields"]:
+                self.warn(f"{f} is a required field")
         for prop in map_json:
             if prop == "disease_status_at_followup":
                 states = [
@@ -423,19 +457,31 @@ def validate_followups(self, map_json):
                             self.warn(f"anatomic_site_progression_or_recurrence is required if disease_status_at_followup is {map_json['disease_status_at_followup']}")
 
     def validate_biomarkers(self, map_json):
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["biomarkers"]["required_fields"]:
+                self.warn(f"{f} is a required field")
         for prop in map_json:
             match prop:
                 case "hpv_pcr_status":
                     if map_json["hpv_pcr_status"] == "Positive" and "hpv_strain" not in map_json:
                         self.warn("If hpv_pcr_status is positive, hpv_strain is required")
 
     def validate_comorbidities(self, map_json):
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["comorbities"]["required_fields"]:
+                self.warn(f"{f} is a required field")
         for prop in map_json:
             if prop == "laterality_of_prior_malignancy":
                 if "prior_malignancy" not in map_json or map_json["prior_malignancy"] != "Yes":
                     self.fail("laterality_of_prior_malignancy should not be submitted unless prior_malignancy = Yes")
 
     def validate_exposures(self, map_json):
+        missing = {field for field, val in map_json.items() if val is None}
+        for f in missing:
+            if f in self.validation_schema["exposures"]["required_fields"]:
+                self.warn(f"{f} is a required field")
         is_smoker = False
         if "tobacco_smoking_status" not in map_json or map_json["tobacco_smoking_status"] is None:
             self.warn("tobacco_smoking_status required for exposure")

diff --git a/tests/raw_data/PrimaryDiagnosis.csv b/tests/raw_data/PrimaryDiagnosis.csv
@@ -1,9 +1,9 @@
 submitter_donor_id, primary_site, submitter_primary_diagnosis_id, date_of_diagnosis, cancer_type_code, basis_of_diagnosis, clinical_tumour_staging_system, clinical_t_category, clinical_n_category, clinical_m_category, clinical_stage_group, laterality, pathological_t_category, pathological_n_category, pathological_m_category, pathological_stage_group
 DONOR_1,Esophagus,PD_1,1/1/2018,C43.1,Cytology,International Neuroblastoma Staging System,,,,Stage 1,Left,T3e,N1,MX,
-DONOR_2,Eye and adnexa,PD_2,1/3/2020,C04.9,Specific tumour markers,Rai staging system,,,,Stage 1A,Bilateral,,,,In situ
+DONOR_2,Eye and adnexa,PD_2,,C04.9,Specific tumour markers,Rai staging system,,,,Stage 1A,Bilateral,,,,In situ
 DONOR_3,Floor of mouth,PD_3,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIA
-DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIB
-DONOR_4,,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS
+DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Cytology,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIB
+DONOR_4,Brain,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS
 DONOR_5,Gum,PD_5,1/3/2020,C64.9,,Revised International staging system (R-ISS),T1,N0a,M0,,Left,,,,Stage IIBES
-DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2018,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB
-DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,,
+DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2016,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB
+DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,,
diff --git a/tests/test_data_ingest.py b/tests/test_data_ingest.py
@@ -84,35 +84,40 @@ def test_donor_2(packets):
 def test_validation(packets, schema):
     schema.validate_ingest_map({"donors": packets})
     print(schema.validation_warnings)
-    assert len(schema.validation_warnings) == 4
-    # should be the following 4 warnings:
-    # "DONOR_5: cause_of_death required if is_deceased = Yes",
-    # "DONOR_5: date_of_death required if is_deceased = Yes",
-    # "DONOR_5 > PD_5: clinical_stage_group is required for clinical_tumour_staging_system Revised International staging system (RISS)",
-    # "DONOR_5 > PD_5 > TR_10: treatment type Systemic therapy should have one or more systemic therapies submitted"
-
-    print(schema.validation_errors)
-
+    warnings = [
+        "DONOR_2 > PD_2: date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient",
+        "DONOR_3 > PD_3: basis_of_diagnosis is a required field",
+        "DONOR_5: cause_of_death required if is_deceased = Yes",
+        "DONOR_5: date_of_death required if is_deceased = Yes",
+        "DONOR_5 > PD_5: basis_of_diagnosis is a required field",
+        "DONOR_5 > PD_5: clinical_stage_group is required for clinical_tumour_staging_system Revised International staging system (R-ISS)",
+        "DONOR_5 > PD_5 > TR_5 > Radiation 0: radiation_therapy_dosage is a required field",
+        "DONOR_5 > PD_5 > TR_10: Treatment type Systemic therapy should have one or more systemic therapies submitted",
+    ]
+    assert (sorted(schema.validation_warnings) == sorted(warnings))
+    assert len(schema.validation_warnings) == 8
+
+
     # temporary: remove 'month_interval' errors:
-    non_interval_errors = []
-    for e in schema.validation_errors:
-        if "month_interval" not in e:
-            non_interval_errors.append(e)
-    schema.validation_errors = non_interval_errors
-
+    schema.validation_errors = [e for e in schema.validation_errors if "month_interval" not in e]    
+
+    print(schema.validation_errors)
+    errors = [
+        "DONOR_2 > PD_2 > TR_2: Treatment start cannot be after treatment end.",
+        "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
+        "DONOR_2 > PD_2 > TR_2: Systemic therapy start date cannot be earlier than its treatment start date.",
+        "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
+        "DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date.",
+        "DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas",
+        "DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date.",
+        "DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date ",
+        "DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death ",
+        "DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes",
+        "Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times"
+    ]
+    assert (sorted(schema.validation_errors) == sorted(errors))
     assert len(schema.validation_errors) == 11
-    # should be the following 11 errors:
-    # "DONOR_2 > PD_2 > TR_2: Treatment start cannot be after treatment end.",
-    # "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
-    # "DONOR_2 > PD_2 > TR_2: Systemic therapy start date cannot be earlier than its treatment start date.",
-    # "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
-    # "DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date.",
-    # "DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas",
-    # "DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date.",
-    # "DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date ",
-    # "DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death ",
-    # "DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes",
-    # "Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times"
+
 
     # there should be an item named DUPLICATE_ID in both followup and sample_registration
     print(json.dumps(schema.identifiers, indent=2))