From e57ca0134618bfc7ecd993b7f4c6a452d44a0cd6 Mon Sep 17 00:00:00 2001 From: Marion Date: Fri, 9 Aug 2024 19:44:50 -0700 Subject: [PATCH] Fix up some of the validations (#69) * fix for loop * fix keyerror * fix if elses * another fix * update tests * revert elif --- src/clinical_etl/mohschemav3.py | 40 +++++++++++++++++------------- tests/raw_data/SystemicTherapy.csv | 2 +- tests/raw_data/Treatment.csv | 2 +- tests/test_data_ingest.py | 23 +++++++++-------- 4 files changed, 38 insertions(+), 29 deletions(-) diff --git a/src/clinical_etl/mohschemav3.py b/src/clinical_etl/mohschemav3.py index 3dcff3e..1a9b85f 100644 --- a/src/clinical_etl/mohschemav3.py +++ b/src/clinical_etl/mohschemav3.py @@ -351,21 +351,23 @@ def validate_treatments(self, map_json): if treatment_start > treatment_end: self.fail("Treatment start cannot be after treatment end.") - if "systemic_therapies" in map_json and len(map_json["systemic_therapies"])> 0: + if "systemic_therapies" in map_json and len(map_json["systemic_therapies"]) > 0: for therapy in map_json["systemic_therapies"]: - if "dict" in str(type(therapy["start_date"])): - therapy_start = therapy["start_date"]['month_interval'] - else: - therapy_start = dateparser.parse(therapy["start_date"]).date() - if "end_date" in therapy and therapy["end_date"] not in [None, '']: - if "dict" in str(type(therapy["end_date"])): - therapy_end = therapy["end_date"]["month_interval"] - else: - therapy_end = dateparser.parse(therapy["treatment_end_date"]).date() - if therapy_start < treatment_start: - self.fail("Systemic therapy start date cannot be earlier than its treatment start date.") - if therapy_end > treatment_end: - self.fail("Systemic therapy end date cannot be after its treatment end date.") + if "start_date" in therapy and therapy["start_date"] not in [None, '']: + if "dict" in str(type(therapy["start_date"])): + therapy_start = therapy["start_date"]['month_interval'] + else: + therapy_start = dateparser.parse(therapy["start_date"]).date() + if therapy_start < treatment_start: + self.fail( + "Systemic therapy start date cannot be earlier than its treatment start date.") + if "end_date" in therapy and therapy["end_date"] not in [None, '']: + if "dict" in str(type(therapy["end_date"])): + therapy_end = therapy["end_date"]["month_interval"] + else: + therapy_end = dateparser.parse(therapy["treatment_end_date"]).date() + if therapy_end > treatment_end: + self.fail("Systemic therapy end date cannot be after its treatment end date.") def validate_systemic_therapies(self, map_json): if "drug_dose_units" not in map_json or map_json["drug_dose_units"] is None: @@ -374,14 +376,18 @@ def validate_systemic_therapies(self, map_json): self.warn(f"drug_dose_units required if {x} is submitted") for prop in map_json: if prop == "start_date" and map_json["start_date"] is not None: + start = None + end = None if "end_date" in map_json and map_json["end_date"] is not None: if "dict" in str(type(map_json["start_date"])): - start = map_json["start_date"]["month_interval"] - end = map_json["end_date"]["month_interval"] + if "month_interval" in map_json["start_date"]: + start = map_json["start_date"]["month_interval"] + if "month_interval" in map_json["end_date"]: + end = map_json["end_date"]["month_interval"] else: start = dateparser.parse(map_json["start_date"]).date() end = dateparser.parse(map_json["end_date"]).date() - if start > end: + if start and end and start > end: self.fail("Systemic therapy start cannot be after systemic therapy end.") def validate_radiations(self, map_json): diff --git a/tests/raw_data/SystemicTherapy.csv b/tests/raw_data/SystemicTherapy.csv index f11297e..7f0909c 100644 --- a/tests/raw_data/SystemicTherapy.csv +++ b/tests/raw_data/SystemicTherapy.csv @@ -1,6 +1,6 @@ submitter_donor_id,submitter_treatment_id,systemic_therapy_type,drug_name,drug_reference_identifier,drug_dose_units,prescribed_cumulative_drug_dose,actual_cumulative_drug_dose,drug_reference_database, days_per_cycle, number_of_cycles,start_date,end_date DONOR_2,TR_2,Chemotherapy,NIVOLUMAB,87354,mg/m2,150,111,PubChem,4,2,1/12/2021,1/03/2023 -DONOR_2,TR_2,Chemotherapy,NIVOLUMAB,87333,mg/m2,150,111,PubChem,5,2,1/04/2023,1/12/2024 +DONOR_2,TR_2,Chemotherapy,NIVOLUMAB,87333,mg/m2,150,111,PubChem,5,2,1/04/2020,1/12/2024 DONOR_3,TR_3,Hormone therapy,degarelix,46475,ug/m2,179,97,PubChem,6,3,10/12/2020,19/12/2021 DONOR_4,TR_4,Immunotherapy,Pembrolizumab,4459876,IU/kg,95,160,RxNorm,4,2,1/3/2021,12/12/2021 DONOR_2,TR_8,Immunotherapy,Pexidartinib,8836851,ug/m2,197,183,PubChem,6,1,9/5/2021,6/6/2023 diff --git a/tests/raw_data/Treatment.csv b/tests/raw_data/Treatment.csv index 19bd240..6b5d854 100644 --- a/tests/raw_data/Treatment.csv +++ b/tests/raw_data/Treatment.csv @@ -1,6 +1,6 @@ submitter_treatment_id,submitter_donor_id, submitter_primary_diagnosis_id, treatment_type, is_primary_treatment, treatment_start_date, treatment_end_date, treatment_intent, response_to_treatment_criteria_method, response_to_treatment, status_of_treatment TR_1,DONOR_1,PD_1,Bone marrow transplant,Yes,01/09/2021,01/09/2022,Palliative,RECIST 1.1,Complete response,Treatment completed as prescribed -TR_2,DONOR_2,PD_2,Systemic therapy,No,01/12/2021,01/12/2024,Guidance,iRECIST,Partial response,Treatment incomplete due to technical or organizational problems +TR_2,DONOR_2,PD_2,Systemic therapy,No,01/12/2021,01/12/2020,Guidance,iRECIST,Partial response,Treatment incomplete due to technical or organizational problems TR_3,DONOR_3,PD_3,Systemic therapy,Yes,01/01/2021,01/01/2022,Diagnostic,,Progressive disease,Treatment incomplete because patient died TR_4,DONOR_4,PD_4,Systemic therapy,No,01/02/2021,02/02/2022,Forensic,Response Assessment in Neuro-Oncology (RANO),Stable disease,Patient choice (stopped or interrupted treatment) TR_5,DONOR_5,PD_5,Radiation therapy,Yes,01/09/2021,01/09/2022,Preventive,AML Response Criteria,,Physician decision (stopped or interrupted treatment) diff --git a/tests/test_data_ingest.py b/tests/test_data_ingest.py index 06b597d..fe5732d 100644 --- a/tests/test_data_ingest.py +++ b/tests/test_data_ingest.py @@ -100,16 +100,19 @@ def test_validation(packets, schema): non_interval_errors.append(e) schema.validation_errors = non_interval_errors - assert len(schema.validation_errors) == 8 - # should be the following 7 errors: - # DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date - # DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death - # DONOR_2: PD_2 > TR_2: date_of_death cannot be earlier than treatment_end_date - # DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date. - # DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas - # DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date. - # DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes - # Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times + assert len(schema.validation_errors) == 11 + # should be the following 11 errors: + # "DONOR_2 > PD_2 > TR_2: Treatment start cannot be after treatment end.", + # "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.", + # "DONOR_2 > PD_2 > TR_2: Systemic therapy start date cannot be earlier than its treatment start date.", + # "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.", + # "DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date.", + # "DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas", + # "DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date.", + # "DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date ", + # "DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death ", + # "DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes", + # "Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times" # there should be an item named DUPLICATE_ID in both followup and sample_registration print(json.dumps(schema.identifiers, indent=2))