Skip to content

Commit

Permalink
Fix up some of the validations (#69)
Browse files Browse the repository at this point in the history
* fix for loop

* fix keyerror

* fix if elses

* another fix

* update tests

* revert elif
  • Loading branch information
mshadbolt authored Aug 10, 2024
1 parent b849e87 commit e57ca01
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 29 deletions.
40 changes: 23 additions & 17 deletions src/clinical_etl/mohschemav3.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,21 +351,23 @@ def validate_treatments(self, map_json):
if treatment_start > treatment_end:
self.fail("Treatment start cannot be after treatment end.")

if "systemic_therapies" in map_json and len(map_json["systemic_therapies"])> 0:
if "systemic_therapies" in map_json and len(map_json["systemic_therapies"]) > 0:
for therapy in map_json["systemic_therapies"]:
if "dict" in str(type(therapy["start_date"])):
therapy_start = therapy["start_date"]['month_interval']
else:
therapy_start = dateparser.parse(therapy["start_date"]).date()
if "end_date" in therapy and therapy["end_date"] not in [None, '']:
if "dict" in str(type(therapy["end_date"])):
therapy_end = therapy["end_date"]["month_interval"]
else:
therapy_end = dateparser.parse(therapy["treatment_end_date"]).date()
if therapy_start < treatment_start:
self.fail("Systemic therapy start date cannot be earlier than its treatment start date.")
if therapy_end > treatment_end:
self.fail("Systemic therapy end date cannot be after its treatment end date.")
if "start_date" in therapy and therapy["start_date"] not in [None, '']:
if "dict" in str(type(therapy["start_date"])):
therapy_start = therapy["start_date"]['month_interval']
else:
therapy_start = dateparser.parse(therapy["start_date"]).date()
if therapy_start < treatment_start:
self.fail(
"Systemic therapy start date cannot be earlier than its treatment start date.")
if "end_date" in therapy and therapy["end_date"] not in [None, '']:
if "dict" in str(type(therapy["end_date"])):
therapy_end = therapy["end_date"]["month_interval"]
else:
therapy_end = dateparser.parse(therapy["treatment_end_date"]).date()
if therapy_end > treatment_end:
self.fail("Systemic therapy end date cannot be after its treatment end date.")

def validate_systemic_therapies(self, map_json):
if "drug_dose_units" not in map_json or map_json["drug_dose_units"] is None:
Expand All @@ -374,14 +376,18 @@ def validate_systemic_therapies(self, map_json):
self.warn(f"drug_dose_units required if {x} is submitted")
for prop in map_json:
if prop == "start_date" and map_json["start_date"] is not None:
start = None
end = None
if "end_date" in map_json and map_json["end_date"] is not None:
if "dict" in str(type(map_json["start_date"])):
start = map_json["start_date"]["month_interval"]
end = map_json["end_date"]["month_interval"]
if "month_interval" in map_json["start_date"]:
start = map_json["start_date"]["month_interval"]
if "month_interval" in map_json["end_date"]:
end = map_json["end_date"]["month_interval"]
else:
start = dateparser.parse(map_json["start_date"]).date()
end = dateparser.parse(map_json["end_date"]).date()
if start > end:
if start and end and start > end:
self.fail("Systemic therapy start cannot be after systemic therapy end.")

def validate_radiations(self, map_json):
Expand Down
2 changes: 1 addition & 1 deletion tests/raw_data/SystemicTherapy.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
submitter_donor_id,submitter_treatment_id,systemic_therapy_type,drug_name,drug_reference_identifier,drug_dose_units,prescribed_cumulative_drug_dose,actual_cumulative_drug_dose,drug_reference_database, days_per_cycle, number_of_cycles,start_date,end_date
DONOR_2,TR_2,Chemotherapy,NIVOLUMAB,87354,mg/m2,150,111,PubChem,4,2,1/12/2021,1/03/2023
DONOR_2,TR_2,Chemotherapy,NIVOLUMAB,87333,mg/m2,150,111,PubChem,5,2,1/04/2023,1/12/2024
DONOR_2,TR_2,Chemotherapy,NIVOLUMAB,87333,mg/m2,150,111,PubChem,5,2,1/04/2020,1/12/2024
DONOR_3,TR_3,Hormone therapy,degarelix,46475,ug/m2,179,97,PubChem,6,3,10/12/2020,19/12/2021
DONOR_4,TR_4,Immunotherapy,Pembrolizumab,4459876,IU/kg,95,160,RxNorm,4,2,1/3/2021,12/12/2021
DONOR_2,TR_8,Immunotherapy,Pexidartinib,8836851,ug/m2,197,183,PubChem,6,1,9/5/2021,6/6/2023
2 changes: 1 addition & 1 deletion tests/raw_data/Treatment.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
submitter_treatment_id,submitter_donor_id, submitter_primary_diagnosis_id, treatment_type, is_primary_treatment, treatment_start_date, treatment_end_date, treatment_intent, response_to_treatment_criteria_method, response_to_treatment, status_of_treatment
TR_1,DONOR_1,PD_1,Bone marrow transplant,Yes,01/09/2021,01/09/2022,Palliative,RECIST 1.1,Complete response,Treatment completed as prescribed
TR_2,DONOR_2,PD_2,Systemic therapy,No,01/12/2021,01/12/2024,Guidance,iRECIST,Partial response,Treatment incomplete due to technical or organizational problems
TR_2,DONOR_2,PD_2,Systemic therapy,No,01/12/2021,01/12/2020,Guidance,iRECIST,Partial response,Treatment incomplete due to technical or organizational problems
TR_3,DONOR_3,PD_3,Systemic therapy,Yes,01/01/2021,01/01/2022,Diagnostic,,Progressive disease,Treatment incomplete because patient died
TR_4,DONOR_4,PD_4,Systemic therapy,No,01/02/2021,02/02/2022,Forensic,Response Assessment in Neuro-Oncology (RANO),Stable disease,Patient choice (stopped or interrupted treatment)
TR_5,DONOR_5,PD_5,Radiation therapy,Yes,01/09/2021,01/09/2022,Preventive,AML Response Criteria,,Physician decision (stopped or interrupted treatment)
Expand Down
23 changes: 13 additions & 10 deletions tests/test_data_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,16 +100,19 @@ def test_validation(packets, schema):
non_interval_errors.append(e)
schema.validation_errors = non_interval_errors

assert len(schema.validation_errors) == 8
# should be the following 7 errors:
# DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date
# DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death
# DONOR_2: PD_2 > TR_2: date_of_death cannot be earlier than treatment_end_date
# DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date.
# DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas
# DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date.
# DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes
# Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times
assert len(schema.validation_errors) == 11
# should be the following 11 errors:
# "DONOR_2 > PD_2 > TR_2: Treatment start cannot be after treatment end.",
# "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
# "DONOR_2 > PD_2 > TR_2: Systemic therapy start date cannot be earlier than its treatment start date.",
# "DONOR_2 > PD_2 > TR_2: Systemic therapy end date cannot be after its treatment end date.",
# "DONOR_2 > PD_2_1 > TR_8: Systemic therapy end date cannot be after its treatment end date.",
# "DONOR_3 > DUPLICATE_ID > primary_site: 'Tongue' is not valid under any of the given schemas",
# "DONOR_3 > PD_3 > TR_3: Systemic therapy start date cannot be earlier than its treatment start date.",
# "DONOR_1: PD_1 > TR_1: date_of_death cannot be earlier than treatment_end_date ",
# "DONOR_1: PD_1 > TR_1: treatment_start_date cannot be after date_of_death ",
# "DONOR_5: lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes",
# "Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times"

# there should be an item named DUPLICATE_ID in both followup and sample_registration
print(json.dumps(schema.identifiers, indent=2))
Expand Down

0 comments on commit e57ca01

Please sign in to comment.