Merge pull request #68 from CanDIG/yavyx/moh-v3

DIG-1684: Update clinical_etl to v3 clinical data model
CanDIG · Aug 9, 2024 · b849e87 · b849e87
2 parents 75046bf + 59c0cfe
commit b849e87
Show file tree

Hide file tree

Showing 25 changed files with 874 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -85,7 +85,7 @@ The `manifest.yml` file contains settings for the cohort mapping. There is a sam
 | mapping       | the mapping template csv file that lists the mappings for each field based on `moh_template.csv`, assumed to be in the same directory as the `manifest.yml` file                                          |
 | identifier    | the unique identifier for the donor or root node                                                                                                                                                          |
 | schema        | a URL to the openapi schema file                                                                                                                                                                          |
-| schema_class  | The name of the class in the schema used as the model for creating the map.json. Currently supported: `MoHSchema` - for clinical MoH data and `GenomicSchema` for creating a genomic ingest linking file. |
+| schema_class  | The name of the class in the schema used as the model for creating the map.json. Currently supported: `MoHSchemaV2` and `MoHSchemaV3` - for clinical MoH data and `GenomicSchema` for creating a genomic ingest linking file. |
 | reference_date | a reference date used to calculate date intervals, formatted as a mapping entry for the mapping template                                                                                                 |
 | date_format | Specify the format of the dates in your input data. Use any combination of the characters `DMY`to specify the order (e.g. `DMY`, `MDY`, `YMD`, etc).                                                                                    |
 | functions     | A list of one or more filenames containing additional mapping functions, can be omitted if not needed. Assumed to be in the same directory as the `manifest.yml` file                                     |
@@ -121,6 +121,7 @@ usage: generate_schema.py [-h] --url URL [--out OUT]
 options:
   -h, --help  show this help message and exit
   --url URL   URL to openAPI schema file (raw github link)
+  --schema    Name of schema class. Default is MoHSchemaV3
   --out OUT   name of output file; csv extension will be added. Default is template
 ```
 </details>

diff --git a/moh_template.csv → moh_v2_template.csv b/moh_template.csv → moh_v2_template.csv
diff --git a/moh_v3_template.csv b/moh_v3_template.csv
diff --git a/sample_inputs/generic_example/manifest.yml b/sample_inputs/generic_example/manifest.yml
@@ -6,7 +6,7 @@ identifier: submitter_donor_id
 # a link to the openapi schema
 schema: https://raw.githubusercontent.com/CanDIG/katsu/develop/chord_metadata_service/mohpackets/docs/schema.yml
 # class of schema for validation:
-schema_class: MoHSchema
+schema_class: MoHSchemaV3
 # a reference date used to calculate date intervals, formatted as a mapping entry for the mapping template
 reference_date: earliest_date(Donor.date_resolution, PrimaryDiagnosis.date_of_diagnosis)
 # one or more files (dataset_functions.py) that implement the mappings

diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py
@@ -573,7 +573,7 @@ def check_for_sheet_inconsistencies(template_sheets, csv_sheets):
 def load_manifest(manifest_file):
     """Given a manifest file's path, return the data inside it."""
     identifier = None
-    schema_class = "MoHSchema"
+    schema_class = "MoHSchemaV2"
     mapping_path = None
     result = {}
     try:

diff --git a/src/clinical_etl/generate_schema.py b/src/clinical_etl/generate_schema.py
@@ -10,14 +10,15 @@
 import pandas
 import sys
 import argparse
-from mohschema import MoHSchema
+from mohschemav2 import MoHSchemaV2
+from mohschemav3 import MoHSchemaV3
 import re
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('--url', type=str, help="URL to openAPI schema file (raw github link)", default="https://raw.githubusercontent.com/CanDIG/katsu/develop/chord_metadata_service/mohpackets/docs/schema.yml")
-    parser.add_argument('--schema', type=str, help="Name of schema class", default="MoHSchema")
+    parser.add_argument('--schema', type=str, help="Name of schema class", default="MoHSchemaV3")
     parser.add_argument('--out', type=str, help="name of output file; csv extension will be added. Default is template", default="template")
     args = parser.parse_args()
     return args

diff --git a/src/clinical_etl/mappings.py b/src/clinical_etl/mappings.py
@@ -152,7 +152,7 @@ def int_to_date_interval_json(data_values):
         return
     # Either month or day date resolutions are permitted.
     try:
-        resolution = INDEXED_DATA["data"]["CALCULATED"][IDENTIFIER]["date_resolution"][0]
+        resolution = INDEXED_DATA["data"]["Donor"][IDENTIFIER]["date_resolution"][0]
     except KeyError:
         raise MappingError("No date_resolution found to specify date interval resolution: is there a date_resolution specified in the donor file?", field_level=2)
     # Format as JSON.  Always include a month_interval.  day_interval is optional.

diff --git a/src/clinical_etl/mohschema.py → src/clinical_etl/mohschemav2.py b/src/clinical_etl/mohschema.py → src/clinical_etl/mohschemav2.py
@@ -7,7 +7,7 @@
 A class for the representation of a DonorWithClinicalData (MoHCCN data model v2) object in Katsu.
 """
 
-class MoHSchema(BaseSchema):
+class MoHSchemaV2(BaseSchema):
     schema_name = "DonorWithClinicalDataSchema"
     base_name = "DONOR"