CanDIG · yavyx · Nov 22, 2024 · May 24, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -19,18 +19,18 @@ jobs:
       - name: Install dependencies
         run: |
           pip install -r requirements.txt
-          python -m pip install -e .
       - name: Test with pytest
         run: |
           pytest
-      - name: Compare moh_template.csv
+      - name: Compare to moh_v3_template.csv
         shell: bash {0}
         run: |
-          python generate_schema.py
-          diff template.csv moh_template.csv > curr_diff.txt
+          # Script based largely on update_moh_template.sh
+          python src/clinical_etl/generate_schema.py --out moh_template
+          diff moh_template.csv moh_v3_template.csv > curr_diff.txt
           bytes=$(head -5 curr_diff.txt | wc -c)
           dd if=curr_diff.txt bs="$bytes" skip=1 conv=notrunc of=new_diff.txt
-          diff new_diff.txt test_data/moh_diffs.txt
+          diff new_diff.txt tests/moh_diffs.txt
           if [[ $? == 1 ]]; then echo MoH template checking needs to be updated! See https://github.com/CanDIG/clinical_ETL_code#mapping-template for information.
           exit 1
           fi
diff --git a/README.md b/README.md
@@ -45,12 +45,6 @@ Install the repo's requirements in your virtual environment
 pip install -r requirements.txt
 ```
 
->[!NOTE]
-> If Python can't find the `clinical_etl` module when running `CSVConvert`, install the depencency manually:
-> ```
-> pip install -e clinical_ETL_code/
-> ```
-
 Before running the script, you will need to have your input files, this will be clinical data in a tabular format (`xlsx`/`csv`) that can be read into program and a cohort directory containing the files that define the schema and mapping configurations.
 
 ### Input file/s format
@@ -65,7 +59,7 @@ If you are working with exports from RedCap, the sample files in the [`sample_in
 
 ### Setting up a cohort directory
 
-For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are:
+For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `clinical_ETL_data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are:
 
 * a [`manifest.yml`](#Manifest-file) file with configuration settings for the mapping and schema validation
 * a [mapping template](#Mapping-template) csv that lists custom mappings for each field (based on `moh_template.csv`)
@@ -96,7 +90,7 @@ You'll need to create a mapping template that defines the mapping between the fi
 
 Each line in the mapping template is composed of comma separated values with two components. The first value is an `element` or field from the target schema and the second value contains a suggested `mapping method` or function to map a field from an input sheet to a valid value for the identified `element`. Each `element`, shows the full object linking path to each field required by the model. These values should not be edited.
 
-If you are generating a mapping for the current CanDIG MoH model, you can use the pre-generated [`moh_template.csv`](moh_template.csv) file. This file is modified from the auto-generated template to update a few fields that require specific handling.
+If you are generating a mapping for the current CanDIG MoH model, you can use the pre-generated [`moh_v3_template.csv`](moh_v3_template.csv) file. This file is modified from the auto-generated template to update a few fields that require specific handling.
 
 You will need to edit the `mapping method` values in each line in the following ways:
 1. Replace the generic sheet names (e.g. `DONOR_SHEET`, `SAMPLE_REGISTRATIONS_SHEET`) with the sheet/csv names you are using as your input to `CSVConvert.py`
@@ -158,12 +152,6 @@ The main output `<INPUT_DIR>_map.json` and optional output`<INPUT_DIR>_indexed.j
 
 Validation will automatically be run after the conversion is complete. Any validation errors or warnings will be reported both on the command line and as part of the `<INPUT_DIR>_map.json` file.
 
->[!NOTE]
-> If Python can't find the `clinical_etl` module when running `CSVConvert`, install the depencency manually:
-> ```
-> pip install -e clinical_ETL_code/
-> ```
-
 #### Format of the output files
 
 `<INPUT_DIR>_map.json` is the main output and contains the results of the mapping, conversion and validation as well as summary statistics.
@@ -187,9 +175,6 @@ A summarised example of the output is below:
         "schemas_used": [
             "donors"
         ],
-        "cases_missing_data": [
-            "DONOR_5"
-        ],
         "schemas_not_used": [
             "exposures",
             "biomarkers"

diff --git a/dist/clinical_ETL-2.2.1-py3-none-any.whl b/dist/clinical_ETL-2.2.1-py3-none-any.whl
diff --git a/dist/clinical_ETL-3.0.0-py3-none-any.whl b/dist/clinical_ETL-3.0.0-py3-none-any.whl
diff --git a/dist/clinical_ETL-3.1.0-py3-none-any.whl b/dist/clinical_ETL-3.1.0-py3-none-any.whl
diff --git a/dist/clinical_etl-2.2.1.tar.gz b/dist/clinical_etl-2.2.1.tar.gz
diff --git a/dist/clinical_etl-3.0.0.tar.gz b/dist/clinical_etl-3.0.0.tar.gz
diff --git a/dist/clinical_etl-3.1.0.tar.gz b/dist/clinical_etl-3.1.0.tar.gz
diff --git a/moh_v3_template.csv b/moh_v3_template.csv
@@ -158,7 +158,7 @@ DONOR.INDEX.biomarkers.INDEX.her2_ish_status, {single_val(BIOMARKERS_SHEET.her2_
 DONOR.INDEX.biomarkers.INDEX.hpv_ihc_status, {single_val(BIOMARKERS_SHEET.hpv_ihc_status)}
 DONOR.INDEX.biomarkers.INDEX.hpv_pcr_status, {single_val(BIOMARKERS_SHEET.hpv_pcr_status)}
 DONOR.INDEX.biomarkers.INDEX.hpv_strain, {pipe_delim(BIOMARKERS_SHEET.hpv_strain)}
-DONOR.INDEX.followups.INDEX, {indexed_on(FOLLOWUPS_SHEET.submitter_donor_id)}
+DONOR.INDEX.followups.INDEX, {moh_indexed_on_donor_if_others_absent(FOLLOWUPS_SHEET.submitter_donor_id)}
 DONOR.INDEX.followups.INDEX.submitter_follow_up_id, {single_val(FOLLOWUPS_SHEET.submitter_follow_up_id)}
 DONOR.INDEX.followups.INDEX.date_of_followup, {date_interval(FOLLOWUPS_SHEET.date_of_followup)}
 DONOR.INDEX.followups.INDEX.disease_status_at_followup, {single_val(FOLLOWUPS_SHEET.disease_status_at_followup)}

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools >= 61.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
-version = "3.1.0"
+version = "3.1.1"
 name = "clinical_ETL"
 dependencies = [
     "pandas>=2.1.0",
@@ -25,4 +25,4 @@ readme = "README.md"
 CSVConvert = "clinical_etl.CSVConvert:main"
 
 [project.urls]
-Repository = "https://github.com/CanDIG/clinical_ETL_code"
+Repository = "https://github.com/CanDIG/clinical_ETL_code"
diff --git a/src/clinical_ETL.egg-info/PKG-INFO b/src/clinical_ETL.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: clinical_ETL
-Version: 3.1.0
+Version: 3.1.1
 Summary: ETL module for transforming clinical CSV data into properly-formatted packets for ingest into Katsu
 Project-URL: Repository, https://github.com/CanDIG/clinical_ETL_code
 Requires-Python: >=3.10
@@ -64,12 +64,6 @@ Install the repo's requirements in your virtual environment
 pip install -r requirements.txt
 ```
 
->[!NOTE]
-> If Python can't find the `clinical_etl` module when running `CSVConvert`, install the depencency manually:
-> ```
-> pip install -e clinical_ETL_code/
-> ```
-
 Before running the script, you will need to have your input files, this will be clinical data in a tabular format (`xlsx`/`csv`) that can be read into program and a cohort directory containing the files that define the schema and mapping configurations.
 
 ### Input file/s format
@@ -84,7 +78,7 @@ If you are working with exports from RedCap, the sample files in the [`sample_in
 
 ### Setting up a cohort directory
 
-For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are:
+For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `clinical_ETL_data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are:
 
 * a [`manifest.yml`](#Manifest-file) file with configuration settings for the mapping and schema validation
 * a [mapping template](#Mapping-template) csv that lists custom mappings for each field (based on `moh_template.csv`)
@@ -177,12 +171,6 @@ The main output `<INPUT_DIR>_map.json` and optional output`<INPUT_DIR>_indexed.j
 
 Validation will automatically be run after the conversion is complete. Any validation errors or warnings will be reported both on the command line and as part of the `<INPUT_DIR>_map.json` file.
 
->[!NOTE]
-> If Python can't find the `clinical_etl` module when running `CSVConvert`, install the depencency manually:
-> ```
-> pip install -e clinical_ETL_code/
-> ```
-
 #### Format of the output files
 
 `<INPUT_DIR>_map.json` is the main output and contains the results of the mapping, conversion and validation as well as summary statistics.
@@ -206,9 +194,6 @@ A summarised example of the output is below:
         "schemas_used": [
             "donors"
         ],
-        "cases_missing_data": [
-            "DONOR_5"
-        ],
         "schemas_not_used": [
             "exposures",
             "biomarkers"
@@ -220,6 +205,7 @@ A summarised example of the output is below:
     }
 }
 ```
+`<INPUT_DIR>_validation_results.json` contains all validation warnings and errors.
 
 The mapping and transformation result is found in the `"donors"` key.
 

diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py
@@ -12,11 +12,7 @@
 import yaml
 import argparse
 from tqdm import tqdm
-from clinical_etl import mappings
-# Include clinical_etl parent directory in the module search path.
-current_dir = os.path.dirname(os.path.abspath(__file__))
-parent_dir = os.path.dirname(current_dir)
-sys.path.append(parent_dir)
+import mappings
 
 
 def verbose_print(message):
@@ -277,7 +273,7 @@ def eval_mapping(node_name, rownum):
     """
     verbose_print(f"  Evaluating {mappings.IDENTIFIER}: {node_name}")
     if "mappings" not in mappings.MODULES:
-        mappings.MODULES["mappings"] = importlib.import_module("clinical_etl.mappings")
+        mappings.MODULES["mappings"] = importlib.import_module("mappings")
     modulename = "mappings"
 
     method, parameters = parse_mapping_function(node_name)
@@ -596,7 +592,7 @@ def load_manifest(manifest_file):
 
     # programatically load schema class based on manifest value:
     # schema class definition will be in a file named schema_class.lower()
-    schema_mod = importlib.import_module(f"clinical_etl.{schema_class.lower()}")
+    schema_mod = importlib.import_module(f"{schema_class.lower()}")
     schema = getattr(schema_mod, schema_class)(manifest["schema"])
     if schema.json_schema is None:
         sys.exit(f"Could not read an openapi schema at {manifest['schema']};\n"
@@ -633,7 +629,7 @@ def load_manifest(manifest_file):
                     f"{manifest_dir} and has the correct name.\n---")
                 sys.exit(e)
     # mappings is a standard module: add it
-    mappings.MODULES["mappings"] = importlib.import_module("clinical_etl.mappings")
+    mappings.MODULES["mappings"] = importlib.import_module("mappings")
     return result
 
 
@@ -743,36 +739,37 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver
                 json.dump(mappings.INDEXED_DATA, f, indent=4)
 
     result_key = list(schema.validation_schema.keys()).pop(0)
-
     result = {
         "openapi_url": schema.openapi_url,
         "schema_class": type(schema).__name__,
         result_key: packets
     }
     if schema.katsu_sha is not None:
         result["katsu_sha"] = schema.katsu_sha
-    print(f"{Bcolors.OKGREEN}Saving packets to file.{Bcolors.ENDC}")
-    with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f:  # write to json file for ingestion
-        if minify:
-            json.dump(result, f)
-        else:
-            json.dump(result, f, indent=4)
 
     # add validation data:
     print(f"\n{Bcolors.OKGREEN}Starting validation...{Bcolors.ENDC}")
     schema.validate_ingest_map(result)
     validation_results = {"validation_errors": schema.validation_errors,
-                          "validation_warnings": schema.validation_warnings}
+                          "validation_warnings": schema.validation_warnings,
+                          "cases_missing_data": schema.statistics["cases_missing_data"]}
     result["statistics"] = schema.statistics
-    with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f:  # write to json file for ingestion
+    result["statistics"].pop("cases_missing_data")  # remove donor IDs from _map.json file
+
+    # write ingestion and validation json files
+    print(f"{Bcolors.OKGREEN}Saving packets to file.{Bcolors.ENDC}")
+    with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f:
         if minify:
             json.dump(result, f)
         else:
             json.dump(result, f, indent=4)
     errors_present = False
-    with open(f"{input_path}_validation_results.json", 'w') as f:
-        json.dump(validation_results, f, indent=4)
-    print(f"Warnings written to {input_path}_validation_results.json.")
+    if len(validation_results["validation_errors"]) == 0 and len(validation_results["validation_warnings"]) == 0:
+           print(f"{Bcolors.OKGREEN}Validation passed!{Bcolors.ENDC}")
+    else:
+        with open(f"{input_path}_validation_results.json", 'w') as f:
+            json.dump(validation_results, f, indent=4)
+        print(f"Warnings written to {input_path}_validation_results.json.")
     if len(validation_results["validation_warnings"]) > 0:
         if len(validation_results["validation_warnings"]) > 20:
             print(f"\n{Bcolors.WARNING}WARNING: There are {len(validation_results['validation_warnings'])} validation "

diff --git a/src/clinical_etl/__init__.py b/src/clinical_etl/__init__.py
@@ -0,0 +1,3 @@
+# Allows relative imports from current directory to work.
+import os, sys
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
diff --git a/src/clinical_etl/generate_mapping_docs.py b/src/clinical_etl/generate_mapping_docs.py
@@ -1,3 +1,9 @@
+# Updates the ../../mapping_functions.md
+# Prior to running, set the PYTHONPATH for use by the subprocess with:
+# export PYTHONPATH="$PWD"
+# Then run:
+# python generate_mapping_docs.py
+
 import subprocess
 
 

diff --git a/src/clinical_etl/generate_schema.py b/src/clinical_etl/generate_schema.py
@@ -21,8 +21,8 @@ def parse_args():
                         default="https://raw.githubusercontent.com/CanDIG/katsu/develop/chord_metadata_service/mohpackets/docs/schemas/schema.json")
     parser.add_argument('--schema', type=str, help="Name of schema class", default="MoHSchemaV3")
     parser.add_argument('--out', type=str,
-                        help="name of output file; csv extension will be added. Default is template",
-                        default="template")
+                        help="name of output file; csv extension will be added. Default is moh_template",
+                        default="moh_template")
     args = parser.parse_args()
     return args
 

diff --git a/src/clinical_etl/genomicschema.py b/src/clinical_etl/genomicschema.py
@@ -1,6 +1,6 @@
 import json
 import dateparser
-from clinical_etl.schema import BaseSchema, ValidationError
+from schema import BaseSchema, ValidationError
 
 
 """

diff --git a/src/clinical_etl/mappings.py b/src/clinical_etl/mappings.py
@@ -4,6 +4,7 @@
 import datetime
 import math
 from dateutil import relativedelta
+import copy
 
 VERBOSE = False
 MODULES = {}
@@ -70,21 +71,27 @@ def earliest_date(data_values):
     """
     fields = list(data_values.keys())
     date_resolution = list(data_values[fields[0]].values())[0]
-    dates = list(data_values[fields[1]].values())[0]
+    dates = copy.deepcopy(list(data_values[fields[1]].values())[0])
     earliest = DEFAULT_DATE_PARSER.get_date_data(str(datetime.date.today()))
     # Ensure dates is a list, not a string, to allow non-indexed, single value entries.
     if type(dates) is not list:
         dates_list = [dates]
     else:
         dates_list = dates
-    for date in dates_list:
-        d = DEFAULT_DATE_PARSER.get_date_data(date)
-        if d['date_obj'] < earliest['date_obj']:
-            earliest = d
-    return {
-        "offset": earliest['date_obj'].strftime("%Y-%m-%d"),
-        "period": date_resolution
-    }
+    # If there's a None value, ignore it
+    if None in dates_list:
+        dates_list = [x for x in dates_list if x is not None]
+    if len(dates_list) > 0:
+        for date in dates_list:
+            d = DEFAULT_DATE_PARSER.get_date_data(date)
+            if d['date_obj'] < earliest['date_obj']:
+                earliest = d
+        return {
+            "offset": earliest['date_obj'].strftime("%Y-%m-%d"),
+            "period": date_resolution
+        }
+    else:
+        return None
 
 
 def date_interval(data_values):
@@ -100,7 +107,9 @@ def date_interval(data_values):
     try:
         reference = INDEXED_DATA["data"]["CALCULATED"][IDENTIFIER]["REFERENCE_DATE"][0]
     except KeyError:
-        raise MappingError("No reference date found to calculate date_interval: is there a reference_date specified in the manifest?", field_level=1)
+        _warn(message="No reference date found to calculate date_interval: check the reference_date is specified in the manifest or if it is missing for this donor",
+              input_values=data_values)
+        return None
     DEFAULT_DATE_PARSER = dateparser.DateDataParser(
         settings={"PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": DATE_FORMAT}
     )
@@ -578,3 +587,4 @@ def _parse_date(date_string):
         except Exception as e:
             raise MappingError(f"error in date({date_string}): {type(e)} {e}", field_level=2)
     return date_string
+
diff --git a/src/clinical_etl/mohschemav2.py b/src/clinical_etl/mohschemav2.py
@@ -1,6 +1,6 @@
 import json
 import dateparser
-from clinical_etl.schema import BaseSchema, ValidationError
+from schema import BaseSchema, ValidationError
 
 
 """