From 3ae6c6f3b069f394482adcee5e9eadac6745a8d4 Mon Sep 17 00:00:00 2001
From: DavidBrownlee <57147680+DavidBrownlee@users.noreply.github.com>
Date: Fri, 25 Oct 2024 09:34:22 -0400
Subject: [PATCH] Allows to be run as a stand alone. (#87)

* Allows to be run as a stand alone.

* pytest corrections for schema updates.

* workflow installation works without the pip install -e.

* Completed conversion to a module. Corrected imports throughout.  Allow imports from CWD.
---
 .github/workflows/test.yml                |  1 -
 README.md                                 |  8 +-------
 src/clinical_ETL.egg-info/PKG-INFO        |  2 +-
 src/clinical_etl/CSVConvert.py            | 12 ++++--------
 src/clinical_etl/__init__.py              |  3 +++
 src/clinical_etl/generate_mapping_docs.py |  6 ++++++
 src/clinical_etl/genomicschema.py         |  2 +-
 src/clinical_etl/mohschemav2.py           |  2 +-
 src/clinical_etl/mohschemav3.py           |  2 +-
 src/clinical_etl/validate_coverage.py     |  9 +++------
 tests/raw_data/PrimaryDiagnosis.csv       |  8 ++++----
 update_moh_template.sh                    |  0
 12 files changed, 25 insertions(+), 30 deletions(-)
 mode change 100644 => 100755 update_moh_template.sh

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1c3cf2e1..5156f5a5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -19,7 +19,6 @@ jobs:
       - name: Install dependencies
         run: |
           pip install -r requirements.txt
-          python -m pip install -e .
       - name: Test with pytest
         run: |
           pytest
diff --git a/README.md b/README.md
index f5afbf3f..91b980a6 100644
--- a/README.md
+++ b/README.md
@@ -45,12 +45,6 @@ Install the repo's requirements in your virtual environment
 pip install -r requirements.txt
 ```
 
->[!NOTE]
-> If Python can't find the `clinical_etl` module when running `CSVConvert`, install the depencency manually:
-> ```
-> pip install -e clinical_ETL_code/
-> ```
-
 Before running the script, you will need to have your input files, this will be clinical data in a tabular format (`xlsx`/`csv`) that can be read into program and a cohort directory containing the files that define the schema and mapping configurations.
 
 ### Input file/s format
@@ -65,7 +59,7 @@ If you are working with exports from RedCap, the sample files in the [`sample_in
 
 ### Setting up a cohort directory
 
-For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are:
+For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `clinical_ETL_data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are:
 
 * a [`manifest.yml`](#Manifest-file) file with configuration settings for the mapping and schema validation
 * a [mapping template](#Mapping-template) csv that lists custom mappings for each field (based on `moh_template.csv`)
diff --git a/src/clinical_ETL.egg-info/PKG-INFO b/src/clinical_ETL.egg-info/PKG-INFO
index fad2ec56..415ea0b5 100644
--- a/src/clinical_ETL.egg-info/PKG-INFO
+++ b/src/clinical_ETL.egg-info/PKG-INFO
@@ -84,7 +84,7 @@ If you are working with exports from RedCap, the sample files in the [`sample_in
 
 ### Setting up a cohort directory
 
-For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are:
+For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `clinical_ETL_data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are:
 
 * a [`manifest.yml`](#Manifest-file) file with configuration settings for the mapping and schema validation
 * a [mapping template](#Mapping-template) csv that lists custom mappings for each field (based on `moh_template.csv`)
diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py
index 3adc2332..73eac929 100644
--- a/src/clinical_etl/CSVConvert.py
+++ b/src/clinical_etl/CSVConvert.py
@@ -12,11 +12,7 @@
 import yaml
 import argparse
 from tqdm import tqdm
-from clinical_etl import mappings
-# Include clinical_etl parent directory in the module search path.
-current_dir = os.path.dirname(os.path.abspath(__file__))
-parent_dir = os.path.dirname(current_dir)
-sys.path.append(parent_dir)
+import mappings
 
 
 def verbose_print(message):
@@ -277,7 +273,7 @@ def eval_mapping(node_name, rownum):
     """
     verbose_print(f"  Evaluating {mappings.IDENTIFIER}: {node_name}")
     if "mappings" not in mappings.MODULES:
-        mappings.MODULES["mappings"] = importlib.import_module("clinical_etl.mappings")
+        mappings.MODULES["mappings"] = importlib.import_module("mappings")
     modulename = "mappings"
 
     method, parameters = parse_mapping_function(node_name)
@@ -596,7 +592,7 @@ def load_manifest(manifest_file):
 
     # programatically load schema class based on manifest value:
     # schema class definition will be in a file named schema_class.lower()
-    schema_mod = importlib.import_module(f"clinical_etl.{schema_class.lower()}")
+    schema_mod = importlib.import_module(f"{schema_class.lower()}")
     schema = getattr(schema_mod, schema_class)(manifest["schema"])
     if schema.json_schema is None:
         sys.exit(f"Could not read an openapi schema at {manifest['schema']};\n"
@@ -633,7 +629,7 @@ def load_manifest(manifest_file):
                     f"{manifest_dir} and has the correct name.\n---")
                 sys.exit(e)
     # mappings is a standard module: add it
-    mappings.MODULES["mappings"] = importlib.import_module("clinical_etl.mappings")
+    mappings.MODULES["mappings"] = importlib.import_module("mappings")
     return result
 
 
diff --git a/src/clinical_etl/__init__.py b/src/clinical_etl/__init__.py
index e69de29b..8cff63fd 100644
--- a/src/clinical_etl/__init__.py
+++ b/src/clinical_etl/__init__.py
@@ -0,0 +1,3 @@
+# Allows relative imports from current directory to work.
+import os, sys
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
\ No newline at end of file
diff --git a/src/clinical_etl/generate_mapping_docs.py b/src/clinical_etl/generate_mapping_docs.py
index c0c7b19c..4aef8628 100644
--- a/src/clinical_etl/generate_mapping_docs.py
+++ b/src/clinical_etl/generate_mapping_docs.py
@@ -1,3 +1,9 @@
+# Updates the ../../mapping_functions.md
+# Prior to running, set the PYTHONPATH for use by the subprocess with:
+# export PYTHONPATH="$PWD"
+# Then run:
+# python generate_mapping_docs.py
+
 import subprocess
 
 
diff --git a/src/clinical_etl/genomicschema.py b/src/clinical_etl/genomicschema.py
index 9a4e5488..12ea5056 100644
--- a/src/clinical_etl/genomicschema.py
+++ b/src/clinical_etl/genomicschema.py
@@ -1,6 +1,6 @@
 import json
 import dateparser
-from clinical_etl.schema import BaseSchema, ValidationError
+from schema import BaseSchema, ValidationError
 
 
 """
diff --git a/src/clinical_etl/mohschemav2.py b/src/clinical_etl/mohschemav2.py
index 71733b73..0847e4b5 100644
--- a/src/clinical_etl/mohschemav2.py
+++ b/src/clinical_etl/mohschemav2.py
@@ -1,6 +1,6 @@
 import json
 import dateparser
-from clinical_etl.schema import BaseSchema, ValidationError
+from schema import BaseSchema, ValidationError
 
 
 """
diff --git a/src/clinical_etl/mohschemav3.py b/src/clinical_etl/mohschemav3.py
index ff0a65b7..71f33eff 100644
--- a/src/clinical_etl/mohschemav3.py
+++ b/src/clinical_etl/mohschemav3.py
@@ -1,6 +1,6 @@
 import json
 import dateparser
-from clinical_etl.schema import BaseSchema, ValidationError
+from schema import BaseSchema, ValidationError
 
 
 """
diff --git a/src/clinical_etl/validate_coverage.py b/src/clinical_etl/validate_coverage.py
index 0d4d67c9..0626eff6 100644
--- a/src/clinical_etl/validate_coverage.py
+++ b/src/clinical_etl/validate_coverage.py
@@ -4,10 +4,6 @@
 import mappings
 import importlib.util
 import os
-# Include clinical_etl parent directory in the module search path for a later import.
-current_dir = os.path.dirname(os.path.abspath(__file__))
-parent_dir = os.path.dirname(current_dir)
-sys.path.append(parent_dir)
 # from jsoncomparison import Compare
 # from copy import deepcopy
 # import yaml
@@ -15,6 +11,8 @@
 # import os
 # import re
 # import CSVConvert
+import mohschemav2
+import mohschemav3
 
 
 def parse_args():
@@ -215,8 +213,7 @@ def validate_coverage(map_json, verbose=False):
     schema_class = "MoHSchemaV3"
     if "schema_class" in map_json:
         schema_class = map_json["schema_class"]
-    schema_mod = importlib.import_module(f"clinical_etl.{schema_class.lower()}")
-    schema = getattr(schema_mod, schema_class)(map_json["openapi_url"])
+    schema = getattr(eval(schema_class.lower()), schema_class)(map_json["openapi_url"])
 
     if schema.json_schema is None:
         sys.exit(f"Did not find an openapi schema at {map_json['openapi_url']}; please check the 'openapi_url' in the map json file.")
diff --git a/tests/raw_data/PrimaryDiagnosis.csv b/tests/raw_data/PrimaryDiagnosis.csv
index 4fc79d6e..03cdefcc 100644
--- a/tests/raw_data/PrimaryDiagnosis.csv
+++ b/tests/raw_data/PrimaryDiagnosis.csv
@@ -1,9 +1,9 @@
 submitter_donor_id, primary_site, submitter_primary_diagnosis_id, date_of_diagnosis, cancer_type_code, basis_of_diagnosis, clinical_tumour_staging_system, clinical_t_category, clinical_n_category, clinical_m_category, clinical_stage_group, laterality, pathological_t_category, pathological_n_category, pathological_m_category, pathological_stage_group
 DONOR_1,Esophagus,PD_1,1/1/2018,C43.1,Cytology,International Neuroblastoma Staging System,,,,Stage 1,Left,T3e,N1,MX,
 DONOR_2,Eye and adnexa,PD_2,1/3/2020,C04.9,Specific tumour markers,Rai staging system,,,,Stage 1A,Bilateral,,,,In situ
-DONOR_3,Floor of mouth,PD_3,1/5/2018,C43.9,Not available,AJCC 7th edition,T0,N0,M1a,,Left,,,,Stage IIIA
-DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Not available,AJCC 7th edition,T0,N0,M1a,,Left,,,,Stage IIIB
-DONOR_4,,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (RISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS
-DONOR_5,Gum,PD_5,1/3/2020,C64.9,,Revised International staging system (RISS),T1,N0a,M0,,Left,,,,Stage IIBES
+DONOR_3,Floor of mouth,PD_3,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIA
+DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIB
+DONOR_4,,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS
+DONOR_5,Gum,PD_5,1/3/2020,C64.9,,Revised International staging system (R-ISS),T1,N0a,M0,,Left,,,,Stage IIBES
 DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2018,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB
 DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,,
diff --git a/update_moh_template.sh b/update_moh_template.sh
old mode 100644
new mode 100755