From 3ae6c6f3b069f394482adcee5e9eadac6745a8d4 Mon Sep 17 00:00:00 2001 From: DavidBrownlee <57147680+DavidBrownlee@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:34:22 -0400 Subject: [PATCH] Allows to be run as a stand alone. (#87) * Allows to be run as a stand alone. * pytest corrections for schema updates. * workflow installation works without the pip install -e. * Completed conversion to a module. Corrected imports throughout. Allow imports from CWD. --- .github/workflows/test.yml | 1 - README.md | 8 +------- src/clinical_ETL.egg-info/PKG-INFO | 2 +- src/clinical_etl/CSVConvert.py | 12 ++++-------- src/clinical_etl/__init__.py | 3 +++ src/clinical_etl/generate_mapping_docs.py | 6 ++++++ src/clinical_etl/genomicschema.py | 2 +- src/clinical_etl/mohschemav2.py | 2 +- src/clinical_etl/mohschemav3.py | 2 +- src/clinical_etl/validate_coverage.py | 9 +++------ tests/raw_data/PrimaryDiagnosis.csv | 8 ++++---- update_moh_template.sh | 0 12 files changed, 25 insertions(+), 30 deletions(-) mode change 100644 => 100755 update_moh_template.sh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1c3cf2e1..5156f5a5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,7 +19,6 @@ jobs: - name: Install dependencies run: | pip install -r requirements.txt - python -m pip install -e . - name: Test with pytest run: | pytest diff --git a/README.md b/README.md index f5afbf3f..91b980a6 100644 --- a/README.md +++ b/README.md @@ -45,12 +45,6 @@ Install the repo's requirements in your virtual environment pip install -r requirements.txt ``` ->[!NOTE] -> If Python can't find the `clinical_etl` module when running `CSVConvert`, install the depencency manually: -> ``` -> pip install -e clinical_ETL_code/ -> ``` - Before running the script, you will need to have your input files, this will be clinical data in a tabular format (`xlsx`/`csv`) that can be read into program and a cohort directory containing the files that define the schema and mapping configurations. ### Input file/s format @@ -65,7 +59,7 @@ If you are working with exports from RedCap, the sample files in the [`sample_in ### Setting up a cohort directory -For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are: +For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `clinical_ETL_data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are: * a [`manifest.yml`](#Manifest-file) file with configuration settings for the mapping and schema validation * a [mapping template](#Mapping-template) csv that lists custom mappings for each field (based on `moh_template.csv`) diff --git a/src/clinical_ETL.egg-info/PKG-INFO b/src/clinical_ETL.egg-info/PKG-INFO index fad2ec56..415ea0b5 100644 --- a/src/clinical_ETL.egg-info/PKG-INFO +++ b/src/clinical_ETL.egg-info/PKG-INFO @@ -84,7 +84,7 @@ If you are working with exports from RedCap, the sample files in the [`sample_in ### Setting up a cohort directory -For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are: +For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `clinical_ETL_data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are: * a [`manifest.yml`](#Manifest-file) file with configuration settings for the mapping and schema validation * a [mapping template](#Mapping-template) csv that lists custom mappings for each field (based on `moh_template.csv`) diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py index 3adc2332..73eac929 100644 --- a/src/clinical_etl/CSVConvert.py +++ b/src/clinical_etl/CSVConvert.py @@ -12,11 +12,7 @@ import yaml import argparse from tqdm import tqdm -from clinical_etl import mappings -# Include clinical_etl parent directory in the module search path. -current_dir = os.path.dirname(os.path.abspath(__file__)) -parent_dir = os.path.dirname(current_dir) -sys.path.append(parent_dir) +import mappings def verbose_print(message): @@ -277,7 +273,7 @@ def eval_mapping(node_name, rownum): """ verbose_print(f" Evaluating {mappings.IDENTIFIER}: {node_name}") if "mappings" not in mappings.MODULES: - mappings.MODULES["mappings"] = importlib.import_module("clinical_etl.mappings") + mappings.MODULES["mappings"] = importlib.import_module("mappings") modulename = "mappings" method, parameters = parse_mapping_function(node_name) @@ -596,7 +592,7 @@ def load_manifest(manifest_file): # programatically load schema class based on manifest value: # schema class definition will be in a file named schema_class.lower() - schema_mod = importlib.import_module(f"clinical_etl.{schema_class.lower()}") + schema_mod = importlib.import_module(f"{schema_class.lower()}") schema = getattr(schema_mod, schema_class)(manifest["schema"]) if schema.json_schema is None: sys.exit(f"Could not read an openapi schema at {manifest['schema']};\n" @@ -633,7 +629,7 @@ def load_manifest(manifest_file): f"{manifest_dir} and has the correct name.\n---") sys.exit(e) # mappings is a standard module: add it - mappings.MODULES["mappings"] = importlib.import_module("clinical_etl.mappings") + mappings.MODULES["mappings"] = importlib.import_module("mappings") return result diff --git a/src/clinical_etl/__init__.py b/src/clinical_etl/__init__.py index e69de29b..8cff63fd 100644 --- a/src/clinical_etl/__init__.py +++ b/src/clinical_etl/__init__.py @@ -0,0 +1,3 @@ +# Allows relative imports from current directory to work. +import os, sys +sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/src/clinical_etl/generate_mapping_docs.py b/src/clinical_etl/generate_mapping_docs.py index c0c7b19c..4aef8628 100644 --- a/src/clinical_etl/generate_mapping_docs.py +++ b/src/clinical_etl/generate_mapping_docs.py @@ -1,3 +1,9 @@ +# Updates the ../../mapping_functions.md +# Prior to running, set the PYTHONPATH for use by the subprocess with: +# export PYTHONPATH="$PWD" +# Then run: +# python generate_mapping_docs.py + import subprocess diff --git a/src/clinical_etl/genomicschema.py b/src/clinical_etl/genomicschema.py index 9a4e5488..12ea5056 100644 --- a/src/clinical_etl/genomicschema.py +++ b/src/clinical_etl/genomicschema.py @@ -1,6 +1,6 @@ import json import dateparser -from clinical_etl.schema import BaseSchema, ValidationError +from schema import BaseSchema, ValidationError """ diff --git a/src/clinical_etl/mohschemav2.py b/src/clinical_etl/mohschemav2.py index 71733b73..0847e4b5 100644 --- a/src/clinical_etl/mohschemav2.py +++ b/src/clinical_etl/mohschemav2.py @@ -1,6 +1,6 @@ import json import dateparser -from clinical_etl.schema import BaseSchema, ValidationError +from schema import BaseSchema, ValidationError """ diff --git a/src/clinical_etl/mohschemav3.py b/src/clinical_etl/mohschemav3.py index ff0a65b7..71f33eff 100644 --- a/src/clinical_etl/mohschemav3.py +++ b/src/clinical_etl/mohschemav3.py @@ -1,6 +1,6 @@ import json import dateparser -from clinical_etl.schema import BaseSchema, ValidationError +from schema import BaseSchema, ValidationError """ diff --git a/src/clinical_etl/validate_coverage.py b/src/clinical_etl/validate_coverage.py index 0d4d67c9..0626eff6 100644 --- a/src/clinical_etl/validate_coverage.py +++ b/src/clinical_etl/validate_coverage.py @@ -4,10 +4,6 @@ import mappings import importlib.util import os -# Include clinical_etl parent directory in the module search path for a later import. -current_dir = os.path.dirname(os.path.abspath(__file__)) -parent_dir = os.path.dirname(current_dir) -sys.path.append(parent_dir) # from jsoncomparison import Compare # from copy import deepcopy # import yaml @@ -15,6 +11,8 @@ # import os # import re # import CSVConvert +import mohschemav2 +import mohschemav3 def parse_args(): @@ -215,8 +213,7 @@ def validate_coverage(map_json, verbose=False): schema_class = "MoHSchemaV3" if "schema_class" in map_json: schema_class = map_json["schema_class"] - schema_mod = importlib.import_module(f"clinical_etl.{schema_class.lower()}") - schema = getattr(schema_mod, schema_class)(map_json["openapi_url"]) + schema = getattr(eval(schema_class.lower()), schema_class)(map_json["openapi_url"]) if schema.json_schema is None: sys.exit(f"Did not find an openapi schema at {map_json['openapi_url']}; please check the 'openapi_url' in the map json file.") diff --git a/tests/raw_data/PrimaryDiagnosis.csv b/tests/raw_data/PrimaryDiagnosis.csv index 4fc79d6e..03cdefcc 100644 --- a/tests/raw_data/PrimaryDiagnosis.csv +++ b/tests/raw_data/PrimaryDiagnosis.csv @@ -1,9 +1,9 @@ submitter_donor_id, primary_site, submitter_primary_diagnosis_id, date_of_diagnosis, cancer_type_code, basis_of_diagnosis, clinical_tumour_staging_system, clinical_t_category, clinical_n_category, clinical_m_category, clinical_stage_group, laterality, pathological_t_category, pathological_n_category, pathological_m_category, pathological_stage_group DONOR_1,Esophagus,PD_1,1/1/2018,C43.1,Cytology,International Neuroblastoma Staging System,,,,Stage 1,Left,T3e,N1,MX, DONOR_2,Eye and adnexa,PD_2,1/3/2020,C04.9,Specific tumour markers,Rai staging system,,,,Stage 1A,Bilateral,,,,In situ -DONOR_3,Floor of mouth,PD_3,1/5/2018,C43.9,Not available,AJCC 7th edition,T0,N0,M1a,,Left,,,,Stage IIIA -DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Not available,AJCC 7th edition,T0,N0,M1a,,Left,,,,Stage IIIB -DONOR_4,,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (RISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS -DONOR_5,Gum,PD_5,1/3/2020,C64.9,,Revised International staging system (RISS),T1,N0a,M0,,Left,,,,Stage IIBES +DONOR_3,Floor of mouth,PD_3,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIA +DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIB +DONOR_4,,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS +DONOR_5,Gum,PD_5,1/3/2020,C64.9,,Revised International staging system (R-ISS),T1,N0a,M0,,Left,,,,Stage IIBES DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2018,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,, diff --git a/update_moh_template.sh b/update_moh_template.sh old mode 100644 new mode 100755