diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1c3cf2e..5156f5a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,7 +19,6 @@ jobs: - name: Install dependencies run: | pip install -r requirements.txt - python -m pip install -e . - name: Test with pytest run: | pytest diff --git a/README.md b/README.md index f5afbf3..91b980a 100644 --- a/README.md +++ b/README.md @@ -45,12 +45,6 @@ Install the repo's requirements in your virtual environment pip install -r requirements.txt ``` ->[!NOTE] -> If Python can't find the `clinical_etl` module when running `CSVConvert`, install the depencency manually: -> ``` -> pip install -e clinical_ETL_code/ -> ``` - Before running the script, you will need to have your input files, this will be clinical data in a tabular format (`xlsx`/`csv`) that can be read into program and a cohort directory containing the files that define the schema and mapping configurations. ### Input file/s format @@ -65,7 +59,7 @@ If you are working with exports from RedCap, the sample files in the [`sample_in ### Setting up a cohort directory -For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are: +For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `clinical_ETL_data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are: * a [`manifest.yml`](#Manifest-file) file with configuration settings for the mapping and schema validation * a [mapping template](#Mapping-template) csv that lists custom mappings for each field (based on `moh_template.csv`) diff --git a/src/clinical_ETL.egg-info/PKG-INFO b/src/clinical_ETL.egg-info/PKG-INFO index fad2ec5..415ea0b 100644 --- a/src/clinical_ETL.egg-info/PKG-INFO +++ b/src/clinical_ETL.egg-info/PKG-INFO @@ -84,7 +84,7 @@ If you are working with exports from RedCap, the sample files in the [`sample_in ### Setting up a cohort directory -For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are: +For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `clinical_ETL_data` repository. This cohort directory should contain the same files as shown in the [`sample_inputs/generic_example`](sample_inputs/generic_example) directory, which are: * a [`manifest.yml`](#Manifest-file) file with configuration settings for the mapping and schema validation * a [mapping template](#Mapping-template) csv that lists custom mappings for each field (based on `moh_template.csv`) diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py index 3adc233..73eac92 100644 --- a/src/clinical_etl/CSVConvert.py +++ b/src/clinical_etl/CSVConvert.py @@ -12,11 +12,7 @@ import yaml import argparse from tqdm import tqdm -from clinical_etl import mappings -# Include clinical_etl parent directory in the module search path. -current_dir = os.path.dirname(os.path.abspath(__file__)) -parent_dir = os.path.dirname(current_dir) -sys.path.append(parent_dir) +import mappings def verbose_print(message): @@ -277,7 +273,7 @@ def eval_mapping(node_name, rownum): """ verbose_print(f" Evaluating {mappings.IDENTIFIER}: {node_name}") if "mappings" not in mappings.MODULES: - mappings.MODULES["mappings"] = importlib.import_module("clinical_etl.mappings") + mappings.MODULES["mappings"] = importlib.import_module("mappings") modulename = "mappings" method, parameters = parse_mapping_function(node_name) @@ -596,7 +592,7 @@ def load_manifest(manifest_file): # programatically load schema class based on manifest value: # schema class definition will be in a file named schema_class.lower() - schema_mod = importlib.import_module(f"clinical_etl.{schema_class.lower()}") + schema_mod = importlib.import_module(f"{schema_class.lower()}") schema = getattr(schema_mod, schema_class)(manifest["schema"]) if schema.json_schema is None: sys.exit(f"Could not read an openapi schema at {manifest['schema']};\n" @@ -633,7 +629,7 @@ def load_manifest(manifest_file): f"{manifest_dir} and has the correct name.\n---") sys.exit(e) # mappings is a standard module: add it - mappings.MODULES["mappings"] = importlib.import_module("clinical_etl.mappings") + mappings.MODULES["mappings"] = importlib.import_module("mappings") return result diff --git a/src/clinical_etl/__init__.py b/src/clinical_etl/__init__.py index e69de29..8cff63f 100644 --- a/src/clinical_etl/__init__.py +++ b/src/clinical_etl/__init__.py @@ -0,0 +1,3 @@ +# Allows relative imports from current directory to work. +import os, sys +sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/src/clinical_etl/generate_mapping_docs.py b/src/clinical_etl/generate_mapping_docs.py index c0c7b19..4aef862 100644 --- a/src/clinical_etl/generate_mapping_docs.py +++ b/src/clinical_etl/generate_mapping_docs.py @@ -1,3 +1,9 @@ +# Updates the ../../mapping_functions.md +# Prior to running, set the PYTHONPATH for use by the subprocess with: +# export PYTHONPATH="$PWD" +# Then run: +# python generate_mapping_docs.py + import subprocess diff --git a/src/clinical_etl/genomicschema.py b/src/clinical_etl/genomicschema.py index 9a4e548..12ea505 100644 --- a/src/clinical_etl/genomicschema.py +++ b/src/clinical_etl/genomicschema.py @@ -1,6 +1,6 @@ import json import dateparser -from clinical_etl.schema import BaseSchema, ValidationError +from schema import BaseSchema, ValidationError """ diff --git a/src/clinical_etl/mohschemav2.py b/src/clinical_etl/mohschemav2.py index 71733b7..0847e4b 100644 --- a/src/clinical_etl/mohschemav2.py +++ b/src/clinical_etl/mohschemav2.py @@ -1,6 +1,6 @@ import json import dateparser -from clinical_etl.schema import BaseSchema, ValidationError +from schema import BaseSchema, ValidationError """ diff --git a/src/clinical_etl/mohschemav3.py b/src/clinical_etl/mohschemav3.py index ff0a65b..71f33ef 100644 --- a/src/clinical_etl/mohschemav3.py +++ b/src/clinical_etl/mohschemav3.py @@ -1,6 +1,6 @@ import json import dateparser -from clinical_etl.schema import BaseSchema, ValidationError +from schema import BaseSchema, ValidationError """ diff --git a/src/clinical_etl/validate_coverage.py b/src/clinical_etl/validate_coverage.py index 0d4d67c..0626eff 100644 --- a/src/clinical_etl/validate_coverage.py +++ b/src/clinical_etl/validate_coverage.py @@ -4,10 +4,6 @@ import mappings import importlib.util import os -# Include clinical_etl parent directory in the module search path for a later import. -current_dir = os.path.dirname(os.path.abspath(__file__)) -parent_dir = os.path.dirname(current_dir) -sys.path.append(parent_dir) # from jsoncomparison import Compare # from copy import deepcopy # import yaml @@ -15,6 +11,8 @@ # import os # import re # import CSVConvert +import mohschemav2 +import mohschemav3 def parse_args(): @@ -215,8 +213,7 @@ def validate_coverage(map_json, verbose=False): schema_class = "MoHSchemaV3" if "schema_class" in map_json: schema_class = map_json["schema_class"] - schema_mod = importlib.import_module(f"clinical_etl.{schema_class.lower()}") - schema = getattr(schema_mod, schema_class)(map_json["openapi_url"]) + schema = getattr(eval(schema_class.lower()), schema_class)(map_json["openapi_url"]) if schema.json_schema is None: sys.exit(f"Did not find an openapi schema at {map_json['openapi_url']}; please check the 'openapi_url' in the map json file.") diff --git a/tests/raw_data/PrimaryDiagnosis.csv b/tests/raw_data/PrimaryDiagnosis.csv index 4fc79d6..03cdefc 100644 --- a/tests/raw_data/PrimaryDiagnosis.csv +++ b/tests/raw_data/PrimaryDiagnosis.csv @@ -1,9 +1,9 @@ submitter_donor_id, primary_site, submitter_primary_diagnosis_id, date_of_diagnosis, cancer_type_code, basis_of_diagnosis, clinical_tumour_staging_system, clinical_t_category, clinical_n_category, clinical_m_category, clinical_stage_group, laterality, pathological_t_category, pathological_n_category, pathological_m_category, pathological_stage_group DONOR_1,Esophagus,PD_1,1/1/2018,C43.1,Cytology,International Neuroblastoma Staging System,,,,Stage 1,Left,T3e,N1,MX, DONOR_2,Eye and adnexa,PD_2,1/3/2020,C04.9,Specific tumour markers,Rai staging system,,,,Stage 1A,Bilateral,,,,In situ -DONOR_3,Floor of mouth,PD_3,1/5/2018,C43.9,Not available,AJCC 7th edition,T0,N0,M1a,,Left,,,,Stage IIIA -DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Not available,AJCC 7th edition,T0,N0,M1a,,Left,,,,Stage IIIB -DONOR_4,,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (RISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS -DONOR_5,Gum,PD_5,1/3/2020,C64.9,,Revised International staging system (RISS),T1,N0a,M0,,Left,,,,Stage IIBES +DONOR_3,Floor of mouth,PD_3,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIA +DONOR_3,Tongue,DUPLICATE_ID,1/5/2018,C43.9,Not available,AJCC cancer staging system,T0,N0,M1a,,Left,,,,Stage IIIB +DONOR_4,,PD_4,1/5/2018,C64.9,Death certificate only,Revised International staging system (R-ISS),,,,Stage 1B,"Unilateral, side not specified",,,,Stage IIS +DONOR_5,Gum,PD_5,1/3/2020,C64.9,,Revised International staging system (R-ISS),T1,N0a,M0,,Left,,,,Stage IIBES DONOR_6,"Heart, mediastinum, and pleura",PD_6,1/5/2018,C02.2,Specific tumour markers,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified",,,,Stage IIIB DONOR_2,Floor of mouth,PD_2_1,6/3/2018,C43.9,Histology of a primary tumour,Binet staging system,,,,Stage B,Bilateral,,,, diff --git a/update_moh_template.sh b/update_moh_template.sh old mode 100644 new mode 100755