From 042028402c258e6fd7b198c6d643e001deded7c0 Mon Sep 17 00:00:00 2001 From: Marion Date: Wed, 22 Nov 2023 11:20:52 -0800 Subject: [PATCH] Documentation, error handling and code improvements for usability (#42) * add to docs * more docs updates * add pycharm files to gitignore * error handling and openapi validation CSVConvert will exit if it encounters a fatal error from its inputs OpenAPI schema is validated using openapi-spec-validator * remove extra excepts * improve error reporting in validate_coverage * correct lines in test2moh and moh_template * tiny typo fix * improve error handling csvconvert * improve readability of README * format manifest info as table * add manifest link * doc additions and reorg * fix link * add links, fix typos * read template with csv reader Use proper csv reader to allow for quoted csvs to be read correctly * add docstrings and documentation * update templates * add pydoc * reverse float method change * mappings docstrings * add lazydocs * add functions index * change to first level heading * add automated docs note * add data_values dict info * switch to pdoc3 * minor changes * remove unused args and imports * revert method in test csv * updates based on PR review, thanks @daisieh --- .gitignore | 3 + CSVConvert.py | 151 +++++++++++++++--------- README.md | 208 +++++++++++++++++++++------------ generate_mapping_docs.py | 25 ++++ mapping_functions.md | 194 +++++++++++++++++++++++++++++- mappings.py | 174 +++++++++++++++++++++++---- moh_template.csv | 2 +- requirements.txt | 2 + sample_inputs/manifest.yml | 4 +- sample_inputs/moh_template.csv | 183 +++++++++++++++++++++++++++++ sample_inputs/new_cohort.py | 17 +++ schema.py | 19 +-- test_data/moh_diffs.txt | 4 +- test_data/test2moh.csv | 8 +- update_moh_template.sh | 2 +- validate_coverage.py | 47 ++++---- 16 files changed, 846 insertions(+), 197 deletions(-) create mode 100644 generate_mapping_docs.py create mode 100644 sample_inputs/moh_template.csv create mode 100644 sample_inputs/new_cohort.py diff --git a/.gitignore b/.gitignore index 3186d08..6398542 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ __pycache__/* .DS_Store *.pyc .venv/ +_local +.idea +.~lock* \ No newline at end of file diff --git a/CSVConvert.py b/CSVConvert.py index b83a7fb..bbbd28d 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -7,6 +7,7 @@ import mappings import os import pandas +import csv import re import sys import yaml @@ -22,14 +23,13 @@ def verbose_print(message): def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--input', type=str, required = True, help="Path to either an xlsx file or a directory of csv files for ingest") + parser.add_argument('--input', type=str, required=True, help="Path to either an xlsx file or a directory of csv files for ingest") # parser.add_argument('--api_key', type=str, help="BioPortal API key found in BioPortal personal account settings") # parser.add_argument('--email', type=str, help="Contact email to access NCBI clinvar API. Required by Entrez") - #parser.add_argument('--schema', type=str, help="Schema to use for template; default is mCodePacket") - parser.add_argument('--manifest', type=str, required = True, help="Path to a manifest file describing the mapping." - " See README for more information") + # parser.add_argument('--schema', type=str, help="Schema to use for template; default is mCodePacket") + parser.add_argument('--manifest', type=str, required=True, help="Path to a manifest file describing the mapping. See README for more information") parser.add_argument('--test', action="store_true", help="Use exact template specified in manifest: do not remove extra lines") - parser.add_argument('--verbose', '--v', action="store_true", help="Print extra information") + parser.add_argument('--verbose', '--v', action="store_true", help="Print extra information, useful for debugging and understanding how the code runs.") args = parser.parse_args() return args @@ -102,7 +102,7 @@ def map_indexed_scaffold(node, line): # only process if there is data for this IDENTIFIER in the index_sheet if mappings.IDENTIFIER in mappings.INDEXED_DATA['data'][index_sheet]: if index_values is not None: - # # add this new indexed value into the indexed_data table + # add this new indexed value into the indexed_data table mappings.INDEXED_DATA['data'][index_sheet][mappings.IDENTIFIER][index_field] = index_values top_frame = mappings._peek_at_top_of_stack() @@ -177,7 +177,8 @@ def parse_sheet_from_field(param): return None, None if param in mappings.INDEXED_DATA["columns"]: if len(mappings.INDEXED_DATA["columns"][param]) > 1: - mappings._warn(f"There are multiple sheets that contain column name {param}. Please specify the exact sheet in the mapping.") + mappings._warn( + f"There are multiple sheets that contain column name {param}. Please specify the exact sheet in the mapping.") return param, mappings.INDEXED_DATA["columns"][param][0] return None, None @@ -249,6 +250,7 @@ def populate_data_for_params(params, rownum): data_values[param][sheet] = [] return data_values + def eval_mapping(node_name, rownum): """ Given the identifier field, the data, and a particular schema node, evaluate @@ -305,17 +307,17 @@ def ingest_raw_data(input_path): def process_data(raw_csv_dfs): - """Takes a set of raw dataframes with a common identifier and merges into a JSON data structure.""" + """Takes a set of raw dataframes with a common identifier and merges into a JSON data structure.""" final_merged = {} cols_index = {} individuals = [] for page in raw_csv_dfs.keys(): print(f"Processing sheet {page}...") - df = raw_csv_dfs[page].dropna(axis='index', how='all')\ - .dropna(axis='columns', how='all')\ - .applymap(str)\ - .applymap(lambda x: x.strip())\ + df = raw_csv_dfs[page].dropna(axis='index', how='all') \ + .dropna(axis='columns', how='all') \ + .applymap(str) \ + .applymap(lambda x: x.strip()) \ .drop_duplicates() # drop absolutely identical lines # Sort by identifier and then tag any dups @@ -393,24 +395,35 @@ def process_mapping(line, test=False): return value, elems return line, None + def read_mapping_template(mapping_path): """Given a path to a mapping template file, read the lines and return them as an array.""" template_lines = [] try: with open(mapping_path, 'r') as f: - lines = f.readlines() + lines = csv.reader(f) for line in lines: - if line.startswith("#"): + if len(line) == 0: + continue + if line[0].startswith("#"): continue - if re.match(r"^\s*$", line): + joined_line = '' + for value in line: + if value.strip() == '': + continue + else: + joined_line = joined_line + value.strip() + ',' + if joined_line == '': continue - template_lines.append(line) + else: + template_lines.append(joined_line.rstrip(',')) except FileNotFoundError: - print(f"Mapping template {mapping_path} not found") - + sys.exit(f"Mapping template {mapping_path} not found. Ensure your mapping template is in the directory with the" + f" manifest.yml and is specified correctly.") return template_lines + def create_scaffold_from_template(lines, test=False): """Given lines from a template mapping csv file, create a scaffold mapping dict.""" @@ -421,7 +434,7 @@ def create_scaffold_from_template(lines, test=False): # this line is a comment, skip it continue if re.match(r"^\s*$", line): - #print(f"skipping {line}") + # print(f"skipping {line}") continue value, elems = process_mapping(line, test) # elems are the first column in the csv, the parts of the schema field, @@ -435,16 +448,16 @@ def create_scaffold_from_template(lines, test=False): # not seen yet, add empty list props[x] = [] if len(elems) > 0: - tempvar=(".".join(elems)+","+value) - #print(f"Appending tempvar {tempvar} to props for {x} : {line}") - props[x].append(".".join(elems)+","+value) + tempvar = (".".join(elems) + "," + value) + # print(f"Appending tempvar {tempvar} to props for {x} : {line}") + props[x].append(".".join(elems) + "," + value) elif value != "": - #print(f"Appending value {value} to props for {x} : {line}") + # print(f"Appending value {value} to props for {x} : {line}") props[x].append(value) else: - #print(f"How do we get here, {x}, adding empty list : {line}") + # print(f"How do we get here, {x}, adding empty list : {line}") props[x] = [x] - #print(f"Now {props[x]} for {x}") + # print(f"Now {props[x]} for {x}") else: return line @@ -458,7 +471,7 @@ def create_scaffold_from_template(lines, test=False): # empty_keys.append(key) # for key in empty_keys: # props.pop(key) - #print(f"Cleared empty keys {empty_keys}") + # print(f"Cleared empty keys {empty_keys}") for key in props.keys(): if key == "INDEX": # this maps to a list @@ -483,8 +496,8 @@ def scan_template_for_duplicate_mappings(template_lines): if val not in field_map: field_map[val] = [] field_map[val].append(template_line) - # else: - # print(f"WARNING: No parameter '{val}' exists") + # else: + # print(f"WARNING: No parameter '{val}' exists") data_values = list(field_map.keys()) for dv in data_values: indices = [] @@ -506,21 +519,37 @@ def scan_template_for_duplicate_mappings(template_lines): indexed_on = [] for i in field_map[dv]: bits = i.split(".") - indexed_on.append(".".join(bits[len(bits)-2:len(bits)-1])) + indexed_on.append(".".join(bits[len(bits) - 2:len(bits) - 1])) uniques = list(set(indexed_on)) - for u in range(0,len(uniques)): + for u in range(0, len(uniques)): count = 0 - for i in range(0,len(indexed_on)): + for i in range(0, len(indexed_on)): if uniques[u] == indexed_on[i]: count += 1 if count > 1: msg = f"ERROR: Key {dv} can only be used to index one line. If one of these duplicates does not have an index, use {{indexed_on(NONE)}}:\n" - for i in range(0,len(indexed_on)): + for i in range(0, len(indexed_on)): msg += f" {field_map[dv][i]}\n" raise Exception(msg) - #print(json.dumps(field_map, indent=4)) + # print(json.dumps(field_map, indent=4)) + +def check_for_sheet_inconsistencies(template_sheets, csv_sheets): + nl = "\n" + verbose_print(f"Expected sheet/csv names based on template_csv: {nl}{nl.join(template_sheets)}{nl}") + verbose_print(f"Expected sheet/csv names based on input files:{nl}{nl.join(csv_sheets)}{nl}") + template_csv_diff = template_sheets.difference(csv_sheets) + csv_template_diff = csv_sheets.difference(template_sheets) + if len(template_csv_diff) > 0: + # Print a warning if verbose enabled, it is possible that the template sheet has more than is required + print("WARNING: The following csv/sheet names are in the mapping template but were not found in the input sheets" + "/csvs:" + nl + nl.join(template_csv_diff) + nl + + "If this is an error please correct it as it may result in errors with mapping your data." + nl) + if len(csv_template_diff) > 0: + # Exit here because if we can't find a mapping for a field we can't properly map the inputs + sys.exit("The following sheet names are in the input csvs but not found in the mapping template:" + nl + + nl.join(csv_template_diff) + nl + "Please correct the sheets above and try again.") def load_manifest(manifest_file): @@ -528,11 +557,15 @@ def load_manifest(manifest_file): identifier = None schema = "mcode" mapping_path = None - with open(manifest_file, 'r') as f: - manifest = yaml.safe_load(f) - if manifest is None: - print("Manifest file needs to be in YAML format") - return + try: + with open(manifest_file, 'r') as f: + manifest = yaml.safe_load(f) + except yaml.YAMLError as e: + print(e) + sys.exit("Manifest file isn't a valid yaml, please fix the errors and try again.") + except FileNotFoundError as e: + print(e) + sys.exit(f"Manifest file not found at provided path: {manifest_file}") if "identifier" in manifest: identifier = manifest["identifier"] @@ -556,8 +589,10 @@ def load_manifest(manifest_file): sys.modules[mod] = mappings.MODULES[mod] spec.loader.exec_module(mappings.MODULES[mod]) except Exception as e: - print(e) - return + print( + f"---\nCould not find appropriate mapping functions at {mod_path}, ensure your mapping file is in " + f"{manifest_dir} and has the correct name.\n---") + sys.exit(e) # mappings is a standard module: add it mappings.MODULES["mappings"] = importlib.import_module("mappings") return { @@ -569,33 +604,31 @@ def load_manifest(manifest_file): def csv_convert(input_path, manifest_file, verbose=False): mappings.VERBOSE = verbose - # read manifest data manifest = load_manifest(manifest_file) mappings.IDENTIFIER_FIELD = manifest["identifier"] if mappings.IDENTIFIER_FIELD is None: - print("Need to specify what the main identifier column name as 'identifier' in the manifest file") - return + sys.exit("Need to specify what the main identifier column name is as 'identifier' in the manifest file, " + "see README for more details.") # read the schema (from the url specified in the manifest) and generate # a scaffold schema = MoHSchema(manifest["schema"]) - if schema is None: - print(f"Did not find an openapi schema at {url}; please check link") - return - - mapping_template = schema.template + if schema.json_schema is None: + sys.exit(f"Could not read an openapi schema at {manifest['schema']};\n" + f"please check the url in the manifest file links to a valid openAPI schema.") # read the mapping template (contains the mapping function for each # field) template_lines = read_mapping_template(manifest["mapping"]) - # # read the raw data + # read the raw data print("Reading raw data") raw_csv_dfs, mappings.OUTPUT_FILE = ingest_raw_data(input_path) if not raw_csv_dfs: - print(f"No ingestable files (csv or xlsx) were found at {input_path}") - return + sys.exit(f"No ingestable files (csv or xlsx) were found at {input_path}. Check path and try again.") + check_for_sheet_inconsistencies(set([re.findall(r"\((\w+)", x)[0] for x in template_lines]), + set(raw_csv_dfs.keys())) print("Indexing data") mappings.INDEXED_DATA = process_data(raw_csv_dfs) @@ -605,7 +638,8 @@ def csv_convert(input_path, manifest_file, verbose=False): # if verbose flag is set, warn if column name is present in multiple sheets: for col in mappings.INDEXED_DATA["columns"]: if col != mappings.IDENTIFIER_FIELD and len(mappings.INDEXED_DATA["columns"][col]) > 1: - mappings._warn(f"Column name {col} present in multiple sheets: {', '.join(mappings.INDEXED_DATA['columns'][col])}") + mappings._warn( + f"Column name {col} present in multiple sheets: {', '.join(mappings.INDEXED_DATA['columns'][col])}") # warn if any template lines map the same column to multiple lines: scan_template_for_duplicate_mappings(template_lines) @@ -613,8 +647,7 @@ def csv_convert(input_path, manifest_file, verbose=False): mapping_scaffold = create_scaffold_from_template(template_lines) if mapping_scaffold is None: - print("Could not create mapping scaffold. Make sure that the manifest specifies a valid csv template.") - return + sys.exit("Could not create mapping scaffold. Make sure that the manifest specifies a valid csv template.") packets = [] # for each identifier's row, make a packet @@ -628,7 +661,8 @@ def csv_convert(input_path, manifest_file, verbose=False): if mappings._pop_from_stack() is None: raise Exception(f"Stack popped too far!\n{mappings.IDENTIFIER_FIELD}: {mappings.IDENTIFIER}") if mappings._pop_from_stack() is not None: - raise Exception(f"Stack not empty\n{mappings.IDENTIFIER_FIELD}: {mappings.IDENTIFIER}\n {mappings.INDEX_STACK}") + raise Exception( + f"Stack not empty\n{mappings.IDENTIFIER_FIELD}: {mappings.IDENTIFIER}\n {mappings.INDEX_STACK}") with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f: json.dump(mappings.INDEXED_DATA, f, indent=4) @@ -639,7 +673,7 @@ def csv_convert(input_path, manifest_file, verbose=False): } if schema.katsu_sha is not None: result["katsu_sha"] = schema.katsu_sha - with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion + with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion json.dump(result, f, indent=4) # add validation data: @@ -647,11 +681,12 @@ def csv_convert(input_path, manifest_file, verbose=False): result["validation_errors"] = schema.validation_errors result["validation_warnings"] = schema.validation_warnings result["statistics"] = schema.statistics - with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion + with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion json.dump(result, f, indent=4) if len(result["validation_warnings"]) > 0: - print("\n\nWARNING: Your data is missing required data for the MoHCCN data model! The following problems were found:") + print( + "\n\nWARNING: Your data is missing required data for the MoHCCN data model! The following problems were found:") print("\n".join(result["validation_warnings"])) if len(result["validation_errors"]) > 0: print("\n\nWARNING: Your data is not valid for the MoHCCN data model! The following errors were found:") diff --git a/README.md b/README.md index 15039b6..ba3aa5c 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,117 @@ # clinical_ETL_code -This repository converts input csv files with clinical (phenotypic) data into a json aligned with a provided openapi schema. You can provide custom mapping functions to transform data in your input file before writing to the json. +This repository provides tools to convert input csv files with clinical (phenotypic) data into a json aligned with a provided openapi schema. You can provide custom mapping functions to transform data in your input file before writing to the json. Specifically, this code was designed to convert clinical data for the MOHCCN project into the packet format needed for ingest into CanDIG's clinical data service (katsu). -## Set-up & Installation +## CSVConvert +Most of the heavy lifting is done in the [`CSVConvert.py`](CSVConvert.py) script. See sections below for setting up the inputs and running the script. + +This script: +* reads a file (`.xlsx` or `.csv`) or a directory of files (`.csv`) +* reads a [template file](#mapping-template) that contains a list of fields and (if needed) a mapping function +* for each field for each patient, applies the mapping function to transform the raw data into permissible values against the provided schema +* exports the data into a json file(s) appropriate for ingest +* performs Validation and gives warning and error feedback for any data that does not meet the schema requirements + +### Environment set-up & Installation Prerequisites: - [Python 3.10+](https://www.python.org/) - [pip](https://github.com/pypa/pip/) +Set up and activate a [virtual environment](https://docs.python.org/3/tutorial/venv.html) using the python environment tool of your choice. For example using `venv` on linux/macOS systems +```commandline +python -m venv /path/to/new/virtual/environment +source /path/to/new/virtual/environment/bin/activate +``` +[See here for Windows instructions](https://realpython.com/python-virtual-environments-a-primer/) -## Running from the command line +Clone this repo and enter the repo directory +```commandline +git clone https://github.com/CanDIG/clinical_ETL_code.git +cd clinical_ETL_code +``` -Most of the heavy lifting is done in the CSVConvert.py script. See sections below for setting up the inputs. This script: -* reads an file (.xlsx or .csv) or a directory of files (csv) -* reads a template file that contains a list of fields and (if needed) a mapping function -* for each field for each patient, applies the mapping function to transform the raw data into valid model data -* exports the data into a json file(s) appropriate for ingest +Install the repo's requirements in your virtual environment +```commandline +pip install -r requirements.txt +``` + +Before running the script, you will need to have your input files, this will be clinical data in a tabular format (`xlsx`/`csv`) that can be read into program and a cohort directory containing the files that define the schema and mapping configurations. + +### Input file/s format + +The input for `CSVConvert` is either a single xlsx file, a single csv, or a directory of csvs that contain your clinical data. If providing a spreadsheet, there can be multiple sheets (usually one for each sub-schema). Examples of how csvs may look can be found in [test_data/raw_data](test_data/raw_data). + +All rows must contain identifiers that allow linkage between the objects in the schema, for example, a row that describes a Treatment must have a link to the Donor / Patient id for that Treatment. + +Data should be [tidy](https://r4ds.had.co.nz/tidy-data.html), with each variable in a separate column, each row representing an observation, and a single data entry in each cell. In the case of fields that can accept an array of values, the values within a cell should be delimited such that a mapping function can accurately return an array of permissible values. + +Depending on the format of your raw data, you may need to write an additional tidying script to pre-process. For example, the `ingest_redcap_data.py` converts the export format from redcap into a set of input csvs for `CSVConvert`. + +### Setting up a cohort directory + +For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain the same files as shown in the `sample_inputs` directory, which are: + +* a [`manifest.yml`](#Manifest-file) file with configuration settings for the mapping and schema validation +* a [mapping template](#Mapping-template) csv that lists custom mappings for each field (based on `moh_template.csv`) +* (if needed) One or more python files that implement any cohort-specific mapping functions (See [mapping functions](mapping_functions.md) for detailed information) + +> [!IMPORTANT] +> If you are placing this directory under version control and the cohort is not sample / synthetic data, do not place raw or processed data files in this directory, to avoid any possibility of committing protected data. + +#### Manifest file +The `manifest.yml` file contains settings for the cohort mapping. There is a sample file in [`sample_inputs/manifest.yml`](sample_inputs/manifest.yml) with documentation and example inputs. The fields are: + +| field | description | +|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| description | A brief description of what mapping task this manifest is being used for | +| mapping | the mapping template csv file that lists the mappings for each field based on `moh_template.csv`, assumed to be in the same directory as the `manifest.yml` file | +| identifier | the unique identifier for the donor or root node | +| schema | a URL to the openapi schema file | +| functions | A list of one or more filenames containing additional mapping functions, can be omitted if not needed. Assumed to be in the same directory as the `manifest.yml` file | + +#### Mapping template + +You'll need to create a mapping template that defines the mapping between the fields in your input files and the fields in the target schema. It also defines what mapping functions (if any) should be used to transform the input data into the required format to pass validation under the target schema. + +Each line in the mapping template is composed of comma separated values with two components. The first value is an `element` or field from the target schema and the second value contains a suggested `mapping method` or function to map a field from an input sheet to a valid value for the identified `element`. Each `element`, shows the full object linking path to each field required by the model. These values should not be edited. + +If you are generating a mapping for the current CanDIG MoH model, you can use the pre-generated [`moh_template.csv`](moh_template.csv) file. This file is modified from the auto-generated template to update a few fields that require specific handling. + +You will need to edit the `mapping method` values in each line in the following ways: +1. Replace the generic sheet names (e.g. `DONOR_SHEET`, `SAMPLE_REGISTRATIONS_SHEET`) with the sheet/csv names you are using as your input to `CSVConvert.py` +2. Replace suggested field names with the relevant field/column names in your input sheets/csvs, if they differ + +If the field does not map in the same way as the suggested mapping function you will also need to: + +3. Choose a different existing [mapping function](mappings.py) or write a new function that does the required transformation and save it in a python file that is specified in your `manifest.yml` in the `functions` section. Functions in your custom mapping _must_ be fully referenced by their module name, e.g. `sample_custom_mappings.sex()`. (See the [mapping instructions](mapping_functions.md) for detailed documentation on writing your own mapping functions.) + +>[!NOTE] +> * Do not edit, delete, or re-order the template lines, except to adjust the sheet name, mapping function and field name in the `mapping method` column. +> * Fields not requiring mapping can be commented out with a # at the start of the line + +
+Generating a template from a different schema +The `generate_schema.py` script will generate a template file based an openapi.yaml file. + +``` +$ python generate_schema.py -h +usage: generate_schema.py [-h] --url URL [--out OUT] + +options: + -h, --help show this help message and exit + --url URL URL to openAPI schema file (raw github link) + --out OUT name of output file; csv extension will be added. Default is template + +``` +
+ +### Running `CSVConvert` from the command line + +CSVConvert requires two inputs: +1. a path to a multi-sheet spreadsheet or path to csvs specified with [`--input`](#Input-file/s-format) +2. a path to a `manifest.yml`, in a directory that also contains the other files defined in [Setting up a cohort directory](#Setting-up-a-cohort-directory) ``` $ python CSVConvert.py [-h] [--input INPUT] [--manifest manifest_file] [--test] [--verbose] @@ -30,13 +125,29 @@ $ python CSVConvert.py [-h] [--input INPUT] [--manifest manifest_file] [--test] --test allows you to add extra lines to your manifest's template file that will be populated in the mapped schema. NOTE: this mapped schema will likely not be a valid mohpacket: it should be used only for debugging. ``` -The output packets (`INPUT_map.json` and `INPUT_indexed.json`) will be in the parent of the `INPUT` directory / file. +Example usage: -Validation will automatically be run after the conversion is complete. Any validation errors or warnings will be reported both on the command line and as part of the `INPUT_map.json` file. +``` +python CSVconvert.py --input test_data/raw_data --manifest test_data/manifest.yml +``` -## Format of the output file +The output packets `_map.json` and `_indexed.json` will be in the parent of the `INPUT` directory / file. In the example above, this would be in the `test_data` directory. -``` +Validation will automatically be run after the conversion is complete. Any validation errors or warnings will be reported both on the command line and as part of the `_map.json` file. + +#### Format of the output files + +`_map.json` is the main output and contains the results of the mapping, conversion and validation as well as summary statistics. + +The mapping and transformation result is found in the `"donors"` key. + +Arrays of validation warnings and errors are found in `validation_warnings` & `validation_errors`. + +Summary statistics about the completeness of the objects against the schema are in the `statistics` key. + +A summarised example of the output is below: + +```json { "openapi_url": "https://raw.githubusercontent.com/CanDIG/katsu/develop/chord_metadata_service/mohpackets/docs/schema.yml", "katsu_sha": < git sha of the katsu version used for the schema >, @@ -76,42 +187,15 @@ Validation will automatically be run after the conversion is complete. Any valid } ``` -## Input file format - -The input for CSVConvert is either a single xlsx file, a single csv, or a directory of csvs. If providing a spreadsheet, there can be multiple sheets (usually one for each sub-schema). - -All rows must contain identifiers that allow linkage to the containing schema, for example, a row that describes a Treatment must have a link to the Donor / Patient id for that Treatment. - -Data should be (tidy)[https://r4ds.had.co.nz/tidy-data.html], with each variable in a separate column, each row representing an observation, and a single data entry in each cell. - -Depending on the format of your raw data, you may need to write an additional tidying script to pre-process. For example, the `ingest_redcap_data.py` converts the export format from redcap into a set of input csvs for CSVConvert. - -## Setting up a cohort directory +`_indexed.json` contains information about how the ETL is looking up the mappings and can be useful for debugging. -For each dataset (cohort) that you want to convert, create a directory outside of this repository. For CanDIG devs, this will be in the private `data` repository. This cohort directory should contain: - -* a `manifest.yml` file with settings for the mapping -* the template file lists custom mappings for each field -* (if needed) a python file that implements any cohort-specific mapping functions - -**Important:** If you are placing this directory under version control and the cohort is not sample / synthetic data, do not place raw or processed data files in this directory, to avoid any possibility of committing protected data. - -## Manifest file -The `manifest.yml` file contains settings for the cohort mapping. There is a sample file in `sample_inputs/manifest.yml` with documentation. The fields are: +## Testing -``` -description: A brief description -mapping: the csv file that lists the mappings for each field -identifier: submitter_donor_id -schema: a URL to the openapi schema file -functions: - - cohort-mapping-functions -``` -## Mapping template +Continuous integration testing for this repository is implemented through Pytest and GitHub Actions which run when pushes occur. Build results can be found at [this repository's GitHub Actions page](https://github.com/CanDIG/clinical_ETL_code/actions/workflows/test.yml). -You'll need to create a mapping template that defines which mapping functions (if any) should be used for which fields. +To run tests manually, enter from command line `$ pytest` -If you're generating a mapping for the current MoH model, you can use the pre-generated `moh_template.csv` file. This file is modified from the auto-generated template to update a few fields that require specific handling. +### When tests fail...
"Compare moh_template.csv" fails @@ -127,46 +211,18 @@ There have probably been MoH model changes in katsu. Run the `update_moh_template.sh` script to see what's changed in `test_data/moh_diffs.txt`. Update `moh_template.csv` to reconcile any differences, then re-run `update_moh_template.sh`. Commit any changes in both `moh_template.csv` and `test_data/moh_diffs.txt`.
-
-Generating a template from a different schema -The `generate_schema.py` script will generate a template file based an openapi.yaml file. - -``` -$ python generate_schema.py -h -usage: generate_schema.py [-h] --url URL [--out OUT] - -options: - -h, --help show this help message and exit - --url URL URL to openAPI schema file (raw github link) - --out OUT name of output file; csv extension will be added. Default is template - -``` -
- -Each line in the mapping template will have a suggested mapping function to map a field on an input sheet to a field in the schema. Replace the generic sheet names with your sheet names. You may need to replace suggested field names with your own field names, if they differ. - -If your data do not map in the same way as the suggested mapping functions, you may need to write your own mapping functions. See the [mapping instructions](mapping_functions.md) for detailed documentation on writing your own mapping functions. - -**Note**: Do not edit, delete, or re-order the template lines, except to add mapping functions after the comma in each line. - -## Testing - -Continuous integration testing for this repository is implemented through Pytest and GitHub Actions which run when pushes occur. Build results can be found at [this repository's GitHub Actions page](https://github.com/CanDIG/clinical_ETL_code/actions/workflows/test.yml). - -To run tests manually, enter from command line `$ pytest` - ## Validating the mapping You can validate the generated json mapping file against the MoH data model. The validation will compare the mapping to the json schema used to generate the template, as well as other known requirements and data conditions specified in the MoH data model. + ``` $ python validate_coverage.py [-h] [--input map.json] [--manifest MAPPING] ---json: path to the map.json file created by CSVConvert - ---manifest: Path to a manifest file describing the mapping +--json JSON _map.json file generated by CSVConvert.py. +--verbose, --v Print extra information ``` -The output will report errors and warnings separately. Jsonschema validation failures and other data mismatches will be listed as errors, while fields that are conditionally required as part of the MoH model but are missing will be reported as warnings. +The output will report errors and warnings separately. JSON schema validation failures and other data mismatches will be listed as errors, while fields that are conditionally required as part of the MoH model but are missing will be reported as warnings. \n\n") + updated_mapping_functions.append(docs.decode()) + with open("mapping_functions.md", "w+") as f: + f.writelines(updated_mapping_functions) + + +if __name__ == '__main__': + main() diff --git a/mapping_functions.md b/mapping_functions.md index d53233e..db2ef9b 100644 --- a/mapping_functions.md +++ b/mapping_functions.md @@ -39,7 +39,7 @@ DONOR.INDEX.primary_diagnoses.INDEX.submitter_primary_diagnosis_id, {single_val( DONOR.INDEX.primary_diagnoses.INDEX.date_of_diagnosis, {single_date(PRIMARY_DIAGNOSES_SHEET.date_of_diagnosis)} ``` -Here, `primary_diagnoses` will be added as an an array for the Donor with `submitter_donor_id`. Each entry in `primary_diagnoses` will use the values on the `PRIMARY_DIAGNOSES_SHEET` that have the same `submitter_donor_id`. +Here, `primary_diagnoses` will be added as an array for the Donor with `submitter_donor_id`. Each entry in `primary_diagnoses` will use the values on the `PRIMARY_DIAGNOSES_SHEET` that have the same `submitter_donor_id`. If your schema doesn't contain any instances of a particular indexed field, you can specify `NONE`: `{indexed_on(NONE)}` @@ -58,8 +58,11 @@ If your schema requires more complex mapping calculations, you can define an ind In addition to mapping column names, you can also transform the values inside the cells to make them align with the schema. We've already seen the simplest case - the `single_val` function takes a single value for the named field and returns it (and should only be used when you expect one single value). -The standard functions are defined in `mappings.py`. They include functions for handling single values, list values, dates, and booleans. +The standard functions are defined in `mappings.py`. They include functions for handling single values, list values, dates, and booleans. +Many functions take one or more `data_values` arguments as input. These are a dictionary representing how the CSVConvert script parses each cell of the input data. It is a dictionary of the format `{:{: }}`, e.g. `{'date_of_birth': {'Donor': '6 Jan 1954'}}`. + +A detailed index of all standard functions can be viewed below in the [Standard functions index](#Standard-functions-index). ## Writing your own custom functions @@ -118,3 +121,190 @@ represents the following JSON dict: } ``` + +# Standard Functions Index + + +Module mappings +=============== + +Functions +--------- + + +`boolean(data_values)` +: Convert value to boolean. + + Args: + data_values: A string to be converted to a boolean + + Returns: + A boolean based on the input, + `False` if value is in ["No", "no", "False", "false"] + `None` if value is in [`None`, "nan", "NaN", "NAN"] + `True` otherwise + + +`concat_vals(data_values)` +: Concatenate several data values + + Args: + data_values: a values dict with a list of values + + Returns: + A concatenated string + + +`date(data_values)` +: Format a list of dates to ISO standard YYYY-MM + + Parses a list of strings representing dates into a list of strings with dates in ISO format YYYY-MM. + + Args: + data_values: a value dict with a list of date-like strings + + Returns: + a list of dates in YYYY-MM format or None if blank/empty/unparseable + + +`flat_list_val(data_values)` +: Take a list mapping and break up any stringified lists into multiple values in the list. + + Attempts to use ast.literal_eval() to parse the list, uses split(',') if this fails. + + Args: + data_values: a values dict with a stringified list, e.g. "['a','b','c']" + Returns: + A parsed list of items in the list, e.g. ['a', 'b', 'c'] + + +`float(data_values)` +: Convert a value to a float. + + Args: + data_values: A values dict + + Returns: + A values dict with a string or integer converted to a float or None if null value + + Raises: + ValueError by float() if it cannot convert to float. + + +`has_value(data_values)` +: Returns a boolean based on whether the key in the mapping has a value. + + +`index_val(data_values)` +: Take a mapping with possibly multiple values from multiple sheets and return an array. + + +`indexed_on(data_values)` +: Default indexing value for arrays. + + Args: + data_values: a values dict of identifiers to be indexed + + Returns: + a dict of the format: + {"field": ,"sheet_name": ,"values": []} + + +`integer(data_values)` +: Convert a value to an integer. + + Args: + data_values: a values dict with value to be converted to an int + Returns: + an integer version of the input value + Raises: + ValueError if int() cannot convert the input + + +`list_val(data_values)` +: Takes a mapping with possibly multiple values from multiple sheets and returns an array of values. + + Args: + data_values: a values dict with a list of values + Returns: + The list of values + + +`moh_indexed_on_donor_if_others_absent(data_values)` +: Maps an object to a donor if not otherwise linked. + + Specifically for the FollowUp object which can be linked to multiple objects. + + Args: + **data_values: any number of values dicts with lists of identifiers, NOTE: values dict with donor identifiers + must be specified first. + + Returns: + a dict of the format: + + {'field': , 'sheet': , 'values': [, ...]} + + Where the 'values' list contains a donor identifier if it should be linked to that donor or None if already + linked to another object. + + +`ontology_placeholder(data_values)` +: Placeholder function to make a fake ontology entry. + + Should only be used for testing. + + Args: + data_values: a values dict with a string value representing an ontology label + + Returns: + a dict of the format: + {"id": "placeholder","label": data_values} + + +`pipe_delim(data_values)` +: Takes a string and splits it into an array based on a pipe delimiter. + + Args: + data_values: values dict with single pipe-delimited string, e.g. "a|b|c" + + Returns: + a list of strings split by pipe, e.g. ["a","b","c"] + + +`placeholder(data_values)` +: Return a dict with a placeholder key. + + +`single_date(data_values)` +: Parses a single date to YYYY-MM format. + + Args: + data_values: a value dict with a date + + Returns: + a string of the format YYYY-MM, or None if blank/unparseable + + +`single_val(data_values)` +: Parse a values dict and return the input as a single value. + + Args: + data_values: a dict with values to be squashed + + Returns: + A single value with any null values removed + None if list is empty or contains only 'nan', 'NaN', 'NAN' + + Raises: + MappingError if multiple values found + +Classes +------- + +`MappingError(value)` +: Common base class for all non-exit exceptions. + + ### Ancestors (in MRO) + + * builtins.Exception + * builtins.BaseException diff --git a/mappings.py b/mappings.py index b1f7baf..0605931 100644 --- a/mappings.py +++ b/mappings.py @@ -22,8 +22,17 @@ def __str__(self): return repr(f"Check the values for {IDENTIFIER} in {IDENTIFIER_FIELD}: {self.value}") -# Format a date field to ISO standard def date(data_values): + """Format a list of dates to ISO standard YYYY-MM + + Parses a list of strings representing dates into a list of strings with dates in ISO format YYYY-MM. + + Args: + data_values: a value dict with a list of date-like strings + + Returns: + a list of dates in YYYY-MM format or None if blank/empty/unparseable + """ raw_date = list_val(data_values) dates = [] if raw_date is None: @@ -35,14 +44,22 @@ def date(data_values): # Single date def single_date(data_values): + """Parses a single date to YYYY-MM format. + + Args: + data_values: a value dict with a date + + Returns: + a string of the format YYYY-MM, or None if blank/unparseable + """ val = single_val(data_values) if val is not None: return _parse_date(val) return None -# Returns a boolean based on whether or not the key in the mapping has a value def has_value(data_values): + """Returns a boolean based on whether the key in the mapping has a value.""" if len(data_values.keys()) == 0: _warn(f"no values passed in") else: @@ -52,8 +69,19 @@ def has_value(data_values): return False -# No matter how many items are registered with this key, squash to one def single_val(data_values): + """Parse a values dict and return the input as a single value. + + Args: + data_values: a dict with values to be squashed + + Returns: + A single value with any null values removed + None if list is empty or contains only 'nan', 'NaN', 'NAN' + + Raises: + MappingError if multiple values found + """ all_items = list_val(data_values) if len(all_items) == 0: return None @@ -70,8 +98,15 @@ def single_val(data_values): return result -# Take a mapping with possibly multiple values from multiple sheets and return an array def list_val(data_values): + """ + Takes a mapping with possibly multiple values from multiple sheets and returns an array of values. + + Args: + data_values: a values dict with a list of values + Returns: + The list of values + """ all_items = [] if has_value(data_values): col = list(data_values.keys())[0] @@ -83,8 +118,15 @@ def list_val(data_values): return all_items -# take a string and split it into an array based on a pipe delimiter: def pipe_delim(data_values): + """Takes a string and splits it into an array based on a pipe delimiter. + + Args: + data_values: values dict with single pipe-delimited string, e.g. "a|b|c" + + Returns: + a list of strings split by pipe, e.g. ["a","b","c"] + """ val = single_val(data_values) if val is not None: return val.split('|') @@ -92,10 +134,12 @@ def pipe_delim(data_values): def placeholder(data_values): + """Return a dict with a placeholder key.""" return {"placeholder": data_values} -# Take a mapping with possibly multiple values from multiple sheets and return an array + def index_val(data_values): + """Take a mapping with possibly multiple values from multiple sheets and return an array.""" all_items = [] if has_value(data_values): col = list(data_values.keys())[0] @@ -107,8 +151,16 @@ def index_val(data_values): return all_items -# Take a list mapping and break up any stringified lists into multiple values in the list def flat_list_val(data_values): + """Take a list mapping and break up any stringified lists into multiple values in the list. + + Attempts to use ast.literal_eval() to parse the list, uses split(',') if this fails. + + Args: + data_values: a values dict with a stringified list, e.g. "['a','b','c']" + Returns: + A parsed list of items in the list, e.g. ['a', 'b', 'c'] + """ items = list_val(data_values) all_items = [] for item in items: @@ -121,16 +173,33 @@ def flat_list_val(data_values): return all_items -# concatenate several data values def concat_vals(data_values): + """Concatenate several data values + + Args: + data_values: a values dict with a list of values + + Returns: + A concatenated string + """ result = [] for x in data_values: result.extend(data_values[x].values()) return "_".join(result) -# Convert various responses to boolean def boolean(data_values): + """Convert value to boolean. + + Args: + data_values: A string to be converted to a boolean + + Returns: + A boolean based on the input, + `False` if value is in ["No", "no", "False", "false"] + `None` if value is in [`None`, "nan", "NaN", "NAN"] + `True` otherwise + """ cell = single_val(data_values) if cell is None or cell.lower().strip() == "nan": return None @@ -140,41 +209,63 @@ def boolean(data_values): def integer(data_values): + """Convert a value to an integer. + + Args: + data_values: a values dict with value to be converted to an int + Returns: + an integer version of the input value + Raises: + ValueError if int() cannot convert the input + """ cell = single_val(data_values) if cell is None or cell.lower() == "nan": return None try: return int(cell) - except: + except ValueError as e: + _warn(e) return None def float(data_values): - cell = single_val(data_values) - if cell is None or cell.lower() == "nan": - return None - try: - return float(cell) - except: - return None + """Convert a value to a float. + + Args: + data_values: A values dict + Returns: + A values dict with a string or integer converted to a float or None if null value -def double(data_values): + Raises: + ValueError by float() if it cannot convert to float. + """ cell = single_val(data_values) if cell is None or cell.lower() == "nan": return None try: return float(cell) - except: + except ValueError as e: + _warn(e) return None -# Placeholder function to make a fake ontology entry def ontology_placeholder(data_values): + """Placeholder function to make a fake ontology entry. + + Should only be used for testing. + + Args: + data_values: a values dict with a string value representing an ontology label + + Returns: + a dict of the format: + {"id": "placeholder","label": data_values} + """ if "str" in str(type(data_values)): return { "id": "placeholder", - "label": mapping + "label": data_values } return { "id": "placeholder", @@ -182,8 +273,16 @@ def ontology_placeholder(data_values): } -# Default indexing value for arrays def indexed_on(data_values): + """Default indexing value for arrays. + + Args: + data_values: a values dict of identifiers to be indexed + + Returns: + a dict of the format: + {"field": ,"sheet_name": ,"values": []} + """ field = list(data_values.keys())[0] sheet = list(data_values[field].keys())[0] @@ -195,6 +294,22 @@ def indexed_on(data_values): def moh_indexed_on_donor_if_others_absent(data_values): + """Maps an object to a donor if not otherwise linked. + + Specifically for the FollowUp object which can be linked to multiple objects. + + Args: + **data_values: any number of values dicts with lists of identifiers, NOTE: values dict with donor identifiers + must be specified first. + + Returns: + a dict of the format: + + {'field': , 'sheet': , 'values': [, ...]} + + Where the 'values' list contains a donor identifier if it should be linked to that donor or None if already + linked to another object. + """ result = [] field = list(data_values.keys())[0] sheet = list(data_values[field].keys())[0] @@ -217,6 +332,7 @@ def moh_indexed_on_donor_if_others_absent(data_values): def _warn(message): + """Warns a user when a mapping is unsuccessful with the IDENTIFIER and FIELD.""" global IDENTIFIER if IDENTIFIER is not None: print(f"WARNING for {IDENTIFIER_FIELD}={IDENTIFIER}: {message}") @@ -256,8 +372,8 @@ def _peek_at_top_of_stack(): } -# Convenience function to convert nan to boolean def _is_null(cell): + """Convert nan, None, '' to boolean.""" if cell == 'nan' or cell is None or cell == '': return True return False @@ -270,6 +386,18 @@ def _single_map(mapping, field): # Convenience function to parse dates to ISO format def _parse_date(date_string): + """ + Parses any date-like string into YYYY-MM format. + + Args: + date_string: A string in various date formats + + Returns: + A string in year, month ISO format: YYYY-MM + + Raises: + MappingError if dateparser cannot recognise the date format. + """ if any(char in '0123456789' for char in date_string): try: d = dateparser.parse(date_string, settings={'TIMEZONE': 'UTC'}) diff --git a/moh_template.csv b/moh_template.csv index d972872..d3f5c1f 100644 --- a/moh_template.csv +++ b/moh_template.csv @@ -148,7 +148,7 @@ DONOR.INDEX.comorbidities.INDEX.comorbidity_treatment, {single_val(COMORBIDITIES DONOR.INDEX.comorbidities.INDEX.age_at_comorbidity_diagnosis, {integer(COMORBIDITIES_SHEET.age_at_comorbidity_diagnosis)} DONOR.INDEX.exposures.INDEX, {indexed_on(EXPOSURES_SHEET.submitter_donor_id)} DONOR.INDEX.exposures.INDEX.tobacco_smoking_status, {single_val(EXPOSURES_SHEET.tobacco_smoking_status)} -DONOR.INDEX.exposures.INDEX.tobacco_type.INDEX, {indexed_on(EXPOSURES_SHEET.tobacco_type)} +DONOR.INDEX.exposures.INDEX.tobacco_type, {pipe_delim(EXPOSURES_SHEET.tobacco_type)} DONOR.INDEX.exposures.INDEX.pack_years_smoked, {float(EXPOSURES_SHEET.pack_years_smoked)} DONOR.INDEX.biomarkers.INDEX, {indexed_on(BIOMARKERS_SHEET.submitter_donor_id)} DONOR.INDEX.biomarkers.INDEX.er_status, {single_val(BIOMARKERS_SHEET.er_status)} diff --git a/requirements.txt b/requirements.txt index 494ceb5..66f186b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ openpyxl~=3.0.9 requests~=2.29 jsonschema~=4.19.1 jsoncomparison~=1.1.0 +openapi-spec-validator~=0.7.1 +pdoc3>=0.10.0 diff --git a/sample_inputs/manifest.yml b/sample_inputs/manifest.yml index ed553ff..99bb9a1 100644 --- a/sample_inputs/manifest.yml +++ b/sample_inputs/manifest.yml @@ -1,6 +1,6 @@ description: Test mapping of DATASET-NAME dataset to MOHCCN format for CanDIG # mapping is the csv file that contains the list of fields and mapping functions -mapping: dataset2moh.csv +mapping: moh_template.csv # the name of the top-level identifier column in the input data identifier: submitter_donor_id # a link to the openapi schema @@ -8,4 +8,4 @@ schema: https://raw.githubusercontent.com/CanDIG/katsu/develop/chord_metadata_se # one or more files (dataset_functions.py) that implement the mappings # described in mapping file functions: - - dataset_functions \ No newline at end of file + - new_cohort \ No newline at end of file diff --git a/sample_inputs/moh_template.csv b/sample_inputs/moh_template.csv new file mode 100644 index 0000000..fb45007 --- /dev/null +++ b/sample_inputs/moh_template.csv @@ -0,0 +1,183 @@ +## Schema generated from https://raw.githubusercontent.com/CanDIG/katsu/develop/chord_metadata_service/mohpackets/docs/schema.yml +## Based on repo commit sha "29fd55d173b7a01daa72fcc89187e3aabd1fb51e" +## MoH template is manually updated to match the MoH clinical data model +## Items are comma separated: element, mapping method +DONOR.INDEX, {indexed_on(DONOR_SHEET.submitter_donor_id)} +DONOR.INDEX.submitter_donor_id, {single_val(DONOR_SHEET.submitter_donor_id)} +DONOR.INDEX.program_id, {single_val(DONOR_SHEET.program_id)} +DONOR.INDEX.lost_to_followup_after_clinical_event_identifier, {single_val(DONOR_SHEET.lost_to_followup_after_clinical_event_identifier)} +DONOR.INDEX.lost_to_followup_reason, {single_val(DONOR_SHEET.lost_to_followup_reason)} +DONOR.INDEX.date_alive_after_lost_to_followup, {single_date(DONOR_SHEET.date_alive_after_lost_to_followup)} +DONOR.INDEX.is_deceased, {boolean(DONOR_SHEET.is_deceased)} +DONOR.INDEX.cause_of_death, {single_val(DONOR_SHEET.cause_of_death)} +DONOR.INDEX.date_of_birth, {single_date(DONOR_SHEET.date_of_birth)} +DONOR.INDEX.date_of_death, {single_date(DONOR_SHEET.date_of_death)} +DONOR.INDEX.gender, {single_val(DONOR_SHEET.gender)} +DONOR.INDEX.sex_at_birth, {single_val(DONOR_SHEET.sex_at_birth)} +DONOR.INDEX.primary_site, {pipe_delim(DONOR_SHEET.primary_site)} +DONOR.INDEX.primary_diagnoses.INDEX, {indexed_on(PRIMARY_DIAGNOSES_SHEET.submitter_donor_id)} +DONOR.INDEX.primary_diagnoses.INDEX.submitter_primary_diagnosis_id, {single_val(PRIMARY_DIAGNOSES_SHEET.submitter_primary_diagnosis_id)} +DONOR.INDEX.primary_diagnoses.INDEX.date_of_diagnosis, {single_date(PRIMARY_DIAGNOSES_SHEET.date_of_diagnosis)} +DONOR.INDEX.primary_diagnoses.INDEX.cancer_type_code, {single_val(PRIMARY_DIAGNOSES_SHEET.cancer_type_code)} +DONOR.INDEX.primary_diagnoses.INDEX.basis_of_diagnosis, {single_val(PRIMARY_DIAGNOSES_SHEET.basis_of_diagnosis)} +DONOR.INDEX.primary_diagnoses.INDEX.lymph_nodes_examined_status, {single_val(PRIMARY_DIAGNOSES_SHEET.lymph_nodes_examined_status)} +DONOR.INDEX.primary_diagnoses.INDEX.lymph_nodes_examined_method, {single_val(PRIMARY_DIAGNOSES_SHEET.lymph_nodes_examined_method)} +DONOR.INDEX.primary_diagnoses.INDEX.number_lymph_nodes_positive, {integer(PRIMARY_DIAGNOSES_SHEET.number_lymph_nodes_positive)} +DONOR.INDEX.primary_diagnoses.INDEX.clinical_tumour_staging_system, {single_val(PRIMARY_DIAGNOSES_SHEET.clinical_tumour_staging_system)} +DONOR.INDEX.primary_diagnoses.INDEX.clinical_t_category, {single_val(PRIMARY_DIAGNOSES_SHEET.clinical_t_category)} +DONOR.INDEX.primary_diagnoses.INDEX.clinical_n_category, {single_val(PRIMARY_DIAGNOSES_SHEET.clinical_n_category)} +DONOR.INDEX.primary_diagnoses.INDEX.clinical_m_category, {single_val(PRIMARY_DIAGNOSES_SHEET.clinical_m_category)} +DONOR.INDEX.primary_diagnoses.INDEX.clinical_stage_group, {single_val(PRIMARY_DIAGNOSES_SHEET.clinical_stage_group)} +DONOR.INDEX.primary_diagnoses.INDEX.laterality, {single_val(PRIMARY_DIAGNOSES_SHEET.laterality)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX, {indexed_on(SPECIMENS_SHEET.submitter_primary_diagnosis_id)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.submitter_specimen_id, {single_val(SPECIMENS_SHEET.submitter_specimen_id)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.pathological_tumour_staging_system, {single_val(SPECIMENS_SHEET.pathological_tumour_staging_system)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.pathological_t_category, {single_val(SPECIMENS_SHEET.pathological_t_category)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.pathological_n_category, {single_val(SPECIMENS_SHEET.pathological_n_category)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.pathological_m_category, {single_val(SPECIMENS_SHEET.pathological_m_category)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.pathological_stage_group, {single_val(SPECIMENS_SHEET.pathological_stage_group)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.specimen_collection_date, {single_date(SPECIMENS_SHEET.specimen_collection_date)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.specimen_storage, {single_val(SPECIMENS_SHEET.specimen_storage)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.tumour_histological_type, {single_val(SPECIMENS_SHEET.tumour_histological_type)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.specimen_anatomic_location, {single_val(SPECIMENS_SHEET.specimen_anatomic_location)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.reference_pathology_confirmed_diagnosis, {single_val(SPECIMENS_SHEET.reference_pathology_confirmed_diagnosis)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.reference_pathology_confirmed_tumour_presence, {single_val(SPECIMENS_SHEET.reference_pathology_confirmed_tumour_presence)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.tumour_grading_system, {single_val(SPECIMENS_SHEET.tumour_grading_system)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.tumour_grade, {single_val(SPECIMENS_SHEET.tumour_grade)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.percent_tumour_cells_range, {single_val(SPECIMENS_SHEET.percent_tumour_cells_range)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.percent_tumour_cells_measurement_method, {single_val(SPECIMENS_SHEET.percent_tumour_cells_measurement_method)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.specimen_processing, {single_val(SPECIMENS_SHEET.specimen_processing)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.specimen_laterality, {single_val(SPECIMENS_SHEET.specimen_laterality)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.sample_registrations.INDEX, {indexed_on(SAMPLE_REGISTRATIONS_SHEET.submitter_specimen_id)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.sample_registrations.INDEX.submitter_sample_id, {single_val(SAMPLE_REGISTRATIONS_SHEET.submitter_sample_id)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.sample_registrations.INDEX.specimen_tissue_source, {single_val(SAMPLE_REGISTRATIONS_SHEET.specimen_tissue_source)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.sample_registrations.INDEX.tumour_normal_designation, {single_val(SAMPLE_REGISTRATIONS_SHEET.tumour_normal_designation)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.sample_registrations.INDEX.specimen_type, {single_val(SAMPLE_REGISTRATIONS_SHEET.specimen_type)} +DONOR.INDEX.primary_diagnoses.INDEX.specimens.INDEX.sample_registrations.INDEX.sample_type, {single_val(SAMPLE_REGISTRATIONS_SHEET.sample_type)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX, {indexed_on(TREATMENTS_SHEET.submitter_primary_diagnosis_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.submitter_treatment_id, {single_val(TREATMENTS_SHEET.submitter_treatment_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.is_primary_treatment, {single_val(TREATMENTS_SHEET.is_primary_treatment)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.treatment_start_date, {single_date(TREATMENTS_SHEET.treatment_start_date)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.treatment_end_date, {single_date(TREATMENTS_SHEET.treatment_end_date)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.treatment_setting, {single_val(TREATMENTS_SHEET.treatment_setting)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.treatment_intent, {single_val(TREATMENTS_SHEET.treatment_intent)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.days_per_cycle, {integer(TREATMENTS_SHEET.days_per_cycle)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.number_of_cycles, {integer(TREATMENTS_SHEET.number_of_cycles)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.line_of_treatment, {integer(TREATMENTS_SHEET.line_of_treatment)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.status_of_treatment, {single_val(TREATMENTS_SHEET.status_of_treatment)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.treatment_type, {pipe_delim(TREATMENTS_SHEET.treatment_type)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.response_to_treatment_criteria_method, {single_val(TREATMENTS_SHEET.response_to_treatment_criteria_method)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.response_to_treatment, {single_val(TREATMENTS_SHEET.response_to_treatment)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.chemotherapies.INDEX, {indexed_on(CHEMOTHERAPIES_SHEET.submitter_treatment_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.chemotherapies.INDEX.chemotherapy_drug_dose_units, {single_val(CHEMOTHERAPIES_SHEET.chemotherapy_drug_dose_units)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.chemotherapies.INDEX.drug_reference_database, {single_val(CHEMOTHERAPIES_SHEET.drug_reference_database)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.chemotherapies.INDEX.drug_name, {single_val(CHEMOTHERAPIES_SHEET.drug_name)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.chemotherapies.INDEX.drug_reference_identifier, {single_val(CHEMOTHERAPIES_SHEET.drug_reference_identifier)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.chemotherapies.INDEX.prescribed_cumulative_drug_dose, {integer(CHEMOTHERAPIES_SHEET.prescribed_cumulative_drug_dose)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.chemotherapies.INDEX.actual_cumulative_drug_dose, {integer(CHEMOTHERAPIES_SHEET.actual_cumulative_drug_dose)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.hormone_therapies.INDEX, {indexed_on(HORMONE_THERAPIES_SHEET.submitter_treatment_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.hormone_therapies.INDEX.hormone_drug_dose_units, {single_val(HORMONE_THERAPIES_SHEET.hormone_drug_dose_units)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.hormone_therapies.INDEX.drug_reference_database, {single_val(HORMONE_THERAPIES_SHEET.drug_reference_database)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.hormone_therapies.INDEX.drug_name, {single_val(HORMONE_THERAPIES_SHEET.drug_name)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.hormone_therapies.INDEX.drug_reference_identifier, {single_val(HORMONE_THERAPIES_SHEET.drug_reference_identifier)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.hormone_therapies.INDEX.prescribed_cumulative_drug_dose, {integer(HORMONE_THERAPIES_SHEET.prescribed_cumulative_drug_dose)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.hormone_therapies.INDEX.actual_cumulative_drug_dose, {integer(HORMONE_THERAPIES_SHEET.actual_cumulative_drug_dose)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.immunotherapies.INDEX, {indexed_on(IMMUNOTHERAPIES_SHEET.submitter_treatment_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.immunotherapies.INDEX.immunotherapy_type, {single_val(IMMUNOTHERAPIES_SHEET.immunotherapy_type)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.immunotherapies.INDEX.drug_reference_database, {single_val(IMMUNOTHERAPIES_SHEET.drug_reference_database)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.immunotherapies.INDEX.immunotherapy_drug_dose_units, {single_val(IMMUNOTHERAPIES_SHEET.immunotherapy_drug_dose_units)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.immunotherapies.INDEX.drug_name, {single_val(IMMUNOTHERAPIES_SHEET.drug_name)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.immunotherapies.INDEX.drug_reference_identifier, {single_val(IMMUNOTHERAPIES_SHEET.drug_reference_identifier)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.immunotherapies.INDEX.prescribed_cumulative_drug_dose, {integer(IMMUNOTHERAPIES_SHEET.prescribed_cumulative_drug_dose)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.immunotherapies.INDEX.actual_cumulative_drug_dose, {integer(IMMUNOTHERAPIES_SHEET.actual_cumulative_drug_dose)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.radiations.INDEX, {indexed_on(RADIATIONS_SHEET.submitter_treatment_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.radiations.INDEX.radiation_therapy_modality, {single_val(RADIATIONS_SHEET.radiation_therapy_modality)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.radiations.INDEX.radiation_therapy_type, {single_val(RADIATIONS_SHEET.radiation_therapy_type)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.radiations.INDEX.anatomical_site_irradiated, {single_val(RADIATIONS_SHEET.anatomical_site_irradiated)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.radiations.INDEX.radiation_therapy_fractions, {integer(RADIATIONS_SHEET.radiation_therapy_fractions)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.radiations.INDEX.radiation_therapy_dosage, {integer(RADIATIONS_SHEET.radiation_therapy_dosage)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.radiations.INDEX.radiation_boost, {boolean(RADIATIONS_SHEET.radiation_boost)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.radiations.INDEX.reference_radiation_treatment_id, {single_val(RADIATIONS_SHEET.reference_radiation_treatment_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX, {indexed_on(SURGERIES_SHEET.submitter_treatment_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.surgery_type, {single_val(SURGERIES_SHEET.surgery_type)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.surgery_site, {single_val(SURGERIES_SHEET.surgery_site)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.surgery_location, {single_val(SURGERIES_SHEET.surgery_location)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.tumour_focality, {single_val(SURGERIES_SHEET.tumour_focality)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.residual_tumour_classification, {single_val(SURGERIES_SHEET.residual_tumour_classification)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.margin_types_involved.INDEX, {pipe_delim(SURGERIES_SHEET.margin_types_involved)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.margin_types_not_involved.INDEX, {pipe_delim(SURGERIES_SHEET.margin_types_not_involved)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.margin_types_not_assessed.INDEX, {pipe_delim(SURGERIES_SHEET.margin_types_not_assessed)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.lymphovascular_invasion, {single_val(SURGERIES_SHEET.lymphovascular_invasion)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.perineural_invasion, {single_val(SURGERIES_SHEET.perineural_invasion)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.submitter_specimen_id, {single_val(SURGERIES_SHEET.submitter_specimen_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.tumour_length, {integer(SURGERIES_SHEET.tumour_length)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.tumour_width, {integer(SURGERIES_SHEET.tumour_width)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.greatest_dimension_tumour, {integer(SURGERIES_SHEET.greatest_dimension_tumour)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX, {indexed_on(FOLLOWUPS_SHEET.submitter_treatment_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.submitter_follow_up_id, {single_val(FOLLOWUPS_SHEET.submitter_follow_up_id)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.date_of_followup, {single_date(FOLLOWUPS_SHEET.date_of_followup)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.disease_status_at_followup, {single_val(FOLLOWUPS_SHEET.disease_status_at_followup)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.relapse_type, {single_val(FOLLOWUPS_SHEET.relapse_type)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.date_of_relapse, {single_date(FOLLOWUPS_SHEET.date_of_relapse)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.method_of_progression_status, {pipe_delim(FOLLOWUPS_SHEET.method_of_progression_status)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.anatomic_site_progression_or_recurrence, {pipe_delim(FOLLOWUPS_SHEET.anatomic_site_progression_or_recurrence)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.recurrence_tumour_staging_system, {single_val(FOLLOWUPS_SHEET.recurrence_tumour_staging_system)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.recurrence_t_category, {single_val(FOLLOWUPS_SHEET.recurrence_t_category)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.recurrence_n_category, {single_val(FOLLOWUPS_SHEET.recurrence_n_category)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.recurrence_m_category, {single_val(FOLLOWUPS_SHEET.recurrence_m_category)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.followups.INDEX.recurrence_stage_group, {single_val(FOLLOWUPS_SHEET.recurrence_stage_group)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX, {indexed_on(FOLLOWUPS_SHEET.submitter_primary_diagnosis_id)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.submitter_follow_up_id, {single_val(FOLLOWUPS_SHEET.submitter_follow_up_id)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.date_of_followup, {single_date(FOLLOWUPS_SHEET.date_of_followup)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.disease_status_at_followup, {single_val(FOLLOWUPS_SHEET.disease_status_at_followup)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.relapse_type, {single_val(FOLLOWUPS_SHEET.relapse_type)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.date_of_relapse, {single_date(FOLLOWUPS_SHEET.date_of_relapse)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.method_of_progression_status, {pipe_delim(FOLLOWUPS_SHEET.method_of_progression_status)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.anatomic_site_progression_or_recurrence, {pipe_delim(FOLLOWUPS_SHEET.anatomic_site_progression_or_recurrence)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.recurrence_tumour_staging_system, {single_val(FOLLOWUPS_SHEET.recurrence_tumour_staging_system)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.recurrence_t_category, {single_val(FOLLOWUPS_SHEET.recurrence_t_category)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.recurrence_n_category, {single_val(FOLLOWUPS_SHEET.recurrence_n_category)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.recurrence_m_category, {single_val(FOLLOWUPS_SHEET.recurrence_m_category)} +DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.recurrence_stage_group, {single_val(FOLLOWUPS_SHEET.recurrence_stage_group)} +DONOR.INDEX.comorbidities.INDEX, {indexed_on(COMORBIDITIES_SHEET.submitter_donor_id)} +DONOR.INDEX.comorbidities.INDEX.prior_malignancy, {single_val(COMORBIDITIES_SHEET.prior_malignancy)} +DONOR.INDEX.comorbidities.INDEX.laterality_of_prior_malignancy, {single_val(COMORBIDITIES_SHEET.laterality_of_prior_malignancy)} +DONOR.INDEX.comorbidities.INDEX.comorbidity_type_code, {single_val(COMORBIDITIES_SHEET.comorbidity_type_code)} +DONOR.INDEX.comorbidities.INDEX.comorbidity_treatment_status, {single_val(COMORBIDITIES_SHEET.comorbidity_treatment_status)} +DONOR.INDEX.comorbidities.INDEX.comorbidity_treatment, {single_val(COMORBIDITIES_SHEET.comorbidity_treatment)} +DONOR.INDEX.comorbidities.INDEX.age_at_comorbidity_diagnosis, {integer(COMORBIDITIES_SHEET.age_at_comorbidity_diagnosis)} +DONOR.INDEX.exposures.INDEX, {indexed_on(EXPOSURES_SHEET.submitter_donor_id)} +DONOR.INDEX.exposures.INDEX.tobacco_smoking_status, {single_val(EXPOSURES_SHEET.tobacco_smoking_status)} +DONOR.INDEX.exposures.INDEX.tobacco_type.INDEX, {indexed_on(EXPOSURES_SHEET.tobacco_type)} +DONOR.INDEX.exposures.INDEX.pack_years_smoked, {float(EXPOSURES_SHEET.pack_years_smoked)} +DONOR.INDEX.biomarkers.INDEX, {indexed_on(BIOMARKERS_SHEET.submitter_donor_id)} +DONOR.INDEX.biomarkers.INDEX.er_status, {single_val(BIOMARKERS_SHEET.er_status)} +DONOR.INDEX.biomarkers.INDEX.pr_status, {single_val(BIOMARKERS_SHEET.pr_status)} +DONOR.INDEX.biomarkers.INDEX.her2_ihc_status, {single_val(BIOMARKERS_SHEET.her2_ihc_status)} +DONOR.INDEX.biomarkers.INDEX.her2_ish_status, {single_val(BIOMARKERS_SHEET.her2_ish_status)} +DONOR.INDEX.biomarkers.INDEX.hpv_ihc_status, {single_val(BIOMARKERS_SHEET.hpv_ihc_status)} +DONOR.INDEX.biomarkers.INDEX.hpv_pcr_status, {single_val(BIOMARKERS_SHEET.hpv_pcr_status)} +DONOR.INDEX.biomarkers.INDEX.hpv_strain, {pipe_delim(BIOMARKERS_SHEET.hpv_strain)} +DONOR.INDEX.biomarkers.INDEX.submitter_specimen_id, {single_val(BIOMARKERS_SHEET.submitter_specimen_id)} +DONOR.INDEX.biomarkers.INDEX.submitter_primary_diagnosis_id, {single_val(BIOMARKERS_SHEET.submitter_primary_diagnosis_id)} +DONOR.INDEX.biomarkers.INDEX.submitter_treatment_id, {single_val(BIOMARKERS_SHEET.submitter_treatment_id)} +DONOR.INDEX.biomarkers.INDEX.submitter_follow_up_id, {single_val(BIOMARKERS_SHEET.submitter_follow_up_id)} +DONOR.INDEX.biomarkers.INDEX.test_date, {single_date(BIOMARKERS_SHEET.test_date)} +DONOR.INDEX.biomarkers.INDEX.psa_level, {integer(BIOMARKERS_SHEET.psa_level)} +DONOR.INDEX.biomarkers.INDEX.ca125, {integer(BIOMARKERS_SHEET.ca125)} +DONOR.INDEX.biomarkers.INDEX.cea, {integer(BIOMARKERS_SHEET.cea)} +DONOR.INDEX.biomarkers.INDEX.er_percent_positive, {float(BIOMARKERS_SHEET.er_percent_positive)} +DONOR.INDEX.biomarkers.INDEX.pr_percent_positive, {float(BIOMARKERS_SHEET.pr_percent_positive)} +DONOR.INDEX.followups.INDEX, {moh_indexed_on_donor_if_others_absent(FOLLOWUPS_SHEET.submitter_donor_id, FOLLOWUPS_SHEET.submitter_primary_diagnosis_id, FOLLOWUPS_SHEET.submitter_treatment_id)} +DONOR.INDEX.followups.INDEX.submitter_follow_up_id, {single_val(FOLLOWUPS_SHEET.submitter_follow_up_id)} +DONOR.INDEX.followups.INDEX.date_of_followup, {single_date(FOLLOWUPS_SHEET.date_of_followup)} +DONOR.INDEX.followups.INDEX.disease_status_at_followup, {single_val(FOLLOWUPS_SHEET.disease_status_at_followup)} +DONOR.INDEX.followups.INDEX.relapse_type, {single_val(FOLLOWUPS_SHEET.relapse_type)} +DONOR.INDEX.followups.INDEX.date_of_relapse, {single_date(FOLLOWUPS_SHEET.date_of_relapse)} +DONOR.INDEX.followups.INDEX.method_of_progression_status, {pipe_delim(FOLLOWUPS_SHEET.method_of_progression_status)} +DONOR.INDEX.followups.INDEX.anatomic_site_progression_or_recurrence, {pipe_delim(FOLLOWUPS_SHEET.anatomic_site_progression_or_recurrence)} +DONOR.INDEX.followups.INDEX.recurrence_tumour_staging_system, {single_val(FOLLOWUPS_SHEET.recurrence_tumour_staging_system)} +DONOR.INDEX.followups.INDEX.recurrence_t_category, {single_val(FOLLOWUPS_SHEET.recurrence_t_category)} +DONOR.INDEX.followups.INDEX.recurrence_n_category, {single_val(FOLLOWUPS_SHEET.recurrence_n_category)} +DONOR.INDEX.followups.INDEX.recurrence_m_category, {single_val(FOLLOWUPS_SHEET.recurrence_m_category)} +DONOR.INDEX.followups.INDEX.recurrence_stage_group, {single_val(FOLLOWUPS_SHEET.recurrence_stage_group)} diff --git a/sample_inputs/new_cohort.py b/sample_inputs/new_cohort.py new file mode 100644 index 0000000..13a00ed --- /dev/null +++ b/sample_inputs/new_cohort.py @@ -0,0 +1,17 @@ +## Additional mappings customised to my special cohort + +def sex(data_value): + # make sure we only have one value + mapping_val = mappings.single_val(data_value) + + sex_dict = { + 'Female': 'F', + 'Male': 'M', + } + + result = None + for item in sex_dict: + if (item == data_value) and (mappings.is_null(data_value)) is False: + result = sex_dict[item] + + return result \ No newline at end of file diff --git a/schema.py b/schema.py index 8d94a16..e6f4bad 100644 --- a/schema.py +++ b/schema.py @@ -6,8 +6,8 @@ import re from copy import deepcopy import jsonschema -import dateparser from collections import Counter +import openapi_spec_validator as osv class ValidationError(Exception): @@ -62,15 +62,16 @@ def __init__(self, url, simple=False): self.scaffold = None """Retrieve the schema from the supplied URL, return as dictionary.""" - resp = requests.get(self.openapi_url) + try: + osv.validate_url(self.openapi_url) + resp = requests.get(self.openapi_url) + resp.raise_for_status() + schema = yaml.safe_load(resp.text) + except Exception as e: + print("Error reading the openapi schema, please ensure you have provided a url to a valid openapi schema.") + print(e) + return - # rudimentary test that we have found something that looks like an openapi schema - # would be better to formally validate - schema = yaml.safe_load(resp.text) - - if not "openapi" in schema: - print("Error: does not seem to be an openapi schema") - schema = None self.schema = schema["components"]["schemas"] sha_match = re.match(r".+Based on commit \"(.+)\".*", schema["info"]["description"]) if sha_match is not None: diff --git a/test_data/moh_diffs.txt b/test_data/moh_diffs.txt index b8c7b24..5c8b1dc 100644 --- a/test_data/moh_diffs.txt +++ b/test_data/moh_diffs.txt @@ -68,9 +68,11 @@ --- > DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.method_of_progression_status, {pipe_delim(FOLLOWUPS_SHEET.method_of_progression_status)} > DONOR.INDEX.primary_diagnoses.INDEX.followups.INDEX.anatomic_site_progression_or_recurrence, {pipe_delim(FOLLOWUPS_SHEET.anatomic_site_progression_or_recurrence)} -151c152 +150,151c151,152 +< DONOR.INDEX.exposures.INDEX.tobacco_type.INDEX, {indexed_on(EXPOSURES_SHEET.tobacco_type)} < DONOR.INDEX.exposures.INDEX.pack_years_smoked, {single_val(EXPOSURES_SHEET.pack_years_smoked)} --- +> DONOR.INDEX.exposures.INDEX.tobacco_type, {pipe_delim(EXPOSURES_SHEET.tobacco_type)} > DONOR.INDEX.exposures.INDEX.pack_years_smoked, {float(EXPOSURES_SHEET.pack_years_smoked)} 159c160 < DONOR.INDEX.biomarkers.INDEX.hpv_strain.INDEX, {indexed_on(BIOMARKERS_SHEET.hpv_strain)} diff --git a/test_data/test2moh.csv b/test_data/test2moh.csv index c34be00..8060590 100644 --- a/test_data/test2moh.csv +++ b/test_data/test2moh.csv @@ -109,9 +109,9 @@ DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.surgery_sit DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.surgery_location, {single_val(Surgery.surgery_location)} DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.tumour_focality, {single_val(Surgery.tumour_focality)} DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.residual_tumour_classification, {single_val(Surgery.residual_tumour_classification)} -DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.margin_types_involved.INDEX, {indexed_on(Surgery.margin_types_involved)} -DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.margin_types_not_involved.INDEX, {indexed_on(Surgery.margin_types_not_involved)} -DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.margin_types_not_assessed.INDEX, {indexed_on(Surgery.margin_types_not_assessed)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.margin_types_involved, {pipe_delim(Surgery.margin_types_involved)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.margin_types_not_involved, {pipe_delim(Surgery.margin_types_not_involved)} +DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.margin_types_not_assessed, {pipe_delim(Surgery.margin_types_not_assessed)} DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.lymphovascular_invasion, {single_val(Surgery.lymphovascular_invasion)} DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.perineural_invasion, {single_val(Surgery.perineural_invasion)} DONOR.INDEX.primary_diagnoses.INDEX.treatments.INDEX.surgeries.INDEX.submitter_specimen_id, {single_val(Surgery.submitter_specimen_id)} @@ -159,7 +159,7 @@ DONOR.INDEX.comorbidities.INDEX.comorbidity_treatment, {single_val(Comorbidity.c DONOR.INDEX.comorbidities.INDEX.age_at_comorbidity_diagnosis, {integer(Comorbidity.age_at_comorbidity_diagnosis)} DONOR.INDEX.exposures.INDEX, {indexed_on(EXPOSURES_SHEET.submitter_donor_id)} DONOR.INDEX.exposures.INDEX.tobacco_smoking_status, {single_val(EXPOSURES_SHEET.tobacco_smoking_status)} -DONOR.INDEX.exposures.INDEX.tobacco_type.INDEX, {indexed_on(EXPOSURES_SHEET.tobacco_type)} +DONOR.INDEX.exposures.INDEX.tobacco_type, {pipe_delim(EXPOSURES_SHEET.tobacco_type)} DONOR.INDEX.exposures.INDEX.pack_years_smoked, {single_val(EXPOSURES_SHEET.pack_years_smoked)} DONOR.INDEX.biomarkers.INDEX, {indexed_on(Biomarker.submitter_donor_id)} DONOR.INDEX.biomarkers.INDEX.er_status, {single_val(Biomarker.er_status)} diff --git a/update_moh_template.sh b/update_moh_template.sh index 473ebcc..0c0c9e2 100644 --- a/update_moh_template.sh +++ b/update_moh_template.sh @@ -5,4 +5,4 @@ python generate_schema.py --out tmp_template diff tmp_template.csv moh_template.csv > test_data/moh_diffs.txt bytes=$(head -5 test_data/moh_diffs.txt | wc -c) dd if=test_data/moh_diffs.txt bs="$bytes" skip=1 conv=notrunc of=test_data/moh_diffs.txt -rm tmp_template.csv \ No newline at end of file +rm tmp_template.csv diff --git a/validate_coverage.py b/validate_coverage.py index df0d17b..f80ed4e 100644 --- a/validate_coverage.py +++ b/validate_coverage.py @@ -1,21 +1,24 @@ import argparse import json -import jsonschema -import os -import re -import CSVConvert +import sys import mappings -from copy import deepcopy from mohschema import MoHSchema -from jsoncomparison import Compare +# from jsoncomparison import Compare +# from copy import deepcopy +# import yaml +# import jsonschema +# import os +# import re +# import CSVConvert def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--manifest', type=str, help="Path to a manifest file describing the mapping.", required=False) - parser.add_argument('--json', type=str, help="JSON file generated by CSVConvert.") - parser.add_argument('--input', type=str, help="Directory to the raw clinical data used for creating the JSON file.") + parser.add_argument('--json', type=str, help="_map.json file generated by CSVConvert.py.", + required=True) parser.add_argument('--verbose', '--v', action="store_true", help="Print extra information") + # parser.add_argument('--manifest', type=str, help="Path to a manifest file describing the mapping.", required=False) + # parser.add_argument('--input', type=str, required=False, help="Directory to the raw clinical data used for creating the JSON file.") args = parser.parse_args() return args @@ -197,7 +200,7 @@ def parse_args(): # missing.append(comment_match.group(2)) # print("\n".join(missing)) -def validate_coverage(map_json, input_path=None, verbose=False): +def validate_coverage(map_json, verbose=False): if verbose: mappings.VERBOSE = True @@ -205,8 +208,8 @@ def validate_coverage(map_json, input_path=None, verbose=False): if "openapi_url" not in map_json: return {"message": "No openapi_url schema available"} schema = MoHSchema(map_json["openapi_url"]) - if schema is None: - return {"message": f"Did not find an openapi schema at {map_json['openapi_url']}; please check link"} + if schema.json_schema is None: + sys.exit(f"Did not find an openapi schema at {map_json['openapi_url']}; please check the 'openapi_url' in the map json file.") # if --input was specified, we can check data frame completeness coverage: # if input_path is not None: @@ -227,17 +230,20 @@ def validate_coverage(map_json, input_path=None, verbose=False): } def main(args): - if args.json is not None and os.path.isfile(args.json): + try: with open(args.json) as fp: map_json = json.load(fp) - else: - print("A JSON file, generated by CSVConvert.py, is required, using the --json argument") - return + map_json['openapi_url'] + except FileNotFoundError as e: + print(e) + sys.exit("JSON file not found at provided path, please check your --json argument.") + except KeyError as e: + sys.exit(f"No {e} key found in the provided map json, please check you are providing the right file and " + "try again, it should end with '_map.json'.") - # manifest = args.manifest - input_path = args.input + # input_path = args.input verbose = True if args.verbose else False - result = validate_coverage(map_json, input_path, verbose) + result = validate_coverage(map_json, verbose) if len(result["warnings"]) > 0: print("Mapping has missing data:") for line in result["warnings"]: @@ -249,5 +255,6 @@ def main(args): for line in result["errors"]: print(line) + if __name__ == '__main__': - main(parse_args()) \ No newline at end of file + main(parse_args())