Merge pull request #2 from HudsonAlpha/multi_reference

Multi reference
HudsonAlpha · Nov 18, 2021 · 8261d3e · 8261d3e
2 parents 1dcc625 + 7c94b51
commit 8261d3e
Show file tree

Hide file tree

Showing 11 changed files with 384 additions and 82 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@ data/original_data
 pip3_freezes
 pipeline*
 results_b37/
+results
 scripts/.snakemake/
 scripts/CompareRTG.py
 scripts/__pycache__/

diff --git a/README.md b/README.md
@@ -15,9 +15,9 @@ The easiest way to invoke this pipeline is to use the provided conda environment
 Inside `PipelineConfig.py` is a set of information describing where files can be found within your environment. Generally, these values need to be set only once during the initial setup. The following are key variables that should be modified:
 
 #### Required
-1. `DATA_DIRECTORY` - The full path to a directory containing pipeline outputs. The pipeline assumes that variant files are located at `{DATA_DIRECTORY}/variant_calls/{aligner}/{caller}/{sample}.vcf.gz`.  Similarly, it assumes that RTG VCFEval outputs are located at `{DATA_DIRECTORY}/rtg_results/{aligner}/{caller}/{sample}`.  Both of these are required for the pipeline to run successfully. NOTE: if using a variant caller that is not specified in `model_metrics.json`, additional steps may be required (see section "Feature Extraction" for details).
-2. `ALIGNERS`, `VARIANT_CALLERS` - Each of these is a list of aligners or variant callers that will be used to populate the paths above (`{aligner}` or `{caller}`). Every combination of the two lists will be used.
-3. `FULL_PIPES` - similar to `ALIGNERS` and `VARIANT_CALLERS`, but this one allows for specific tuple pairs (i.e. instead of every combination).
+1. `DATA_DIRECTORY` - The full path to a directory containing pipeline outputs. The pipeline assumes that variant files are located at `{DATA_DIRECTORY}/variant_calls/{reference}/{aligner}/{caller}/{sample}.vcf.gz`.  Similarly, it assumes that RTG VCFEval outputs are located at `{DATA_DIRECTORY}/rtg_results/{reference}/{aligner}/{caller}/{sample}`.  Both of these are required for the pipeline to run successfully. NOTE: if using a variant caller that is not specified in `model_metrics.json`, additional steps may be required (see section "Feature Extraction" for details).
+2. `REFERENCES`, `ALIGNERS`, `VARIANT_CALLERS` - Each of these is a list of references, aligners, or variant callers that will be used to populate the paths above (e.g. `{reference}/{aligner}/{caller}`). Every combination of the three lists will be used.
+3. `FULL_PIPES` - similar to the above options, but this one allows for specific tuple triplets (i.e. instead of every combination).
 
 #### Optional
 1. `THREADS_PER_PROCS` - positive integer, the number of threads to use for training and testing

diff --git a/scripts/GIAB_pcrfree.json → sample_metadata/GIAB_pcrfree.json b/scripts/GIAB_pcrfree.json → sample_metadata/GIAB_pcrfree.json
diff --git a/scripts/GIAB_v1.json → sample_metadata/GIAB_v1.json b/scripts/GIAB_v1.json → sample_metadata/GIAB_v1.json
diff --git a/scripts/GIAB_v2.json → sample_metadata/GIAB_v2.json b/scripts/GIAB_v2.json → sample_metadata/GIAB_v2.json
diff --git a/scripts/ExtractFeatures.py b/scripts/ExtractFeatures.py
@@ -252,18 +252,19 @@ def gatherVcfMetrics(vcfFN, rawVCF, metrics):
 
     return ret, fields
 
-def extractFeatures(aligner, caller, samples, outPrefix):
+def extractFeatures(reference, aligner, caller, samples, outPrefix):
     '''
     This will run the feature extraction after identifying the appropriate inputs
+    @param reference - the reference label that was used upstream
     @param aligner - the aligner that was used upstream
     @param caller - the caller tht was used upstream
     @param samples - a file or string corresponding to the samples (see parseSlids(...) function)
     @param outPrefix - prefix for the extracted feature output
     @return - None
     '''
     #0 - constants based on input
-    RTG_ROOT = '%s/rtg_results/%s/%s' % (DATA_DIRECTORY, aligner, caller)
-    VCF_ROOT = '%s/variant_calls/%s/%s' % (DATA_DIRECTORY, aligner, caller)
+    RTG_ROOT = '%s/rtg_results/%s/%s/%s' % (DATA_DIRECTORY, reference, aligner, caller)
+    VCF_ROOT = '%s/variant_calls/%s/%s/%s' % (DATA_DIRECTORY, reference, aligner, caller)
     OUTPUT_ROOT = outPrefix
     CLASSIFICATION_TYPES = ['fp', 'tp']
     VCF_METRICS = ALL_METRICS[caller]
@@ -274,8 +275,8 @@ def extractFeatures(aligner, caller, samples, outPrefix):
     #1 - load the samples
     SAMPLES = parseSlids(samples)
 
-    #2 - make sure results exist for each aligner-caller-sample triplet
-    acDir = '%s/%s/%s' % (OUTPUT_ROOT, aligner, caller)
+    #2 - make sure results exist for each reference-aligner-caller-sample quad
+    acDir = '%s/%s/%s/%s' % (OUTPUT_ROOT, reference, aligner, caller)
     if not os.path.exists(acDir):
         os.makedirs(acDir)
 
@@ -296,8 +297,8 @@ def extractFeatures(aligner, caller, samples, outPrefix):
     #3 - reformat results into feature sets as necessary
     for sample in SAMPLES:
         for ct in CLASSIFICATION_TYPES:
-            outNumpyFN = '%s/%s/%s/%s_%s.npy' % (OUTPUT_ROOT, aligner, caller, sample, ct)
-            outCatFN = '%s/%s/%s/%s_%s_fields.json' % (OUTPUT_ROOT, aligner, caller, sample, ct)
+            outNumpyFN = '%s/%s/%s/%s/%s_%s.npy' % (OUTPUT_ROOT, reference, aligner, caller, sample, ct)
+            outCatFN = '%s/%s/%s/%s/%s_%s_fields.json' % (OUTPUT_ROOT, reference, aligner, caller, sample, ct)
             if not os.path.exists(outNumpyFN) or REGENERATE:
                 print('Building "%s"...' % (outNumpyFN, ))
                 rtgVcfFN = '%s/%s/%s.vcf.gz' % (RTG_ROOT, sample, ct)
@@ -319,15 +320,15 @@ def extractFeatures(aligner, caller, samples, outPrefix):
     fpList = []
 
     #load the fields from the first file
-    fp = open('%s/%s/%s/%s_tp_fields.json' % (OUTPUT_ROOT, aligner, caller, SAMPLES[0]))
+    fp = open('%s/%s/%s/%s/%s_tp_fields.json' % (OUTPUT_ROOT, reference, aligner, caller, SAMPLES[0]))
     fieldsList = json.load(fp)
     fp.close()
 
     #get data from each sample
     for sample in SAMPLES:
         #TP first
-        tpFN = '%s/%s/%s/%s_tp.npy' % (OUTPUT_ROOT, aligner, caller, sample)
-        tpOrder = '%s/%s/%s/%s_tp_fields.json' % (OUTPUT_ROOT, aligner, caller, sample)
+        tpFN = '%s/%s/%s/%s/%s_tp.npy' % (OUTPUT_ROOT, reference, aligner, caller, sample)
+        tpOrder = '%s/%s/%s/%s/%s_tp_fields.json' % (OUTPUT_ROOT, reference, aligner, caller, sample)
         fp = open(tpOrder, 'r')
         tpFields = json.load(fp)
         fp.close()
@@ -336,8 +337,8 @@ def extractFeatures(aligner, caller, samples, outPrefix):
         tpList.append(tpVar)
 
         #now false positives
-        fpFN = '%s/%s/%s/%s_fp.npy' % (OUTPUT_ROOT, aligner, caller, sample)
-        fpOrder = '%s/%s/%s/%s_fp_fields.json' % (OUTPUT_ROOT, aligner, caller, sample)
+        fpFN = '%s/%s/%s/%s/%s_fp.npy' % (OUTPUT_ROOT, reference, aligner, caller, sample)
+        fpOrder = '%s/%s/%s/%s/%s_fp_fields.json' % (OUTPUT_ROOT, reference, aligner, caller, sample)
         fp = open(fpOrder, 'r')
         fpFields = json.load(fp)
         fp.close()
@@ -360,6 +361,7 @@ def extractFeatures(aligner, caller, samples, outPrefix):
     #p.add_argument('-d', '--date-subdir', dest='date_subdir', default=None, help='the date subdirectory (default: "hli-YYMMDD")')
 
     #required main arguments
+    p.add_argument('reference', type=str, help='the reference to train on')
     p.add_argument('aligner', type=str, help='the aligner to train on')
     p.add_argument('caller', type=str, help='the variant caller to train on')
     p.add_argument('sample_metadata', type=str, help='a file with sample identifiers')
@@ -368,4 +370,4 @@ def extractFeatures(aligner, caller, samples, outPrefix):
     #parse the arguments
     args = p.parse_args()
 
-    extractFeatures(args.aligner, args.caller, args.sample_metadata, args.output_prefix)
+    extractFeatures(args.reference, args.aligner, args.caller, args.sample_metadata, args.output_prefix)
diff --git a/scripts/PipelineConfig.py b/scripts/PipelineConfig.py
@@ -3,7 +3,7 @@
 #Core pipeline config
 ############################################################
 #this is the repo directory
-REPO_DIRECTORY = '/gpfs/gpfs1/home/jholt/sanger_less_tests'
+REPO_DIRECTORY = '/cluster/home/jholt/sanger_less_tests'
 
 #a dictionary containing metadata for the samples
 SAMPLE_JSONS = [
@@ -16,10 +16,15 @@
 # the following files
 # {DATA_DIRECTORY}/variant_calls/{aligner}/{caller}/{sample}.vcf.gz - the expected VCF file paths
 # {DATA_DIRECTORY}/rtg_results/{aligner}/{caller}/{sample} - folder containing standard RTG results, specifically summary.txt, fp.vcf.gz, and tp.vcf.gz
-DATA_DIRECTORY = '/gpfs/gpfs1/home/jholt/csl_validations/core_pipeline_analysis/pipeline'
+#DATA_DIRECTORY = '/cluster/home/jholt/csl_validations/core_pipeline_analysis/pipeline'
+DATA_DIRECTORY = '/cluster/home/jholt/CSL_pipeline_benchmark/pipeline'
+
+#this is where you specify the pipelines to run; every REFERENCE, ALIGNER, and VARIANT_CALLER combo will be run together
+#  and each triplet gets it's own model trained for it
+REFERENCES = [
+    #'clinical'
+]
 
-#this is where you specify the pipelines to run; every ALIGNER and VARIANT_CALLER combo will be run together 
-# pair-wise and used to identify batches of files for training/testing
 ALIGNERS = [
     #'bwa-mem-0.7.17-BQSR'
 ]
@@ -32,7 +37,7 @@
 FULL_PIPES = [
     #('dragen-07.011.352.3.2.8b', 'dragen-07.011.352.3.2.8b'), #this is what we originally trained with
     #('dragen-07.021.510.3.5.7', 'dragen-07.021.510.3.5.7'), #this is what follow ups were on
-    ('clinical_sentieon-201808.07', 'strelka-2.9.10')
+    ('clinical', 'sentieon-201808.07', 'strelka-2.9.10')
 ]
 
 #pipeline parameters, thread count currently only applies to the actual model training (CV to be precise)
@@ -43,7 +48,7 @@
 ############################################################
 
 #only necessary if you plan to generate a data summary (i.e. the Supplementary Document)
-LATEX_PATH = '/gpfs/gpfs1/home/jholt/texlive/2019/bin/x86_64-linux/pdflatex'
+LATEX_PATH = '/cluster/home/jholt/texlive/2019/bin/x86_64-linux/pdflatex'
 
 ############################################################
 #Slack config
@@ -54,5 +59,5 @@
 
 #json with sub-dictionary 'urls' where key is a channel name (such as the value in ENABLED_SLACK_CHANNEL) 
 # and value is a Slack endpoint URL
-ENABLED_SLACK_URLS = '/gpfs/gpfs1/home/jholt/slack_integration/data/slack_urls.json'
+ENABLED_SLACK_URLS = '/cluster/home/jholt/slack_integration/data/slack_urls.json'
 ENABLED_SLACK_CHANNEL = "@holtjma"