From 64b2d3b13ffeb51be5a5a3310d17b4fb72919e7b Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Tue, 3 May 2022 10:39:13 -0500
Subject: [PATCH 01/16] changes necessary to get feature extracton on cyvcf2
 and training to run

---
 scripts/ExtractFeatures.py | 113 ++++++++++++++++++++++---------------
 scripts/PipelineConfig.py  |  18 ++++--
 scripts/TrainModels.py     |   2 +-
 scripts/cluster.json       |   1 -
 scripts/model_metrics.json |  13 +++++
 5 files changed, 95 insertions(+), 52 deletions(-)

diff --git a/scripts/ExtractFeatures.py b/scripts/ExtractFeatures.py
index de8ef02..4f8c33d 100644
--- a/scripts/ExtractFeatures.py
+++ b/scripts/ExtractFeatures.py
@@ -5,7 +5,7 @@
 import numpy as np
 import os
 import re
-import vcf
+import cyvcf2
 
 from PipelineConfig import DATA_DIRECTORY, REPO_DIRECTORY
 from RunTrainingPipeline import parseSlids
@@ -47,43 +47,44 @@
     assert(k not in ALL_METRICS)
     ALL_METRICS[k] = ALL_METRICS[UNPARSED_METRICS['copied'][k]]
 
-def getVariantFeatures(variant, sampleID, fields, rawReader, allowHomRef=False):
+def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=False):
     '''
     This function takes a variant from a VCF and a sample ID and extracts the features for it
     @param variant - the variant (from "vcfReader")
-    @param sampleID - the sample in the VCF file
+    @param sample_index - the sample index in the VCF file (0-based)
     @param fields - the fields to extract from the variant
     @param rawReader - the raw VCF file
     @param allowHomRef - if True, this will allow 0/0 calls (otherwise, it throws an error)
     @return - a list of ordered features
     '''
     annots = []
-
-    #first, annotate whether it's a SNP or an indel
-    callStats = variant.genotype(sampleID)
-    if '.' in callStats['GT']:
-        gtPieces = ['0', '0']
-    else:
-        gtPieces = re.split('[/|]', callStats['GT'])
     
-    if len(gtPieces) != 2:
+    #first, annotate whether it's a SNP or an indel
+    gt_pieces = variant.genotypes[sample_index]
+    if len(gt_pieces) != 3:
         #TODO: only a problem in strelka, how to handle it?
-        return None
+        #return None
+        raise Exception(f'Unexpected GT field: {gt_pieces}')
+
+    #explicitly store phasing and sort the GT field (this is important for ref/alt ordering)
+    is_phased = gt_pieces[-1]
+    gt_pieces = sorted(gt_pieces[0:-1])
     
     if not allowHomRef:
-        assert(gtPieces[0] != '0' or gtPieces[1] != '0')
+        assert(gt_pieces[0] != 0 or gt_pieces[1] != 0)
     
     #look at the variant call
-    s1 = (variant.REF if gtPieces[0] == '0' else variant.ALT[int(gtPieces[0])-1])
-    s2 = (variant.REF if gtPieces[1] == '0' else variant.ALT[int(gtPieces[1])-1])
+    s1 = (variant.REF if gt_pieces[0] == 0 else variant.ALT[int(gt_pieces[0])-1])
+    s2 = (variant.REF if gt_pieces[1] == 0 else variant.ALT[int(gt_pieces[1])-1])
+
     if str(s1) == '<NON_REF>' or str(s2) == '<NON_REF>':
         #ignore this one, it's something we can't figure out from a GVCF
         #TODO: do we feel like ever changing this?
         return None
     
     #pull this out for use by all the AD stat fields
-    callAD = callStats['AD']
-    adSum = sum(callAD)
+    call_ad = variant.format('AD')[sample_index]
+    ad_sum = sum(call_ad)
 
     for k, subk in fields:
         if k == 'VAR':
@@ -102,53 +103,59 @@ def getVariantFeatures(variant, sampleID, fields, rawReader, allowHomRef=False):
             #these are call specific measures
             # all sub-keys default to a FLOAT interpretation if not specifically handled (see "else" clause)
             if subk == 'GT':
-                if gtPieces[0] == gtPieces[1]:
-                    if gtPieces[0] == '0':
+                if gt_pieces[0] == gt_pieces[1]:
+                    if gt_pieces[0] == 0:
                         val = GT_REF_HOM
                     else:
                         val = GT_ALT_HOM
-                elif gtPieces[0] == '0' or gtPieces[1] == '0':
+                elif gt_pieces[0] == 0 or gt_pieces[1] == 0:
                     val = GT_REF_HET
                 else:
                     val = GT_HET_HET
 
             elif subk == 'AD0':
-                val = (callAD[int(gtPieces[0])] if adSum > 0 else DEFAULT_MISSING)
+                val = (call_ad[int(gt_pieces[0])] if ad_sum > 0 else DEFAULT_MISSING)
 
             elif subk == 'AD1':
-                val = (callAD[int(gtPieces[1])] if adSum > 0 else DEFAULT_MISSING)
+                val = (call_ad[int(gt_pieces[1])] if ad_sum > 0 else DEFAULT_MISSING)
 
             elif subk == 'ADO':
-                if gtPieces[0] == gtPieces[1]:
+                if gt_pieces[0] == gt_pieces[1]:
                     #homozygous, pull AD once
-                    adUsed = callAD[int(gtPieces[0])]
+                    ad_used = call_ad[int(gt_pieces[0])]
                 else:
                     #heterozygous, get both AD vals
-                    adUsed = callAD[int(gtPieces[0])] + callAD[int(gtPieces[1])]
+                    ad_used = call_ad[int(gt_pieces[0])] + call_ad[int(gt_pieces[1])]
                 
                 #AD-other is the total AD minus the GT AD
-                val = ((adSum - adUsed) if adSum > 0 else DEFAULT_MISSING)
+                val = ((ad_sum - ad_used) if ad_sum > 0 else DEFAULT_MISSING)
 
             elif subk == 'AF0':
-                val = (callAD[int(gtPieces[0])] / adSum if adSum > 0 else DEFAULT_MISSING)
+                val = (call_ad[int(gt_pieces[0])] / ad_sum if ad_sum > 0 else DEFAULT_MISSING)
 
             elif subk == 'AF1':
-                val = (callAD[int(gtPieces[1])] / adSum if adSum > 0 else DEFAULT_MISSING)
+                val = (call_ad[int(gt_pieces[1])] / ad_sum if ad_sum > 0 else DEFAULT_MISSING)
 
             elif subk == 'AFO':
-                if gtPieces[0] == gtPieces[1]:
+                if gt_pieces[0] == gt_pieces[1]:
                     #homozygous, pull AD once
-                    adUsed = callAD[int(gtPieces[0])]
+                    adUsed = call_ad[int(gt_pieces[0])]
                 else:
                     #heterozygous, get both AD vals
-                    adUsed = callAD[int(gtPieces[0])] + callAD[int(gtPieces[1])]
+                    adUsed = call_ad[int(gt_pieces[0])] + call_ad[int(gt_pieces[1])]
                 
                 #AD-other is the total AD minus the GT AD
-                val = ((adSum - adUsed) / adSum if adSum > 0 else DEFAULT_MISSING)
+                val = ((ad_sum - ad_used) / ad_sum if ad_sum > 0 else DEFAULT_MISSING)
+            
+            elif subk == 'min(PL)':
+                #this is the phred likelihoods, one of which should always be zero
+                pl_values = list(variant.format('PL')[sample_index])
+                pl_values.remove(0)
+                val = min(pl_values)
 
             else:
                 try:
-                    val = float(callStats[subk])
+                    val = float(variant.format(subk)[sample_index])
                 except AttributeError as e:
                     #TODO: is this okay for everything?
                     val = DEFAULT_MISSING
@@ -157,7 +164,7 @@ def getVariantFeatures(variant, sampleID, fields, rawReader, allowHomRef=False):
         elif k == 'INFO':
             #get from the INFO column
             # all values here default to a FLOAT interpretation
-            val = (float(variant.INFO[subk]) if (subk in variant.INFO) else DEFAULT_MISSING)
+            val = float(variant.INFO.get(subk, DEFAULT_MISSING))
             annots.append(val)
         
         elif k == 'MUNGED':
@@ -184,17 +191,27 @@ def getVariantFeatures(variant, sampleID, fields, rawReader, allowHomRef=False):
                 #search the raw VCF for nearby calls
                 nearbyFound = 0
                 BUFFER = 20
-                for rawVariant in rawReader.fetch(variant.CHROM, variant.POS-BUFFER, variant.POS+BUFFER):
+                for rawVariant in rawReader(f'{variant.CHROM}:{variant.POS-BUFFER}-{variant.POS+BUFFER}'):
                     if rawVariant.POS != variant.POS:
-                        rawData = rawVariant.genotype(sampleID)
-                        if rawData['GT'] not in ['./.', '0/0']:
+                        raw_gt = rawVariant.genotypes[sample_index]
+                        assert(len(raw_gt) == 3)
+                        if raw_gt[0] > 0 or raw_gt[1] > 0:
                             nearbyFound += 1
                             
                 annots.append(nearbyFound)
 
             elif subk == 'FILTER':
-                #store the number of filters applied (PASS doesn't show up in these lists)
-                annots.append(len(variant.FILTER))
+                #store the number of filters applied
+                if variant.FILTERS:
+                    if 'PASS' in variant.FILTERS:
+                        #PASS can show up here, if it exist we want to remove it from the count
+                        val = len(variant.FILTERS)-1
+                    else:
+                        #else just count up all the filters
+                        val = len(variant.FILTERS)
+                else:
+                    val = 0
+                annots.append(val)
 
             elif subk == 'ID':
                 #this is basically a boolean flag capturing if the variant has an ID annotation
@@ -223,13 +240,13 @@ def gatherVcfMetrics(vcfFN, rawVCF, metrics):
     fields = []
 
     #open up the VCF file
-    vcfReader = vcf.Reader(filename=vcfFN, compressed=True)
-    rawReader = vcf.Reader(filename=rawVCF, compressed=True)
+    vcfReader = cyvcf2.VCF(vcfFN)
+    rawReader = cyvcf2.VCF(rawVCF)
 
-    #get the sample
+    #get the sample, right now we are enforcing one sample per VCF
     samples = vcfReader.samples
     assert(len(samples) == 1)
-    sampleID = samples[0]
+    sample_index = 0
     
     #fill out the fields; always include a variant type field
     fields.append(('VAR', 'TYPE'))
@@ -243,7 +260,15 @@ def gatherVcfMetrics(vcfFN, rawVCF, metrics):
     
     #now we go through
     for variant in vcfReader:
-        annots = getVariantFeatures(variant, sampleID, fields, rawReader)
+        annots = getVariantFeatures(variant, sample_index, fields, rawReader)
+        assert(len(annots) == len(fields))
+        '''
+        print(annots)
+        for i, f in enumerate(fields):
+            print(i, f, annots[i], sep='\t')
+        print()
+        exit()
+        '''
         if annots != None:
             ret.append(annots)
             if len(ret) % 100000 == 0:
diff --git a/scripts/PipelineConfig.py b/scripts/PipelineConfig.py
index 77428d5..00a98a2 100644
--- a/scripts/PipelineConfig.py
+++ b/scripts/PipelineConfig.py
@@ -1,15 +1,20 @@
 
+#this is a module we use for storing all our paths systematically, aka a bunch of strings
+from csl_pipeline.config import (
+    JHOLT_HOME
+)
+
 ############################################################
 #Core pipeline config
 ############################################################
 #this is the repo directory
-REPO_DIRECTORY = '/cluster/home/jholt/sanger_less_tests'
+REPO_DIRECTORY = f'{JHOLT_HOME}/sanger_less_tests'
 
 #a dictionary containing metadata for the samples
 SAMPLE_JSONS = [
     #'%s/scripts/GIAB_v1.json' % REPO_DIRECTORY
     #'%s/scripts/GIAB_v2.json' % REPO_DIRECTORY
-    '%s/scripts/GIAB_pcrfree.json' % REPO_DIRECTORY
+    f'{REPO_DIRECTORY}/scripts/GIAB_pcrfree.json'
 ]
 
 #root directory for RTG-based analysis which is expected to conform to a standard directory structure, and contain
@@ -17,7 +22,7 @@
 # {DATA_DIRECTORY}/variant_calls/{aligner}/{caller}/{sample}.vcf.gz - the expected VCF file paths
 # {DATA_DIRECTORY}/rtg_results/{aligner}/{caller}/{sample} - folder containing standard RTG results, specifically summary.txt, fp.vcf.gz, and tp.vcf.gz
 #DATA_DIRECTORY = '/cluster/home/jholt/csl_validations/core_pipeline_analysis/pipeline'
-DATA_DIRECTORY = '/cluster/home/jholt/CSL_pipeline_benchmark/pipeline'
+DATA_DIRECTORY = f'{JHOLT_HOME}/CSL_pipeline_benchmark/pipeline'
 
 #this is where you specify the pipelines to run; every REFERENCE, ALIGNER, and VARIANT_CALLER combo will be run together
 #  and each triplet gets it's own model trained for it
@@ -37,7 +42,8 @@
 FULL_PIPES = [
     #('dragen-07.011.352.3.2.8b', 'dragen-07.011.352.3.2.8b'), #this is what we originally trained with
     #('dragen-07.021.510.3.5.7', 'dragen-07.021.510.3.5.7'), #this is what follow ups were on
-    ('clinical', 'sentieon-201808.07', 'strelka-2.9.10')
+    #('clinical', 'sentieon-201808.07', 'strelka-2.9.10')
+    ('hg38_T2T_masked', 'sentieon-202112.01', 'dnascope-1.0-202112.01-PO')
 ]
 
 #pipeline parameters, thread count currently only applies to the actual model training (CV to be precise)
@@ -48,7 +54,7 @@
 ############################################################
 
 #only necessary if you plan to generate a data summary (i.e. the Supplementary Document)
-LATEX_PATH = '/cluster/home/jholt/texlive/2019/bin/x86_64-linux/pdflatex'
+LATEX_PATH = f'{JHOLT_HOME}/texlive/2019/bin/x86_64-linux/pdflatex'
 
 ############################################################
 #Slack config
@@ -59,5 +65,5 @@
 
 #json with sub-dictionary 'urls' where key is a channel name (such as the value in ENABLED_SLACK_CHANNEL) 
 # and value is a Slack endpoint URL
-ENABLED_SLACK_URLS = '/cluster/home/jholt/slack_integration/data/slack_urls.json'
+ENABLED_SLACK_URLS = f'{JHOLT_HOME}/slack_integration/data/slack_urls.json'
 ENABLED_SLACK_CHANNEL = "@holtjma"
diff --git a/scripts/TrainModels.py b/scripts/TrainModels.py
index ac21ef5..3ab979e 100644
--- a/scripts/TrainModels.py
+++ b/scripts/TrainModels.py
@@ -319,7 +319,7 @@ def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, co
             )
         else:
             raise Exception('NO_IMPL')
-        gsClf.fit(train_X, train_Y, train_groups)
+        gsClf.fit(train_X, train_Y, groups=train_groups)
         print('[%s]\t\tBest params: %s' % (str(datetime.datetime.now()), gsClf.best_params_))
         
         print('[%s]\tGathering CV results...'% (str(datetime.datetime.now()), ))
diff --git a/scripts/cluster.json b/scripts/cluster.json
index 38aed1a..6a94506 100644
--- a/scripts/cluster.json
+++ b/scripts/cluster.json
@@ -1,7 +1,6 @@
 {
     "__default__": {
         "queue" : "csl",
-        "logdir" : "/gpfs/gpfs1/home/jholt/sanger_less_tests/pipeline/logs/cluster",
         "memory" : 8000,
         "mem_mb" : 8000
     }
diff --git a/scripts/model_metrics.json b/scripts/model_metrics.json
index 473d73d..f59cda2 100644
--- a/scripts/model_metrics.json
+++ b/scripts/model_metrics.json
@@ -32,6 +32,19 @@
             "MUNGED" : [
                 "QUAL", "NEARBY", "FILTER", "ID"
             ]
+        },
+        "dnascope-1.0-202112.01-PO" : {
+            "CALL" : [
+                "AD0", "AD1", "ADO", "AF0", "AF1", "AFO", "GT", "DP", "GQ", "min(PL)"
+            ],
+            "INFO" : [
+                "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet", 
+                "FS", "GC", "ML_PROB", "MQ", "MQ0", 
+                "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", "SOR", "STR", "UDP"
+            ],
+            "MUNGED" : [
+                "QUAL", "NEARBY", "FILTER", "ID"
+            ]
         }
     },
     "copied" : {

From 19419510eb5bb2d31fe92597c10d469b177b7a3b Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Wed, 4 May 2022 07:31:14 -0500
Subject: [PATCH 02/16] temp backup

---
 scripts/TrainingConfig.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py
index 7a3c2c9..b5b5f78 100644
--- a/scripts/TrainingConfig.py
+++ b/scripts/TrainingConfig.py
@@ -25,7 +25,7 @@
 SUBSET_SIZE = 10000
 
 #if True, manually remove features in MANUAL_FS_LABELS (these are historically unimportant features)
-MANUAL_FS = True
+MANUAL_FS = False
 MANUAL_FS_LABELS = ['CALL-ADO', 'CALL-AFO']
 
 #if True, mark false positive as true positives and vice versa

From 7245fd5eacf23bc01e3816d64f9b32eb2978d3c5 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Wed, 4 May 2022 09:04:46 -0500
Subject: [PATCH 03/16] removing this fixes stats, this will need to be full
 patched

---
 scripts/EvaluateVariants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/EvaluateVariants.py b/scripts/EvaluateVariants.py
index e61af3d..11d3c4b 100644
--- a/scripts/EvaluateVariants.py
+++ b/scripts/EvaluateVariants.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pickle
 import re
-import vcf
+#import vcf
 
 from ExtractFeatures import ALL_METRICS, getVariantFeatures, GT_TRANSLATE, VAR_TRANSLATE
 

From 8b7e511e396aa97250f032ec9832172d1ae6ef0c Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Wed, 4 May 2022 09:44:56 -0500
Subject: [PATCH 04/16] fixes for ref/al/caller setup

---
 scripts/ExtractELI5Results.py |  2 +-
 scripts/GenerateSupplement.py | 39 +++++++++++++++++++----------------
 scripts/PipelineConfig.py     |  2 +-
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/scripts/ExtractELI5Results.py b/scripts/ExtractELI5Results.py
index 27b325b..248d482 100644
--- a/scripts/ExtractELI5Results.py
+++ b/scripts/ExtractELI5Results.py
@@ -60,7 +60,7 @@ def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels):
             coreFeatureNames = [tuple(f.split('-')) for f in allModels[k][bestModelName]['FEATURES']]
             
             #get the weights and add it to the result
-            weights = eli5.explain_weights(clf, feature_names=['-'.join(cfn) for cfn in coreFeatureNames])
+            weights = eli5.explain_weights(clf, feature_names=['-'.join(cfn) for cfn in coreFeatureNames], top=None)
             dictForm = eli5.formatters.as_dict.format_as_dict(weights)
             ret[reformKey] = {
                 'best_model' : bestModelName,
diff --git a/scripts/GenerateSupplement.py b/scripts/GenerateSupplement.py
index 1122075..c5611fe 100644
--- a/scripts/GenerateSupplement.py
+++ b/scripts/GenerateSupplement.py
@@ -33,22 +33,23 @@ def loadMetadata():
         md.update(d)
     return md
 
-def getModelResults(datasets, pipeline, aligner, caller):
+def getModelResults(datasets, pipeline, reference, aligner, caller):
     '''
     This will pull ALL the results for an aligner/caller combination and put it in a dictionary to be
     later consumed by Jinja2.
     @param pipeline - the pipeline subdirectory
+    @param reference - the reference to get information for
     @param aligner - the aligner to get information for
     @param caller - the caller to get information for
     @return - a dictionary values for use by Jinja2
     '''
     #get all the RTG-specific results for this aligner/caller combo
-    rtgResults = getRtgResults(datasets, pipeline, aligner, caller)
+    rtgResults = getRtgResults(datasets, pipeline, reference, aligner, caller)
 
     #get all the training results for this aligner/caller combo
-    trainingResults = getTrainingResults(pipeline, aligner, caller)
-    strictResults = getTrainingResults(pipeline, aligner, caller, True)
-    eli5Results = getEli5Results(pipeline, aligner, caller)
+    trainingResults = getTrainingResults(pipeline, reference, aligner, caller)
+    strictResults = getTrainingResults(pipeline, reference, aligner, caller, True)
+    eli5Results = getEli5Results(pipeline, reference, aligner, caller)
 
     retDict = {
         'ALIGNER' : aligner,
@@ -60,17 +61,18 @@ def getModelResults(datasets, pipeline, aligner, caller):
     }
     return retDict
 
-def getRtgResults(datasets, pipeline, aligner, caller):
+def getRtgResults(datasets, pipeline, reference, aligner, caller):
     '''
     This will load results specific to the RTG evaluation, things like number of TP, TN, etc.
     @param pipeline - the pipeline subdirectory (usually "pipeline")
+    @param reference - the reference to get information for
     @param aligner - the aligner to get information for
     @param caller - the caller to get information for
     @return - a dictionary values for use by Jinja2
     '''
     retDict = {}
 
-    rtgSummaryPattern = '%s/rtg_results/%s/%s/*/summary.txt' % (DATA_DIRECTORY, aligner, caller)
+    rtgSummaryPattern = '%s/rtg_results/%s/%s/%s/*/summary.txt' % (DATA_DIRECTORY, reference, aligner, caller)
     rtgMetrics = sorted(glob.glob(rtgSummaryPattern))
     retDict['SAMPLE_SUMMARY'] = {}
     totalDict = {}
@@ -103,7 +105,7 @@ def getRtgResults(datasets, pipeline, aligner, caller):
     retDict['TOTAL_SUMMARY'] = totalDict
 
     #get the feature stats
-    summaryFN = '%s/%s/feature_stats/%s/%s/feature_stats.tsv' % (REPO_DIRECTORY, pipeline, aligner, caller)
+    summaryFN = '%s/%s/feature_stats/%s/%s/%s/feature_stats.tsv' % (REPO_DIRECTORY, pipeline, reference, aligner, caller)
     if os.path.exists(summaryFN):
         featureDict = {}
         fp = open(summaryFN, 'r')
@@ -173,10 +175,11 @@ def loadRtgSummary(fn):
         resultsDict[h] = val
     return resultsDict
 
-def getTrainingResults(pipeline, aligner, caller, strict=False):
+def getTrainingResults(pipeline, reference, aligner, caller, strict=False):
     '''
     This will retrieve training results for the specified aligner/caller combo
     @param pipeline - the pipeline subdirectory
+    @param reference - the reference specified
     @param aligner - the aligner specified
     @param caller - the caller specified
     @param strict - if True, load the strict version of the results
@@ -184,9 +187,9 @@ def getTrainingResults(pipeline, aligner, caller, strict=False):
     '''
     #pull the summary results and parse the clinical fragments
     if not strict:
-        summaryFN = '%s/%s/model_summaries/%s/%s/model_summary.tsv' % (REPO_DIRECTORY, pipeline, aligner, caller)
+        summaryFN = '%s/%s/model_summaries/%s/%s/%s/model_summary.tsv' % (REPO_DIRECTORY, pipeline, reference, aligner, caller)
     else:
-        summaryFN = '%s/%s/model_summaries/%s/%s/strict_summary.tsv' % (REPO_DIRECTORY, pipeline, aligner, caller)
+        summaryFN = '%s/%s/model_summaries/%s/%s/%s/strict_summary.tsv' % (REPO_DIRECTORY, pipeline, reference, aligner, caller)
     clinicalResults = {}
     fp = open(summaryFN, 'r')
 
@@ -244,7 +247,7 @@ def getTrainingResults(pipeline, aligner, caller, strict=False):
                 continue
             
             reformKey = VAR_TRANSLATE[vt]+'_'+GT_TRANSLATE[gt]
-            imageFN = '%s/%s/model_summaries/%s/%s/roc_curves/%s.png' % (REPO_DIRECTORY, pipeline, aligner, caller, reformKey)
+            imageFN = '%s/%s/model_summaries/%s/%s/%s/roc_curves/%s.png' % (REPO_DIRECTORY, pipeline, reference, aligner, caller, reformKey)
             if os.path.exists(imageFN):
                 imageDict[reformKey] = imageFN
             else:
@@ -258,7 +261,7 @@ def getTrainingResults(pipeline, aligner, caller, strict=False):
     }
     return ret
 
-def getEli5Results(pipeline, aligner, caller):
+def getEli5Results(pipeline, reference, aligner, caller):
     '''
     This will retrieve ELI5 results if available
     @param pipeline - the pipeline subdirectory
@@ -266,7 +269,7 @@ def getEli5Results(pipeline, aligner, caller):
     @param caller - the caller specified
     @return - a dictionary of metrics for this combo
     '''
-    jsonFN = '%s/%s/eli5_summaries/%s/%s/model_eli5.json' % (REPO_DIRECTORY, pipeline, aligner, caller)
+    jsonFN = '%s/%s/eli5_summaries/%s/%s/%s/model_eli5.json' % (REPO_DIRECTORY, pipeline, reference, aligner, caller)
 
     #catch the situation where we didn't run ELI5 for w/e reason
     ret = {}
@@ -442,15 +445,15 @@ def generateReport(dataDict, prefix):
     for fullPipe in FULL_PIPES:
         combos.append(fullPipe)
 
-    for aligner, caller in combos:
+    for reference, aligner, caller in combos:
         #check for the summary file
-        summaryFN = '%s/%s/model_summaries/%s/%s/model_summary.tsv' % (REPO_DIRECTORY, args.pipeline, aligner, caller)
+        summaryFN = '%s/%s/model_summaries/%s/%s/%s/model_summary.tsv' % (REPO_DIRECTORY, args.pipeline, reference, aligner, caller)
         if not os.path.exists(summaryFN):
-            print('WARNING: Summary file for %s/%s was not found, re-run pipeline?')
+            print(f'WARNING: Summary file for {reference}/{aligner}/{caller} was not found, re-run pipeline?')
             continue
 
         #get the results
-        comboData = getModelResults(METADATA_DICT, args.pipeline, aligner, caller)
+        comboData = getModelResults(METADATA_DICT, args.pipeline, reference, aligner, caller)
         
         #rename
         comboData['ALIGNER_LABEL'] = ALIGNER_RENAMING.get(aligner, aligner)
diff --git a/scripts/PipelineConfig.py b/scripts/PipelineConfig.py
index 00a98a2..b2e17e6 100644
--- a/scripts/PipelineConfig.py
+++ b/scripts/PipelineConfig.py
@@ -14,7 +14,7 @@
 SAMPLE_JSONS = [
     #'%s/scripts/GIAB_v1.json' % REPO_DIRECTORY
     #'%s/scripts/GIAB_v2.json' % REPO_DIRECTORY
-    f'{REPO_DIRECTORY}/scripts/GIAB_pcrfree.json'
+    f'{REPO_DIRECTORY}/sample_metadata/GIAB_pcrfree.json'
 ]
 
 #root directory for RTG-based analysis which is expected to conform to a standard directory structure, and contain

From 6d41e1b8160d5f02d87196e3cc1cb7527cba5a58 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Wed, 4 May 2022 12:30:16 -0500
Subject: [PATCH 05/16] first pass at pruning

---
 scripts/ExtractFeatures.py |  4 ++++
 scripts/model_metrics.json | 11 +++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/scripts/ExtractFeatures.py b/scripts/ExtractFeatures.py
index 4f8c33d..4bf0ef2 100644
--- a/scripts/ExtractFeatures.py
+++ b/scripts/ExtractFeatures.py
@@ -219,6 +219,10 @@ def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=Fal
 
             else:
                 raise Exception('Unhandled computed measurement: %s-%s' % (k, subk))
+        elif k == 'IGNORE':
+            #this is just a dummy parameter we can ignore
+            #historically, these were tested and found to be uninformative to the models
+            pass
         else:
             raise Exception('Unexpected metric key: %s' % (k, ))
 
diff --git a/scripts/model_metrics.json b/scripts/model_metrics.json
index f59cda2..eaaa381 100644
--- a/scripts/model_metrics.json
+++ b/scripts/model_metrics.json
@@ -38,12 +38,15 @@
                 "AD0", "AD1", "ADO", "AF0", "AF1", "AFO", "GT", "DP", "GQ", "min(PL)"
             ],
             "INFO" : [
-                "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet", 
-                "FS", "GC", "ML_PROB", "MQ", "MQ0", 
-                "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", "SOR", "STR", "UDP"
+                "BaseQRankSum", "DP", "Entropy", "GC", "ML_PROB", "MQ", "MQ0",
+                "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", 
+                "SOR", "UDP"
             ],
             "MUNGED" : [
-                "QUAL", "NEARBY", "FILTER", "ID"
+                "DP_DP", "QUAL", "NEARBY", "ID"
+            ],
+            "IGNORE" : [
+                "MUNGED-FILTER", "INFO-ExcessHet", "INFO-STR", "INFO-ClippingRankSum", "INFO-FS"
             ]
         }
     },

From 29ad64dc6884deff9605b9a81b504e69240109a2 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Thu, 5 May 2022 16:15:33 -0500
Subject: [PATCH 06/16] added configuration for feature selection

---
 scripts/ExtractFeatures.py     | 19 +++++++++++----
 scripts/RunTrainingPipeline.py |  4 ++++
 scripts/TrainModels.py         | 30 +++++++++++++++++++++--
 scripts/TrainingConfig.py      | 44 ++++++++++++++++++++++------------
 scripts/model_metrics.json     | 13 +++++-----
 5 files changed, 81 insertions(+), 29 deletions(-)

diff --git a/scripts/ExtractFeatures.py b/scripts/ExtractFeatures.py
index 4bf0ef2..bd3e01b 100644
--- a/scripts/ExtractFeatures.py
+++ b/scripts/ExtractFeatures.py
@@ -43,6 +43,9 @@
 ALL_METRICS = {}
 for k in UNPARSED_METRICS['defined']:
     ALL_METRICS[k] = UNPARSED_METRICS['defined'][k]
+    if 'IGNORE' in ALL_METRICS[k]:
+        #these are metrics we are tracking but ignoring; usually they were tested but uninformative for the models
+        del ALL_METRICS[k]['IGNORE']
 for k in UNPARSED_METRICS['copied']:
     assert(k not in ALL_METRICS)
     ALL_METRICS[k] = ALL_METRICS[UNPARSED_METRICS['copied'][k]]
@@ -165,6 +168,8 @@ def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=Fal
             #get from the INFO column
             # all values here default to a FLOAT interpretation
             val = float(variant.INFO.get(subk, DEFAULT_MISSING))
+            if np.isnan(val):
+                val = DEFAULT_MISSING
             annots.append(val)
         
         elif k == 'MUNGED':
@@ -173,7 +178,9 @@ def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=Fal
                 #call depth / total variant depth; usually ~1.0 if single sample
                 try:
                     dpCall = callStats['DP']
-                    dpVar = variant.INFO['DP']
+                    dpVar = variant.INFO.get('DP', DEFAULT_MISSING)
+                    if dpVar == 0 or dpVar == DEFAULT_MISSING:
+                        raise Exception('found the error')
                     dpRat = dpCall / dpVar
                     annots.append(dpRat)
                 except:
@@ -219,13 +226,15 @@ def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=Fal
 
             else:
                 raise Exception('Unhandled computed measurement: %s-%s' % (k, subk))
-        elif k == 'IGNORE':
-            #this is just a dummy parameter we can ignore
-            #historically, these were tested and found to be uninformative to the models
-            pass
         else:
             raise Exception('Unexpected metric key: %s' % (k, ))
 
+    if np.isnan(annots).sum() > 0 or np.isinf(annots).sum() > 0:
+        print(variant)
+        for i, f in enumerate(fields):
+            print('', i, f, annots[i], sep='\t')
+        raise Exception('NaN or inf found found, this should not happen')
+
     return annots
 
 def gatherVcfMetrics(vcfFN, rawVCF, metrics):
diff --git a/scripts/RunTrainingPipeline.py b/scripts/RunTrainingPipeline.py
index 2fd0edb..e21b5e4 100644
--- a/scripts/RunTrainingPipeline.py
+++ b/scripts/RunTrainingPipeline.py
@@ -113,6 +113,7 @@ def sendSlackMessage(channel, message):
     p.add_argument('-t', '--train-models', dest='train_models', action='store_true', default=False, help='train the models using the pipeline structure (default: False)')
     p.add_argument('-u', '--unlock', dest='unlock', action='store_true', default=False, help='unlock the directory in the event of snakemake failure (default: False)')
     p.add_argument('-x', '--execute', dest='execute', action='store_true', default=False, help='execute the commands (default: False)')
+    p.add_argument('-C', '--clean', dest='clean', action='store_true', default=False, help='clean all output files (default: False)')
 
     #required main arguments
     p.add_argument('slids', type=str, help='sample labels (.json, .txt, comma-separated entry)')
@@ -163,6 +164,9 @@ def sendSlackMessage(channel, message):
         buildFrags.append('model_eli5')
 
     if somethingToDo:
+        if args.clean:
+            buildFrags.append('--delete-all-output')
+
         fullCmd = ' '.join(snakemakeFrags+buildFrags)
         print('Executing: ')
         print(fullCmd)
diff --git a/scripts/TrainModels.py b/scripts/TrainModels.py
index 3ab979e..6b3905d 100644
--- a/scripts/TrainModels.py
+++ b/scripts/TrainModels.py
@@ -15,6 +15,7 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import LeaveOneGroupOut
 from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 from sklearn.utils import shuffle
 from skopt import BayesSearchCV
@@ -251,10 +252,35 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType,
             passParams = rangeHyperparams
         else:
             passParams = hyperparameters
-        fullClf, sumRet, sumRocRet = trainClassifier(raw_clf, passParams, final_train_X, final_train_Y, final_train_groups, configuration)
+
+        if ENABLE_FEATURE_SELECTION:
+            pipeline_clf = Pipeline(
+                [
+                    #this is a relatively small and relatively shallow GBC
+                    ("reduce_dim", 'passthrough'),
+                    #now the main classifier
+                    ("classifier", raw_clf)
+                ]
+            )
+            new_params = {
+                'reduce_dim' : FEATURE_SELECTION_MODELS
+            }
+            for k in passParams.keys():
+                new_key = f'classifier__{k}'
+                new_params[new_key] = passParams[k]
+            passParams = new_params
+        else:
+            pipeline_clf = raw_clf 
+
+        fullClf, sumRet, sumRocRet = trainClassifier(pipeline_clf, passParams, final_train_X, final_train_Y, final_train_groups, configuration)
 
         if label == 'GradientBoosting':
-            print('n_estimators_', fullClf.n_estimators_)
+            if ENABLE_FEATURE_SELECTION:
+                print('n_estimators_', fullClf[1].n_estimators_)
+            else:
+                print('n_estimators_', fullClf.n_estimators_)
+        if ENABLE_FEATURE_SELECTION:
+            print('support_', fullClf[0].support_)
         #this is the test on the held out test set
         print('[%s]\tFull testing classifier...' % (str(datetime.datetime.now()), ))
         resultsDict = testClassifier(fullClf, final_train_X, final_train_Y, final_test_X, final_test_Y)
diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py
index b5b5f78..92bfef5 100644
--- a/scripts/TrainingConfig.py
+++ b/scripts/TrainingConfig.py
@@ -2,7 +2,9 @@
 from imblearn.ensemble import EasyEnsembleClassifier
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import RFECV
 from sklearn.tree import DecisionTreeClassifier
 from skopt.space import Real, Categorical, Integer
 from xgboost import XGBClassifier
@@ -28,6 +30,25 @@
 MANUAL_FS = False
 MANUAL_FS_LABELS = ['CALL-ADO', 'CALL-AFO']
 
+#if True, this will pre-pend an automated feature selection step to each pipeline
+ENABLE_FEATURE_SELECTION = True
+FEATURE_SELECTION_MODELS = [
+    RFECV(
+        estimator=RandomForestClassifier(
+            random_state=0, class_weight='balanced', n_estimators=50, max_depth=3, max_features='sqrt'
+        ),
+        scoring='roc_auc',
+        cv=5
+    ),
+    RFECV(
+        estimator=GradientBoostingClassifier(
+            random_state=0, loss='exponential', n_estimators=50, max_depth=3, max_features='sqrt'
+        ), 
+        scoring='roc_auc', 
+        cv=5
+    )
+]
+
 #if True, mark false positive as true positives and vice versa
 FLIP_TP = True
 
@@ -53,14 +74,7 @@
 ENABLE_EASYENSEMBLE = False
 
 #this is an experimental mode in sklearn, it may change rapidly from version to version
-ENABLE_HISTGRADIENTBOOST = False
-if ENABLE_HISTGRADIENTBOOST:
-    #make sure we can actually do what we're trying to do
-    try:
-        from sklearn.experimental import enable_hist_gradient_boosting
-        from sklearn.ensemble import HistGradientBoostingClassifier
-    except:
-        ENABLE_HISTGRADIENTBOOST = False
+ENABLE_HISTGRADIENTBOOST = True
 
 #here is where we put what each enable option indicates
 #now enumerate the models as a tuple (
@@ -83,7 +97,7 @@
         {
             'random_state' : [0],
             'class_weight' : ['balanced'],
-            'n_estimators' : [500], #prior tests: 100, 200, 300
+            'n_estimators' : [200], #prior tests: 100, 200, 300, 500 - technically 500 was best, but you really get marginal gains
             'max_depth' : [6], #prior tests: 3, 4, 5
             'min_samples_split' : [2, 50],
             'max_features' : ['sqrt'],
@@ -117,10 +131,10 @@
     #'''
 
 if ENABLE_ADABOOST:
+    '''
+    Generally slower and worse results than other models, disabled after v1.
+    '''
     CLASSIFIERS.append(
-        '''
-        Generally slower and worse results than other models, disabled after v1.
-        '''
         #"The most important parameters are base_estimator, n_estimators, and learning_rate" - https://chrisalbon.com/machine_learning/trees_and_forests/adaboost_classifier/
         ('AdaBoost', AdaBoostClassifier(random_state=0, algorithm='SAMME.R', learning_rate=1.0, base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=200),
         {
@@ -257,10 +271,10 @@
         ('HistGradientBoosting', HistGradientBoostingClassifier(random_state=0),
         {
             'random_state' : [0],
-            'learning_rate' : [0.05, 0.1], #high learning rates don't seem to work out very well
+            'learning_rate' : [0.05, 0.1, 0.2], #high learning rates don't seem to work out very well
             'max_iter' : [1000],
-            'max_leaf_nodes' : [31], #don't want to make this too high or it overfits
-            'min_samples_leaf' : [200, 2000], #want this to be semi-high to avoid overfitting to a few variants
+            'max_leaf_nodes' : [15, 31, 63], #don't want to make this too high or it overfits
+            'min_samples_leaf' : [20, 200, 2000], #want this to be semi-high to avoid overfitting to a few variants
             'validation_fraction' : [0.1],
             'n_iter_no_change' : [20]
         },
diff --git a/scripts/model_metrics.json b/scripts/model_metrics.json
index eaaa381..bc81e53 100644
--- a/scripts/model_metrics.json
+++ b/scripts/model_metrics.json
@@ -35,18 +35,17 @@
         },
         "dnascope-1.0-202112.01-PO" : {
             "CALL" : [
-                "AD0", "AD1", "ADO", "AF0", "AF1", "AFO", "GT", "DP", "GQ", "min(PL)"
+                "AD0", "AD1", "ADO", "AF0", "AF1", 
+                "AFO", "GT", "DP", "GQ", "min(PL)"
             ],
             "INFO" : [
-                "BaseQRankSum", "DP", "Entropy", "GC", "ML_PROB", "MQ", "MQ0",
+                "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet", 
+                "FS", "GC", "ML_PROB", "MQ", "MQ0",
                 "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", 
-                "SOR", "UDP"
+                "SOR", "STR", "UDP"
             ],
             "MUNGED" : [
-                "DP_DP", "QUAL", "NEARBY", "ID"
-            ],
-            "IGNORE" : [
-                "MUNGED-FILTER", "INFO-ExcessHet", "INFO-STR", "INFO-ClippingRankSum", "INFO-FS"
+                "DP_DP", "FILTER", "ID", "NEARBY", "QUAL"
             ]
         }
     },

From 72e99f7bb3916f6e652cb3f065e0cd833046b554 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Tue, 10 May 2022 08:17:53 -0500
Subject: [PATCH 07/16] final training config set, still need to adjust model
 selection criteria

---
 scripts/PipelineConfig.py |  2 +-
 scripts/TrainingConfig.py | 37 +++++++++++++++++++------------------
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/scripts/PipelineConfig.py b/scripts/PipelineConfig.py
index b2e17e6..14a9ca1 100644
--- a/scripts/PipelineConfig.py
+++ b/scripts/PipelineConfig.py
@@ -47,7 +47,7 @@
 ]
 
 #pipeline parameters, thread count currently only applies to the actual model training (CV to be precise)
-THREADS_PER_PROC = 33 #was 16 in v1
+THREADS_PER_PROC = 48 #was 16 in v1
 
 ############################################################
 #Documentation config
diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py
index 92bfef5..7469401 100644
--- a/scripts/TrainingConfig.py
+++ b/scripts/TrainingConfig.py
@@ -24,27 +24,27 @@
 
 #if True, only use SUBSET_SIZE variants from each file (mostly for debugging)
 USE_SUBSET = False
-SUBSET_SIZE = 10000
+SUBSET_SIZE = 100000
 
 #if True, manually remove features in MANUAL_FS_LABELS (these are historically unimportant features)
 MANUAL_FS = False
 MANUAL_FS_LABELS = ['CALL-ADO', 'CALL-AFO']
 
 #if True, this will pre-pend an automated feature selection step to each pipeline
+#WARNING: this can be a time-consuming process, the reduce run-time increase FS_STEP_SIZE and/or FS_MIN_FEATURE_COUNT
 ENABLE_FEATURE_SELECTION = True
+FS_MIN_FEATURE_COUNT = 10 #always keep at least this many features
+FS_STEP_SIZE = 1 #if an int, the number of features to remove at each step; if a float, the fraction of features to remove at each step
 FEATURE_SELECTION_MODELS = [
     RFECV(
-        estimator=RandomForestClassifier(
-            random_state=0, class_weight='balanced', n_estimators=50, max_depth=3, max_features='sqrt'
-        ),
-        scoring='roc_auc',
-        cv=5
-    ),
-    RFECV(
+        #this is a relatively fast estimator; low tree count, shallow trees, and small sub-sample
         estimator=GradientBoostingClassifier(
-            random_state=0, loss='exponential', n_estimators=50, max_depth=3, max_features='sqrt'
+            random_state=0, loss='exponential', n_estimators=50, max_depth=3, max_features='sqrt',
+            subsample=0.25
         ), 
-        scoring='roc_auc', 
+        scoring='roc_auc',
+        step=FS_STEP_SIZE,
+        min_features_to_select=FS_MIN_FEATURE_COUNT,
         cv=5
     )
 ]
@@ -57,7 +57,7 @@
 CLASSIFIERS = []
 
 #Generally slight underperformance, but is a decent backup
-ENABLE_RANDOMFOREST = True
+ENABLE_RANDOMFOREST = False
 
 #performs alright, but always seems to be below GradientBoostingClassifier and slower
 ENABLE_ADABOOST = False
@@ -74,7 +74,7 @@
 ENABLE_EASYENSEMBLE = False
 
 #this is an experimental mode in sklearn, it may change rapidly from version to version
-ENABLE_HISTGRADIENTBOOST = True
+ENABLE_HISTGRADIENTBOOST = False
 
 #here is where we put what each enable option indicates
 #now enumerate the models as a tuple (
@@ -101,7 +101,8 @@
             'max_depth' : [6], #prior tests: 3, 4, 5
             'min_samples_split' : [2, 50],
             'max_features' : ['sqrt'],
-            'bootstrap' : [True, False]
+            'bootstrap' : [True],
+            'max_samples' : [0.25, 0.5]
         },
         {
             'random_state' : Categorical([0]),
@@ -172,8 +173,8 @@
         {
             'random_state' : [0],
             'n_estimators' : [1000], #prior tests: 100, 200; OBSOLETE: since adding n_iter_no_change, just set to a big number
-            'max_depth' : [6], #prior tests: 3, 4
-            'learning_rate' : [0.05, 0.1, 0.5], #prior tests: 0.01, 0.2; from bayes mode, all results were in the 0.04-0.2 range with the occasional "high" rate near 0.5
+            'max_depth' : [3, 6], #prior tests: 3, 4
+            'learning_rate' : [0.05, 0.1, 0.2], #prior tests: 0.01, 0.2; from bayes mode, all results were in the 0.04-0.2 range with the occasional "high" rate near 0.5 - this was very rare though
             'loss' : ['exponential'], #prior tests: 'deviance'
             'max_features' : ['sqrt'],
             'min_samples_split' : [2, 15, 50], #mostly extremes in Bayes most, but adding 15 for middle-ground that was sometimes chosen
@@ -271,10 +272,10 @@
         ('HistGradientBoosting', HistGradientBoostingClassifier(random_state=0),
         {
             'random_state' : [0],
-            'learning_rate' : [0.05, 0.1, 0.2], #high learning rates don't seem to work out very well
+            'learning_rate' : [0.05, 0.1], #high learning rates don't seem to work out very well
             'max_iter' : [1000],
-            'max_leaf_nodes' : [15, 31, 63], #don't want to make this too high or it overfits
-            'min_samples_leaf' : [20, 200, 2000], #want this to be semi-high to avoid overfitting to a few variants
+            'max_leaf_nodes' : [15, 31], #don't want to make this too high or it overfits
+            'min_samples_leaf' : [20, 200], #want this to be semi-high to avoid overfitting to a few variants
             'validation_fraction' : [0.1],
             'n_iter_no_change' : [20]
         },

From f3c37fc23be4d69b9cf40a3a32ebf41cbc9b9ca5 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Tue, 17 May 2022 08:45:11 -0500
Subject: [PATCH 08/16] changes required to support auto-targeting of global
 precision

---
 scripts/EvaluateVariants.py   | 53 ++++++++++++++++---
 scripts/ExtractELI5Results.py | 13 +++--
 scripts/GenerateSupplement.py | 12 +++--
 scripts/PrintModelReport.py   | 35 ++++++++++---
 scripts/TrainModels.py        | 99 +++++++++++++++++++++++++++++------
 scripts/TrainingConfig.py     | 18 ++++++-
 scripts/TrainingPipeline.smk  |  7 +--
 7 files changed, 195 insertions(+), 42 deletions(-)

diff --git a/scripts/EvaluateVariants.py b/scripts/EvaluateVariants.py
index 11d3c4b..e3c576e 100644
--- a/scripts/EvaluateVariants.py
+++ b/scripts/EvaluateVariants.py
@@ -9,16 +9,42 @@
 
 from ExtractFeatures import ALL_METRICS, getVariantFeatures, GT_TRANSLATE, VAR_TRANSLATE
 
-def getClinicalModel(stats, acceptedRecall, targetRecall):
+def getClinicalModel(stats, acceptedRecall, targetRecall, global_precision):
     '''
     @param stats - the stats dictionary from training
     @param acceptedRecall - the minimum acceptable recall
-    @param targetRecall - the target we are aiming for 
-    @return - a dictionary with the eval recall, model name, and harmonic mean score
+    @param targetRecall - the target we are aiming for, must be greater than or equal to accepted
+    @param global_precision - float value indicating target global precision; if set, 
+        then it will dynamically figure out the target recalls based on the data
+    @return - a dictionary with the eval recall, model name, and harmonic mean score; more info is added if global_precision is used
     '''
     bestModelTargetRecall = None
     bestModelName = None
     bestHM = 0.0
+
+    if global_precision != None:
+        #we need to override the accepted and target recall values
+        #print(stats.keys())
+        lookup_key = list(stats.keys())[0]
+        base_total_tp = stats[lookup_key]['RAW_TOTAL_TP']
+        base_total_fp = stats[lookup_key]['RAW_TOTAL_FP']
+        base_precision = base_total_tp / (base_total_fp + base_total_tp)
+
+        #figure out how far from the goal we are
+        delta_precision = global_precision - base_precision
+        assert(delta_precision > 0)
+        remainder_precision = 1.0 - base_precision
+
+        #now derive our target recall from that difference
+        derived_recall = delta_precision / remainder_precision
+
+        #for now, just set both accepted and target to that same value
+        acceptedRecall = derived_recall
+        targetRecall = derived_recall
+    
+    #check these after we potentially override any passed in values
+    assert(targetRecall >= acceptedRecall)
+
     for mn in stats.keys():
         for tr in stats[mn]['ALL_SUMMARY']:
             #CM = confusion matrix
@@ -40,9 +66,17 @@ def getClinicalModel(stats, acceptedRecall, targetRecall):
                 else:
                     #in clinical, best is harmonic mean of our adjusted recall and our TNR
                     modelTNR = modelCM[0, 0] / (modelCM[0, 0] + modelCM[0, 1])
-                    adjRecall = (modelRecall - acceptedRecall) / (float(targetRecall) - acceptedRecall)
-                    if adjRecall > 1.0:
-                        adjRecall = 1.0
+                    if targetRecall == acceptedRecall:
+                        #target and accepted are identical, so this is a binary kill switch that's either 1.0 or 0.0
+                        if modelRecall >= acceptedRecall:
+                            adjRecall = 1.0
+                        else:
+                            adjRecall = 0.0
+                    else:
+                        #otherwise, there's a scale
+                        adjRecall = (modelRecall - acceptedRecall) / (targetRecall - acceptedRecall)
+                        if adjRecall > 1.0:
+                            adjRecall = 1.0
                     modelHM = 2 * adjRecall * modelTNR / (adjRecall+modelTNR)
                     
             if modelHM > bestHM:
@@ -50,11 +84,16 @@ def getClinicalModel(stats, acceptedRecall, targetRecall):
                 bestModelName = mn
                 bestHM = modelHM
     
-    return {
+    ret = {
         'eval_recall' : bestModelTargetRecall,
         'model_name' : bestModelName,
         'hm_score' : bestHM
     }
+    if global_precision != None:
+        ret['base_precision'] = base_precision
+        ret['derived_recall'] = derived_recall
+    
+    return ret
 
 def evaluateVariants(args):
     '''
diff --git a/scripts/ExtractELI5Results.py b/scripts/ExtractELI5Results.py
index 248d482..983d10e 100644
--- a/scripts/ExtractELI5Results.py
+++ b/scripts/ExtractELI5Results.py
@@ -8,13 +8,14 @@
 from EvaluateVariants import getClinicalModel
 from ExtractFeatures import GT_TRANSLATE, VAR_TRANSLATE
 
-def gatherEli5Stats(modelDir, minRecall, targetRecall):
+def gatherEli5Stats(modelDir, minRecall, targetRecall, global_precision):
     '''
     This will gather our clinical model statistics in an ingestible format
     @param modelDir - the model directory 
     @param rocDir - a directory for output ROC curve images
     @param minRecall - the minimum recall values allowed
     @param targetRecall - the target recall we want the models to achieve
+    @param global_precision - if set, this will override other recall values
     '''
     #read in the stats
     jsonFN = '%s/stats.json' % (modelDir, )
@@ -29,14 +30,15 @@ def gatherEli5Stats(modelDir, minRecall, targetRecall):
     fp.close()
     
     #get the stats now and send 'em on back
-    return gatherClinicalModelStats(stats, minRecall, targetRecall, models)
+    return gatherClinicalModelStats(stats, minRecall, targetRecall, global_precision, models)
 
-def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels):
+def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, global_precision, allModels):
     '''
     This will run eli5 if it can and store the results in a dictionary
     @param allStats - the full stats dict (all models)
     @param acceptedRecall - the minimum recall we need
     @param targetRecall - the target recall we want
+    @param global_precision - if set, this will override the recall values
     @param allModels - the actual loaded models
     @return - a dictionary where keys are the variant/genotype and value has the model name and eli5 outputs
     '''
@@ -46,7 +48,7 @@ def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels):
         reformKey = VAR_TRANSLATE[int(k.split('_')[0])]+'_'+GT_TRANSLATE[int(k.split('_')[1])]
         
         #get the clinical model
-        clinicalModelDict = getClinicalModel(stats, acceptedRecall, targetRecall)
+        clinicalModelDict = getClinicalModel(stats, acceptedRecall, targetRecall, global_precision)
         bestModelName = clinicalModelDict['model_name']
         
         if bestModelName == None:
@@ -77,6 +79,7 @@ def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels):
     #optional arguments with default
     p.add_argument('-m', '--min-recall', dest='min_recall', default=0.99, type=float, help='the minimum recall for clinical applications (default: 0.990)')
     p.add_argument('-t', '--target-recall', dest='target_recall', default=0.995, type=float, help='the target recall for clinical applications (default: 0.995)')
+    p.add_argument('-g', '--global-precision', dest='global_precision', default=None, type=float, help='the global precision target; if set, override min/target recalls (default: None)')
 
     #required main arguments
     p.add_argument('model_directory', type=str, help='directory with models and model stats')
@@ -86,7 +89,7 @@ def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels):
     args = p.parse_args()
 
     #gather the stats
-    finalStats = gatherEli5Stats(args.model_directory, args.min_recall, args.target_recall)
+    finalStats = gatherEli5Stats(args.model_directory, args.min_recall, args.target_recall, args.global_precision)
 
     fp = open(args.out_json, 'w+')
     json.dump(finalStats, fp, indent=4, sort_keys=True)
diff --git a/scripts/GenerateSupplement.py b/scripts/GenerateSupplement.py
index c5611fe..fc967f9 100644
--- a/scripts/GenerateSupplement.py
+++ b/scripts/GenerateSupplement.py
@@ -199,10 +199,14 @@ def getTrainingResults(pipeline, reference, aligner, caller, strict=False):
         if l.startswith('[clinical_model'):
             pieces = l.rstrip()[1:-1].split(' ')
             assert(pieces[0] == 'clinical_model')
-            assert(pieces[1].startswith('min='))
-            assert(pieces[2].startswith('tar='))
-            clinicalMinimum = float(pieces[1][4:])
-            clinicalTarget = float(pieces[2][4:])
+            if pieces[1].startswith('min=') and pieces[2].startswith('tar='):
+                clinicalMinimum = float(pieces[1][4:])
+                clinicalTarget = float(pieces[2][4:])
+            elif pieces[1].startswith('target_global_precision='):
+                clinicalMinimum = 'dynamic'
+                clinicalTarget = 'dynamic'
+            else:
+                raise Exception('unknown clinical_model format')
             break
         elif l.startswith('['):
             vt, tpcount, fpcount = l.rstrip()[1:-1].split(' ')
diff --git a/scripts/PrintModelReport.py b/scripts/PrintModelReport.py
index e8d4b2a..ada64a1 100644
--- a/scripts/PrintModelReport.py
+++ b/scripts/PrintModelReport.py
@@ -11,13 +11,15 @@
 from EvaluateVariants import getClinicalModel
 from ExtractFeatures import GT_TRANSLATE, VAR_TRANSLATE
 
-def printAllStats(modelDir, rocDir, minRecall, targetRecall):
+def printAllStats(modelDir, rocDir, minRecall, targetRecall, global_precision):
     '''
     This will print out our model statistics in an ingestible format
     @param modelDir - the model directory 
     @param rocDir - a directory for output ROC curve images
     @param minRecall - the minimum recall values allowed
     @param targetRecall - the target recall we want the models to achieve
+    @param global_precision - float value indicating target global precision; if set, 
+        then it will dynamically figure out the target recalls based on the data
     '''
     #read in the stats
     jsonFN = '%s/stats.json' % (modelDir, )
@@ -48,7 +50,7 @@ def printAllStats(modelDir, rocDir, minRecall, targetRecall):
         if rocDir != None:
             createRocImage(reformKey, rocs[k], rocDir)
     
-    imageDict = printClinicalModels(stats, minRecall, targetRecall, models)
+    imageDict = printClinicalModels(stats, minRecall, targetRecall, models, global_precision)
     if rocDir != None:
         createTrainingImage(imageDict, rocDir)
 
@@ -142,24 +144,33 @@ def createTrainingImage(imageDict, rocDir):
     plt.savefig(outFN)
     plt.close()
 
-def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels):
+def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels, global_precision):
     header = [
         'variant_type', 'best_model', 'model_eval',
         'CV_recall', 'final_recall', 'CV_FPR', 'final_FPR'
     ]
-    print('[clinical_model min=%0.4f tar=%0.4f]' % (acceptedRecall, targetRecall))
+    if global_precision == None:
+        print('[clinical_model min=%0.4f tar=%0.4f]' % (acceptedRecall, targetRecall))
+    else:
+        print('[clinical_model target_global_precision=%0.4f]' % (global_precision, ))
+        header += ['target_recall', 'global_prec']
     print(*header, sep='\t')
     imageDict = {}
     for k in sorted(allStats.keys()):
         stats = allStats[k]
         reformKey = VAR_TRANSLATE[int(k.split('_')[0])]+'_'+GT_TRANSLATE[int(k.split('_')[1])]
 
-        clinDict = getClinicalModel(stats, acceptedRecall, targetRecall)
+        clinDict = getClinicalModel(stats, acceptedRecall, targetRecall, global_precision)
         bestModelName = clinDict['model_name']
 
         if bestModelName == None:
             #this is the unfortunate event that NO model passes 
-            print(reformKey, 'None', 'None', '--', '--', '--', '--', sep='\t')
+            rowVals = [
+                reformKey, 'None', 'None', '--', '--', '--', '--'
+            ]
+            if global_precision:
+                rowVals += ['--', '--']
+            print(*rowVals, sep='\t')
         else:
             bestModelTargetRecall = clinDict['eval_recall']
 
@@ -182,6 +193,15 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels):
                 #str(stats[bestModelName]['LEAVEONEOUT_SUMMARY'][bestModelTargetRecall]['TEST_TPR']),
                 #str(stats[bestModelName]['LEAVEONEOUT_SUMMARY'][bestModelTargetRecall]['TEST_FPR'])
             ]
+            if global_precision:
+                base_precision = clinDict['base_precision']
+                derived_recall = clinDict['derived_recall']
+                calculated_precision = base_precision + (1 - base_precision) * bestTPR
+                rowVals += [
+                    '%0.4f' % (derived_recall, ),
+                    '%0.6f' % (calculated_precision, )
+                ]
+
             print(*rowVals, sep='\t')
 
             imageDict[reformKey] = (
@@ -203,6 +223,7 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels):
     p.add_argument('-r', '--roc-dir', dest='roc_dir', default=None, help='a directory to store ROC images (default: None)')
     p.add_argument('-m', '--min-recall', dest='min_recall', default=0.99, type=float, help='the minimum recall for clinical applications (default: 0.990)')
     p.add_argument('-t', '--target-recall', dest='target_recall', default=0.995, type=float, help='the target recall for clinical applications (default: 0.995)')
+    p.add_argument('-g', '--global-precision', dest='global_precision', default=None, type=float, help='the global precision target; if set, override min/target recalls (default: None)')
 
     #required main arguments
     p.add_argument('model_directory', type=str, help='directory with models and model stats')
@@ -210,4 +231,4 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels):
     #parse the arguments
     args = p.parse_args()
 
-    printAllStats(args.model_directory, args.roc_dir, args.min_recall, args.target_recall)
\ No newline at end of file
+    printAllStats(args.model_directory, args.roc_dir, args.min_recall, args.target_recall, args.global_precision)
\ No newline at end of file
diff --git a/scripts/TrainModels.py b/scripts/TrainModels.py
index 6b3905d..83900e2 100644
--- a/scripts/TrainModels.py
+++ b/scripts/TrainModels.py
@@ -57,6 +57,7 @@ def trainModels(featureDir, slids, outPrefix, splitByType, numProcs):
     fieldsList = json.load(fp)
     fp.close()
     
+    print(f'[{datetime.datetime.now()}] Full sample list: {samples}')
     #get data from each sample
     for sample in samples:
         #TP first
@@ -78,7 +79,7 @@ def trainModels(featureDir, slids, outPrefix, splitByType, numProcs):
         assert(fpFields == fieldsList)
         fpVar = np.load(fpFN, 'r')
         fpList.append(fpVar)
-        
+
     results = {}
     trainedModelDict = {}
     rocDict = {}
@@ -121,7 +122,7 @@ def trainModels(featureDir, slids, outPrefix, splitByType, numProcs):
     fp = open(rocFN, 'w+')
     json.dump(rocDict, fp, indent=4, sort_keys=True)
     fp.close()
-
+    
     print('[%s] All models finished training!' % (str(datetime.datetime.now()), ))
 
 def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType, callType, numProcs):
@@ -174,6 +175,10 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType,
     train_groups = []
     test_list_X = []
     test_list_Y = []
+    
+    #we will need the totals if we auto-calculate the target recalls
+    raw_total_tp = 0
+    raw_total_fp = 0
 
     for i in range(0, len(raw_tpList)):
         tpVals = raw_tpList[i]
@@ -193,6 +198,10 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType,
             tpVals = np.delete(tpVals, removedIndices, 1)
             fpVals = np.delete(fpVals, removedIndices, 1)
         
+        #store the total number we found before any sort of subsetting; this is used to derive overall accuracies if auto-enabled
+        raw_total_tp += tpVals.shape[0]
+        raw_total_fp += fpVals.shape[0]
+
         #we aren't doing a full test, so cut the input sizes down
         if USE_SUBSET:
             tpVals = tpVals[:SUBSET_SIZE]
@@ -242,6 +251,34 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType,
 
     #TODO: future possible optimization - RandomForest and EasyEnsemble have a "n_jobs" parameters for parallel 
     # fit/predict; is there a way we can utilize that in general?
+
+    print('Raw Total TP, FP:', raw_total_tp, raw_total_fp)
+    total_obs = raw_total_fp + raw_total_tp
+    raw_precision = raw_total_tp / total_obs
+    print('Raw precision:', raw_precision)
+
+    if ENABLE_AUTO_TARGET:
+        #we need to derive what the recall targets should be
+        missed_precision = GLOBAL_AUTO_TARGET_PRECISION - raw_precision
+
+        if missed_precision < 0:
+            #TODO: what should we do here generally? this isn't currently an issue
+            raise Exception('raw precision is already better than global target, this is currently unhandled')
+        
+        total_gap = 1.0 - raw_precision
+        lower_target_recall = missed_precision / total_gap
+        delta_gap = (1.0 - lower_target_recall) / AUTO_TARGET_BREAKPOINT_COUNT
+        
+        auto_targets = [lower_target_recall + i*delta_gap for i in range(0, AUTO_TARGET_BREAKPOINT_COUNT)]
+        auto_targets += [0.9999, 1.0000]
+
+        #now reverse sort them
+        recall_targets = np.array(sorted(auto_targets)[::-1])
+
+    else:
+        #manual targetting only
+        recall_targets = MANUAL_TARGETS
+    print('Recall targets: ', recall_targets)
     
     #go through each model, one at a time
     for (label, raw_clf, hyperparameters, rangeHyperparams) in CLASSIFIERS:
@@ -272,7 +309,10 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType,
         else:
             pipeline_clf = raw_clf 
 
-        fullClf, sumRet, sumRocRet = trainClassifier(pipeline_clf, passParams, final_train_X, final_train_Y, final_train_groups, configuration)
+        fullClf, sumRet, sumRocRet = trainClassifier(
+            pipeline_clf, passParams, final_train_X, final_train_Y, final_train_groups, 
+            configuration, recall_targets, raw_precision
+        )
 
         if label == 'GradientBoosting':
             if ENABLE_FEATURE_SELECTION:
@@ -286,11 +326,13 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType,
         resultsDict = testClassifier(fullClf, final_train_X, final_train_Y, final_test_X, final_test_Y)
         
         #get results and store everything in the dictionary locations below
-        allRet, allRocRet = summarizeResults([resultsDict], [final_test_Y])
+        allRet, allRocRet = summarizeResults([resultsDict], [final_test_Y], recall_targets, raw_precision)
 
         ret[label] = {
             'LEAVEONEOUT_SUMMARY' : sumRet,
-            'ALL_SUMMARY' : allRet
+            'ALL_SUMMARY' : allRet,
+            'RAW_TOTAL_TP' : raw_total_tp,
+            'RAW_TOTAL_FP' : raw_total_fp
         }
         modelRet[label] = {
             'FEATURES' : featureLabels,
@@ -306,7 +348,7 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType,
     
     return ret, modelRet, rocRet
 
-def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, configuration):
+def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, configuration, recall_targets, base_precision):
     '''
     This will run a classifier for us
     @param raw_clf - a classifier instance
@@ -315,6 +357,8 @@ def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, co
     @param train_Y - the classifier values expected
     @param train_groups - the groups of input values
     @param configuration - a dictionary containing information on how to do the training
+    @param recall_targets - the recall targets to evaluate results
+    @param raw_precision - the base precision prior to any training
     @return - a trained classifier
     '''
     print('[%s]\tFull training classifier...' % (str(datetime.datetime.now()), ))
@@ -358,7 +402,7 @@ def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, co
             test_Ys.append(train_Y[test_ind])
 
         print('[%s]\tGenerating CV summary...'% (str(datetime.datetime.now()), ))
-        sumRet, sumRocRet = summarizeResults(results, test_Ys)
+        sumRet, sumRocRet = summarizeResults(results, test_Ys, recall_targets, base_precision)
     else:
         raise Exception('Unexpected mode')
     
@@ -379,6 +423,11 @@ def testClassifier(clf, train_X, train_Y, test_X, test_Y):
     @param test_X - the training features
     @param test_Y - the classifier values expected
     @return - a dictionary of many results
+        TEST_PRED - the probability of being in class 1 (i.e. predicted false call) on the test data
+        TEST_ROC - tuple (false_positive rate, true_positive_rate, thresholds) from the roc_curve function on the test data
+        TEST_ROC_AUC - ROC-AUC from auc(roc_curve(...))
+        TRAIN_PRED - the probability of being class 1 on the training data
+        TRAIN_ROC - tuple (false_positive rate, true_positive_rate, thresholds) from the roc_curve function on the training data
     '''
     ret = {}
     
@@ -399,15 +448,18 @@ def testClassifier(clf, train_X, train_Y, test_X, test_Y):
 
     return ret
 
-def summarizeResults(results, test_Ys):
+def summarizeResults(results, test_Ys, recallValues, base_precision):
     '''
     This will summarize results for us across multiple leave-one-out runs
-    @param results - a list of results dictionaries for a classifier
-    @return - TODO
+    @param results - a list of results dictionaries for a classifier; this is basically a list of results from testClassifier(...)
+    @param test_Ys - a list of lists of correct Y-values (e.g. categories)
+    @param recallValues - the recall values to evaluate the models at
+    @param base_precision - the base precision of the caller without any ML
+    @return - tuple (stats, rocs)
+        stats - a dict where key is a recall value and value is a dictionary of stats for the model with that recall including test/train TPR, FPR, etc.
+        rocs - a list of test ROCs, one for each in results
     '''
     print('[%s]\tRunning threshold tests...' % (str(datetime.datetime.now()), ))
-    recallValues = np.array([1.0, 0.9999, 0.999, 0.998, 0.997, 0.996, 0.995, 0.99])
-    
     #TODO: if we want to do confidence intervals on errors, look into it here: (For now, I think CV is fine)
     #https://machinelearningmastery.com/report-classifier-performance-confidence-intervals/
     #note for future matt, if you get 0 errors out of N classifications, we would need to assume the NEXT
@@ -418,8 +470,13 @@ def summarizeResults(results, test_Ys):
     # 99% confidence in that interval
     
     ret = {}
-    print('', 'tarTPR', 'train_FPR', 'train_TPR', 'test_FPR', 'test_TPR', 'adjConf', sep='\t')
+    if ENABLE_AUTO_TARGET:
+        print('', 'tarTPR', 'train_FPR', 'train_TPR', 'test_FPR', 'test_TPR', 'global_prec', 'adjConf', sep='\t')
+    else:
+        print('', 'tarTPR', 'train_FPR', 'train_TPR', 'test_FPR', 'test_TPR', 'adjConf', sep='\t')
     for minRecall in recallValues:
+        #for each specified recall level, we want to calculate how well the training/testing does for a bunch of stats:
+        #   training FPR, training TPR, training thresholds, testing FPR, testing TPR, and testing confusion matrix (CM)
         tarTPR = minRecall
         trainFprList = []
         trainTprList = []
@@ -428,12 +485,16 @@ def summarizeResults(results, test_Ys):
         testCMList = []
         testFprList = []
         testTprList = []
-        
+
+        #for CV, we will have multiple training/testing sets, so we need to gather them all for mean/std later
+        #this function is also used for final training, which will only have one set in `results` (i.e. N=1, st. dev.=0)
         for i, resultsDict in enumerate(results):
+            #get the training ROC values
             train_false_positive_rate, train_true_positive_rate, train_thresholds = resultsDict['TRAIN_ROC']
             test_Y = test_Ys[i]
 
             #first, find the point in the training values that matches our recall requirement
+            #this will allow us to pick the threshold value matching that recall req.
             ind = bisect.bisect_left(train_true_positive_rate, minRecall)
             while train_true_positive_rate[ind] < minRecall:
                 ind += 1
@@ -461,11 +522,19 @@ def summarizeResults(results, test_Ys):
             ('%0.4f+-%0.4f', (np.mean(trainFprList), np.std(trainFprList))),
             ('%0.4f+-%0.4f', (np.mean(trainTprList), np.std(trainTprList))),
             ('%0.4f+-%0.4f', (np.mean(testFprList), np.std(testFprList))),
-            ('%0.4f+-%0.4f', (np.mean(testTprList), np.std(testTprList))),
+            ('%0.4f+-%0.4f', (np.mean(testTprList), np.std(testTprList)))
+        ]
+        if ENABLE_AUTO_TARGET:
+            recall = np.mean(testTprList)
+            printVals += [
+                ('%0.6f', base_precision + (1-base_precision) * recall)
+            ]
+        printVals += [
             ('%s', str(sum(testCMList)).replace('\n', '')),    
         ]
         print('\t'.join([t[0] % t[1] for t in printVals]))
 
+        #save the lists in our output tied to this recall value
         ret[minRecall] = {
             'TRAIN_FPR' : trainFprList,
             'TRAIN_TPR' : trainTprList,
diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py
index 7469401..5b75a0a 100644
--- a/scripts/TrainingConfig.py
+++ b/scripts/TrainingConfig.py
@@ -1,5 +1,6 @@
 
 from imblearn.ensemble import EasyEnsembleClassifier
+import numpy as np
 from sklearn.ensemble import AdaBoostClassifier
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import HistGradientBoostingClassifier
@@ -26,6 +27,21 @@
 USE_SUBSET = False
 SUBSET_SIZE = 100000
 
+#if True, we will generate an array of targets from the data
+ENABLE_AUTO_TARGET = True
+
+#if auto-targets is enabled, this is the total target precision that is used for deriving the auto-target
+# e.g. if based precision is 0.999, then the model needs to recover 0.0009 of the 0.001 that is not captured
+# e.g. 90% derived target recall for the models
+GLOBAL_AUTO_TARGET_PRECISION = 0.9999
+AUTO_TARGET_BREAKPOINT_COUNT = 10 #the number of point inbetween the target and 1.00 to calculate, must be > 0
+AUTO_EXTRA_TARGETS = [0.9999, 1.0000] #these are always added
+
+#if the auto-targets is disabled, these will be used as targets instead
+MANUAL_TARGETS = np.array([
+    1.0, 0.9999, 0.999, 0.998, 0.997, 0.996, 0.995, 0.99
+])
+
 #if True, manually remove features in MANUAL_FS_LABELS (these are historically unimportant features)
 MANUAL_FS = False
 MANUAL_FS_LABELS = ['CALL-ADO', 'CALL-AFO']
@@ -34,7 +50,7 @@
 #WARNING: this can be a time-consuming process, the reduce run-time increase FS_STEP_SIZE and/or FS_MIN_FEATURE_COUNT
 ENABLE_FEATURE_SELECTION = True
 FS_MIN_FEATURE_COUNT = 10 #always keep at least this many features
-FS_STEP_SIZE = 1 #if an int, the number of features to remove at each step; if a float, the fraction of features to remove at each step
+FS_STEP_SIZE = 0.1 #if an int, the number of features to remove at each step; if a float, the fraction of features to remove at each step
 FEATURE_SELECTION_MODELS = [
     RFECV(
         #this is a relatively fast estimator; low tree count, shallow trees, and small sub-sample
diff --git a/scripts/TrainingPipeline.smk b/scripts/TrainingPipeline.smk
index af1c621..d439be7 100644
--- a/scripts/TrainingPipeline.smk
+++ b/scripts/TrainingPipeline.smk
@@ -219,7 +219,8 @@ rule SummarizeModels:
         images=directory("{pipeline_dir}/model_summaries/{reference}/{aligner}/{caller}/roc_curves")
     params:
         script=MODEL_REPORT_SCRIPT,
-        prefix="{pipeline_dir}/trained_models/{reference}/{aligner}/{caller}"
+        prefix="{pipeline_dir}/trained_models/{reference}/{aligner}/{caller}",
+        global_precision="0.9999"
     #conda:
     #    CONDA_ENV
     log: "{pipeline_dir}/logs/model_summaries/{reference}/{aligner}/{caller}.log"
@@ -228,6 +229,7 @@ rule SummarizeModels:
         '''
         python3 -u {params.script} \
             -r {output.images} \
+            -g {params.global_precision} \
             {params.prefix} > \
             {output.summary}
         '''
@@ -272,8 +274,7 @@ rule ModelEli5:
     shell:
         '''
         python3 -u {params.script} \
-            -m 0.99 \
-            -t 0.995 \
+            -g 0.9999 \
             {params.prefix} \
             {output.eli5}
         '''

From 8d29be77090d1b3d081f1edda7dcbbc62f0584bd Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Wed, 25 May 2022 08:42:28 -0500
Subject: [PATCH 09/16] fixing global precision to previous value

---
 scripts/PrintModelReport.py  | 8 ++++----
 scripts/TrainingConfig.py    | 2 +-
 scripts/TrainingPipeline.smk | 6 ++++--
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/scripts/PrintModelReport.py b/scripts/PrintModelReport.py
index ada64a1..917ce51 100644
--- a/scripts/PrintModelReport.py
+++ b/scripts/PrintModelReport.py
@@ -73,10 +73,10 @@ def printModelStats(modelType, stats):
         'eval_recall'
     ]
     for e in evalList:
-        header.append(e)
-        header.append('')
-        header.append('')
-        header.append('')
+        header.append(f'{e}-recall')
+        header.append('CV_recall')
+        header.append('FPR')
+        header.append('CV_FPR')
     print(*header, sep='\t')
     for targetRecall in availableRecalls:
         rowVals = ['%0.4f' % (float(targetRecall), )]
diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py
index 5b75a0a..b19df4b 100644
--- a/scripts/TrainingConfig.py
+++ b/scripts/TrainingConfig.py
@@ -33,7 +33,7 @@
 #if auto-targets is enabled, this is the total target precision that is used for deriving the auto-target
 # e.g. if based precision is 0.999, then the model needs to recover 0.0009 of the 0.001 that is not captured
 # e.g. 90% derived target recall for the models
-GLOBAL_AUTO_TARGET_PRECISION = 0.9999
+GLOBAL_AUTO_TARGET_PRECISION = 0.99996
 AUTO_TARGET_BREAKPOINT_COUNT = 10 #the number of point inbetween the target and 1.00 to calculate, must be > 0
 AUTO_EXTRA_TARGETS = [0.9999, 1.0000] #these are always added
 
diff --git a/scripts/TrainingPipeline.smk b/scripts/TrainingPipeline.smk
index d439be7..5bfe071 100644
--- a/scripts/TrainingPipeline.smk
+++ b/scripts/TrainingPipeline.smk
@@ -3,6 +3,7 @@ import os
 
 from RunTrainingPipeline import parseSlids
 from PipelineConfig import *
+from TrainingConfig import GLOBAL_AUTO_TARGET_PRECISION
 
 #derived from repo 
 PIPELINE_DIRECTORY = '%s/pipeline_pcrfree' % REPO_DIRECTORY
@@ -220,7 +221,7 @@ rule SummarizeModels:
     params:
         script=MODEL_REPORT_SCRIPT,
         prefix="{pipeline_dir}/trained_models/{reference}/{aligner}/{caller}",
-        global_precision="0.9999"
+        global_precision=GLOBAL_AUTO_TARGET_PRECISION
     #conda:
     #    CONDA_ENV
     log: "{pipeline_dir}/logs/model_summaries/{reference}/{aligner}/{caller}.log"
@@ -266,6 +267,7 @@ rule ModelEli5:
         eli5="{pipeline_dir}/eli5_summaries/{reference}/{aligner}/{caller}/model_eli5.json"
     params:
         script=MODEL_ELI5_SCRIPT,
+        global_precision=GLOBAL_AUTO_TARGET_PRECISION,
         prefix="{pipeline_dir}/trained_models/{reference}/{aligner}/{caller}"
     #conda:
     #    CONDA_ENV
@@ -274,7 +276,7 @@ rule ModelEli5:
     shell:
         '''
         python3 -u {params.script} \
-            -g 0.9999 \
+            -g {params.global_precision} \
             {params.prefix} \
             {output.eli5}
         '''

From 8f70cfc16d566aeee87a3bb3a393edbe6af4f1c2 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Wed, 1 Jun 2022 09:54:46 -0500
Subject: [PATCH 10/16] adding RF to recover indel hets

---
 scripts/TrainingConfig.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py
index b19df4b..9ce0fbe 100644
--- a/scripts/TrainingConfig.py
+++ b/scripts/TrainingConfig.py
@@ -73,7 +73,7 @@
 CLASSIFIERS = []
 
 #Generally slight underperformance, but is a decent backup
-ENABLE_RANDOMFOREST = False
+ENABLE_RANDOMFOREST = True
 
 #performs alright, but always seems to be below GradientBoostingClassifier and slower
 ENABLE_ADABOOST = False
@@ -114,7 +114,7 @@
             'random_state' : [0],
             'class_weight' : ['balanced'],
             'n_estimators' : [200], #prior tests: 100, 200, 300, 500 - technically 500 was best, but you really get marginal gains
-            'max_depth' : [6], #prior tests: 3, 4, 5
+            'max_depth' : [3, 6], #prior tests: 3, 4, 5
             'min_samples_split' : [2, 50],
             'max_features' : ['sqrt'],
             'bootstrap' : [True],

From 44dd3b60e498cc160ebbbc959d3cfa47d2094525 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Wed, 1 Jun 2022 13:57:17 -0500
Subject: [PATCH 11/16] updating to work with cyvcf2 and codi outputs

---
 scripts/EvaluateVariants.py | 92 ++++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 33 deletions(-)

diff --git a/scripts/EvaluateVariants.py b/scripts/EvaluateVariants.py
index e3c576e..1beae67 100644
--- a/scripts/EvaluateVariants.py
+++ b/scripts/EvaluateVariants.py
@@ -1,5 +1,6 @@
 
 import argparse as ap
+import cyvcf2
 import csv
 import json
 import numpy as np
@@ -8,6 +9,7 @@
 #import vcf
 
 from ExtractFeatures import ALL_METRICS, getVariantFeatures, GT_TRANSLATE, VAR_TRANSLATE
+from TrainingConfig import ENABLE_AUTO_TARGET, GLOBAL_AUTO_TARGET_PRECISION
 
 def getClinicalModel(stats, acceptedRecall, targetRecall, global_precision):
     '''
@@ -166,7 +168,7 @@ def runSubType(variantType, args, stats, models, statKey):
 
     #make sure our recall is in the list
     availableRecalls = stats[list(stats.keys())[0]]['ALL_SUMMARY'].keys()
-    if targetRecall not in availableRecalls:
+    if targetRecall not in availableRecalls and not ENABLE_AUTO_TARGET:
         raise Exception('Invalid target recall, available options are: %s' % (availableRecalls, ))
 
     #figure out which models we will actually be using
@@ -191,17 +193,25 @@ def runSubType(variantType, args, stats, models, statKey):
         evalList = [bestModelName]
     
     elif modelName == 'clinical':
-        #TODO: make this a script parameter instead of hard-coding
-        #this contains minimum acceptable recals for a given target
-        targetThresholds = {
-            '0.995' : 0.99
-        }
-        if targetRecall not in targetThresholds:
-            raise Exception('"clinical" mode has no defined threshold for target recall "%s"' % targetRecall)
-        acceptedRecall = targetThresholds[targetRecall]
+        if ENABLE_AUTO_TARGET:
+            global_precision = GLOBAL_AUTO_TARGET_PRECISION
+            
+            #dummy values
+            acceptedRecall = 0.0
+            targetRecall = 0.0
+        else:
+            global_precision = None
+            #TODO: make this a script parameter instead of hard-coding
+            #this contains minimum acceptable recals for a given target
+            targetThresholds = {
+                '0.995' : 0.99
+            }
+            if targetRecall not in targetThresholds:
+                raise Exception('"clinical" mode has no defined threshold for target recall "%s"' % targetRecall)
+            acceptedRecall = targetThresholds[targetRecall]
         
         #get the clinical model
-        clinicalModelDict = getClinicalModel(stats, acceptedRecall, targetRecall)
+        clinicalModelDict = getClinicalModel(stats, acceptedRecall, targetRecall, global_precision)
         bestModelName = clinicalModelDict['model_name']
         bestModelTargetRecall = clinicalModelDict['eval_recall']
         
@@ -252,10 +262,11 @@ def runSubType(variantType, args, stats, models, statKey):
         allVariants += loadCodicemVariants(args.codicem)
     
     #now load the VCF file
-    vcfReader = vcf.Reader(filename=args.sample_vcf, compressed=True)
-    rawReader = vcf.Reader(filename=args.sample_vcf, compressed=True)
-    assert(len(vcfReader.samples) == 1)
-    chromList = vcfReader.contigs.keys()
+    vcf_reader = cyvcf2.VCF(args.sample_vcf)
+    raw_reader = cyvcf2.VCF(args.sample_vcf)
+    assert(len(vcf_reader.samples) == 1)
+    sample_index = 0
+    chrom_list = vcf_reader.seqnames
 
     #go through each variant and extract the features into a shared set
     varIndex = []
@@ -263,12 +274,12 @@ def runSubType(variantType, args, stats, models, statKey):
     rawGT = []
     varFeatures = []
     for i, (chrom, start, end, ref, alt) in enumerate(allVariants):
-        if (chrom not in chromList and
-            'chr'+chrom in chromList):
+        if (chrom not in chrom_list and
+            'chr'+chrom in chrom_list):
             chrom = 'chr'+chrom
         
-        if chrom in chromList:
-            variantList = [variant for variant in vcfReader.fetch(chrom, start, end)]
+        if chrom in chrom_list:
+            variantList = [variant for variant in vcf_reader(f'{chrom}:{start}-{end}')]
         else:
             print('WARNING: Chromosome "%s" not found' % (chrom, ))
             variantList = []
@@ -279,7 +290,7 @@ def runSubType(variantType, args, stats, models, statKey):
 
         #now go through each variant and pull out the features for it
         for variant in variantList:
-            featureVals = getVariantFeatures(variant, vcfReader.samples[0], fields, rawReader, allowHomRef=True)
+            featureVals = getVariantFeatures(variant, sample_index, fields, raw_reader, allowHomRef=True)
             varFeatures.append(featureVals)
             rawGT.append(featureVals[gtIndex])
     
@@ -322,7 +333,8 @@ def runSubType(variantType, args, stats, models, statKey):
             #if it's not found, we DON'T put it in the dictionary list
         else:
             for ind in foundVarIndices:
-                vals = [chrom, start, end, ref, alt, rawVariants[ind], rawGT[ind]]
+                raw_variant_str = generateCyvcf2RecordStr(rawVariants[ind])
+                vals = [chrom, start, end, ref, alt, raw_variant_str, rawGT[ind]]
                 modelResultDict = {}
                 resultsFound = False
                 for mn in evalList:
@@ -346,7 +358,7 @@ def runSubType(variantType, args, stats, models, statKey):
                         'end' : end,
                         'ref' : ref,
                         'alt' : alt,
-                        'call_variant' : rawVariants[ind],
+                        'call_variant' : raw_variant_str,
                         'call_gt' : rawGT[ind],
                         'predictions' : modelResultDict
                     }
@@ -386,10 +398,11 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT):
         allVariants += loadCodicemVariants(args.codicem)
     
     #now load the VCF file
-    vcfReader = vcf.Reader(filename=args.sample_vcf, compressed=True)
-    rawReader = vcf.Reader(filename=args.sample_vcf, compressed=True)
-    assert(len(vcfReader.samples) == 1)
-    chromList = vcfReader.contigs.keys()
+    vcf_reader = cyvcf2.VCF(args.sample_vcf)
+    raw_reader = cyvcf2.VCF(args.sample_vcf)
+    assert(len(vcf_reader.samples) == 1)
+    sample_index = 0
+    chrom_list = vcf_reader.seqnames
 
     #go through each variant and extract the features into a shared set
     varIndex = []
@@ -397,12 +410,12 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT):
     rawGT = []
     varFeatures = []
     for i, (chrom, start, end, ref, alt) in enumerate(allVariants):
-        if (chrom not in chromList and
-            'chr'+chrom in chromList):
+        if (chrom not in chrom_list and
+            'chr'+chrom in chrom_list):
             chrom = 'chr'+chrom
         
-        if chrom in chromList:
-            variantList = [variant for variant in vcfReader.fetch(chrom, start, end)]
+        if chrom in chrom_list:
+            variantList = [variant for variant in vcf_reader(f'{chrom}:{start}-{end}')]
         else:
             print('WARNING: Chromosome "%s" not found' % (chrom, ))
             variantList = []
@@ -413,7 +426,7 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT):
 
         #now go through each variant and pull out the features for it
         for variant in variantList:
-            featureVals = getVariantFeatures(variant, vcfReader.samples[0], fields, rawReader, allowHomRef=True)
+            featureVals = getVariantFeatures(variant, sample_index, fields, raw_reader, allowHomRef=True)
             varFeatures.append(featureVals)
             rawGT.append(featureVals[gtIndex])
     
@@ -436,7 +449,8 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT):
             valList.append(vals)
         else:
             for ind in foundVarIndices:
-                vals = [chrom, start, end, ref, alt, rawVariants[ind], rawGT[ind]]
+                raw_variant_str = generateCyvcf2RecordStr(rawVariants[ind])
+                vals = [chrom, start, end, ref, alt, raw_variant_str, rawGT[ind]]
                 modelResultDict = {}
                 resultsFound = False
                 if filtersEnabled and (acceptedVT != varFeatures[ind][0] or 
@@ -459,7 +473,7 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT):
                         'end' : end,
                         'ref' : ref,
                         'alt' : alt,
-                        'call_variant' : rawVariants[ind],
+                        'call_variant' : raw_variant_str,
                         'call_gt' : rawGT[ind],
                         'predictions' : modelResultDict
                     }
@@ -519,6 +533,18 @@ def parseCLIVariants(csVar):
             ret.append(var)
     return ret
 
+def generateCyvcf2RecordStr(variant):
+    '''
+    Simple cyvcf2 variant reformatter, makes it match something similar to vcf
+    @param variant - a variant from cyvcf2
+    @return - a string formatted as a "record"
+    '''
+    chrom = variant.CHROM
+    pos = variant.POS
+    ref = variant.REF
+    alts = variant.ALT
+    return f'cyvcf2.record(CHROM={chrom}, POS={pos}, REF={ref}, ALT={alts})'
+
 def loadCodicemVariants(csvFN):
     '''
     Loads a Codicem sanger CSV and finds variants that need confirmation
@@ -559,4 +585,4 @@ def loadCodicemVariants(csvFN):
     #parse the arguments
     args = p.parse_args()
 
-    evaluateVariants(args)
\ No newline at end of file
+    evaluateVariants(args)

From 0c409792cf0bee296183fcf4ba08a9d462325101 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Thu, 2 Jun 2022 16:25:23 -0500
Subject: [PATCH 12/16] README updates

---
 README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 002409e..4ba7b20 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,12 @@ We have currently configured the pipeline to run using an LSF cluster.  If a dif
 2. If running locally, configuration changes to the snakemake command itself may be necessary. These are located in variable `snakemakeFrags`, contact the repo owner or submit a ticket for assistance.
 
 ### Training Configuration (Optional)
-There are several options available that adjust way training of models is performed (e.g. models used, hyperparameters, training method, etc.).  These options are available in `TrainingConfig.py`.  Details on each will be in a future release. 
+There are several options available that adjust way training of models is performed (e.g. models used, hyperparameters, training method, etc.).
+These options are available in `TrainingConfig.py` and generally described in-line with the parameter.
+However, some are of critical importance and should be considered for training:
+
+1. `ENABLE_AUTO_TARGET` - If set to `True`, this option enables an alternate method for determining the target recall that is automatically calculated based on the observed precision and the desired global precision (another parameter `GLOBAL_AUTO_TARGET_PRECISION`). *Global precision* in this context is the combined precision of the upstream pipeline followed by STEVE identifying false positive calls from that pipeline.  For example, if the desired global precision is 99% and the upstream pipeline achieves 98% precision by itself, then the models trained by STEVE need to capture 1% out of the 2% false positives to achieve 99% global precision. This means the target recall will be set to 0.5000 indicating that half of all false positives need to be identified to achieve the desired global precision. This approach allows for pipeline that _already_ have a high precision to have lower/easier targets in STEVE to achieve the same global precision. In practice, this allows you to swap the upstream pipelines without needing to recalculate/adjust the target/accepted recall values to account for a more or less precise upstream pipeline.
+2. `ENABLE_FEATURE_SELECTION` - If set to `True`, this option enabled feature selection prior to model training. Numerous parameters adjust how exactly the feature selection is performed. In general, enabling feature selection leads to longer training times, but may improve the results by removing unnecessary and/or redundant features using a systematic approach.
 
 ### Setting up conda environment
 Assuming conda or miniconda is installed, use the following two commands to create and then activate a conda environment for the pipeline:
@@ -53,7 +58,7 @@ conda activate steve
 Assuming the above configuration steps were successful, all that remains is to run the pipeline itself.  Here is an example execution of the pipeline used in the paper:
 
 ```
-python3 scripts/RunTrainingPipeline.py -dest -x ./GIAB_v1.json
+python3 scripts/RunTrainingPipeline.py -dest -x ./sample_metadata/GIAB_v1.json
 ```
 
 For details on each option, run `python3 scripts/RunTrainingPipeline.py -h`.  The following sections describe what the pipeline is actually doing.

From 2117a81321094938e3e06a817e6a55b0f34f9d80 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Thu, 2 Jun 2022 16:27:05 -0500
Subject: [PATCH 13/16] update conda yaml

---
 scripts/TrainingPipeline.smk |   2 +
 scripts/conda_config.yaml    | 106 ++++++++++++++++++++++++++++++-----
 2 files changed, 95 insertions(+), 13 deletions(-)

diff --git a/scripts/TrainingPipeline.smk b/scripts/TrainingPipeline.smk
index 5bfe071..e4ac6cc 100644
--- a/scripts/TrainingPipeline.smk
+++ b/scripts/TrainingPipeline.smk
@@ -114,6 +114,8 @@ rule ExtractFeatures:
     params:
         script=EXTRACT_SCRIPT,
         prefix="{pipeline_dir}/features"
+    resources:
+        mem_mb=8192
     #conda:
     #    CONDA_ENV
     log: "{pipeline_dir}/logs/features/{reference}/{aligner}/{caller}/{slid}.log"
diff --git a/scripts/conda_config.yaml b/scripts/conda_config.yaml
index 709c0dd..7dc07cd 100644
--- a/scripts/conda_config.yaml
+++ b/scripts/conda_config.yaml
@@ -1,15 +1,95 @@
-name: steve
+name: steve-3.10
 channels:
-    - conda-forge
-    - bioconda
-    - defaults
+  - conda-forge
+  - bioconda
+  - defaults
 dependencies:
-    - python=3.7.4
-    - eli5=0.10.1
-    - imbalanced-learn=0.6.2
-    - pysam=0.15.4
-    - PyVCF=0.6.8
-    - scikit-learn=0.22.2.post1
-    - scikit-optimize=0.7.4
-    - snakemake=5.11.1
-    - xgboost=1.0.2
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - bzip2=1.0.8=h7f98852_4
+  - ca-certificates=2022.5.18.1=ha878542_0
+  - ld_impl_linux-64=2.36.1=hea4e1c9_2
+  - libffi=3.4.2=h7f98852_5
+  - libgcc-ng=12.1.0=h8d9b700_16
+  - libgomp=12.1.0=h8d9b700_16
+  - libnsl=2.0.0=h7f98852_0
+  - libuuid=2.32.1=h7f98852_1000
+  - libzlib=1.2.12=h166bdaf_0
+  - ncurses=6.3=h27087fc_1
+  - openssl=3.0.3=h166bdaf_0
+  - pip=22.1.2=pyhd8ed1ab_0
+  - python=3.10.1=h543edf9_2_cpython
+  - python_abi=3.10=2_cp310
+  - readline=8.1=h46c0cb4_0
+  - sqlite=3.38.5=h4ff8645_0
+  - tk=8.6.12=h27826a3_0
+  - tzdata=2022a=h191b570_0
+  - wheel=0.37.1=pyhd8ed1ab_0
+  - xz=5.2.5=h516909a_1
+  - zlib=1.2.12=h166bdaf_0
+  - pip:
+    - appdirs==1.4.4
+    - attrs==21.4.0
+    - certifi==2022.5.18.1
+    - charset-normalizer==2.0.12
+    - click==8.1.3
+    - coloredlogs==15.0.1
+    - configargparse==1.5.3
+    - connection-pool==0.0.3
+    - csl-pipeline-config==1.0.9
+    - cycler==0.11.0
+    - cyvcf2==0.30.15
+    - datrie==0.8.2
+    - decorator==5.1.1
+    - docutils==0.18.1
+    - dpath==2.0.6
+    - eli5==0.13.0
+    - fastjsonschema==2.15.3
+    - fonttools==4.33.3
+    - gitdb==4.0.9
+    - gitpython==3.1.27
+    - humanfriendly==10.0
+    - idna==3.3
+    - imbalanced-learn==0.9.1
+    - jinja2==3.1.2
+    - joblib==1.1.0
+    - jsonschema==4.6.0
+    - jupyter-core==4.10.0
+    - kiwisolver==1.4.2
+    - markupsafe==2.1.1
+    - matplotlib==3.5.2
+    - nbformat==5.4.0
+    - numpy==1.22.4
+    - packaging==21.3
+    - pillow==9.1.1
+    - plac==1.3.5
+    - psutil==5.9.1
+    - pulp==2.6.0
+    - py==1.11.0
+    - pyaml==21.10.1
+    - pyparsing==3.0.9
+    - pyrsistent==0.18.1
+    - python-dateutil==2.8.2
+    - python-graphviz==0.20
+    - pyyaml==6.0
+    - ratelimiter==1.2.0.post0
+    - requests==2.27.1
+    - retry==0.9.2
+    - scikit-learn==1.1.1
+    - scikit-optimize==0.9.0
+    - scipy==1.8.1
+    - setuptools==62.3.2
+    - six==1.16.0
+    - smart-open==6.0.0
+    - smmap==5.0.0
+    - snakemake==7.8.1
+    - stopit==1.1.2
+    - tabulate==0.8.9
+    - threadpoolctl==3.1.0
+    - toposort==1.7
+    - traitlets==5.2.2.post1
+    - urllib3==1.26.9
+    - wrapt==1.14.1
+    - xgboost==1.6.1
+    - yte==1.4.0
+prefix: /cluster/home/jholt/githubDL/miniconda3/envs/steve-3.10

From af6e6eaff0578dd0c6e95eaefeb7d61c1bf0c86a Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Tue, 7 Jun 2022 09:23:24 -0500
Subject: [PATCH 14/16] pipeline config removal

---
 scripts/conda_config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/conda_config.yaml b/scripts/conda_config.yaml
index 7dc07cd..3df638a 100644
--- a/scripts/conda_config.yaml
+++ b/scripts/conda_config.yaml
@@ -36,7 +36,6 @@ dependencies:
     - coloredlogs==15.0.1
     - configargparse==1.5.3
     - connection-pool==0.0.3
-    - csl-pipeline-config==1.0.9
     - cycler==0.11.0
     - cyvcf2==0.30.15
     - datrie==0.8.2

From 3e97832fede01bb48f92070fb7b967c1706a4375 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Wed, 20 Jul 2022 14:24:46 -0500
Subject: [PATCH 15/16] final version of training config

---
 scripts/PipelineConfig.py   | 3 ++-
 scripts/PrintModelReport.py | 3 ++-
 scripts/TrainingConfig.py   | 4 ++--
 scripts/model_metrics.json  | 5 ++++-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/scripts/PipelineConfig.py b/scripts/PipelineConfig.py
index 14a9ca1..9352359 100644
--- a/scripts/PipelineConfig.py
+++ b/scripts/PipelineConfig.py
@@ -14,7 +14,8 @@
 SAMPLE_JSONS = [
     #'%s/scripts/GIAB_v1.json' % REPO_DIRECTORY
     #'%s/scripts/GIAB_v2.json' % REPO_DIRECTORY
-    f'{REPO_DIRECTORY}/sample_metadata/GIAB_pcrfree.json'
+    #f'{REPO_DIRECTORY}/sample_metadata/GIAB_pcrfree.json'
+    f'{REPO_DIRECTORY}/sample_metadata/GIAB_pcrfree_v4.2.1.json'
 ]
 
 #root directory for RTG-based analysis which is expected to conform to a standard directory structure, and contain
diff --git a/scripts/PrintModelReport.py b/scripts/PrintModelReport.py
index 917ce51..cf10bcf 100644
--- a/scripts/PrintModelReport.py
+++ b/scripts/PrintModelReport.py
@@ -173,6 +173,7 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels, globa
             print(*rowVals, sep='\t')
         else:
             bestModelTargetRecall = clinDict['eval_recall']
+            str_bestModelTargetRecall = '{0:0.4f}'.format(float(bestModelTargetRecall))
 
             #copy TPR results
             bestTPR = stats[bestModelName]['ALL_SUMMARY'][bestModelTargetRecall]['TEST_TPR'][0]
@@ -185,7 +186,7 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels, globa
             bestFPR = stats[bestModelName]['ALL_SUMMARY'][bestModelTargetRecall]['TEST_FPR'][0]
 
             rowVals = [
-                reformKey, bestModelName, bestModelTargetRecall,
+                reformKey, bestModelName, str_bestModelTargetRecall,
                 '%0.4f+-%0.4f' % (bestTPRAvg, bestTPRStd), 
                 '%0.4f' % bestTPR,
                 '%0.4f+-%0.4f' % (bestFPRAvg, bestFPRStd), 
diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py
index 9ce0fbe..9c73d3b 100644
--- a/scripts/TrainingConfig.py
+++ b/scripts/TrainingConfig.py
@@ -34,7 +34,7 @@
 # e.g. if based precision is 0.999, then the model needs to recover 0.0009 of the 0.001 that is not captured
 # e.g. 90% derived target recall for the models
 GLOBAL_AUTO_TARGET_PRECISION = 0.99996
-AUTO_TARGET_BREAKPOINT_COUNT = 10 #the number of point inbetween the target and 1.00 to calculate, must be > 0
+AUTO_TARGET_BREAKPOINT_COUNT = 20 #the number of point inbetween the target and 1.00 to calculate, must be > 0
 AUTO_EXTRA_TARGETS = [0.9999, 1.0000] #these are always added
 
 #if the auto-targets is disabled, these will be used as targets instead
@@ -49,7 +49,7 @@
 #if True, this will pre-pend an automated feature selection step to each pipeline
 #WARNING: this can be a time-consuming process, the reduce run-time increase FS_STEP_SIZE and/or FS_MIN_FEATURE_COUNT
 ENABLE_FEATURE_SELECTION = True
-FS_MIN_FEATURE_COUNT = 10 #always keep at least this many features
+FS_MIN_FEATURE_COUNT = 20 #always keep at least this many features
 FS_STEP_SIZE = 0.1 #if an int, the number of features to remove at each step; if a float, the fraction of features to remove at each step
 FEATURE_SELECTION_MODELS = [
     RFECV(
diff --git a/scripts/model_metrics.json b/scripts/model_metrics.json
index bc81e53..f001df1 100644
--- a/scripts/model_metrics.json
+++ b/scripts/model_metrics.json
@@ -39,13 +39,16 @@
                 "AFO", "GT", "DP", "GQ", "min(PL)"
             ],
             "INFO" : [
-                "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet", 
+                "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet",
                 "FS", "GC", "ML_PROB", "MQ", "MQ0",
                 "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", 
                 "SOR", "STR", "UDP"
             ],
             "MUNGED" : [
                 "DP_DP", "FILTER", "ID", "NEARBY", "QUAL"
+            ],
+            "IGNORE" : [
+                
             ]
         }
     },

From 9f8d1908d5bbc6e0d5f23fe33050834a614b7288 Mon Sep 17 00:00:00 2001
From: holtjma <jholt@hudsonalpha.org>
Date: Wed, 20 Jul 2022 15:07:28 -0500
Subject: [PATCH 16/16] making clinical the default

---
 scripts/EvaluateVariants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/EvaluateVariants.py b/scripts/EvaluateVariants.py
index 1beae67..0c64492 100644
--- a/scripts/EvaluateVariants.py
+++ b/scripts/EvaluateVariants.py
@@ -574,7 +574,7 @@ def loadCodicemVariants(csvFN):
     #optional arguments with default
     p.add_argument('-c', '--codicem', dest='codicem', default=None, help='a Codicem CSV file with variants to evaluate (default: None)')
     p.add_argument('-v', '--variants', dest='variants', default=None, help='variant coordinates to evaluate (default: None)')
-    p.add_argument('-m', '--model', dest='model', default='best', help='the model name to use (default: best)')
+    p.add_argument('-m', '--model', dest='model', default='clinical', help='the model name to use (default: clinical)')
     p.add_argument('-r', '--recall', dest='recall', default='0.99', help='the target recall value from training (default: 0.99)')
     p.add_argument('-o', '--output', dest='outFN', default=None, help='the place to send output to (default: stdout)')