From 64b2d3b13ffeb51be5a5a3310d17b4fb72919e7b Mon Sep 17 00:00:00 2001 From: holtjma Date: Tue, 3 May 2022 10:39:13 -0500 Subject: [PATCH 01/16] changes necessary to get feature extracton on cyvcf2 and training to run --- scripts/ExtractFeatures.py | 113 ++++++++++++++++++++++--------------- scripts/PipelineConfig.py | 18 ++++-- scripts/TrainModels.py | 2 +- scripts/cluster.json | 1 - scripts/model_metrics.json | 13 +++++ 5 files changed, 95 insertions(+), 52 deletions(-) diff --git a/scripts/ExtractFeatures.py b/scripts/ExtractFeatures.py index de8ef02..4f8c33d 100644 --- a/scripts/ExtractFeatures.py +++ b/scripts/ExtractFeatures.py @@ -5,7 +5,7 @@ import numpy as np import os import re -import vcf +import cyvcf2 from PipelineConfig import DATA_DIRECTORY, REPO_DIRECTORY from RunTrainingPipeline import parseSlids @@ -47,43 +47,44 @@ assert(k not in ALL_METRICS) ALL_METRICS[k] = ALL_METRICS[UNPARSED_METRICS['copied'][k]] -def getVariantFeatures(variant, sampleID, fields, rawReader, allowHomRef=False): +def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=False): ''' This function takes a variant from a VCF and a sample ID and extracts the features for it @param variant - the variant (from "vcfReader") - @param sampleID - the sample in the VCF file + @param sample_index - the sample index in the VCF file (0-based) @param fields - the fields to extract from the variant @param rawReader - the raw VCF file @param allowHomRef - if True, this will allow 0/0 calls (otherwise, it throws an error) @return - a list of ordered features ''' annots = [] - - #first, annotate whether it's a SNP or an indel - callStats = variant.genotype(sampleID) - if '.' in callStats['GT']: - gtPieces = ['0', '0'] - else: - gtPieces = re.split('[/|]', callStats['GT']) - if len(gtPieces) != 2: + #first, annotate whether it's a SNP or an indel + gt_pieces = variant.genotypes[sample_index] + if len(gt_pieces) != 3: #TODO: only a problem in strelka, how to handle it? - return None + #return None + raise Exception(f'Unexpected GT field: {gt_pieces}') + + #explicitly store phasing and sort the GT field (this is important for ref/alt ordering) + is_phased = gt_pieces[-1] + gt_pieces = sorted(gt_pieces[0:-1]) if not allowHomRef: - assert(gtPieces[0] != '0' or gtPieces[1] != '0') + assert(gt_pieces[0] != 0 or gt_pieces[1] != 0) #look at the variant call - s1 = (variant.REF if gtPieces[0] == '0' else variant.ALT[int(gtPieces[0])-1]) - s2 = (variant.REF if gtPieces[1] == '0' else variant.ALT[int(gtPieces[1])-1]) + s1 = (variant.REF if gt_pieces[0] == 0 else variant.ALT[int(gt_pieces[0])-1]) + s2 = (variant.REF if gt_pieces[1] == 0 else variant.ALT[int(gt_pieces[1])-1]) + if str(s1) == '' or str(s2) == '': #ignore this one, it's something we can't figure out from a GVCF #TODO: do we feel like ever changing this? return None #pull this out for use by all the AD stat fields - callAD = callStats['AD'] - adSum = sum(callAD) + call_ad = variant.format('AD')[sample_index] + ad_sum = sum(call_ad) for k, subk in fields: if k == 'VAR': @@ -102,53 +103,59 @@ def getVariantFeatures(variant, sampleID, fields, rawReader, allowHomRef=False): #these are call specific measures # all sub-keys default to a FLOAT interpretation if not specifically handled (see "else" clause) if subk == 'GT': - if gtPieces[0] == gtPieces[1]: - if gtPieces[0] == '0': + if gt_pieces[0] == gt_pieces[1]: + if gt_pieces[0] == 0: val = GT_REF_HOM else: val = GT_ALT_HOM - elif gtPieces[0] == '0' or gtPieces[1] == '0': + elif gt_pieces[0] == 0 or gt_pieces[1] == 0: val = GT_REF_HET else: val = GT_HET_HET elif subk == 'AD0': - val = (callAD[int(gtPieces[0])] if adSum > 0 else DEFAULT_MISSING) + val = (call_ad[int(gt_pieces[0])] if ad_sum > 0 else DEFAULT_MISSING) elif subk == 'AD1': - val = (callAD[int(gtPieces[1])] if adSum > 0 else DEFAULT_MISSING) + val = (call_ad[int(gt_pieces[1])] if ad_sum > 0 else DEFAULT_MISSING) elif subk == 'ADO': - if gtPieces[0] == gtPieces[1]: + if gt_pieces[0] == gt_pieces[1]: #homozygous, pull AD once - adUsed = callAD[int(gtPieces[0])] + ad_used = call_ad[int(gt_pieces[0])] else: #heterozygous, get both AD vals - adUsed = callAD[int(gtPieces[0])] + callAD[int(gtPieces[1])] + ad_used = call_ad[int(gt_pieces[0])] + call_ad[int(gt_pieces[1])] #AD-other is the total AD minus the GT AD - val = ((adSum - adUsed) if adSum > 0 else DEFAULT_MISSING) + val = ((ad_sum - ad_used) if ad_sum > 0 else DEFAULT_MISSING) elif subk == 'AF0': - val = (callAD[int(gtPieces[0])] / adSum if adSum > 0 else DEFAULT_MISSING) + val = (call_ad[int(gt_pieces[0])] / ad_sum if ad_sum > 0 else DEFAULT_MISSING) elif subk == 'AF1': - val = (callAD[int(gtPieces[1])] / adSum if adSum > 0 else DEFAULT_MISSING) + val = (call_ad[int(gt_pieces[1])] / ad_sum if ad_sum > 0 else DEFAULT_MISSING) elif subk == 'AFO': - if gtPieces[0] == gtPieces[1]: + if gt_pieces[0] == gt_pieces[1]: #homozygous, pull AD once - adUsed = callAD[int(gtPieces[0])] + adUsed = call_ad[int(gt_pieces[0])] else: #heterozygous, get both AD vals - adUsed = callAD[int(gtPieces[0])] + callAD[int(gtPieces[1])] + adUsed = call_ad[int(gt_pieces[0])] + call_ad[int(gt_pieces[1])] #AD-other is the total AD minus the GT AD - val = ((adSum - adUsed) / adSum if adSum > 0 else DEFAULT_MISSING) + val = ((ad_sum - ad_used) / ad_sum if ad_sum > 0 else DEFAULT_MISSING) + + elif subk == 'min(PL)': + #this is the phred likelihoods, one of which should always be zero + pl_values = list(variant.format('PL')[sample_index]) + pl_values.remove(0) + val = min(pl_values) else: try: - val = float(callStats[subk]) + val = float(variant.format(subk)[sample_index]) except AttributeError as e: #TODO: is this okay for everything? val = DEFAULT_MISSING @@ -157,7 +164,7 @@ def getVariantFeatures(variant, sampleID, fields, rawReader, allowHomRef=False): elif k == 'INFO': #get from the INFO column # all values here default to a FLOAT interpretation - val = (float(variant.INFO[subk]) if (subk in variant.INFO) else DEFAULT_MISSING) + val = float(variant.INFO.get(subk, DEFAULT_MISSING)) annots.append(val) elif k == 'MUNGED': @@ -184,17 +191,27 @@ def getVariantFeatures(variant, sampleID, fields, rawReader, allowHomRef=False): #search the raw VCF for nearby calls nearbyFound = 0 BUFFER = 20 - for rawVariant in rawReader.fetch(variant.CHROM, variant.POS-BUFFER, variant.POS+BUFFER): + for rawVariant in rawReader(f'{variant.CHROM}:{variant.POS-BUFFER}-{variant.POS+BUFFER}'): if rawVariant.POS != variant.POS: - rawData = rawVariant.genotype(sampleID) - if rawData['GT'] not in ['./.', '0/0']: + raw_gt = rawVariant.genotypes[sample_index] + assert(len(raw_gt) == 3) + if raw_gt[0] > 0 or raw_gt[1] > 0: nearbyFound += 1 annots.append(nearbyFound) elif subk == 'FILTER': - #store the number of filters applied (PASS doesn't show up in these lists) - annots.append(len(variant.FILTER)) + #store the number of filters applied + if variant.FILTERS: + if 'PASS' in variant.FILTERS: + #PASS can show up here, if it exist we want to remove it from the count + val = len(variant.FILTERS)-1 + else: + #else just count up all the filters + val = len(variant.FILTERS) + else: + val = 0 + annots.append(val) elif subk == 'ID': #this is basically a boolean flag capturing if the variant has an ID annotation @@ -223,13 +240,13 @@ def gatherVcfMetrics(vcfFN, rawVCF, metrics): fields = [] #open up the VCF file - vcfReader = vcf.Reader(filename=vcfFN, compressed=True) - rawReader = vcf.Reader(filename=rawVCF, compressed=True) + vcfReader = cyvcf2.VCF(vcfFN) + rawReader = cyvcf2.VCF(rawVCF) - #get the sample + #get the sample, right now we are enforcing one sample per VCF samples = vcfReader.samples assert(len(samples) == 1) - sampleID = samples[0] + sample_index = 0 #fill out the fields; always include a variant type field fields.append(('VAR', 'TYPE')) @@ -243,7 +260,15 @@ def gatherVcfMetrics(vcfFN, rawVCF, metrics): #now we go through for variant in vcfReader: - annots = getVariantFeatures(variant, sampleID, fields, rawReader) + annots = getVariantFeatures(variant, sample_index, fields, rawReader) + assert(len(annots) == len(fields)) + ''' + print(annots) + for i, f in enumerate(fields): + print(i, f, annots[i], sep='\t') + print() + exit() + ''' if annots != None: ret.append(annots) if len(ret) % 100000 == 0: diff --git a/scripts/PipelineConfig.py b/scripts/PipelineConfig.py index 77428d5..00a98a2 100644 --- a/scripts/PipelineConfig.py +++ b/scripts/PipelineConfig.py @@ -1,15 +1,20 @@ +#this is a module we use for storing all our paths systematically, aka a bunch of strings +from csl_pipeline.config import ( + JHOLT_HOME +) + ############################################################ #Core pipeline config ############################################################ #this is the repo directory -REPO_DIRECTORY = '/cluster/home/jholt/sanger_less_tests' +REPO_DIRECTORY = f'{JHOLT_HOME}/sanger_less_tests' #a dictionary containing metadata for the samples SAMPLE_JSONS = [ #'%s/scripts/GIAB_v1.json' % REPO_DIRECTORY #'%s/scripts/GIAB_v2.json' % REPO_DIRECTORY - '%s/scripts/GIAB_pcrfree.json' % REPO_DIRECTORY + f'{REPO_DIRECTORY}/scripts/GIAB_pcrfree.json' ] #root directory for RTG-based analysis which is expected to conform to a standard directory structure, and contain @@ -17,7 +22,7 @@ # {DATA_DIRECTORY}/variant_calls/{aligner}/{caller}/{sample}.vcf.gz - the expected VCF file paths # {DATA_DIRECTORY}/rtg_results/{aligner}/{caller}/{sample} - folder containing standard RTG results, specifically summary.txt, fp.vcf.gz, and tp.vcf.gz #DATA_DIRECTORY = '/cluster/home/jholt/csl_validations/core_pipeline_analysis/pipeline' -DATA_DIRECTORY = '/cluster/home/jholt/CSL_pipeline_benchmark/pipeline' +DATA_DIRECTORY = f'{JHOLT_HOME}/CSL_pipeline_benchmark/pipeline' #this is where you specify the pipelines to run; every REFERENCE, ALIGNER, and VARIANT_CALLER combo will be run together # and each triplet gets it's own model trained for it @@ -37,7 +42,8 @@ FULL_PIPES = [ #('dragen-07.011.352.3.2.8b', 'dragen-07.011.352.3.2.8b'), #this is what we originally trained with #('dragen-07.021.510.3.5.7', 'dragen-07.021.510.3.5.7'), #this is what follow ups were on - ('clinical', 'sentieon-201808.07', 'strelka-2.9.10') + #('clinical', 'sentieon-201808.07', 'strelka-2.9.10') + ('hg38_T2T_masked', 'sentieon-202112.01', 'dnascope-1.0-202112.01-PO') ] #pipeline parameters, thread count currently only applies to the actual model training (CV to be precise) @@ -48,7 +54,7 @@ ############################################################ #only necessary if you plan to generate a data summary (i.e. the Supplementary Document) -LATEX_PATH = '/cluster/home/jholt/texlive/2019/bin/x86_64-linux/pdflatex' +LATEX_PATH = f'{JHOLT_HOME}/texlive/2019/bin/x86_64-linux/pdflatex' ############################################################ #Slack config @@ -59,5 +65,5 @@ #json with sub-dictionary 'urls' where key is a channel name (such as the value in ENABLED_SLACK_CHANNEL) # and value is a Slack endpoint URL -ENABLED_SLACK_URLS = '/cluster/home/jholt/slack_integration/data/slack_urls.json' +ENABLED_SLACK_URLS = f'{JHOLT_HOME}/slack_integration/data/slack_urls.json' ENABLED_SLACK_CHANNEL = "@holtjma" diff --git a/scripts/TrainModels.py b/scripts/TrainModels.py index ac21ef5..3ab979e 100644 --- a/scripts/TrainModels.py +++ b/scripts/TrainModels.py @@ -319,7 +319,7 @@ def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, co ) else: raise Exception('NO_IMPL') - gsClf.fit(train_X, train_Y, train_groups) + gsClf.fit(train_X, train_Y, groups=train_groups) print('[%s]\t\tBest params: %s' % (str(datetime.datetime.now()), gsClf.best_params_)) print('[%s]\tGathering CV results...'% (str(datetime.datetime.now()), )) diff --git a/scripts/cluster.json b/scripts/cluster.json index 38aed1a..6a94506 100644 --- a/scripts/cluster.json +++ b/scripts/cluster.json @@ -1,7 +1,6 @@ { "__default__": { "queue" : "csl", - "logdir" : "/gpfs/gpfs1/home/jholt/sanger_less_tests/pipeline/logs/cluster", "memory" : 8000, "mem_mb" : 8000 } diff --git a/scripts/model_metrics.json b/scripts/model_metrics.json index 473d73d..f59cda2 100644 --- a/scripts/model_metrics.json +++ b/scripts/model_metrics.json @@ -32,6 +32,19 @@ "MUNGED" : [ "QUAL", "NEARBY", "FILTER", "ID" ] + }, + "dnascope-1.0-202112.01-PO" : { + "CALL" : [ + "AD0", "AD1", "ADO", "AF0", "AF1", "AFO", "GT", "DP", "GQ", "min(PL)" + ], + "INFO" : [ + "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet", + "FS", "GC", "ML_PROB", "MQ", "MQ0", + "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", "SOR", "STR", "UDP" + ], + "MUNGED" : [ + "QUAL", "NEARBY", "FILTER", "ID" + ] } }, "copied" : { From 19419510eb5bb2d31fe92597c10d469b177b7a3b Mon Sep 17 00:00:00 2001 From: holtjma Date: Wed, 4 May 2022 07:31:14 -0500 Subject: [PATCH 02/16] temp backup --- scripts/TrainingConfig.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py index 7a3c2c9..b5b5f78 100644 --- a/scripts/TrainingConfig.py +++ b/scripts/TrainingConfig.py @@ -25,7 +25,7 @@ SUBSET_SIZE = 10000 #if True, manually remove features in MANUAL_FS_LABELS (these are historically unimportant features) -MANUAL_FS = True +MANUAL_FS = False MANUAL_FS_LABELS = ['CALL-ADO', 'CALL-AFO'] #if True, mark false positive as true positives and vice versa From 7245fd5eacf23bc01e3816d64f9b32eb2978d3c5 Mon Sep 17 00:00:00 2001 From: holtjma Date: Wed, 4 May 2022 09:04:46 -0500 Subject: [PATCH 03/16] removing this fixes stats, this will need to be full patched --- scripts/EvaluateVariants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/EvaluateVariants.py b/scripts/EvaluateVariants.py index e61af3d..11d3c4b 100644 --- a/scripts/EvaluateVariants.py +++ b/scripts/EvaluateVariants.py @@ -5,7 +5,7 @@ import numpy as np import pickle import re -import vcf +#import vcf from ExtractFeatures import ALL_METRICS, getVariantFeatures, GT_TRANSLATE, VAR_TRANSLATE From 8b7e511e396aa97250f032ec9832172d1ae6ef0c Mon Sep 17 00:00:00 2001 From: holtjma Date: Wed, 4 May 2022 09:44:56 -0500 Subject: [PATCH 04/16] fixes for ref/al/caller setup --- scripts/ExtractELI5Results.py | 2 +- scripts/GenerateSupplement.py | 39 +++++++++++++++++++---------------- scripts/PipelineConfig.py | 2 +- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/scripts/ExtractELI5Results.py b/scripts/ExtractELI5Results.py index 27b325b..248d482 100644 --- a/scripts/ExtractELI5Results.py +++ b/scripts/ExtractELI5Results.py @@ -60,7 +60,7 @@ def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels): coreFeatureNames = [tuple(f.split('-')) for f in allModels[k][bestModelName]['FEATURES']] #get the weights and add it to the result - weights = eli5.explain_weights(clf, feature_names=['-'.join(cfn) for cfn in coreFeatureNames]) + weights = eli5.explain_weights(clf, feature_names=['-'.join(cfn) for cfn in coreFeatureNames], top=None) dictForm = eli5.formatters.as_dict.format_as_dict(weights) ret[reformKey] = { 'best_model' : bestModelName, diff --git a/scripts/GenerateSupplement.py b/scripts/GenerateSupplement.py index 1122075..c5611fe 100644 --- a/scripts/GenerateSupplement.py +++ b/scripts/GenerateSupplement.py @@ -33,22 +33,23 @@ def loadMetadata(): md.update(d) return md -def getModelResults(datasets, pipeline, aligner, caller): +def getModelResults(datasets, pipeline, reference, aligner, caller): ''' This will pull ALL the results for an aligner/caller combination and put it in a dictionary to be later consumed by Jinja2. @param pipeline - the pipeline subdirectory + @param reference - the reference to get information for @param aligner - the aligner to get information for @param caller - the caller to get information for @return - a dictionary values for use by Jinja2 ''' #get all the RTG-specific results for this aligner/caller combo - rtgResults = getRtgResults(datasets, pipeline, aligner, caller) + rtgResults = getRtgResults(datasets, pipeline, reference, aligner, caller) #get all the training results for this aligner/caller combo - trainingResults = getTrainingResults(pipeline, aligner, caller) - strictResults = getTrainingResults(pipeline, aligner, caller, True) - eli5Results = getEli5Results(pipeline, aligner, caller) + trainingResults = getTrainingResults(pipeline, reference, aligner, caller) + strictResults = getTrainingResults(pipeline, reference, aligner, caller, True) + eli5Results = getEli5Results(pipeline, reference, aligner, caller) retDict = { 'ALIGNER' : aligner, @@ -60,17 +61,18 @@ def getModelResults(datasets, pipeline, aligner, caller): } return retDict -def getRtgResults(datasets, pipeline, aligner, caller): +def getRtgResults(datasets, pipeline, reference, aligner, caller): ''' This will load results specific to the RTG evaluation, things like number of TP, TN, etc. @param pipeline - the pipeline subdirectory (usually "pipeline") + @param reference - the reference to get information for @param aligner - the aligner to get information for @param caller - the caller to get information for @return - a dictionary values for use by Jinja2 ''' retDict = {} - rtgSummaryPattern = '%s/rtg_results/%s/%s/*/summary.txt' % (DATA_DIRECTORY, aligner, caller) + rtgSummaryPattern = '%s/rtg_results/%s/%s/%s/*/summary.txt' % (DATA_DIRECTORY, reference, aligner, caller) rtgMetrics = sorted(glob.glob(rtgSummaryPattern)) retDict['SAMPLE_SUMMARY'] = {} totalDict = {} @@ -103,7 +105,7 @@ def getRtgResults(datasets, pipeline, aligner, caller): retDict['TOTAL_SUMMARY'] = totalDict #get the feature stats - summaryFN = '%s/%s/feature_stats/%s/%s/feature_stats.tsv' % (REPO_DIRECTORY, pipeline, aligner, caller) + summaryFN = '%s/%s/feature_stats/%s/%s/%s/feature_stats.tsv' % (REPO_DIRECTORY, pipeline, reference, aligner, caller) if os.path.exists(summaryFN): featureDict = {} fp = open(summaryFN, 'r') @@ -173,10 +175,11 @@ def loadRtgSummary(fn): resultsDict[h] = val return resultsDict -def getTrainingResults(pipeline, aligner, caller, strict=False): +def getTrainingResults(pipeline, reference, aligner, caller, strict=False): ''' This will retrieve training results for the specified aligner/caller combo @param pipeline - the pipeline subdirectory + @param reference - the reference specified @param aligner - the aligner specified @param caller - the caller specified @param strict - if True, load the strict version of the results @@ -184,9 +187,9 @@ def getTrainingResults(pipeline, aligner, caller, strict=False): ''' #pull the summary results and parse the clinical fragments if not strict: - summaryFN = '%s/%s/model_summaries/%s/%s/model_summary.tsv' % (REPO_DIRECTORY, pipeline, aligner, caller) + summaryFN = '%s/%s/model_summaries/%s/%s/%s/model_summary.tsv' % (REPO_DIRECTORY, pipeline, reference, aligner, caller) else: - summaryFN = '%s/%s/model_summaries/%s/%s/strict_summary.tsv' % (REPO_DIRECTORY, pipeline, aligner, caller) + summaryFN = '%s/%s/model_summaries/%s/%s/%s/strict_summary.tsv' % (REPO_DIRECTORY, pipeline, reference, aligner, caller) clinicalResults = {} fp = open(summaryFN, 'r') @@ -244,7 +247,7 @@ def getTrainingResults(pipeline, aligner, caller, strict=False): continue reformKey = VAR_TRANSLATE[vt]+'_'+GT_TRANSLATE[gt] - imageFN = '%s/%s/model_summaries/%s/%s/roc_curves/%s.png' % (REPO_DIRECTORY, pipeline, aligner, caller, reformKey) + imageFN = '%s/%s/model_summaries/%s/%s/%s/roc_curves/%s.png' % (REPO_DIRECTORY, pipeline, reference, aligner, caller, reformKey) if os.path.exists(imageFN): imageDict[reformKey] = imageFN else: @@ -258,7 +261,7 @@ def getTrainingResults(pipeline, aligner, caller, strict=False): } return ret -def getEli5Results(pipeline, aligner, caller): +def getEli5Results(pipeline, reference, aligner, caller): ''' This will retrieve ELI5 results if available @param pipeline - the pipeline subdirectory @@ -266,7 +269,7 @@ def getEli5Results(pipeline, aligner, caller): @param caller - the caller specified @return - a dictionary of metrics for this combo ''' - jsonFN = '%s/%s/eli5_summaries/%s/%s/model_eli5.json' % (REPO_DIRECTORY, pipeline, aligner, caller) + jsonFN = '%s/%s/eli5_summaries/%s/%s/%s/model_eli5.json' % (REPO_DIRECTORY, pipeline, reference, aligner, caller) #catch the situation where we didn't run ELI5 for w/e reason ret = {} @@ -442,15 +445,15 @@ def generateReport(dataDict, prefix): for fullPipe in FULL_PIPES: combos.append(fullPipe) - for aligner, caller in combos: + for reference, aligner, caller in combos: #check for the summary file - summaryFN = '%s/%s/model_summaries/%s/%s/model_summary.tsv' % (REPO_DIRECTORY, args.pipeline, aligner, caller) + summaryFN = '%s/%s/model_summaries/%s/%s/%s/model_summary.tsv' % (REPO_DIRECTORY, args.pipeline, reference, aligner, caller) if not os.path.exists(summaryFN): - print('WARNING: Summary file for %s/%s was not found, re-run pipeline?') + print(f'WARNING: Summary file for {reference}/{aligner}/{caller} was not found, re-run pipeline?') continue #get the results - comboData = getModelResults(METADATA_DICT, args.pipeline, aligner, caller) + comboData = getModelResults(METADATA_DICT, args.pipeline, reference, aligner, caller) #rename comboData['ALIGNER_LABEL'] = ALIGNER_RENAMING.get(aligner, aligner) diff --git a/scripts/PipelineConfig.py b/scripts/PipelineConfig.py index 00a98a2..b2e17e6 100644 --- a/scripts/PipelineConfig.py +++ b/scripts/PipelineConfig.py @@ -14,7 +14,7 @@ SAMPLE_JSONS = [ #'%s/scripts/GIAB_v1.json' % REPO_DIRECTORY #'%s/scripts/GIAB_v2.json' % REPO_DIRECTORY - f'{REPO_DIRECTORY}/scripts/GIAB_pcrfree.json' + f'{REPO_DIRECTORY}/sample_metadata/GIAB_pcrfree.json' ] #root directory for RTG-based analysis which is expected to conform to a standard directory structure, and contain From 6d41e1b8160d5f02d87196e3cc1cb7527cba5a58 Mon Sep 17 00:00:00 2001 From: holtjma Date: Wed, 4 May 2022 12:30:16 -0500 Subject: [PATCH 05/16] first pass at pruning --- scripts/ExtractFeatures.py | 4 ++++ scripts/model_metrics.json | 11 +++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/scripts/ExtractFeatures.py b/scripts/ExtractFeatures.py index 4f8c33d..4bf0ef2 100644 --- a/scripts/ExtractFeatures.py +++ b/scripts/ExtractFeatures.py @@ -219,6 +219,10 @@ def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=Fal else: raise Exception('Unhandled computed measurement: %s-%s' % (k, subk)) + elif k == 'IGNORE': + #this is just a dummy parameter we can ignore + #historically, these were tested and found to be uninformative to the models + pass else: raise Exception('Unexpected metric key: %s' % (k, )) diff --git a/scripts/model_metrics.json b/scripts/model_metrics.json index f59cda2..eaaa381 100644 --- a/scripts/model_metrics.json +++ b/scripts/model_metrics.json @@ -38,12 +38,15 @@ "AD0", "AD1", "ADO", "AF0", "AF1", "AFO", "GT", "DP", "GQ", "min(PL)" ], "INFO" : [ - "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet", - "FS", "GC", "ML_PROB", "MQ", "MQ0", - "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", "SOR", "STR", "UDP" + "BaseQRankSum", "DP", "Entropy", "GC", "ML_PROB", "MQ", "MQ0", + "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", + "SOR", "UDP" ], "MUNGED" : [ - "QUAL", "NEARBY", "FILTER", "ID" + "DP_DP", "QUAL", "NEARBY", "ID" + ], + "IGNORE" : [ + "MUNGED-FILTER", "INFO-ExcessHet", "INFO-STR", "INFO-ClippingRankSum", "INFO-FS" ] } }, From 29ad64dc6884deff9605b9a81b504e69240109a2 Mon Sep 17 00:00:00 2001 From: holtjma Date: Thu, 5 May 2022 16:15:33 -0500 Subject: [PATCH 06/16] added configuration for feature selection --- scripts/ExtractFeatures.py | 19 +++++++++++---- scripts/RunTrainingPipeline.py | 4 ++++ scripts/TrainModels.py | 30 +++++++++++++++++++++-- scripts/TrainingConfig.py | 44 ++++++++++++++++++++++------------ scripts/model_metrics.json | 13 +++++----- 5 files changed, 81 insertions(+), 29 deletions(-) diff --git a/scripts/ExtractFeatures.py b/scripts/ExtractFeatures.py index 4bf0ef2..bd3e01b 100644 --- a/scripts/ExtractFeatures.py +++ b/scripts/ExtractFeatures.py @@ -43,6 +43,9 @@ ALL_METRICS = {} for k in UNPARSED_METRICS['defined']: ALL_METRICS[k] = UNPARSED_METRICS['defined'][k] + if 'IGNORE' in ALL_METRICS[k]: + #these are metrics we are tracking but ignoring; usually they were tested but uninformative for the models + del ALL_METRICS[k]['IGNORE'] for k in UNPARSED_METRICS['copied']: assert(k not in ALL_METRICS) ALL_METRICS[k] = ALL_METRICS[UNPARSED_METRICS['copied'][k]] @@ -165,6 +168,8 @@ def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=Fal #get from the INFO column # all values here default to a FLOAT interpretation val = float(variant.INFO.get(subk, DEFAULT_MISSING)) + if np.isnan(val): + val = DEFAULT_MISSING annots.append(val) elif k == 'MUNGED': @@ -173,7 +178,9 @@ def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=Fal #call depth / total variant depth; usually ~1.0 if single sample try: dpCall = callStats['DP'] - dpVar = variant.INFO['DP'] + dpVar = variant.INFO.get('DP', DEFAULT_MISSING) + if dpVar == 0 or dpVar == DEFAULT_MISSING: + raise Exception('found the error') dpRat = dpCall / dpVar annots.append(dpRat) except: @@ -219,13 +226,15 @@ def getVariantFeatures(variant, sample_index, fields, rawReader, allowHomRef=Fal else: raise Exception('Unhandled computed measurement: %s-%s' % (k, subk)) - elif k == 'IGNORE': - #this is just a dummy parameter we can ignore - #historically, these were tested and found to be uninformative to the models - pass else: raise Exception('Unexpected metric key: %s' % (k, )) + if np.isnan(annots).sum() > 0 or np.isinf(annots).sum() > 0: + print(variant) + for i, f in enumerate(fields): + print('', i, f, annots[i], sep='\t') + raise Exception('NaN or inf found found, this should not happen') + return annots def gatherVcfMetrics(vcfFN, rawVCF, metrics): diff --git a/scripts/RunTrainingPipeline.py b/scripts/RunTrainingPipeline.py index 2fd0edb..e21b5e4 100644 --- a/scripts/RunTrainingPipeline.py +++ b/scripts/RunTrainingPipeline.py @@ -113,6 +113,7 @@ def sendSlackMessage(channel, message): p.add_argument('-t', '--train-models', dest='train_models', action='store_true', default=False, help='train the models using the pipeline structure (default: False)') p.add_argument('-u', '--unlock', dest='unlock', action='store_true', default=False, help='unlock the directory in the event of snakemake failure (default: False)') p.add_argument('-x', '--execute', dest='execute', action='store_true', default=False, help='execute the commands (default: False)') + p.add_argument('-C', '--clean', dest='clean', action='store_true', default=False, help='clean all output files (default: False)') #required main arguments p.add_argument('slids', type=str, help='sample labels (.json, .txt, comma-separated entry)') @@ -163,6 +164,9 @@ def sendSlackMessage(channel, message): buildFrags.append('model_eli5') if somethingToDo: + if args.clean: + buildFrags.append('--delete-all-output') + fullCmd = ' '.join(snakemakeFrags+buildFrags) print('Executing: ') print(fullCmd) diff --git a/scripts/TrainModels.py b/scripts/TrainModels.py index 3ab979e..6b3905d 100644 --- a/scripts/TrainModels.py +++ b/scripts/TrainModels.py @@ -15,6 +15,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.model_selection import LeaveOneGroupOut from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.utils import shuffle from skopt import BayesSearchCV @@ -251,10 +252,35 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType, passParams = rangeHyperparams else: passParams = hyperparameters - fullClf, sumRet, sumRocRet = trainClassifier(raw_clf, passParams, final_train_X, final_train_Y, final_train_groups, configuration) + + if ENABLE_FEATURE_SELECTION: + pipeline_clf = Pipeline( + [ + #this is a relatively small and relatively shallow GBC + ("reduce_dim", 'passthrough'), + #now the main classifier + ("classifier", raw_clf) + ] + ) + new_params = { + 'reduce_dim' : FEATURE_SELECTION_MODELS + } + for k in passParams.keys(): + new_key = f'classifier__{k}' + new_params[new_key] = passParams[k] + passParams = new_params + else: + pipeline_clf = raw_clf + + fullClf, sumRet, sumRocRet = trainClassifier(pipeline_clf, passParams, final_train_X, final_train_Y, final_train_groups, configuration) if label == 'GradientBoosting': - print('n_estimators_', fullClf.n_estimators_) + if ENABLE_FEATURE_SELECTION: + print('n_estimators_', fullClf[1].n_estimators_) + else: + print('n_estimators_', fullClf.n_estimators_) + if ENABLE_FEATURE_SELECTION: + print('support_', fullClf[0].support_) #this is the test on the held out test set print('[%s]\tFull testing classifier...' % (str(datetime.datetime.now()), )) resultsDict = testClassifier(fullClf, final_train_X, final_train_Y, final_test_X, final_test_Y) diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py index b5b5f78..92bfef5 100644 --- a/scripts/TrainingConfig.py +++ b/scripts/TrainingConfig.py @@ -2,7 +2,9 @@ from imblearn.ensemble import EasyEnsembleClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier +from sklearn.feature_selection import RFECV from sklearn.tree import DecisionTreeClassifier from skopt.space import Real, Categorical, Integer from xgboost import XGBClassifier @@ -28,6 +30,25 @@ MANUAL_FS = False MANUAL_FS_LABELS = ['CALL-ADO', 'CALL-AFO'] +#if True, this will pre-pend an automated feature selection step to each pipeline +ENABLE_FEATURE_SELECTION = True +FEATURE_SELECTION_MODELS = [ + RFECV( + estimator=RandomForestClassifier( + random_state=0, class_weight='balanced', n_estimators=50, max_depth=3, max_features='sqrt' + ), + scoring='roc_auc', + cv=5 + ), + RFECV( + estimator=GradientBoostingClassifier( + random_state=0, loss='exponential', n_estimators=50, max_depth=3, max_features='sqrt' + ), + scoring='roc_auc', + cv=5 + ) +] + #if True, mark false positive as true positives and vice versa FLIP_TP = True @@ -53,14 +74,7 @@ ENABLE_EASYENSEMBLE = False #this is an experimental mode in sklearn, it may change rapidly from version to version -ENABLE_HISTGRADIENTBOOST = False -if ENABLE_HISTGRADIENTBOOST: - #make sure we can actually do what we're trying to do - try: - from sklearn.experimental import enable_hist_gradient_boosting - from sklearn.ensemble import HistGradientBoostingClassifier - except: - ENABLE_HISTGRADIENTBOOST = False +ENABLE_HISTGRADIENTBOOST = True #here is where we put what each enable option indicates #now enumerate the models as a tuple ( @@ -83,7 +97,7 @@ { 'random_state' : [0], 'class_weight' : ['balanced'], - 'n_estimators' : [500], #prior tests: 100, 200, 300 + 'n_estimators' : [200], #prior tests: 100, 200, 300, 500 - technically 500 was best, but you really get marginal gains 'max_depth' : [6], #prior tests: 3, 4, 5 'min_samples_split' : [2, 50], 'max_features' : ['sqrt'], @@ -117,10 +131,10 @@ #''' if ENABLE_ADABOOST: + ''' + Generally slower and worse results than other models, disabled after v1. + ''' CLASSIFIERS.append( - ''' - Generally slower and worse results than other models, disabled after v1. - ''' #"The most important parameters are base_estimator, n_estimators, and learning_rate" - https://chrisalbon.com/machine_learning/trees_and_forests/adaboost_classifier/ ('AdaBoost', AdaBoostClassifier(random_state=0, algorithm='SAMME.R', learning_rate=1.0, base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=200), { @@ -257,10 +271,10 @@ ('HistGradientBoosting', HistGradientBoostingClassifier(random_state=0), { 'random_state' : [0], - 'learning_rate' : [0.05, 0.1], #high learning rates don't seem to work out very well + 'learning_rate' : [0.05, 0.1, 0.2], #high learning rates don't seem to work out very well 'max_iter' : [1000], - 'max_leaf_nodes' : [31], #don't want to make this too high or it overfits - 'min_samples_leaf' : [200, 2000], #want this to be semi-high to avoid overfitting to a few variants + 'max_leaf_nodes' : [15, 31, 63], #don't want to make this too high or it overfits + 'min_samples_leaf' : [20, 200, 2000], #want this to be semi-high to avoid overfitting to a few variants 'validation_fraction' : [0.1], 'n_iter_no_change' : [20] }, diff --git a/scripts/model_metrics.json b/scripts/model_metrics.json index eaaa381..bc81e53 100644 --- a/scripts/model_metrics.json +++ b/scripts/model_metrics.json @@ -35,18 +35,17 @@ }, "dnascope-1.0-202112.01-PO" : { "CALL" : [ - "AD0", "AD1", "ADO", "AF0", "AF1", "AFO", "GT", "DP", "GQ", "min(PL)" + "AD0", "AD1", "ADO", "AF0", "AF1", + "AFO", "GT", "DP", "GQ", "min(PL)" ], "INFO" : [ - "BaseQRankSum", "DP", "Entropy", "GC", "ML_PROB", "MQ", "MQ0", + "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet", + "FS", "GC", "ML_PROB", "MQ", "MQ0", "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", - "SOR", "UDP" + "SOR", "STR", "UDP" ], "MUNGED" : [ - "DP_DP", "QUAL", "NEARBY", "ID" - ], - "IGNORE" : [ - "MUNGED-FILTER", "INFO-ExcessHet", "INFO-STR", "INFO-ClippingRankSum", "INFO-FS" + "DP_DP", "FILTER", "ID", "NEARBY", "QUAL" ] } }, From 72e99f7bb3916f6e652cb3f065e0cd833046b554 Mon Sep 17 00:00:00 2001 From: holtjma Date: Tue, 10 May 2022 08:17:53 -0500 Subject: [PATCH 07/16] final training config set, still need to adjust model selection criteria --- scripts/PipelineConfig.py | 2 +- scripts/TrainingConfig.py | 37 +++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/scripts/PipelineConfig.py b/scripts/PipelineConfig.py index b2e17e6..14a9ca1 100644 --- a/scripts/PipelineConfig.py +++ b/scripts/PipelineConfig.py @@ -47,7 +47,7 @@ ] #pipeline parameters, thread count currently only applies to the actual model training (CV to be precise) -THREADS_PER_PROC = 33 #was 16 in v1 +THREADS_PER_PROC = 48 #was 16 in v1 ############################################################ #Documentation config diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py index 92bfef5..7469401 100644 --- a/scripts/TrainingConfig.py +++ b/scripts/TrainingConfig.py @@ -24,27 +24,27 @@ #if True, only use SUBSET_SIZE variants from each file (mostly for debugging) USE_SUBSET = False -SUBSET_SIZE = 10000 +SUBSET_SIZE = 100000 #if True, manually remove features in MANUAL_FS_LABELS (these are historically unimportant features) MANUAL_FS = False MANUAL_FS_LABELS = ['CALL-ADO', 'CALL-AFO'] #if True, this will pre-pend an automated feature selection step to each pipeline +#WARNING: this can be a time-consuming process, the reduce run-time increase FS_STEP_SIZE and/or FS_MIN_FEATURE_COUNT ENABLE_FEATURE_SELECTION = True +FS_MIN_FEATURE_COUNT = 10 #always keep at least this many features +FS_STEP_SIZE = 1 #if an int, the number of features to remove at each step; if a float, the fraction of features to remove at each step FEATURE_SELECTION_MODELS = [ RFECV( - estimator=RandomForestClassifier( - random_state=0, class_weight='balanced', n_estimators=50, max_depth=3, max_features='sqrt' - ), - scoring='roc_auc', - cv=5 - ), - RFECV( + #this is a relatively fast estimator; low tree count, shallow trees, and small sub-sample estimator=GradientBoostingClassifier( - random_state=0, loss='exponential', n_estimators=50, max_depth=3, max_features='sqrt' + random_state=0, loss='exponential', n_estimators=50, max_depth=3, max_features='sqrt', + subsample=0.25 ), - scoring='roc_auc', + scoring='roc_auc', + step=FS_STEP_SIZE, + min_features_to_select=FS_MIN_FEATURE_COUNT, cv=5 ) ] @@ -57,7 +57,7 @@ CLASSIFIERS = [] #Generally slight underperformance, but is a decent backup -ENABLE_RANDOMFOREST = True +ENABLE_RANDOMFOREST = False #performs alright, but always seems to be below GradientBoostingClassifier and slower ENABLE_ADABOOST = False @@ -74,7 +74,7 @@ ENABLE_EASYENSEMBLE = False #this is an experimental mode in sklearn, it may change rapidly from version to version -ENABLE_HISTGRADIENTBOOST = True +ENABLE_HISTGRADIENTBOOST = False #here is where we put what each enable option indicates #now enumerate the models as a tuple ( @@ -101,7 +101,8 @@ 'max_depth' : [6], #prior tests: 3, 4, 5 'min_samples_split' : [2, 50], 'max_features' : ['sqrt'], - 'bootstrap' : [True, False] + 'bootstrap' : [True], + 'max_samples' : [0.25, 0.5] }, { 'random_state' : Categorical([0]), @@ -172,8 +173,8 @@ { 'random_state' : [0], 'n_estimators' : [1000], #prior tests: 100, 200; OBSOLETE: since adding n_iter_no_change, just set to a big number - 'max_depth' : [6], #prior tests: 3, 4 - 'learning_rate' : [0.05, 0.1, 0.5], #prior tests: 0.01, 0.2; from bayes mode, all results were in the 0.04-0.2 range with the occasional "high" rate near 0.5 + 'max_depth' : [3, 6], #prior tests: 3, 4 + 'learning_rate' : [0.05, 0.1, 0.2], #prior tests: 0.01, 0.2; from bayes mode, all results were in the 0.04-0.2 range with the occasional "high" rate near 0.5 - this was very rare though 'loss' : ['exponential'], #prior tests: 'deviance' 'max_features' : ['sqrt'], 'min_samples_split' : [2, 15, 50], #mostly extremes in Bayes most, but adding 15 for middle-ground that was sometimes chosen @@ -271,10 +272,10 @@ ('HistGradientBoosting', HistGradientBoostingClassifier(random_state=0), { 'random_state' : [0], - 'learning_rate' : [0.05, 0.1, 0.2], #high learning rates don't seem to work out very well + 'learning_rate' : [0.05, 0.1], #high learning rates don't seem to work out very well 'max_iter' : [1000], - 'max_leaf_nodes' : [15, 31, 63], #don't want to make this too high or it overfits - 'min_samples_leaf' : [20, 200, 2000], #want this to be semi-high to avoid overfitting to a few variants + 'max_leaf_nodes' : [15, 31], #don't want to make this too high or it overfits + 'min_samples_leaf' : [20, 200], #want this to be semi-high to avoid overfitting to a few variants 'validation_fraction' : [0.1], 'n_iter_no_change' : [20] }, From f3c37fc23be4d69b9cf40a3a32ebf41cbc9b9ca5 Mon Sep 17 00:00:00 2001 From: holtjma Date: Tue, 17 May 2022 08:45:11 -0500 Subject: [PATCH 08/16] changes required to support auto-targeting of global precision --- scripts/EvaluateVariants.py | 53 ++++++++++++++++--- scripts/ExtractELI5Results.py | 13 +++-- scripts/GenerateSupplement.py | 12 +++-- scripts/PrintModelReport.py | 35 ++++++++++--- scripts/TrainModels.py | 99 +++++++++++++++++++++++++++++------ scripts/TrainingConfig.py | 18 ++++++- scripts/TrainingPipeline.smk | 7 +-- 7 files changed, 195 insertions(+), 42 deletions(-) diff --git a/scripts/EvaluateVariants.py b/scripts/EvaluateVariants.py index 11d3c4b..e3c576e 100644 --- a/scripts/EvaluateVariants.py +++ b/scripts/EvaluateVariants.py @@ -9,16 +9,42 @@ from ExtractFeatures import ALL_METRICS, getVariantFeatures, GT_TRANSLATE, VAR_TRANSLATE -def getClinicalModel(stats, acceptedRecall, targetRecall): +def getClinicalModel(stats, acceptedRecall, targetRecall, global_precision): ''' @param stats - the stats dictionary from training @param acceptedRecall - the minimum acceptable recall - @param targetRecall - the target we are aiming for - @return - a dictionary with the eval recall, model name, and harmonic mean score + @param targetRecall - the target we are aiming for, must be greater than or equal to accepted + @param global_precision - float value indicating target global precision; if set, + then it will dynamically figure out the target recalls based on the data + @return - a dictionary with the eval recall, model name, and harmonic mean score; more info is added if global_precision is used ''' bestModelTargetRecall = None bestModelName = None bestHM = 0.0 + + if global_precision != None: + #we need to override the accepted and target recall values + #print(stats.keys()) + lookup_key = list(stats.keys())[0] + base_total_tp = stats[lookup_key]['RAW_TOTAL_TP'] + base_total_fp = stats[lookup_key]['RAW_TOTAL_FP'] + base_precision = base_total_tp / (base_total_fp + base_total_tp) + + #figure out how far from the goal we are + delta_precision = global_precision - base_precision + assert(delta_precision > 0) + remainder_precision = 1.0 - base_precision + + #now derive our target recall from that difference + derived_recall = delta_precision / remainder_precision + + #for now, just set both accepted and target to that same value + acceptedRecall = derived_recall + targetRecall = derived_recall + + #check these after we potentially override any passed in values + assert(targetRecall >= acceptedRecall) + for mn in stats.keys(): for tr in stats[mn]['ALL_SUMMARY']: #CM = confusion matrix @@ -40,9 +66,17 @@ def getClinicalModel(stats, acceptedRecall, targetRecall): else: #in clinical, best is harmonic mean of our adjusted recall and our TNR modelTNR = modelCM[0, 0] / (modelCM[0, 0] + modelCM[0, 1]) - adjRecall = (modelRecall - acceptedRecall) / (float(targetRecall) - acceptedRecall) - if adjRecall > 1.0: - adjRecall = 1.0 + if targetRecall == acceptedRecall: + #target and accepted are identical, so this is a binary kill switch that's either 1.0 or 0.0 + if modelRecall >= acceptedRecall: + adjRecall = 1.0 + else: + adjRecall = 0.0 + else: + #otherwise, there's a scale + adjRecall = (modelRecall - acceptedRecall) / (targetRecall - acceptedRecall) + if adjRecall > 1.0: + adjRecall = 1.0 modelHM = 2 * adjRecall * modelTNR / (adjRecall+modelTNR) if modelHM > bestHM: @@ -50,11 +84,16 @@ def getClinicalModel(stats, acceptedRecall, targetRecall): bestModelName = mn bestHM = modelHM - return { + ret = { 'eval_recall' : bestModelTargetRecall, 'model_name' : bestModelName, 'hm_score' : bestHM } + if global_precision != None: + ret['base_precision'] = base_precision + ret['derived_recall'] = derived_recall + + return ret def evaluateVariants(args): ''' diff --git a/scripts/ExtractELI5Results.py b/scripts/ExtractELI5Results.py index 248d482..983d10e 100644 --- a/scripts/ExtractELI5Results.py +++ b/scripts/ExtractELI5Results.py @@ -8,13 +8,14 @@ from EvaluateVariants import getClinicalModel from ExtractFeatures import GT_TRANSLATE, VAR_TRANSLATE -def gatherEli5Stats(modelDir, minRecall, targetRecall): +def gatherEli5Stats(modelDir, minRecall, targetRecall, global_precision): ''' This will gather our clinical model statistics in an ingestible format @param modelDir - the model directory @param rocDir - a directory for output ROC curve images @param minRecall - the minimum recall values allowed @param targetRecall - the target recall we want the models to achieve + @param global_precision - if set, this will override other recall values ''' #read in the stats jsonFN = '%s/stats.json' % (modelDir, ) @@ -29,14 +30,15 @@ def gatherEli5Stats(modelDir, minRecall, targetRecall): fp.close() #get the stats now and send 'em on back - return gatherClinicalModelStats(stats, minRecall, targetRecall, models) + return gatherClinicalModelStats(stats, minRecall, targetRecall, global_precision, models) -def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels): +def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, global_precision, allModels): ''' This will run eli5 if it can and store the results in a dictionary @param allStats - the full stats dict (all models) @param acceptedRecall - the minimum recall we need @param targetRecall - the target recall we want + @param global_precision - if set, this will override the recall values @param allModels - the actual loaded models @return - a dictionary where keys are the variant/genotype and value has the model name and eli5 outputs ''' @@ -46,7 +48,7 @@ def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels): reformKey = VAR_TRANSLATE[int(k.split('_')[0])]+'_'+GT_TRANSLATE[int(k.split('_')[1])] #get the clinical model - clinicalModelDict = getClinicalModel(stats, acceptedRecall, targetRecall) + clinicalModelDict = getClinicalModel(stats, acceptedRecall, targetRecall, global_precision) bestModelName = clinicalModelDict['model_name'] if bestModelName == None: @@ -77,6 +79,7 @@ def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels): #optional arguments with default p.add_argument('-m', '--min-recall', dest='min_recall', default=0.99, type=float, help='the minimum recall for clinical applications (default: 0.990)') p.add_argument('-t', '--target-recall', dest='target_recall', default=0.995, type=float, help='the target recall for clinical applications (default: 0.995)') + p.add_argument('-g', '--global-precision', dest='global_precision', default=None, type=float, help='the global precision target; if set, override min/target recalls (default: None)') #required main arguments p.add_argument('model_directory', type=str, help='directory with models and model stats') @@ -86,7 +89,7 @@ def gatherClinicalModelStats(allStats, acceptedRecall, targetRecall, allModels): args = p.parse_args() #gather the stats - finalStats = gatherEli5Stats(args.model_directory, args.min_recall, args.target_recall) + finalStats = gatherEli5Stats(args.model_directory, args.min_recall, args.target_recall, args.global_precision) fp = open(args.out_json, 'w+') json.dump(finalStats, fp, indent=4, sort_keys=True) diff --git a/scripts/GenerateSupplement.py b/scripts/GenerateSupplement.py index c5611fe..fc967f9 100644 --- a/scripts/GenerateSupplement.py +++ b/scripts/GenerateSupplement.py @@ -199,10 +199,14 @@ def getTrainingResults(pipeline, reference, aligner, caller, strict=False): if l.startswith('[clinical_model'): pieces = l.rstrip()[1:-1].split(' ') assert(pieces[0] == 'clinical_model') - assert(pieces[1].startswith('min=')) - assert(pieces[2].startswith('tar=')) - clinicalMinimum = float(pieces[1][4:]) - clinicalTarget = float(pieces[2][4:]) + if pieces[1].startswith('min=') and pieces[2].startswith('tar='): + clinicalMinimum = float(pieces[1][4:]) + clinicalTarget = float(pieces[2][4:]) + elif pieces[1].startswith('target_global_precision='): + clinicalMinimum = 'dynamic' + clinicalTarget = 'dynamic' + else: + raise Exception('unknown clinical_model format') break elif l.startswith('['): vt, tpcount, fpcount = l.rstrip()[1:-1].split(' ') diff --git a/scripts/PrintModelReport.py b/scripts/PrintModelReport.py index e8d4b2a..ada64a1 100644 --- a/scripts/PrintModelReport.py +++ b/scripts/PrintModelReport.py @@ -11,13 +11,15 @@ from EvaluateVariants import getClinicalModel from ExtractFeatures import GT_TRANSLATE, VAR_TRANSLATE -def printAllStats(modelDir, rocDir, minRecall, targetRecall): +def printAllStats(modelDir, rocDir, minRecall, targetRecall, global_precision): ''' This will print out our model statistics in an ingestible format @param modelDir - the model directory @param rocDir - a directory for output ROC curve images @param minRecall - the minimum recall values allowed @param targetRecall - the target recall we want the models to achieve + @param global_precision - float value indicating target global precision; if set, + then it will dynamically figure out the target recalls based on the data ''' #read in the stats jsonFN = '%s/stats.json' % (modelDir, ) @@ -48,7 +50,7 @@ def printAllStats(modelDir, rocDir, minRecall, targetRecall): if rocDir != None: createRocImage(reformKey, rocs[k], rocDir) - imageDict = printClinicalModels(stats, minRecall, targetRecall, models) + imageDict = printClinicalModels(stats, minRecall, targetRecall, models, global_precision) if rocDir != None: createTrainingImage(imageDict, rocDir) @@ -142,24 +144,33 @@ def createTrainingImage(imageDict, rocDir): plt.savefig(outFN) plt.close() -def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels): +def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels, global_precision): header = [ 'variant_type', 'best_model', 'model_eval', 'CV_recall', 'final_recall', 'CV_FPR', 'final_FPR' ] - print('[clinical_model min=%0.4f tar=%0.4f]' % (acceptedRecall, targetRecall)) + if global_precision == None: + print('[clinical_model min=%0.4f tar=%0.4f]' % (acceptedRecall, targetRecall)) + else: + print('[clinical_model target_global_precision=%0.4f]' % (global_precision, )) + header += ['target_recall', 'global_prec'] print(*header, sep='\t') imageDict = {} for k in sorted(allStats.keys()): stats = allStats[k] reformKey = VAR_TRANSLATE[int(k.split('_')[0])]+'_'+GT_TRANSLATE[int(k.split('_')[1])] - clinDict = getClinicalModel(stats, acceptedRecall, targetRecall) + clinDict = getClinicalModel(stats, acceptedRecall, targetRecall, global_precision) bestModelName = clinDict['model_name'] if bestModelName == None: #this is the unfortunate event that NO model passes - print(reformKey, 'None', 'None', '--', '--', '--', '--', sep='\t') + rowVals = [ + reformKey, 'None', 'None', '--', '--', '--', '--' + ] + if global_precision: + rowVals += ['--', '--'] + print(*rowVals, sep='\t') else: bestModelTargetRecall = clinDict['eval_recall'] @@ -182,6 +193,15 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels): #str(stats[bestModelName]['LEAVEONEOUT_SUMMARY'][bestModelTargetRecall]['TEST_TPR']), #str(stats[bestModelName]['LEAVEONEOUT_SUMMARY'][bestModelTargetRecall]['TEST_FPR']) ] + if global_precision: + base_precision = clinDict['base_precision'] + derived_recall = clinDict['derived_recall'] + calculated_precision = base_precision + (1 - base_precision) * bestTPR + rowVals += [ + '%0.4f' % (derived_recall, ), + '%0.6f' % (calculated_precision, ) + ] + print(*rowVals, sep='\t') imageDict[reformKey] = ( @@ -203,6 +223,7 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels): p.add_argument('-r', '--roc-dir', dest='roc_dir', default=None, help='a directory to store ROC images (default: None)') p.add_argument('-m', '--min-recall', dest='min_recall', default=0.99, type=float, help='the minimum recall for clinical applications (default: 0.990)') p.add_argument('-t', '--target-recall', dest='target_recall', default=0.995, type=float, help='the target recall for clinical applications (default: 0.995)') + p.add_argument('-g', '--global-precision', dest='global_precision', default=None, type=float, help='the global precision target; if set, override min/target recalls (default: None)') #required main arguments p.add_argument('model_directory', type=str, help='directory with models and model stats') @@ -210,4 +231,4 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels): #parse the arguments args = p.parse_args() - printAllStats(args.model_directory, args.roc_dir, args.min_recall, args.target_recall) \ No newline at end of file + printAllStats(args.model_directory, args.roc_dir, args.min_recall, args.target_recall, args.global_precision) \ No newline at end of file diff --git a/scripts/TrainModels.py b/scripts/TrainModels.py index 6b3905d..83900e2 100644 --- a/scripts/TrainModels.py +++ b/scripts/TrainModels.py @@ -57,6 +57,7 @@ def trainModels(featureDir, slids, outPrefix, splitByType, numProcs): fieldsList = json.load(fp) fp.close() + print(f'[{datetime.datetime.now()}] Full sample list: {samples}') #get data from each sample for sample in samples: #TP first @@ -78,7 +79,7 @@ def trainModels(featureDir, slids, outPrefix, splitByType, numProcs): assert(fpFields == fieldsList) fpVar = np.load(fpFN, 'r') fpList.append(fpVar) - + results = {} trainedModelDict = {} rocDict = {} @@ -121,7 +122,7 @@ def trainModels(featureDir, slids, outPrefix, splitByType, numProcs): fp = open(rocFN, 'w+') json.dump(rocDict, fp, indent=4, sort_keys=True) fp.close() - + print('[%s] All models finished training!' % (str(datetime.datetime.now()), )) def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType, callType, numProcs): @@ -174,6 +175,10 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType, train_groups = [] test_list_X = [] test_list_Y = [] + + #we will need the totals if we auto-calculate the target recalls + raw_total_tp = 0 + raw_total_fp = 0 for i in range(0, len(raw_tpList)): tpVals = raw_tpList[i] @@ -193,6 +198,10 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType, tpVals = np.delete(tpVals, removedIndices, 1) fpVals = np.delete(fpVals, removedIndices, 1) + #store the total number we found before any sort of subsetting; this is used to derive overall accuracies if auto-enabled + raw_total_tp += tpVals.shape[0] + raw_total_fp += fpVals.shape[0] + #we aren't doing a full test, so cut the input sizes down if USE_SUBSET: tpVals = tpVals[:SUBSET_SIZE] @@ -242,6 +251,34 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType, #TODO: future possible optimization - RandomForest and EasyEnsemble have a "n_jobs" parameters for parallel # fit/predict; is there a way we can utilize that in general? + + print('Raw Total TP, FP:', raw_total_tp, raw_total_fp) + total_obs = raw_total_fp + raw_total_tp + raw_precision = raw_total_tp / total_obs + print('Raw precision:', raw_precision) + + if ENABLE_AUTO_TARGET: + #we need to derive what the recall targets should be + missed_precision = GLOBAL_AUTO_TARGET_PRECISION - raw_precision + + if missed_precision < 0: + #TODO: what should we do here generally? this isn't currently an issue + raise Exception('raw precision is already better than global target, this is currently unhandled') + + total_gap = 1.0 - raw_precision + lower_target_recall = missed_precision / total_gap + delta_gap = (1.0 - lower_target_recall) / AUTO_TARGET_BREAKPOINT_COUNT + + auto_targets = [lower_target_recall + i*delta_gap for i in range(0, AUTO_TARGET_BREAKPOINT_COUNT)] + auto_targets += [0.9999, 1.0000] + + #now reverse sort them + recall_targets = np.array(sorted(auto_targets)[::-1]) + + else: + #manual targetting only + recall_targets = MANUAL_TARGETS + print('Recall targets: ', recall_targets) #go through each model, one at a time for (label, raw_clf, hyperparameters, rangeHyperparams) in CLASSIFIERS: @@ -272,7 +309,10 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType, else: pipeline_clf = raw_clf - fullClf, sumRet, sumRocRet = trainClassifier(pipeline_clf, passParams, final_train_X, final_train_Y, final_train_groups, configuration) + fullClf, sumRet, sumRocRet = trainClassifier( + pipeline_clf, passParams, final_train_X, final_train_Y, final_train_groups, + configuration, recall_targets, raw_precision + ) if label == 'GradientBoosting': if ENABLE_FEATURE_SELECTION: @@ -286,11 +326,13 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType, resultsDict = testClassifier(fullClf, final_train_X, final_train_Y, final_test_X, final_test_Y) #get results and store everything in the dictionary locations below - allRet, allRocRet = summarizeResults([resultsDict], [final_test_Y]) + allRet, allRocRet = summarizeResults([resultsDict], [final_test_Y], recall_targets, raw_precision) ret[label] = { 'LEAVEONEOUT_SUMMARY' : sumRet, - 'ALL_SUMMARY' : allRet + 'ALL_SUMMARY' : allRet, + 'RAW_TOTAL_TP' : raw_total_tp, + 'RAW_TOTAL_FP' : raw_total_fp } modelRet[label] = { 'FEATURES' : featureLabels, @@ -306,7 +348,7 @@ def trainAllClassifiers(raw_tpList, raw_fpList, raw_featureLabels, variantType, return ret, modelRet, rocRet -def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, configuration): +def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, configuration, recall_targets, base_precision): ''' This will run a classifier for us @param raw_clf - a classifier instance @@ -315,6 +357,8 @@ def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, co @param train_Y - the classifier values expected @param train_groups - the groups of input values @param configuration - a dictionary containing information on how to do the training + @param recall_targets - the recall targets to evaluate results + @param raw_precision - the base precision prior to any training @return - a trained classifier ''' print('[%s]\tFull training classifier...' % (str(datetime.datetime.now()), )) @@ -358,7 +402,7 @@ def trainClassifier(raw_clf, hyperparameters, train_X, train_Y, train_groups, co test_Ys.append(train_Y[test_ind]) print('[%s]\tGenerating CV summary...'% (str(datetime.datetime.now()), )) - sumRet, sumRocRet = summarizeResults(results, test_Ys) + sumRet, sumRocRet = summarizeResults(results, test_Ys, recall_targets, base_precision) else: raise Exception('Unexpected mode') @@ -379,6 +423,11 @@ def testClassifier(clf, train_X, train_Y, test_X, test_Y): @param test_X - the training features @param test_Y - the classifier values expected @return - a dictionary of many results + TEST_PRED - the probability of being in class 1 (i.e. predicted false call) on the test data + TEST_ROC - tuple (false_positive rate, true_positive_rate, thresholds) from the roc_curve function on the test data + TEST_ROC_AUC - ROC-AUC from auc(roc_curve(...)) + TRAIN_PRED - the probability of being class 1 on the training data + TRAIN_ROC - tuple (false_positive rate, true_positive_rate, thresholds) from the roc_curve function on the training data ''' ret = {} @@ -399,15 +448,18 @@ def testClassifier(clf, train_X, train_Y, test_X, test_Y): return ret -def summarizeResults(results, test_Ys): +def summarizeResults(results, test_Ys, recallValues, base_precision): ''' This will summarize results for us across multiple leave-one-out runs - @param results - a list of results dictionaries for a classifier - @return - TODO + @param results - a list of results dictionaries for a classifier; this is basically a list of results from testClassifier(...) + @param test_Ys - a list of lists of correct Y-values (e.g. categories) + @param recallValues - the recall values to evaluate the models at + @param base_precision - the base precision of the caller without any ML + @return - tuple (stats, rocs) + stats - a dict where key is a recall value and value is a dictionary of stats for the model with that recall including test/train TPR, FPR, etc. + rocs - a list of test ROCs, one for each in results ''' print('[%s]\tRunning threshold tests...' % (str(datetime.datetime.now()), )) - recallValues = np.array([1.0, 0.9999, 0.999, 0.998, 0.997, 0.996, 0.995, 0.99]) - #TODO: if we want to do confidence intervals on errors, look into it here: (For now, I think CV is fine) #https://machinelearningmastery.com/report-classifier-performance-confidence-intervals/ #note for future matt, if you get 0 errors out of N classifications, we would need to assume the NEXT @@ -418,8 +470,13 @@ def summarizeResults(results, test_Ys): # 99% confidence in that interval ret = {} - print('', 'tarTPR', 'train_FPR', 'train_TPR', 'test_FPR', 'test_TPR', 'adjConf', sep='\t') + if ENABLE_AUTO_TARGET: + print('', 'tarTPR', 'train_FPR', 'train_TPR', 'test_FPR', 'test_TPR', 'global_prec', 'adjConf', sep='\t') + else: + print('', 'tarTPR', 'train_FPR', 'train_TPR', 'test_FPR', 'test_TPR', 'adjConf', sep='\t') for minRecall in recallValues: + #for each specified recall level, we want to calculate how well the training/testing does for a bunch of stats: + # training FPR, training TPR, training thresholds, testing FPR, testing TPR, and testing confusion matrix (CM) tarTPR = minRecall trainFprList = [] trainTprList = [] @@ -428,12 +485,16 @@ def summarizeResults(results, test_Ys): testCMList = [] testFprList = [] testTprList = [] - + + #for CV, we will have multiple training/testing sets, so we need to gather them all for mean/std later + #this function is also used for final training, which will only have one set in `results` (i.e. N=1, st. dev.=0) for i, resultsDict in enumerate(results): + #get the training ROC values train_false_positive_rate, train_true_positive_rate, train_thresholds = resultsDict['TRAIN_ROC'] test_Y = test_Ys[i] #first, find the point in the training values that matches our recall requirement + #this will allow us to pick the threshold value matching that recall req. ind = bisect.bisect_left(train_true_positive_rate, minRecall) while train_true_positive_rate[ind] < minRecall: ind += 1 @@ -461,11 +522,19 @@ def summarizeResults(results, test_Ys): ('%0.4f+-%0.4f', (np.mean(trainFprList), np.std(trainFprList))), ('%0.4f+-%0.4f', (np.mean(trainTprList), np.std(trainTprList))), ('%0.4f+-%0.4f', (np.mean(testFprList), np.std(testFprList))), - ('%0.4f+-%0.4f', (np.mean(testTprList), np.std(testTprList))), + ('%0.4f+-%0.4f', (np.mean(testTprList), np.std(testTprList))) + ] + if ENABLE_AUTO_TARGET: + recall = np.mean(testTprList) + printVals += [ + ('%0.6f', base_precision + (1-base_precision) * recall) + ] + printVals += [ ('%s', str(sum(testCMList)).replace('\n', '')), ] print('\t'.join([t[0] % t[1] for t in printVals])) + #save the lists in our output tied to this recall value ret[minRecall] = { 'TRAIN_FPR' : trainFprList, 'TRAIN_TPR' : trainTprList, diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py index 7469401..5b75a0a 100644 --- a/scripts/TrainingConfig.py +++ b/scripts/TrainingConfig.py @@ -1,5 +1,6 @@ from imblearn.ensemble import EasyEnsembleClassifier +import numpy as np from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import HistGradientBoostingClassifier @@ -26,6 +27,21 @@ USE_SUBSET = False SUBSET_SIZE = 100000 +#if True, we will generate an array of targets from the data +ENABLE_AUTO_TARGET = True + +#if auto-targets is enabled, this is the total target precision that is used for deriving the auto-target +# e.g. if based precision is 0.999, then the model needs to recover 0.0009 of the 0.001 that is not captured +# e.g. 90% derived target recall for the models +GLOBAL_AUTO_TARGET_PRECISION = 0.9999 +AUTO_TARGET_BREAKPOINT_COUNT = 10 #the number of point inbetween the target and 1.00 to calculate, must be > 0 +AUTO_EXTRA_TARGETS = [0.9999, 1.0000] #these are always added + +#if the auto-targets is disabled, these will be used as targets instead +MANUAL_TARGETS = np.array([ + 1.0, 0.9999, 0.999, 0.998, 0.997, 0.996, 0.995, 0.99 +]) + #if True, manually remove features in MANUAL_FS_LABELS (these are historically unimportant features) MANUAL_FS = False MANUAL_FS_LABELS = ['CALL-ADO', 'CALL-AFO'] @@ -34,7 +50,7 @@ #WARNING: this can be a time-consuming process, the reduce run-time increase FS_STEP_SIZE and/or FS_MIN_FEATURE_COUNT ENABLE_FEATURE_SELECTION = True FS_MIN_FEATURE_COUNT = 10 #always keep at least this many features -FS_STEP_SIZE = 1 #if an int, the number of features to remove at each step; if a float, the fraction of features to remove at each step +FS_STEP_SIZE = 0.1 #if an int, the number of features to remove at each step; if a float, the fraction of features to remove at each step FEATURE_SELECTION_MODELS = [ RFECV( #this is a relatively fast estimator; low tree count, shallow trees, and small sub-sample diff --git a/scripts/TrainingPipeline.smk b/scripts/TrainingPipeline.smk index af1c621..d439be7 100644 --- a/scripts/TrainingPipeline.smk +++ b/scripts/TrainingPipeline.smk @@ -219,7 +219,8 @@ rule SummarizeModels: images=directory("{pipeline_dir}/model_summaries/{reference}/{aligner}/{caller}/roc_curves") params: script=MODEL_REPORT_SCRIPT, - prefix="{pipeline_dir}/trained_models/{reference}/{aligner}/{caller}" + prefix="{pipeline_dir}/trained_models/{reference}/{aligner}/{caller}", + global_precision="0.9999" #conda: # CONDA_ENV log: "{pipeline_dir}/logs/model_summaries/{reference}/{aligner}/{caller}.log" @@ -228,6 +229,7 @@ rule SummarizeModels: ''' python3 -u {params.script} \ -r {output.images} \ + -g {params.global_precision} \ {params.prefix} > \ {output.summary} ''' @@ -272,8 +274,7 @@ rule ModelEli5: shell: ''' python3 -u {params.script} \ - -m 0.99 \ - -t 0.995 \ + -g 0.9999 \ {params.prefix} \ {output.eli5} ''' From 8d29be77090d1b3d081f1edda7dcbbc62f0584bd Mon Sep 17 00:00:00 2001 From: holtjma Date: Wed, 25 May 2022 08:42:28 -0500 Subject: [PATCH 09/16] fixing global precision to previous value --- scripts/PrintModelReport.py | 8 ++++---- scripts/TrainingConfig.py | 2 +- scripts/TrainingPipeline.smk | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/scripts/PrintModelReport.py b/scripts/PrintModelReport.py index ada64a1..917ce51 100644 --- a/scripts/PrintModelReport.py +++ b/scripts/PrintModelReport.py @@ -73,10 +73,10 @@ def printModelStats(modelType, stats): 'eval_recall' ] for e in evalList: - header.append(e) - header.append('') - header.append('') - header.append('') + header.append(f'{e}-recall') + header.append('CV_recall') + header.append('FPR') + header.append('CV_FPR') print(*header, sep='\t') for targetRecall in availableRecalls: rowVals = ['%0.4f' % (float(targetRecall), )] diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py index 5b75a0a..b19df4b 100644 --- a/scripts/TrainingConfig.py +++ b/scripts/TrainingConfig.py @@ -33,7 +33,7 @@ #if auto-targets is enabled, this is the total target precision that is used for deriving the auto-target # e.g. if based precision is 0.999, then the model needs to recover 0.0009 of the 0.001 that is not captured # e.g. 90% derived target recall for the models -GLOBAL_AUTO_TARGET_PRECISION = 0.9999 +GLOBAL_AUTO_TARGET_PRECISION = 0.99996 AUTO_TARGET_BREAKPOINT_COUNT = 10 #the number of point inbetween the target and 1.00 to calculate, must be > 0 AUTO_EXTRA_TARGETS = [0.9999, 1.0000] #these are always added diff --git a/scripts/TrainingPipeline.smk b/scripts/TrainingPipeline.smk index d439be7..5bfe071 100644 --- a/scripts/TrainingPipeline.smk +++ b/scripts/TrainingPipeline.smk @@ -3,6 +3,7 @@ import os from RunTrainingPipeline import parseSlids from PipelineConfig import * +from TrainingConfig import GLOBAL_AUTO_TARGET_PRECISION #derived from repo PIPELINE_DIRECTORY = '%s/pipeline_pcrfree' % REPO_DIRECTORY @@ -220,7 +221,7 @@ rule SummarizeModels: params: script=MODEL_REPORT_SCRIPT, prefix="{pipeline_dir}/trained_models/{reference}/{aligner}/{caller}", - global_precision="0.9999" + global_precision=GLOBAL_AUTO_TARGET_PRECISION #conda: # CONDA_ENV log: "{pipeline_dir}/logs/model_summaries/{reference}/{aligner}/{caller}.log" @@ -266,6 +267,7 @@ rule ModelEli5: eli5="{pipeline_dir}/eli5_summaries/{reference}/{aligner}/{caller}/model_eli5.json" params: script=MODEL_ELI5_SCRIPT, + global_precision=GLOBAL_AUTO_TARGET_PRECISION, prefix="{pipeline_dir}/trained_models/{reference}/{aligner}/{caller}" #conda: # CONDA_ENV @@ -274,7 +276,7 @@ rule ModelEli5: shell: ''' python3 -u {params.script} \ - -g 0.9999 \ + -g {params.global_precision} \ {params.prefix} \ {output.eli5} ''' From 8f70cfc16d566aeee87a3bb3a393edbe6af4f1c2 Mon Sep 17 00:00:00 2001 From: holtjma Date: Wed, 1 Jun 2022 09:54:46 -0500 Subject: [PATCH 10/16] adding RF to recover indel hets --- scripts/TrainingConfig.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py index b19df4b..9ce0fbe 100644 --- a/scripts/TrainingConfig.py +++ b/scripts/TrainingConfig.py @@ -73,7 +73,7 @@ CLASSIFIERS = [] #Generally slight underperformance, but is a decent backup -ENABLE_RANDOMFOREST = False +ENABLE_RANDOMFOREST = True #performs alright, but always seems to be below GradientBoostingClassifier and slower ENABLE_ADABOOST = False @@ -114,7 +114,7 @@ 'random_state' : [0], 'class_weight' : ['balanced'], 'n_estimators' : [200], #prior tests: 100, 200, 300, 500 - technically 500 was best, but you really get marginal gains - 'max_depth' : [6], #prior tests: 3, 4, 5 + 'max_depth' : [3, 6], #prior tests: 3, 4, 5 'min_samples_split' : [2, 50], 'max_features' : ['sqrt'], 'bootstrap' : [True], From 44dd3b60e498cc160ebbbc959d3cfa47d2094525 Mon Sep 17 00:00:00 2001 From: holtjma Date: Wed, 1 Jun 2022 13:57:17 -0500 Subject: [PATCH 11/16] updating to work with cyvcf2 and codi outputs --- scripts/EvaluateVariants.py | 92 ++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 33 deletions(-) diff --git a/scripts/EvaluateVariants.py b/scripts/EvaluateVariants.py index e3c576e..1beae67 100644 --- a/scripts/EvaluateVariants.py +++ b/scripts/EvaluateVariants.py @@ -1,5 +1,6 @@ import argparse as ap +import cyvcf2 import csv import json import numpy as np @@ -8,6 +9,7 @@ #import vcf from ExtractFeatures import ALL_METRICS, getVariantFeatures, GT_TRANSLATE, VAR_TRANSLATE +from TrainingConfig import ENABLE_AUTO_TARGET, GLOBAL_AUTO_TARGET_PRECISION def getClinicalModel(stats, acceptedRecall, targetRecall, global_precision): ''' @@ -166,7 +168,7 @@ def runSubType(variantType, args, stats, models, statKey): #make sure our recall is in the list availableRecalls = stats[list(stats.keys())[0]]['ALL_SUMMARY'].keys() - if targetRecall not in availableRecalls: + if targetRecall not in availableRecalls and not ENABLE_AUTO_TARGET: raise Exception('Invalid target recall, available options are: %s' % (availableRecalls, )) #figure out which models we will actually be using @@ -191,17 +193,25 @@ def runSubType(variantType, args, stats, models, statKey): evalList = [bestModelName] elif modelName == 'clinical': - #TODO: make this a script parameter instead of hard-coding - #this contains minimum acceptable recals for a given target - targetThresholds = { - '0.995' : 0.99 - } - if targetRecall not in targetThresholds: - raise Exception('"clinical" mode has no defined threshold for target recall "%s"' % targetRecall) - acceptedRecall = targetThresholds[targetRecall] + if ENABLE_AUTO_TARGET: + global_precision = GLOBAL_AUTO_TARGET_PRECISION + + #dummy values + acceptedRecall = 0.0 + targetRecall = 0.0 + else: + global_precision = None + #TODO: make this a script parameter instead of hard-coding + #this contains minimum acceptable recals for a given target + targetThresholds = { + '0.995' : 0.99 + } + if targetRecall not in targetThresholds: + raise Exception('"clinical" mode has no defined threshold for target recall "%s"' % targetRecall) + acceptedRecall = targetThresholds[targetRecall] #get the clinical model - clinicalModelDict = getClinicalModel(stats, acceptedRecall, targetRecall) + clinicalModelDict = getClinicalModel(stats, acceptedRecall, targetRecall, global_precision) bestModelName = clinicalModelDict['model_name'] bestModelTargetRecall = clinicalModelDict['eval_recall'] @@ -252,10 +262,11 @@ def runSubType(variantType, args, stats, models, statKey): allVariants += loadCodicemVariants(args.codicem) #now load the VCF file - vcfReader = vcf.Reader(filename=args.sample_vcf, compressed=True) - rawReader = vcf.Reader(filename=args.sample_vcf, compressed=True) - assert(len(vcfReader.samples) == 1) - chromList = vcfReader.contigs.keys() + vcf_reader = cyvcf2.VCF(args.sample_vcf) + raw_reader = cyvcf2.VCF(args.sample_vcf) + assert(len(vcf_reader.samples) == 1) + sample_index = 0 + chrom_list = vcf_reader.seqnames #go through each variant and extract the features into a shared set varIndex = [] @@ -263,12 +274,12 @@ def runSubType(variantType, args, stats, models, statKey): rawGT = [] varFeatures = [] for i, (chrom, start, end, ref, alt) in enumerate(allVariants): - if (chrom not in chromList and - 'chr'+chrom in chromList): + if (chrom not in chrom_list and + 'chr'+chrom in chrom_list): chrom = 'chr'+chrom - if chrom in chromList: - variantList = [variant for variant in vcfReader.fetch(chrom, start, end)] + if chrom in chrom_list: + variantList = [variant for variant in vcf_reader(f'{chrom}:{start}-{end}')] else: print('WARNING: Chromosome "%s" not found' % (chrom, )) variantList = [] @@ -279,7 +290,7 @@ def runSubType(variantType, args, stats, models, statKey): #now go through each variant and pull out the features for it for variant in variantList: - featureVals = getVariantFeatures(variant, vcfReader.samples[0], fields, rawReader, allowHomRef=True) + featureVals = getVariantFeatures(variant, sample_index, fields, raw_reader, allowHomRef=True) varFeatures.append(featureVals) rawGT.append(featureVals[gtIndex]) @@ -322,7 +333,8 @@ def runSubType(variantType, args, stats, models, statKey): #if it's not found, we DON'T put it in the dictionary list else: for ind in foundVarIndices: - vals = [chrom, start, end, ref, alt, rawVariants[ind], rawGT[ind]] + raw_variant_str = generateCyvcf2RecordStr(rawVariants[ind]) + vals = [chrom, start, end, ref, alt, raw_variant_str, rawGT[ind]] modelResultDict = {} resultsFound = False for mn in evalList: @@ -346,7 +358,7 @@ def runSubType(variantType, args, stats, models, statKey): 'end' : end, 'ref' : ref, 'alt' : alt, - 'call_variant' : rawVariants[ind], + 'call_variant' : raw_variant_str, 'call_gt' : rawGT[ind], 'predictions' : modelResultDict } @@ -386,10 +398,11 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT): allVariants += loadCodicemVariants(args.codicem) #now load the VCF file - vcfReader = vcf.Reader(filename=args.sample_vcf, compressed=True) - rawReader = vcf.Reader(filename=args.sample_vcf, compressed=True) - assert(len(vcfReader.samples) == 1) - chromList = vcfReader.contigs.keys() + vcf_reader = cyvcf2.VCF(args.sample_vcf) + raw_reader = cyvcf2.VCF(args.sample_vcf) + assert(len(vcf_reader.samples) == 1) + sample_index = 0 + chrom_list = vcf_reader.seqnames #go through each variant and extract the features into a shared set varIndex = [] @@ -397,12 +410,12 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT): rawGT = [] varFeatures = [] for i, (chrom, start, end, ref, alt) in enumerate(allVariants): - if (chrom not in chromList and - 'chr'+chrom in chromList): + if (chrom not in chrom_list and + 'chr'+chrom in chrom_list): chrom = 'chr'+chrom - if chrom in chromList: - variantList = [variant for variant in vcfReader.fetch(chrom, start, end)] + if chrom in chrom_list: + variantList = [variant for variant in vcf_reader(f'{chrom}:{start}-{end}')] else: print('WARNING: Chromosome "%s" not found' % (chrom, )) variantList = [] @@ -413,7 +426,7 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT): #now go through each variant and pull out the features for it for variant in variantList: - featureVals = getVariantFeatures(variant, vcfReader.samples[0], fields, rawReader, allowHomRef=True) + featureVals = getVariantFeatures(variant, sample_index, fields, raw_reader, allowHomRef=True) varFeatures.append(featureVals) rawGT.append(featureVals[gtIndex]) @@ -436,7 +449,8 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT): valList.append(vals) else: for ind in foundVarIndices: - vals = [chrom, start, end, ref, alt, rawVariants[ind], rawGT[ind]] + raw_variant_str = generateCyvcf2RecordStr(rawVariants[ind]) + vals = [chrom, start, end, ref, alt, raw_variant_str, rawGT[ind]] modelResultDict = {} resultsFound = False if filtersEnabled and (acceptedVT != varFeatures[ind][0] or @@ -459,7 +473,7 @@ def runReferenceCalls(variantType, args, acceptedVT, acceptedGT): 'end' : end, 'ref' : ref, 'alt' : alt, - 'call_variant' : rawVariants[ind], + 'call_variant' : raw_variant_str, 'call_gt' : rawGT[ind], 'predictions' : modelResultDict } @@ -519,6 +533,18 @@ def parseCLIVariants(csVar): ret.append(var) return ret +def generateCyvcf2RecordStr(variant): + ''' + Simple cyvcf2 variant reformatter, makes it match something similar to vcf + @param variant - a variant from cyvcf2 + @return - a string formatted as a "record" + ''' + chrom = variant.CHROM + pos = variant.POS + ref = variant.REF + alts = variant.ALT + return f'cyvcf2.record(CHROM={chrom}, POS={pos}, REF={ref}, ALT={alts})' + def loadCodicemVariants(csvFN): ''' Loads a Codicem sanger CSV and finds variants that need confirmation @@ -559,4 +585,4 @@ def loadCodicemVariants(csvFN): #parse the arguments args = p.parse_args() - evaluateVariants(args) \ No newline at end of file + evaluateVariants(args) From 0c409792cf0bee296183fcf4ba08a9d462325101 Mon Sep 17 00:00:00 2001 From: holtjma Date: Thu, 2 Jun 2022 16:25:23 -0500 Subject: [PATCH 12/16] README updates --- README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 002409e..4ba7b20 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,12 @@ We have currently configured the pipeline to run using an LSF cluster. If a dif 2. If running locally, configuration changes to the snakemake command itself may be necessary. These are located in variable `snakemakeFrags`, contact the repo owner or submit a ticket for assistance. ### Training Configuration (Optional) -There are several options available that adjust way training of models is performed (e.g. models used, hyperparameters, training method, etc.). These options are available in `TrainingConfig.py`. Details on each will be in a future release. +There are several options available that adjust way training of models is performed (e.g. models used, hyperparameters, training method, etc.). +These options are available in `TrainingConfig.py` and generally described in-line with the parameter. +However, some are of critical importance and should be considered for training: + +1. `ENABLE_AUTO_TARGET` - If set to `True`, this option enables an alternate method for determining the target recall that is automatically calculated based on the observed precision and the desired global precision (another parameter `GLOBAL_AUTO_TARGET_PRECISION`). *Global precision* in this context is the combined precision of the upstream pipeline followed by STEVE identifying false positive calls from that pipeline. For example, if the desired global precision is 99% and the upstream pipeline achieves 98% precision by itself, then the models trained by STEVE need to capture 1% out of the 2% false positives to achieve 99% global precision. This means the target recall will be set to 0.5000 indicating that half of all false positives need to be identified to achieve the desired global precision. This approach allows for pipeline that _already_ have a high precision to have lower/easier targets in STEVE to achieve the same global precision. In practice, this allows you to swap the upstream pipelines without needing to recalculate/adjust the target/accepted recall values to account for a more or less precise upstream pipeline. +2. `ENABLE_FEATURE_SELECTION` - If set to `True`, this option enabled feature selection prior to model training. Numerous parameters adjust how exactly the feature selection is performed. In general, enabling feature selection leads to longer training times, but may improve the results by removing unnecessary and/or redundant features using a systematic approach. ### Setting up conda environment Assuming conda or miniconda is installed, use the following two commands to create and then activate a conda environment for the pipeline: @@ -53,7 +58,7 @@ conda activate steve Assuming the above configuration steps were successful, all that remains is to run the pipeline itself. Here is an example execution of the pipeline used in the paper: ``` -python3 scripts/RunTrainingPipeline.py -dest -x ./GIAB_v1.json +python3 scripts/RunTrainingPipeline.py -dest -x ./sample_metadata/GIAB_v1.json ``` For details on each option, run `python3 scripts/RunTrainingPipeline.py -h`. The following sections describe what the pipeline is actually doing. From 2117a81321094938e3e06a817e6a55b0f34f9d80 Mon Sep 17 00:00:00 2001 From: holtjma Date: Thu, 2 Jun 2022 16:27:05 -0500 Subject: [PATCH 13/16] update conda yaml --- scripts/TrainingPipeline.smk | 2 + scripts/conda_config.yaml | 106 ++++++++++++++++++++++++++++++----- 2 files changed, 95 insertions(+), 13 deletions(-) diff --git a/scripts/TrainingPipeline.smk b/scripts/TrainingPipeline.smk index 5bfe071..e4ac6cc 100644 --- a/scripts/TrainingPipeline.smk +++ b/scripts/TrainingPipeline.smk @@ -114,6 +114,8 @@ rule ExtractFeatures: params: script=EXTRACT_SCRIPT, prefix="{pipeline_dir}/features" + resources: + mem_mb=8192 #conda: # CONDA_ENV log: "{pipeline_dir}/logs/features/{reference}/{aligner}/{caller}/{slid}.log" diff --git a/scripts/conda_config.yaml b/scripts/conda_config.yaml index 709c0dd..7dc07cd 100644 --- a/scripts/conda_config.yaml +++ b/scripts/conda_config.yaml @@ -1,15 +1,95 @@ -name: steve +name: steve-3.10 channels: - - conda-forge - - bioconda - - defaults + - conda-forge + - bioconda + - defaults dependencies: - - python=3.7.4 - - eli5=0.10.1 - - imbalanced-learn=0.6.2 - - pysam=0.15.4 - - PyVCF=0.6.8 - - scikit-learn=0.22.2.post1 - - scikit-optimize=0.7.4 - - snakemake=5.11.1 - - xgboost=1.0.2 + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2022.5.18.1=ha878542_0 + - ld_impl_linux-64=2.36.1=hea4e1c9_2 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=12.1.0=h8d9b700_16 + - libgomp=12.1.0=h8d9b700_16 + - libnsl=2.0.0=h7f98852_0 + - libuuid=2.32.1=h7f98852_1000 + - libzlib=1.2.12=h166bdaf_0 + - ncurses=6.3=h27087fc_1 + - openssl=3.0.3=h166bdaf_0 + - pip=22.1.2=pyhd8ed1ab_0 + - python=3.10.1=h543edf9_2_cpython + - python_abi=3.10=2_cp310 + - readline=8.1=h46c0cb4_0 + - sqlite=3.38.5=h4ff8645_0 + - tk=8.6.12=h27826a3_0 + - tzdata=2022a=h191b570_0 + - wheel=0.37.1=pyhd8ed1ab_0 + - xz=5.2.5=h516909a_1 + - zlib=1.2.12=h166bdaf_0 + - pip: + - appdirs==1.4.4 + - attrs==21.4.0 + - certifi==2022.5.18.1 + - charset-normalizer==2.0.12 + - click==8.1.3 + - coloredlogs==15.0.1 + - configargparse==1.5.3 + - connection-pool==0.0.3 + - csl-pipeline-config==1.0.9 + - cycler==0.11.0 + - cyvcf2==0.30.15 + - datrie==0.8.2 + - decorator==5.1.1 + - docutils==0.18.1 + - dpath==2.0.6 + - eli5==0.13.0 + - fastjsonschema==2.15.3 + - fonttools==4.33.3 + - gitdb==4.0.9 + - gitpython==3.1.27 + - humanfriendly==10.0 + - idna==3.3 + - imbalanced-learn==0.9.1 + - jinja2==3.1.2 + - joblib==1.1.0 + - jsonschema==4.6.0 + - jupyter-core==4.10.0 + - kiwisolver==1.4.2 + - markupsafe==2.1.1 + - matplotlib==3.5.2 + - nbformat==5.4.0 + - numpy==1.22.4 + - packaging==21.3 + - pillow==9.1.1 + - plac==1.3.5 + - psutil==5.9.1 + - pulp==2.6.0 + - py==1.11.0 + - pyaml==21.10.1 + - pyparsing==3.0.9 + - pyrsistent==0.18.1 + - python-dateutil==2.8.2 + - python-graphviz==0.20 + - pyyaml==6.0 + - ratelimiter==1.2.0.post0 + - requests==2.27.1 + - retry==0.9.2 + - scikit-learn==1.1.1 + - scikit-optimize==0.9.0 + - scipy==1.8.1 + - setuptools==62.3.2 + - six==1.16.0 + - smart-open==6.0.0 + - smmap==5.0.0 + - snakemake==7.8.1 + - stopit==1.1.2 + - tabulate==0.8.9 + - threadpoolctl==3.1.0 + - toposort==1.7 + - traitlets==5.2.2.post1 + - urllib3==1.26.9 + - wrapt==1.14.1 + - xgboost==1.6.1 + - yte==1.4.0 +prefix: /cluster/home/jholt/githubDL/miniconda3/envs/steve-3.10 From af6e6eaff0578dd0c6e95eaefeb7d61c1bf0c86a Mon Sep 17 00:00:00 2001 From: holtjma Date: Tue, 7 Jun 2022 09:23:24 -0500 Subject: [PATCH 14/16] pipeline config removal --- scripts/conda_config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/conda_config.yaml b/scripts/conda_config.yaml index 7dc07cd..3df638a 100644 --- a/scripts/conda_config.yaml +++ b/scripts/conda_config.yaml @@ -36,7 +36,6 @@ dependencies: - coloredlogs==15.0.1 - configargparse==1.5.3 - connection-pool==0.0.3 - - csl-pipeline-config==1.0.9 - cycler==0.11.0 - cyvcf2==0.30.15 - datrie==0.8.2 From 3e97832fede01bb48f92070fb7b967c1706a4375 Mon Sep 17 00:00:00 2001 From: holtjma Date: Wed, 20 Jul 2022 14:24:46 -0500 Subject: [PATCH 15/16] final version of training config --- scripts/PipelineConfig.py | 3 ++- scripts/PrintModelReport.py | 3 ++- scripts/TrainingConfig.py | 4 ++-- scripts/model_metrics.json | 5 ++++- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/PipelineConfig.py b/scripts/PipelineConfig.py index 14a9ca1..9352359 100644 --- a/scripts/PipelineConfig.py +++ b/scripts/PipelineConfig.py @@ -14,7 +14,8 @@ SAMPLE_JSONS = [ #'%s/scripts/GIAB_v1.json' % REPO_DIRECTORY #'%s/scripts/GIAB_v2.json' % REPO_DIRECTORY - f'{REPO_DIRECTORY}/sample_metadata/GIAB_pcrfree.json' + #f'{REPO_DIRECTORY}/sample_metadata/GIAB_pcrfree.json' + f'{REPO_DIRECTORY}/sample_metadata/GIAB_pcrfree_v4.2.1.json' ] #root directory for RTG-based analysis which is expected to conform to a standard directory structure, and contain diff --git a/scripts/PrintModelReport.py b/scripts/PrintModelReport.py index 917ce51..cf10bcf 100644 --- a/scripts/PrintModelReport.py +++ b/scripts/PrintModelReport.py @@ -173,6 +173,7 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels, globa print(*rowVals, sep='\t') else: bestModelTargetRecall = clinDict['eval_recall'] + str_bestModelTargetRecall = '{0:0.4f}'.format(float(bestModelTargetRecall)) #copy TPR results bestTPR = stats[bestModelName]['ALL_SUMMARY'][bestModelTargetRecall]['TEST_TPR'][0] @@ -185,7 +186,7 @@ def printClinicalModels(allStats, acceptedRecall, targetRecall, allModels, globa bestFPR = stats[bestModelName]['ALL_SUMMARY'][bestModelTargetRecall]['TEST_FPR'][0] rowVals = [ - reformKey, bestModelName, bestModelTargetRecall, + reformKey, bestModelName, str_bestModelTargetRecall, '%0.4f+-%0.4f' % (bestTPRAvg, bestTPRStd), '%0.4f' % bestTPR, '%0.4f+-%0.4f' % (bestFPRAvg, bestFPRStd), diff --git a/scripts/TrainingConfig.py b/scripts/TrainingConfig.py index 9ce0fbe..9c73d3b 100644 --- a/scripts/TrainingConfig.py +++ b/scripts/TrainingConfig.py @@ -34,7 +34,7 @@ # e.g. if based precision is 0.999, then the model needs to recover 0.0009 of the 0.001 that is not captured # e.g. 90% derived target recall for the models GLOBAL_AUTO_TARGET_PRECISION = 0.99996 -AUTO_TARGET_BREAKPOINT_COUNT = 10 #the number of point inbetween the target and 1.00 to calculate, must be > 0 +AUTO_TARGET_BREAKPOINT_COUNT = 20 #the number of point inbetween the target and 1.00 to calculate, must be > 0 AUTO_EXTRA_TARGETS = [0.9999, 1.0000] #these are always added #if the auto-targets is disabled, these will be used as targets instead @@ -49,7 +49,7 @@ #if True, this will pre-pend an automated feature selection step to each pipeline #WARNING: this can be a time-consuming process, the reduce run-time increase FS_STEP_SIZE and/or FS_MIN_FEATURE_COUNT ENABLE_FEATURE_SELECTION = True -FS_MIN_FEATURE_COUNT = 10 #always keep at least this many features +FS_MIN_FEATURE_COUNT = 20 #always keep at least this many features FS_STEP_SIZE = 0.1 #if an int, the number of features to remove at each step; if a float, the fraction of features to remove at each step FEATURE_SELECTION_MODELS = [ RFECV( diff --git a/scripts/model_metrics.json b/scripts/model_metrics.json index bc81e53..f001df1 100644 --- a/scripts/model_metrics.json +++ b/scripts/model_metrics.json @@ -39,13 +39,16 @@ "AFO", "GT", "DP", "GQ", "min(PL)" ], "INFO" : [ - "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet", + "BaseQRankSum", "ClippingRankSum", "DP", "Entropy", "ExcessHet", "FS", "GC", "ML_PROB", "MQ", "MQ0", "MQRankSum", "NBQ", "QD", "ReadPosEndDist", "ReadPosRankSum", "SOR", "STR", "UDP" ], "MUNGED" : [ "DP_DP", "FILTER", "ID", "NEARBY", "QUAL" + ], + "IGNORE" : [ + ] } }, From 9f8d1908d5bbc6e0d5f23fe33050834a614b7288 Mon Sep 17 00:00:00 2001 From: holtjma Date: Wed, 20 Jul 2022 15:07:28 -0500 Subject: [PATCH 16/16] making clinical the default --- scripts/EvaluateVariants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/EvaluateVariants.py b/scripts/EvaluateVariants.py index 1beae67..0c64492 100644 --- a/scripts/EvaluateVariants.py +++ b/scripts/EvaluateVariants.py @@ -574,7 +574,7 @@ def loadCodicemVariants(csvFN): #optional arguments with default p.add_argument('-c', '--codicem', dest='codicem', default=None, help='a Codicem CSV file with variants to evaluate (default: None)') p.add_argument('-v', '--variants', dest='variants', default=None, help='variant coordinates to evaluate (default: None)') - p.add_argument('-m', '--model', dest='model', default='best', help='the model name to use (default: best)') + p.add_argument('-m', '--model', dest='model', default='clinical', help='the model name to use (default: clinical)') p.add_argument('-r', '--recall', dest='recall', default='0.99', help='the target recall value from training (default: 0.99)') p.add_argument('-o', '--output', dest='outFN', default=None, help='the place to send output to (default: stdout)')