Skip to content

Commit

Permalink
Merge pull request #2 from HudsonAlpha/multi_reference
Browse files Browse the repository at this point in the history
Multi reference
  • Loading branch information
holtjma authored Nov 18, 2021
2 parents 1dcc625 + 7c94b51 commit 8261d3e
Show file tree
Hide file tree
Showing 11 changed files with 384 additions and 82 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ data/original_data
pip3_freezes
pipeline*
results_b37/
results
scripts/.snakemake/
scripts/CompareRTG.py
scripts/__pycache__/
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ The easiest way to invoke this pipeline is to use the provided conda environment
Inside `PipelineConfig.py` is a set of information describing where files can be found within your environment. Generally, these values need to be set only once during the initial setup. The following are key variables that should be modified:

#### Required
1. `DATA_DIRECTORY` - The full path to a directory containing pipeline outputs. The pipeline assumes that variant files are located at `{DATA_DIRECTORY}/variant_calls/{aligner}/{caller}/{sample}.vcf.gz`. Similarly, it assumes that RTG VCFEval outputs are located at `{DATA_DIRECTORY}/rtg_results/{aligner}/{caller}/{sample}`. Both of these are required for the pipeline to run successfully. NOTE: if using a variant caller that is not specified in `model_metrics.json`, additional steps may be required (see section "Feature Extraction" for details).
2. `ALIGNERS`, `VARIANT_CALLERS` - Each of these is a list of aligners or variant callers that will be used to populate the paths above (`{aligner}` or `{caller}`). Every combination of the two lists will be used.
3. `FULL_PIPES` - similar to `ALIGNERS` and `VARIANT_CALLERS`, but this one allows for specific tuple pairs (i.e. instead of every combination).
1. `DATA_DIRECTORY` - The full path to a directory containing pipeline outputs. The pipeline assumes that variant files are located at `{DATA_DIRECTORY}/variant_calls/{reference}/{aligner}/{caller}/{sample}.vcf.gz`. Similarly, it assumes that RTG VCFEval outputs are located at `{DATA_DIRECTORY}/rtg_results/{reference}/{aligner}/{caller}/{sample}`. Both of these are required for the pipeline to run successfully. NOTE: if using a variant caller that is not specified in `model_metrics.json`, additional steps may be required (see section "Feature Extraction" for details).
2. `REFERENCES`, `ALIGNERS`, `VARIANT_CALLERS` - Each of these is a list of references, aligners, or variant callers that will be used to populate the paths above (e.g. `{reference}/{aligner}/{caller}`). Every combination of the three lists will be used.
3. `FULL_PIPES` - similar to the above options, but this one allows for specific tuple triplets (i.e. instead of every combination).

#### Optional
1. `THREADS_PER_PROCS` - positive integer, the number of threads to use for training and testing
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
28 changes: 15 additions & 13 deletions scripts/ExtractFeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,18 +252,19 @@ def gatherVcfMetrics(vcfFN, rawVCF, metrics):

return ret, fields

def extractFeatures(aligner, caller, samples, outPrefix):
def extractFeatures(reference, aligner, caller, samples, outPrefix):
'''
This will run the feature extraction after identifying the appropriate inputs
@param reference - the reference label that was used upstream
@param aligner - the aligner that was used upstream
@param caller - the caller tht was used upstream
@param samples - a file or string corresponding to the samples (see parseSlids(...) function)
@param outPrefix - prefix for the extracted feature output
@return - None
'''
#0 - constants based on input
RTG_ROOT = '%s/rtg_results/%s/%s' % (DATA_DIRECTORY, aligner, caller)
VCF_ROOT = '%s/variant_calls/%s/%s' % (DATA_DIRECTORY, aligner, caller)
RTG_ROOT = '%s/rtg_results/%s/%s/%s' % (DATA_DIRECTORY, reference, aligner, caller)
VCF_ROOT = '%s/variant_calls/%s/%s/%s' % (DATA_DIRECTORY, reference, aligner, caller)
OUTPUT_ROOT = outPrefix
CLASSIFICATION_TYPES = ['fp', 'tp']
VCF_METRICS = ALL_METRICS[caller]
Expand All @@ -274,8 +275,8 @@ def extractFeatures(aligner, caller, samples, outPrefix):
#1 - load the samples
SAMPLES = parseSlids(samples)

#2 - make sure results exist for each aligner-caller-sample triplet
acDir = '%s/%s/%s' % (OUTPUT_ROOT, aligner, caller)
#2 - make sure results exist for each reference-aligner-caller-sample quad
acDir = '%s/%s/%s/%s' % (OUTPUT_ROOT, reference, aligner, caller)
if not os.path.exists(acDir):
os.makedirs(acDir)

Expand All @@ -296,8 +297,8 @@ def extractFeatures(aligner, caller, samples, outPrefix):
#3 - reformat results into feature sets as necessary
for sample in SAMPLES:
for ct in CLASSIFICATION_TYPES:
outNumpyFN = '%s/%s/%s/%s_%s.npy' % (OUTPUT_ROOT, aligner, caller, sample, ct)
outCatFN = '%s/%s/%s/%s_%s_fields.json' % (OUTPUT_ROOT, aligner, caller, sample, ct)
outNumpyFN = '%s/%s/%s/%s/%s_%s.npy' % (OUTPUT_ROOT, reference, aligner, caller, sample, ct)
outCatFN = '%s/%s/%s/%s/%s_%s_fields.json' % (OUTPUT_ROOT, reference, aligner, caller, sample, ct)
if not os.path.exists(outNumpyFN) or REGENERATE:
print('Building "%s"...' % (outNumpyFN, ))
rtgVcfFN = '%s/%s/%s.vcf.gz' % (RTG_ROOT, sample, ct)
Expand All @@ -319,15 +320,15 @@ def extractFeatures(aligner, caller, samples, outPrefix):
fpList = []

#load the fields from the first file
fp = open('%s/%s/%s/%s_tp_fields.json' % (OUTPUT_ROOT, aligner, caller, SAMPLES[0]))
fp = open('%s/%s/%s/%s/%s_tp_fields.json' % (OUTPUT_ROOT, reference, aligner, caller, SAMPLES[0]))
fieldsList = json.load(fp)
fp.close()

#get data from each sample
for sample in SAMPLES:
#TP first
tpFN = '%s/%s/%s/%s_tp.npy' % (OUTPUT_ROOT, aligner, caller, sample)
tpOrder = '%s/%s/%s/%s_tp_fields.json' % (OUTPUT_ROOT, aligner, caller, sample)
tpFN = '%s/%s/%s/%s/%s_tp.npy' % (OUTPUT_ROOT, reference, aligner, caller, sample)
tpOrder = '%s/%s/%s/%s/%s_tp_fields.json' % (OUTPUT_ROOT, reference, aligner, caller, sample)
fp = open(tpOrder, 'r')
tpFields = json.load(fp)
fp.close()
Expand All @@ -336,8 +337,8 @@ def extractFeatures(aligner, caller, samples, outPrefix):
tpList.append(tpVar)

#now false positives
fpFN = '%s/%s/%s/%s_fp.npy' % (OUTPUT_ROOT, aligner, caller, sample)
fpOrder = '%s/%s/%s/%s_fp_fields.json' % (OUTPUT_ROOT, aligner, caller, sample)
fpFN = '%s/%s/%s/%s/%s_fp.npy' % (OUTPUT_ROOT, reference, aligner, caller, sample)
fpOrder = '%s/%s/%s/%s/%s_fp_fields.json' % (OUTPUT_ROOT, reference, aligner, caller, sample)
fp = open(fpOrder, 'r')
fpFields = json.load(fp)
fp.close()
Expand All @@ -360,6 +361,7 @@ def extractFeatures(aligner, caller, samples, outPrefix):
#p.add_argument('-d', '--date-subdir', dest='date_subdir', default=None, help='the date subdirectory (default: "hli-YYMMDD")')

#required main arguments
p.add_argument('reference', type=str, help='the reference to train on')
p.add_argument('aligner', type=str, help='the aligner to train on')
p.add_argument('caller', type=str, help='the variant caller to train on')
p.add_argument('sample_metadata', type=str, help='a file with sample identifiers')
Expand All @@ -368,4 +370,4 @@ def extractFeatures(aligner, caller, samples, outPrefix):
#parse the arguments
args = p.parse_args()

extractFeatures(args.aligner, args.caller, args.sample_metadata, args.output_prefix)
extractFeatures(args.reference, args.aligner, args.caller, args.sample_metadata, args.output_prefix)
19 changes: 12 additions & 7 deletions scripts/PipelineConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#Core pipeline config
############################################################
#this is the repo directory
REPO_DIRECTORY = '/gpfs/gpfs1/home/jholt/sanger_less_tests'
REPO_DIRECTORY = '/cluster/home/jholt/sanger_less_tests'

#a dictionary containing metadata for the samples
SAMPLE_JSONS = [
Expand All @@ -16,10 +16,15 @@
# the following files
# {DATA_DIRECTORY}/variant_calls/{aligner}/{caller}/{sample}.vcf.gz - the expected VCF file paths
# {DATA_DIRECTORY}/rtg_results/{aligner}/{caller}/{sample} - folder containing standard RTG results, specifically summary.txt, fp.vcf.gz, and tp.vcf.gz
DATA_DIRECTORY = '/gpfs/gpfs1/home/jholt/csl_validations/core_pipeline_analysis/pipeline'
#DATA_DIRECTORY = '/cluster/home/jholt/csl_validations/core_pipeline_analysis/pipeline'
DATA_DIRECTORY = '/cluster/home/jholt/CSL_pipeline_benchmark/pipeline'

#this is where you specify the pipelines to run; every REFERENCE, ALIGNER, and VARIANT_CALLER combo will be run together
# and each triplet gets it's own model trained for it
REFERENCES = [
#'clinical'
]

#this is where you specify the pipelines to run; every ALIGNER and VARIANT_CALLER combo will be run together
# pair-wise and used to identify batches of files for training/testing
ALIGNERS = [
#'bwa-mem-0.7.17-BQSR'
]
Expand All @@ -32,7 +37,7 @@
FULL_PIPES = [
#('dragen-07.011.352.3.2.8b', 'dragen-07.011.352.3.2.8b'), #this is what we originally trained with
#('dragen-07.021.510.3.5.7', 'dragen-07.021.510.3.5.7'), #this is what follow ups were on
('clinical_sentieon-201808.07', 'strelka-2.9.10')
('clinical', 'sentieon-201808.07', 'strelka-2.9.10')
]

#pipeline parameters, thread count currently only applies to the actual model training (CV to be precise)
Expand All @@ -43,7 +48,7 @@
############################################################

#only necessary if you plan to generate a data summary (i.e. the Supplementary Document)
LATEX_PATH = '/gpfs/gpfs1/home/jholt/texlive/2019/bin/x86_64-linux/pdflatex'
LATEX_PATH = '/cluster/home/jholt/texlive/2019/bin/x86_64-linux/pdflatex'

############################################################
#Slack config
Expand All @@ -54,5 +59,5 @@

#json with sub-dictionary 'urls' where key is a channel name (such as the value in ENABLED_SLACK_CHANNEL)
# and value is a Slack endpoint URL
ENABLED_SLACK_URLS = '/gpfs/gpfs1/home/jholt/slack_integration/data/slack_urls.json'
ENABLED_SLACK_URLS = '/cluster/home/jholt/slack_integration/data/slack_urls.json'
ENABLED_SLACK_CHANNEL = "@holtjma"
Loading

0 comments on commit 8261d3e

Please sign in to comment.