From b018a59f835e9fe6279d103c29e944bf0bd0b364 Mon Sep 17 00:00:00 2001 From: davidstew Date: Fri, 18 May 2018 16:04:57 -0700 Subject: [PATCH] Make majority a user provided option (resolves #286) resolves #286 Added snv_majority and indel_majority in a consensus tab under mutation_calling in input_parameters.yaml. If values are not provided, the majority will be dynamically figured out. --- MANUAL.md | 5 +++++ src/protect/common.py | 2 +- src/protect/mutation_calling/common.py | 21 +++++++++++++------- src/protect/pipeline/ProTECT.py | 23 +++++++++++++++++++--- src/protect/pipeline/defaults.yaml | 3 +++ src/protect/pipeline/input_parameters.yaml | 3 +++ 6 files changed, 46 insertions(+), 11 deletions(-) diff --git a/MANUAL.md b/MANUAL.md index 4d536b31..48f88ff4 100644 --- a/MANUAL.md +++ b/MANUAL.md @@ -335,6 +335,11 @@ be substituted with S3 links. Descriptions for creating all files can be found i version: 1.2.0 mutation_calling: + consensus: + indel_majority: -> Number of callers required to accept an indel. + Will dynamically compute the indel majority by default. + snv_majority: -> Number of callers required to accept an snv. + Will dynamically compute the snv majority by default. indexes: chromsosomes: canonical, canonical_chr, chr1, Y -> A list of chromosomes to process. This options overrides the diff --git a/src/protect/common.py b/src/protect/common.py index 17d72b5e..1839a2a5 100644 --- a/src/protect/common.py +++ b/src/protect/common.py @@ -638,7 +638,7 @@ def email_report(job, univ_options): server = smtplib.SMTP('localhost') except socket.error as e: if e.errno == 111: - print('No mail utils on this maachine') + print('No mail utils on this machine') else: print('Unexpected error while attempting to send an email.') print('Could not send email report') diff --git a/src/protect/mutation_calling/common.py b/src/protect/mutation_calling/common.py index 301eaae2..05dc8d55 100644 --- a/src/protect/mutation_calling/common.py +++ b/src/protect/mutation_calling/common.py @@ -51,13 +51,14 @@ def chromosomes_from_fai(genome_fai): return chromosomes -def run_mutation_aggregator(job, mutation_results, univ_options): +def run_mutation_aggregator(job, mutation_results, univ_options, consensus_options): """ Aggregate all the called mutations. :param dict mutation_results: Dict of dicts of the various mutation callers in a per chromosome format :param dict univ_options: Dict of universal options used by almost all tools + :param dict consensus_options: options specific for consensus mutation calling :returns: fsID for the merged mutations file :rtype: toil.fileStore.FileID """ @@ -80,7 +81,7 @@ def run_mutation_aggregator(job, mutation_results, univ_options): if chroms: for chrom in chroms: out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results, - univ_options).rv() + univ_options, consensus_options).rv() merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options) job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient']) return merged_snvs.rv() @@ -89,7 +90,7 @@ def run_mutation_aggregator(job, mutation_results, univ_options): -def merge_perchrom_mutations(job, chrom, mutations, univ_options): +def merge_perchrom_mutations(job, chrom, mutations, univ_options, consensus_options): """ Merge the mutation calls for a single chromosome. @@ -97,6 +98,7 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options): :param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of per chromosome job store ids for vcfs as value :param dict univ_options: Dict of universal options used by almost all tools + :param dict consensus_options: options specific for consensus mutation calling :returns fsID for vcf contaning merged calls for the given chromosome :rtype: toil.fileStore.FileID """ @@ -109,13 +111,13 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options): mutations.pop('indels') mutations['strelka_indels'] = mutations['strelka']['indels'] mutations['strelka_snvs'] = mutations['strelka']['snvs'] - vcf_processor = {'snvs': {'mutect': process_mutect_vcf, + vcf_processor = {'snv': {'mutect': process_mutect_vcf, 'muse': process_muse_vcf, 'radia': process_radia_vcf, 'somaticsniper': process_somaticsniper_vcf, 'strelka_snvs': process_strelka_vcf }, - 'indels': {'strelka_indels': process_strelka_vcf + 'indel': {'strelka_indels': process_strelka_vcf } } accepted_hits = defaultdict(dict) @@ -131,7 +133,12 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options): if 'strelka_' + mut_type in perchrom_mutations: perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type] perchrom_mutations.pop('strelka_' + mut_type) - majority = 1 if len(perchrom_mutations) <= 2 else (len(perchrom_mutations) + 1) / 2 + if consensus_options[mut_type + '_majority'] is not None: + majority = consensus_options[mut_type + '_majority'] + elif len(perchrom_mutations) <= 2: + majority = 1 + else: + majority = (len(perchrom_mutations) + 1) / 2 # Read in each file to a dict vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()} all_positions = list(set(itertools.chain(*vcf_lists.values()))) @@ -171,7 +178,7 @@ def read_vcf(vcf_file): if line.startswith('#'): continue line = line.strip().split() - vcf_dict.append((line[0], line[1], line[3], line[4])) + vcf_dict.append((line[0], line[1], line[3].upper(), line[4].upper())) return vcf_dict diff --git a/src/protect/pipeline/ProTECT.py b/src/protect/pipeline/ProTECT.py index 2223f661..b8162fe7 100644 --- a/src/protect/pipeline/ProTECT.py +++ b/src/protect/pipeline/ProTECT.py @@ -434,8 +434,25 @@ def _parse_config_file(job, config_file, max_cores=None): job.fileStore.logToMaster('Obtaining tool inputs') if (sample_without_variants and all(tool_options[k]['run'] is False for k in mutation_caller_list - if k not in ('indexes', 'fusion_inspector'))): + if k not in ('indexes', 'fusion_inspector', 'consensus'))): raise RuntimeError("Cannot run mutation callers if all callers are set to run = False.") + + for mut_type in ('snv', 'indel'): + if tool_options['consensus'][mut_type + '_majority'] is not None: + if not isinstance(tool_options['consensus'][mut_type + '_majority'], int): + raise RuntimeError('Majorities have to be integers. Got %s for %s_majority.' % + (tool_options['consensus'][mut_type + '_majority'], mut_type)) + if mut_type == 'snv': + count = sum(tool_options[k]['run'] is True + for k in ('muse', 'mutect', 'somaticsniper', 'radia', 'strelka')) + else: + count = 1 if tool_options['strelka']['run'] is True else 0 + if tool_options['consensus'][mut_type + '_majority'] > count: + raise RuntimeError('Majority cannot be greater than the number of callers. ' + 'Got number of %s callers = %s majority = %s.' % + (mut_type, count, + tool_options['consensus'][mut_type + '_majority'])) + process_tool_inputs = job.addChildJobFn(get_all_tool_inputs, tool_options, mutation_caller_list=mutation_caller_list) job.fileStore.logToMaster('Obtained tool inputs') @@ -687,7 +704,7 @@ def launch_protect(job, patient_data, univ_options, tool_options): bam_files['tumor_rna'].addChild(mutations['radia']) get_mutations = job.wrapJobFn(run_mutation_aggregator, {caller: cjob.rv() for caller, cjob in mutations.items()}, - univ_options, disk='100M', memory='100M', + univ_options, tool_options['consensus'], disk='100M', memory='100M', cores=1).encapsulate() for caller in mutations: mutations[caller].addChild(get_mutations) @@ -777,7 +794,7 @@ def get_all_tool_inputs(job, tools, outer_key='', mutation_caller_list=None): indexes = tools.pop('indexes') indexes['chromosomes'] = parse_chromosome_string(job, indexes['chromosomes']) for mutation_caller in mutation_caller_list: - if mutation_caller == 'indexes': + if mutation_caller in ('indexes', 'consensus'): continue tools[mutation_caller].update(indexes) return tools diff --git a/src/protect/pipeline/defaults.yaml b/src/protect/pipeline/defaults.yaml index 046e043d..bd2bf5de 100644 --- a/src/protect/pipeline/defaults.yaml +++ b/src/protect/pipeline/defaults.yaml @@ -55,6 +55,9 @@ expression_estimation: version: 1.2.20 mutation_calling: + consensus: + indel_majority: + snv_majority: indexes: chromosomes: mutect: diff --git a/src/protect/pipeline/input_parameters.yaml b/src/protect/pipeline/input_parameters.yaml index 2d6d652e..e98182b6 100644 --- a/src/protect/pipeline/input_parameters.yaml +++ b/src/protect/pipeline/input_parameters.yaml @@ -83,6 +83,9 @@ expression_estimation: # version: 1.2.0 mutation_calling: + consensus: + indel_majority: 1 + snv_majority: 2 indexes: chromosomes: canonical_chr, chrM genome_fasta: S3://protect-data/hg38_references/hg38.fa.tar.gz