Skip to content

Commit

Permalink
Refactored epitope computation. Now when using dictionary method diff…
Browse files Browse the repository at this point in the history
…erent functions are called depending on variant effect. Accounting for 3' UTR translation on frameshift and stop_lost variants. Changed config and .nf files to reflect this change.
  • Loading branch information
Akazhiel committed Feb 28, 2023
1 parent d39f054 commit a8f18b2
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 87 deletions.
101 changes: 47 additions & 54 deletions bin/epitopes.py
Original file line number Diff line number Diff line change
@@ -1,93 +1,86 @@
#!/usr/bin/env python

from varcode import Variant
from Bio.Seq import translate
from Bio.Data import IUPACData
from variant_effect import *
import re


def translate_dna(seq):
return translate(seq, to_stop=True)


def create_epitope_varcode(chrm, start, ref, alt, db, mut_dna, mut_aa, transcript, funcensgene, cDNA_dict, AA_dict):
def create_epitope_varcode(chrm, start, ref, alt, db, mut_dna, mut_aa, transcript, funcensgene, cDNA_dict, AA_dict, three_prime_utr_dict):
# Retrieve variant info
vinfo = Variant(contig=chrm, start=start, ref=ref, alt=alt, ensembl=db, allow_extended_nucleotides=True)
effect = [effect for effect in vinfo.effects() if effect.transcript_id == transcript][0]
effect = vinfo.effect_on_transcript(db.transcript_by_id(transcript))
errors = "Flags:"
wt_mer = ['-', '-']
mut_mer = ['-', '-']
starts = [13, 21]
ends = [12, 20]
pos = -1
three_to_one = IUPACData.protein_letters_3to1_extended
if effect is None:
errors += ' could not infer the effect'
errors += ' Could not infer the effect.'
else:
# Retrieve effect type
protein_mut = effect.short_description
if protein_mut is None:
errors += ' could not retrieve AA mutation'
errors += ' Could not retrieve AA mutation.'
elif not protein_mut.startswith('p.'):
errors += ' invalid mutation {}'.format(protein_mut)
errors += ' Computed with dictionary method.'
errors += ' Invalid mutation {}.'.format(protein_mut)
aa_pos = int(re.findall(r'\d+', mut_aa)[0]) if mut_aa != '' else 0
cDNA_pos = int(re.findall(r'\d+', mut_dna)[0]) if mut_dna != '' else 0
cDNA_pos = cDNA_pos + 1 if cDNA_pos == 1 else cDNA_pos
if 'missense' in funcensgene:
ref_AA, var_AA = [three_to_one[aa] for aa in re.split(r'\d+', mut_aa.lstrip('p.'))]
if aa_pos == 0:
errors += ' can not code for this mutated position'
else:
protein_seq = AA_dict[transcript]
end = [aa_pos + x if aa_pos + x < len(protein_seq) else None for x in ends]
start = [aa_pos - x for x in starts]
if any(ele < 0 for ele in start):
errors += ' Start of sequence is shorter than 12aa from mutation'
start = [0 if x < 0 else x for x in start]
wt_mer = [protein_seq[x:y] for x, y in zip(start, end)]
mut_mer = [protein_seq[x:aa_pos - 1] + var_AA + protein_seq[aa_pos:y] for x, y in zip(start, end)]
elif 'inframe' in funcensgene:
if 'dup' in mut_dna:
dup_pos = cDNA_pos - 1
dup_base = cDNA_dict[transcript][dup_pos]
elif 'ins' in mut_dna:
pass
elif 'del' in mut_dna:
if aa_pos == 0:
errors += ' Can not code for this mutated position.'
else:
if 'missense' in funcensgene:
errors, wt_mer, mut_mer = missense_variant(starts, ends, wt_mer, mut_mer, errors,
mut_dna, mut_aa, transcript, cDNA_pos,
aa_pos, cDNA_dict, AA_dict)
elif 'frameshift' in funcensgene:
errors, wt_mer, mut_mer = frameshift_variant(ref, starts, ends, wt_mer, mut_mer, errors,
mut_dna, mut_aa, transcript, cDNA_pos,
aa_pos, cDNA_dict, AA_dict, three_prime_utr_dict)
elif 'stop_lost' in funcensgene:
errors += ' Stop lost not yet implemented.'
pass
elif 'frameshift' in funcensgene:
fs = len(ref)
cDNA_seq = cDNA_dict[transcript]
mut_cDNA = cDNA_seq[:cDNA_pos - 1] + cDNA_seq[cDNA_pos + fs - 1:]
start = [aa_pos - x for x in starts]
if any(ele < 0 for ele in start):
errors += ' Start of sequence is shorter than 12aa from mutation'
start = [0 if x < 0 else x for x in start]
wt_mer = [effect.original_protein_sequence[x:aa_pos + 13] for x in start]
mut_fasta = str(translate_dna(mut_cDNA.replace(' ', '')))
mut_mer = [mut_fasta[x:] for x in start]
# errors, wt_mer, mut_mer = stoplost_variant(starts, ends, wt_mer, mut_mer, errors,
# mut_dna, mut_aa, transcript, cDNA_pos,
# aa_pos, cDNA_dict, AA_dict, three_prime_utr_dict)
elif 'inframe' in funcensgene:
errors, wt_mer, mut_mer = inframe_variant(starts, ends, wt_mer, mut_mer, errors,
mut_dna, mut_aa, transcript, cDNA_pos,
aa_pos, cDNA_dict, AA_dict)
elif protein_mut.startswith('p.X'):
errors += ' mutation occurs in stop codon'
errors += ' Mutation occurs in stop codon.'
else:
# Retrieve pos
pos = effect.aa_mutation_start_offset
protein_seq = effect.original_protein_sequence
if pos is None:
errors += ' could not find the position for this mutation'
errors += ' Could not find the position for this mutation.'
elif pos == 0:
errors += ' mutation occurs in start codon'
errors += ' Mutation occurs in start codon.'
else:
if effect.mutant_protein_sequence is None or effect.original_protein_sequence is None:
errors += ' could not retrieve protein sequence'
errors += ' Could not retrieve protein sequence.'
else:
errors += ' Computed with varcode method.'
# Type of effect
effect_type = type(effect).__name__
if 'Stop' in effect_type:
errors += ' stop mutation'
if 'StopLoss' in effect_type:
start = [pos - x + 1 for x in starts]
end = [pos + x + 1 if pos + x + 1 < len(protein_seq) else None for x in ends]
if any(ele < 0 for ele in start):
errors += ' Start of sequence is shorter than 12aa from mutation.'
start = [0 if x < 0 else x for x in start]
wt_mer = [effect.original_protein_sequence[x:y] for x, y in zip(start, end)]
mut_mer = [effect.mutant_protein_sequence[x:] for x in start]
elif 'Stop' in effect_type:
errors += ' Stop mutation.'
elif 'FrameShift' in effect_type:
start = [pos - x + 1 for x in starts]
end = [pos + x + 1 if pos + x + 1 < len(protein_seq) else None for x in ends]
if any(ele < 0 for ele in start):
errors += ' Start of sequence is shorter than 12aa from mutation'
errors += ' Start of sequence is shorter than 12aa from mutation.'
start = [0 if x < 0 else x for x in start]
wt_mer = [effect.original_protein_sequence[x:y] for x, y in zip(start, end)]
mut_mer = [effect.mutant_protein_sequence[x:] for x in start]
Expand All @@ -96,18 +89,18 @@ def create_epitope_varcode(chrm, start, ref, alt, db, mut_dna, mut_aa, transcrip
start = [pos - x + 1 for x in starts]
end = [pos + x + 1 if pos + x + 1 < len(protein_seq) else None for x in ends]
if any(ele < 0 for ele in start):
errors += ' Start of sequence is shorter than 12aa from mutation'
errors += ' Start of sequence is shorter than 12aa from mutation.'
start = [0 if x < 0 else x for x in start]
wt_mer = [effect.original_protein_sequence[x:y] for x, y in zip(start, end)]
mut_mer = [effect.mutant_protein_sequence[x:y] for x, y in zip(start, end)]
elif 'Insertion' in effect_type:
start = [pos - x + 1 for x in starts]
if any(ele < 0 for ele in start):
errors += ' Start of sequence is shorter than 12aa from mutation'
errors += ' Start of sequence is shorter than 12aa from mutation.'
start = [0 if x < 0 else x for x in start]
size = int(abs(len(ref) - len(alt)) / 3)
wt_mer = [effect.original_protein_sequence[x:pos+size+13] for x in start]
mut_mer = [effect.mutant_protein_sequence[x:pos+size+13] for x in start]
else:
errors += ' unknown exonic function {}'.format(effect_type)
return pos, errors, wt_mer, mut_mer
errors += ' Unknown exonic function {}.'.format(effect_type)
return errors, wt_mer, mut_mer
30 changes: 21 additions & 9 deletions bin/merge_variants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python
#! /usr/bin/env python

import statistics
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from collections import defaultdict

Expand All @@ -18,6 +17,7 @@ def main(dna_variants,
rna_counts,
cDNA_DICT,
AA_DICT,
UTR_DICT,
tumor_coverage,
tumor_var_depth,
tumor_var_freq,
Expand All @@ -43,12 +43,19 @@ def main(dna_variants,
for line in handle.readlines():
tokens = line.split(":")
AA_seq_dict[tokens[0]] = tokens[1].strip()

cDNA_seq_dict = dict()
with open(cDNA_DICT, "r") as handle:
for line in handle.readlines():
tokens = line.split(":")
cDNA_seq_dict[tokens[0]] = tokens[1].strip()

three_prime_utr_dict = dict()
with open(UTR_DICT, "r") as handle:
for line in handle.readlines():
tokens = line.split(":")
three_prime_utr_dict[tokens[0]] = tokens[1].strip()

variant_dict = defaultdict(list)

if dna_variants and len(dna_variants) > 0 and len(dna_variants) == len(dna_names):
Expand All @@ -65,7 +72,8 @@ def main(dna_variants,
num_callers_indel,
ensembl_version,
cDNA_seq_dict,
AA_seq_dict)
AA_seq_dict,
three_prime_utr_dict)
for variant in variants:
variant_dict[variant.key].append((variant, name))

Expand All @@ -79,7 +87,8 @@ def main(dna_variants,
num_callers_rna,
ensembl_version,
cDNA_seq_dict,
AA_seq_dict)
AA_seq_dict,
three_prime_utr_dict)
for variant in variants:
variant_dict[variant.key].append((variant, name))

Expand All @@ -91,7 +100,7 @@ def main(dna_variants,
print('Loading Gene counts..')
for file, name in zip(rna_counts, rna_names):
counts_table = pd.read_csv(file, sep='\t', skiprows=1)
counts = counts_table.iloc[:, 6].to_numpy()
counts = counts_table.iloc[:, 6].to_numpy()
lengths = counts_table['Length'].to_numpy()
rpb = counts / lengths
counts_table['RPKM'] = (rpb / sum(counts)) * 1e9
Expand All @@ -110,8 +119,8 @@ def main(dna_variants,
'DNA samples (failing)\tNumber of DNA samples (failing)\t' \
'RNA samples (passing)\tNumber of RNA samples (passing)\t' \
'RNA samples (failing)\tNumber of RNA samples (failing)\tEffects\t' \
'cDNA change\tAA change\tEpitope creation flags\tWt Epitope\t' \
'Mut Epitope\tTranscripts\tDNA Callers Sample(Name:NDP;NAD;NVAF;TDP;TAD;TVAF)\t' \
'cDNA change\tAA change\tEpitope creation flags\tWt Epitope 25mer\t' \
'Mut Epitope 25mer\tWt Epitope 41mer\tMut Epitope 41mer\tTranscripts\tDNA Callers Sample(Name:NDP;NAD;NVAF;TDP;TAD;TVAF)\t' \
'RNA Callers Sample(Name:TDP;TAD;TVAF)\tGeneCount info Sample(gene;exp;mean;percentile)\n'

final_file = open('overlap_final.txt', 'w')
Expand Down Expand Up @@ -185,7 +194,7 @@ def main(dna_variants,
';'.join(rna_name_pass), num_rna_pass,
';'.join(rna_name_fail), num_rna_fail,
effect, epitope.dnamut, epitope.aamut, epitope.flags,
epitope.wtseq[0], epitope.mutseq[0], transcripts,
epitope.wtseq[0], epitope.mutseq[0], epitope.wtseq[1], epitope.mutseq[1], transcripts,
dna_callers, rna_callers, ';'.join(gene_locus)])
if num_dna_pass >= 1:
final_file.write(to_write + '\n')
Expand Down Expand Up @@ -218,6 +227,8 @@ def main(dna_variants,
help='Path to a dictionary of transcript IDs to peptide sequences', required=True)
parser.add_argument('--dictcDNA',
help='Path to a dictionary of transcript IDs to DNA sequences', required=True)
parser.add_argument('--dict3prime',
help='Path to a dictionary of transcript IDs to three prime UTR sequences.', required=True)
parser.add_argument('--filter-dna-tumor-cov', type=int, default=10, required=False, dest='tumor_coverage',
help='Filter for DNA variants tumor number of reads (coverage) (DP). Default=10')
parser.add_argument('--filter-dna-tumor-depth', type=int, default=4, required=False, dest='tumor_var_depth',
Expand Down Expand Up @@ -259,6 +270,7 @@ def main(dna_variants,
args.rna_counts,
os.path.abspath(args.dictcDNA),
os.path.abspath(args.dictAA),
os.path.abspath(args.dict3prime),
args.tumor_coverage,
args.tumor_var_depth,
args.tumor_var_freq,
Expand All @@ -271,4 +283,4 @@ def main(dna_variants,
args.tumor_var_depth_rna,
args.tumor_var_freq_rna,
args.num_callers_rna,
args.ensembl_version)
args.ensembl_version)
98 changes: 98 additions & 0 deletions bin/variant_effect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python

from Bio.Seq import translate
from Bio.Data import IUPACData
import re


three_to_one = IUPACData.protein_letters_3to1_extended


def translate_dna(seq):
return translate(seq, to_stop=True)


def missense_variant(starts, ends, wt_mer, mut_mer, errors, mut_dna, mut_aa, transcript, cDNA_pos, aa_pos, cDNA_dict, AA_dict):
ref_AA, var_AA = [three_to_one[aa] for aa in re.split(r'\d+', mut_aa.lstrip('p.')) if len(aa[1])]
protein_seq = AA_dict[transcript]
end = [aa_pos + x if (aa_pos + x) < len(protein_seq) else None for x in ends]
start = [aa_pos - x for x in starts]
if any(ele < 0 for ele in start):
errors += ' Start of sequence is shorter than 12aa from mutation'
start = [0 if x < 0 else x for x in start]
wt_mer = [protein_seq[x:y] for x, y in zip(start, end)]
mut_mer = [protein_seq[x:aa_pos - 1] + var_AA + protein_seq[aa_pos:y] for x, y in zip(start, end)]
return errors, wt_mer, mut_mer


def inframe_variant(starts, ends, wt_mer, mut_mer, errors, mut_dna, mut_aa, transcript, cDNA_pos, aa_pos, cDNA_dict, AA_dict):
cDNA_seq = cDNA_dict[transcript]
protein_seq = AA_dict[transcript]
end = [aa_pos + x if (aa_pos + x) < len(protein_seq) else None for x in ends]
start = [aa_pos - x for x in starts]
if any(ele < 0 for ele in start):
errors += ' Start of sequence is shorter than 12aa from mutation'
start = [0 if x < 0 else x for x in start]
wt_mer = [protein_seq[x:y] for x, y in zip(start, end)]
if 'dup' in mut_dna:
dup_pos = list(map(int, re.findall(r'\d+', mut_dna))) if mut_dna != '' else 0
mut_fasta = cDNA_seq[:dup_pos[0] - 1] + cDNA_seq[dup_pos[0] - 1:dup_pos[-1]] + cDNA_seq[dup_pos[0] - 1:]
mut_protein = translate_dna(mut_fasta)
mut_mer = [mut_protein[x:y] for x, y in zip(start, end)]
elif 'ins' in mut_dna:
ins_seq = mut_dna[int(mut_dna.find('ins')) + 3:]
mut_fasta = cDNA_seq[:cDNA_pos] + ins_seq + cDNA_seq[cDNA_pos:]
mut_protein = translate_dna(mut_fasta)
mut_mer = [mut_protein[x:y] for x, y in zip(start, end)]
elif 'del' in mut_dna:
del_pos = list(map(int, re.findall(r'\d+', mut_dna))) if mut_dna != '' else 0
mut_fasta = cDNA_seq[:del_pos[0] - 1] + cDNA_seq[del_pos[1]:]
mut_protein = translate_dna(mut_fasta)
mut_mer = [mut_protein[x:y] for x, y in zip(start, end)]
return errors, wt_mer, mut_mer


def frameshift_variant(ref, starts, ends, wt_mer, mut_mer, errors, mut_dna, mut_aa, transcript, cDNA_pos, aa_pos, cDNA_dict, AA_dict, three_prime_utr_dict):
CDS_seq = cDNA_dict[transcript]
protein_seq = AA_dict[transcript]
try:
utr = three_prime_utr_dict[transcript]
except KeyError:
utr = ''
end = [aa_pos + x if (aa_pos + x) < len(protein_seq) else None for x in ends]
start = [aa_pos - x for x in starts]
cDNA_seq = CDS_seq + utr
if any(ele < 0 for ele in start):
errors += ' Start of sequence is shorter than 12aa from mutation'
start = [0 if x < 0 else x for x in start]
wt_mer = [protein_seq[x:y] for x, y in zip(start, end)]
if 'del' in mut_dna:
fs = len(ref)
mut_cDNA = cDNA_seq[:cDNA_pos - 1] + cDNA_seq[cDNA_pos + fs - 1:]
mut_fasta = str(translate_dna(mut_cDNA.replace(' ', '')))
mut_mer = [mut_fasta[x:] for x in start]
elif 'dup' in mut_dna:
dup_pos = [None, None]
dup_pos = list(map(int, re.findall(r'\d+', mut_dna))) if mut_dna != '' else 0
mut_fasta = cDNA_seq[:dup_pos[0]] + cDNA_seq[dup_pos[0] - 1:dup_pos[-1]] + cDNA_seq[dup_pos[-1]:]
mut_protein = translate_dna(mut_fasta)
mut_mer = [mut_protein[x:] for x in start]
elif 'ins' in mut_dna:
ins_seq = mut_dna[int(mut_dna.find('ins')) + 3:]
mut_fasta = cDNA_seq[:cDNA_pos] + ins_seq + cDNA_seq[cDNA_pos:]
mut_protein = translate_dna(mut_fasta)
mut_mer = [mut_protein[x:y] for x, y in zip(start, end)]
return errors, wt_mer, mut_mer


def stoplost_variant(starts, ends, wt_mer, mut_mer, errors, mut_dna, mut_aa, transcript, cDNA_pos, aa_pos, cDNA_dict, AA_dict, three_prime_utr_dict):
CDS_seq = cDNA_dict[transcript]
protein_seq = AA_dict[transcript]
try:
utr = three_prime_utr_dict[transcript]
except KeyError:
utr = ''
end = [aa_pos + x if (aa_pos + x) < len(protein_seq) else None for x in ends]
start = [aa_pos - x for x in starts]
cDNA_seq = CDS_seq + utr
pass
Loading

0 comments on commit a8f18b2

Please sign in to comment.