Refactored epitope computation. Now when using dictionary method diff…

…erent functions are called depending on variant effect. Accounting for 3' UTR translation on frameshift and stop_lost variants. Changed config and .nf files to reflect this change.
Akazhiel · Feb 28, 2023 · a8f18b2 · a8f18b2
1 parent d39f054
commit a8f18b2
Show file tree

Hide file tree

Showing 7 changed files with 195 additions and 87 deletions.
diff --git a/bin/epitopes.py b/bin/epitopes.py
@@ -1,93 +1,86 @@
 #!/usr/bin/env python
 
 from varcode import Variant
-from Bio.Seq import translate
-from Bio.Data import IUPACData
+from variant_effect import *
 import re
 
 
-def translate_dna(seq):
-    return translate(seq, to_stop=True)
-
-
-def create_epitope_varcode(chrm, start, ref, alt, db, mut_dna, mut_aa, transcript, funcensgene, cDNA_dict, AA_dict):
+def create_epitope_varcode(chrm, start, ref, alt, db, mut_dna, mut_aa, transcript, funcensgene, cDNA_dict, AA_dict, three_prime_utr_dict):
     # Retrieve variant info
     vinfo = Variant(contig=chrm, start=start, ref=ref, alt=alt, ensembl=db, allow_extended_nucleotides=True)
-    effect = [effect for effect in vinfo.effects() if effect.transcript_id == transcript][0]
+    effect = vinfo.effect_on_transcript(db.transcript_by_id(transcript))
     errors = "Flags:"
     wt_mer = ['-', '-']
     mut_mer = ['-', '-']
     starts = [13, 21]
     ends = [12, 20]
     pos = -1
-    three_to_one = IUPACData.protein_letters_3to1_extended
     if effect is None:
-        errors += ' could not infer the effect'
+        errors += ' Could not infer the effect.'
     else:
         # Retrieve effect type
         protein_mut = effect.short_description
         if protein_mut is None:
-            errors += ' could not retrieve AA mutation'
+            errors += ' Could not retrieve AA mutation.'
         elif not protein_mut.startswith('p.'):
-            errors += ' invalid mutation {}'.format(protein_mut)
+            errors += ' Computed with dictionary method.'
+            errors += ' Invalid mutation {}.'.format(protein_mut)
             aa_pos = int(re.findall(r'\d+', mut_aa)[0]) if mut_aa != '' else 0
             cDNA_pos = int(re.findall(r'\d+', mut_dna)[0]) if mut_dna != '' else 0
             cDNA_pos = cDNA_pos + 1 if cDNA_pos == 1 else cDNA_pos
-            if 'missense' in funcensgene:
-                ref_AA, var_AA = [three_to_one[aa] for aa in re.split(r'\d+', mut_aa.lstrip('p.'))]
-                if aa_pos == 0:
-                    errors += ' can not code for this mutated position'
-                else:
-                    protein_seq = AA_dict[transcript]
-                    end = [aa_pos + x if aa_pos + x < len(protein_seq) else None for x in ends]
-                    start = [aa_pos - x for x in starts]
-                    if any(ele < 0 for ele in start):
-                        errors += ' Start of sequence is shorter than 12aa from mutation'
-                        start = [0 if x < 0 else x for x in start]
-                    wt_mer = [protein_seq[x:y] for x, y in zip(start, end)]
-                    mut_mer = [protein_seq[x:aa_pos - 1] + var_AA + protein_seq[aa_pos:y] for x, y in zip(start, end)]
-            elif 'inframe' in funcensgene:
-                if 'dup' in mut_dna:
-                    dup_pos = cDNA_pos - 1
-                    dup_base = cDNA_dict[transcript][dup_pos]
-                elif 'ins' in mut_dna:
-                    pass
-                elif 'del' in mut_dna:
+            if aa_pos == 0:
+                errors += ' Can not code for this mutated position.'
+            else:
+                if 'missense' in funcensgene:
+                    errors, wt_mer, mut_mer = missense_variant(starts, ends, wt_mer, mut_mer, errors,
+                                                                    mut_dna, mut_aa, transcript, cDNA_pos,
+                                                                    aa_pos, cDNA_dict, AA_dict)
+                elif 'frameshift' in funcensgene:
+                    errors, wt_mer, mut_mer = frameshift_variant(ref, starts, ends, wt_mer, mut_mer, errors,
+                                                                    mut_dna, mut_aa, transcript, cDNA_pos,
+                                                                    aa_pos, cDNA_dict, AA_dict, three_prime_utr_dict)
+                elif 'stop_lost' in funcensgene:
+                    errors += ' Stop lost not yet implemented.'
                     pass
-            elif 'frameshift' in funcensgene:
-                fs = len(ref)
-                cDNA_seq = cDNA_dict[transcript]
-                mut_cDNA = cDNA_seq[:cDNA_pos - 1] + cDNA_seq[cDNA_pos + fs - 1:]
-                start = [aa_pos - x for x in starts]
-                if any(ele < 0 for ele in start):
-                    errors += ' Start of sequence is shorter than 12aa from mutation'
-                    start = [0 if x < 0 else x for x in start]
-                wt_mer = [effect.original_protein_sequence[x:aa_pos + 13] for x in start]
-                mut_fasta = str(translate_dna(mut_cDNA.replace(' ', '')))
-                mut_mer = [mut_fasta[x:] for x in start]
+                    # errors, wt_mer, mut_mer = stoplost_variant(starts, ends, wt_mer, mut_mer, errors,
+                    #                                                 mut_dna, mut_aa, transcript, cDNA_pos,
+                    #                                                 aa_pos, cDNA_dict, AA_dict, three_prime_utr_dict)
+                elif 'inframe' in funcensgene:
+                    errors, wt_mer, mut_mer = inframe_variant(starts, ends, wt_mer, mut_mer, errors,
+                                                                mut_dna, mut_aa, transcript, cDNA_pos,
+                                                               aa_pos, cDNA_dict, AA_dict)
         elif protein_mut.startswith('p.X'):
-            errors += ' mutation occurs in stop codon'
+            errors += ' Mutation occurs in stop codon.'
         else:
             # Retrieve pos
             pos = effect.aa_mutation_start_offset
             protein_seq = effect.original_protein_sequence
             if pos is None:
-                errors += ' could not find the position for this mutation'
+                errors += ' Could not find the position for this mutation.'
             elif pos == 0:
-                errors += ' mutation occurs in start codon'
+                errors += ' Mutation occurs in start codon.'
             else:
                 if effect.mutant_protein_sequence is None or effect.original_protein_sequence is None:
-                    errors += ' could not retrieve protein sequence'
+                    errors += ' Could not retrieve protein sequence.'
                 else:
+                    errors += ' Computed with varcode method.'
                     # Type of effect
                     effect_type = type(effect).__name__
-                    if 'Stop' in effect_type:
-                        errors += ' stop mutation'
+                    if 'StopLoss' in effect_type:
+                        start = [pos - x + 1 for x in starts]
+                        end = [pos + x + 1 if pos + x + 1 < len(protein_seq) else None for x in ends]
+                        if any(ele < 0 for ele in start):
+                            errors += ' Start of sequence is shorter than 12aa from mutation.'
+                            start = [0 if x < 0 else x for x in start]
+                        wt_mer = [effect.original_protein_sequence[x:y] for x, y in zip(start, end)]
+                        mut_mer = [effect.mutant_protein_sequence[x:] for x in start]
+                    elif 'Stop' in effect_type:
+                        errors += ' Stop mutation.'
                     elif 'FrameShift' in effect_type:
                         start = [pos - x + 1 for x in starts]
                         end = [pos + x + 1 if pos + x + 1 < len(protein_seq) else None for x in ends]
                         if any(ele < 0 for ele in start):
-                            errors += ' Start of sequence is shorter than 12aa from mutation'
+                            errors += ' Start of sequence is shorter than 12aa from mutation.'
                             start = [0 if x < 0 else x for x in start]
                         wt_mer = [effect.original_protein_sequence[x:y] for x, y in zip(start, end)]
                         mut_mer = [effect.mutant_protein_sequence[x:] for x in start]
@@ -96,18 +89,18 @@ def create_epitope_varcode(chrm, start, ref, alt, db, mut_dna, mut_aa, transcrip
                         start = [pos - x + 1 for x in starts]
                         end = [pos + x + 1 if pos + x + 1 < len(protein_seq) else None for x in ends]
                         if any(ele < 0 for ele in start):
-                            errors += ' Start of sequence is shorter than 12aa from mutation'
+                            errors += ' Start of sequence is shorter than 12aa from mutation.'
                             start = [0 if x < 0 else x for x in start]
                         wt_mer = [effect.original_protein_sequence[x:y] for x, y in zip(start, end)]
                         mut_mer = [effect.mutant_protein_sequence[x:y] for x, y in zip(start, end)]
                     elif 'Insertion' in effect_type:
                         start = [pos - x + 1 for x in starts]
                         if any(ele < 0 for ele in start):
-                            errors += ' Start of sequence is shorter than 12aa from mutation'
+                            errors += ' Start of sequence is shorter than 12aa from mutation.'
                             start = [0 if x < 0 else x for x in start]
                         size = int(abs(len(ref) - len(alt)) / 3)
                         wt_mer = [effect.original_protein_sequence[x:pos+size+13] for x in start]
                         mut_mer = [effect.mutant_protein_sequence[x:pos+size+13] for x in start]
                     else:
-                        errors += ' unknown exonic function {}'.format(effect_type)
-    return pos, errors, wt_mer, mut_mer
+                        errors += ' Unknown exonic function {}.'.format(effect_type)
+    return errors, wt_mer, mut_mer
diff --git a/bin/merge_variants.py b/bin/merge_variants.py
@@ -1,6 +1,5 @@
-#!/usr/bin/env python
+#! /usr/bin/env python
 
-import statistics
 from argparse import ArgumentParser, RawDescriptionHelpFormatter
 from collections import defaultdict
 
@@ -18,6 +17,7 @@ def main(dna_variants,
          rna_counts,
          cDNA_DICT,
          AA_DICT,
+         UTR_DICT,
          tumor_coverage,
          tumor_var_depth,
          tumor_var_freq,
@@ -43,12 +43,19 @@ def main(dna_variants,
         for line in handle.readlines():
             tokens = line.split(":")
             AA_seq_dict[tokens[0]] = tokens[1].strip()
+
     cDNA_seq_dict = dict()
     with open(cDNA_DICT, "r") as handle:
         for line in handle.readlines():
             tokens = line.split(":")
             cDNA_seq_dict[tokens[0]] = tokens[1].strip()
 
+    three_prime_utr_dict = dict()
+    with open(UTR_DICT, "r") as handle:
+        for line in handle.readlines():
+            tokens = line.split(":")
+            three_prime_utr_dict[tokens[0]] = tokens[1].strip()
+
     variant_dict = defaultdict(list)
 
     if dna_variants and len(dna_variants) > 0 and len(dna_variants) == len(dna_names):
@@ -65,7 +72,8 @@ def main(dna_variants,
                                            num_callers_indel,
                                            ensembl_version,
                                            cDNA_seq_dict,
-                                           AA_seq_dict)
+                                           AA_seq_dict,
+                                           three_prime_utr_dict)
             for variant in variants:
                 variant_dict[variant.key].append((variant, name))
 
@@ -79,7 +87,8 @@ def main(dna_variants,
                                            num_callers_rna,
                                            ensembl_version,
                                            cDNA_seq_dict,
-                                           AA_seq_dict)
+                                           AA_seq_dict,
+                                           three_prime_utr_dict)
             for variant in variants:
                 variant_dict[variant.key].append((variant, name))
 
@@ -91,7 +100,7 @@ def main(dna_variants,
         print('Loading Gene counts..')
         for file, name in zip(rna_counts, rna_names):
             counts_table = pd.read_csv(file, sep='\t', skiprows=1)
-            counts = counts_table.iloc[:, 6].to_numpy()     
+            counts = counts_table.iloc[:, 6].to_numpy()
             lengths = counts_table['Length'].to_numpy()
             rpb = counts / lengths
             counts_table['RPKM'] = (rpb / sum(counts)) * 1e9
@@ -110,8 +119,8 @@ def main(dna_variants,
                    'DNA samples (failing)\tNumber of DNA samples (failing)\t' \
                    'RNA samples (passing)\tNumber of RNA samples (passing)\t' \
                    'RNA samples (failing)\tNumber of RNA samples (failing)\tEffects\t' \
-                   'cDNA change\tAA change\tEpitope creation flags\tWt Epitope\t' \
-                   'Mut Epitope\tTranscripts\tDNA Callers Sample(Name:NDP;NAD;NVAF;TDP;TAD;TVAF)\t' \
+                   'cDNA change\tAA change\tEpitope creation flags\tWt Epitope 25mer\t' \
+                   'Mut Epitope 25mer\tWt Epitope 41mer\tMut Epitope 41mer\tTranscripts\tDNA Callers Sample(Name:NDP;NAD;NVAF;TDP;TAD;TVAF)\t' \
                    'RNA Callers Sample(Name:TDP;TAD;TVAF)\tGeneCount info Sample(gene;exp;mean;percentile)\n'
 
     final_file = open('overlap_final.txt', 'w')
@@ -185,7 +194,7 @@ def main(dna_variants,
                                                   ';'.join(rna_name_pass), num_rna_pass,
                                                   ';'.join(rna_name_fail), num_rna_fail,
                                                   effect, epitope.dnamut, epitope.aamut, epitope.flags,
-                                                  epitope.wtseq[0], epitope.mutseq[0], transcripts,
+                                                  epitope.wtseq[0], epitope.mutseq[0], epitope.wtseq[1], epitope.mutseq[1], transcripts,
                                                   dna_callers, rna_callers, ';'.join(gene_locus)])
             if num_dna_pass >= 1:
                 final_file.write(to_write + '\n')
@@ -218,6 +227,8 @@ def main(dna_variants,
                         help='Path to a dictionary of transcript IDs to peptide sequences', required=True)
     parser.add_argument('--dictcDNA',
                         help='Path to a dictionary of transcript IDs to DNA sequences', required=True)
+    parser.add_argument('--dict3prime',
+                        help='Path to a dictionary of transcript IDs to three prime UTR sequences.', required=True)
     parser.add_argument('--filter-dna-tumor-cov', type=int, default=10, required=False, dest='tumor_coverage',
                         help='Filter for DNA variants tumor number of reads (coverage) (DP). Default=10')
     parser.add_argument('--filter-dna-tumor-depth', type=int, default=4, required=False, dest='tumor_var_depth',
@@ -259,6 +270,7 @@ def main(dna_variants,
          args.rna_counts,
          os.path.abspath(args.dictcDNA),
          os.path.abspath(args.dictAA),
+         os.path.abspath(args.dict3prime),
          args.tumor_coverage,
          args.tumor_var_depth,
          args.tumor_var_freq,
@@ -271,4 +283,4 @@ def main(dna_variants,
          args.tumor_var_depth_rna,
          args.tumor_var_freq_rna,
          args.num_callers_rna,
-         args.ensembl_version)
+         args.ensembl_version)
diff --git a/bin/variant_effect.py b/bin/variant_effect.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+
+from Bio.Seq import translate
+from Bio.Data import IUPACData
+import re
+
+
+three_to_one = IUPACData.protein_letters_3to1_extended
+
+
+def translate_dna(seq):
+    return translate(seq, to_stop=True)
+
+
+def missense_variant(starts, ends, wt_mer, mut_mer, errors, mut_dna, mut_aa, transcript, cDNA_pos, aa_pos, cDNA_dict, AA_dict):
+    ref_AA, var_AA = [three_to_one[aa] for aa in re.split(r'\d+', mut_aa.lstrip('p.')) if len(aa[1])]
+    protein_seq = AA_dict[transcript]
+    end = [aa_pos + x if (aa_pos + x) < len(protein_seq) else None for x in ends]
+    start = [aa_pos - x for x in starts]
+    if any(ele < 0 for ele in start):
+        errors += ' Start of sequence is shorter than 12aa from mutation'
+        start = [0 if x < 0 else x for x in start]
+    wt_mer = [protein_seq[x:y] for x, y in zip(start, end)]
+    mut_mer = [protein_seq[x:aa_pos - 1] + var_AA + protein_seq[aa_pos:y] for x, y in zip(start, end)]
+    return errors, wt_mer, mut_mer
+
+
+def inframe_variant(starts, ends, wt_mer, mut_mer, errors, mut_dna, mut_aa, transcript, cDNA_pos, aa_pos, cDNA_dict, AA_dict):
+    cDNA_seq = cDNA_dict[transcript]
+    protein_seq = AA_dict[transcript]
+    end = [aa_pos + x if (aa_pos + x) < len(protein_seq) else None for x in ends]
+    start = [aa_pos - x for x in starts]
+    if any(ele < 0 for ele in start):
+        errors += ' Start of sequence is shorter than 12aa from mutation'
+        start = [0 if x < 0 else x for x in start]
+    wt_mer = [protein_seq[x:y] for x, y in zip(start, end)]
+    if 'dup' in mut_dna:
+        dup_pos = list(map(int, re.findall(r'\d+', mut_dna))) if mut_dna != '' else 0
+        mut_fasta = cDNA_seq[:dup_pos[0] - 1] + cDNA_seq[dup_pos[0] - 1:dup_pos[-1]] + cDNA_seq[dup_pos[0] - 1:]
+        mut_protein = translate_dna(mut_fasta)
+        mut_mer = [mut_protein[x:y] for x, y in zip(start, end)]
+    elif 'ins' in mut_dna:
+        ins_seq = mut_dna[int(mut_dna.find('ins')) + 3:]
+        mut_fasta = cDNA_seq[:cDNA_pos] + ins_seq + cDNA_seq[cDNA_pos:]
+        mut_protein = translate_dna(mut_fasta)
+        mut_mer = [mut_protein[x:y] for x, y in zip(start, end)]
+    elif 'del' in mut_dna:
+        del_pos = list(map(int, re.findall(r'\d+', mut_dna))) if mut_dna != '' else 0
+        mut_fasta = cDNA_seq[:del_pos[0] - 1] + cDNA_seq[del_pos[1]:]
+        mut_protein = translate_dna(mut_fasta)
+        mut_mer = [mut_protein[x:y] for x, y in zip(start, end)]
+    return errors, wt_mer, mut_mer
+
+
+def frameshift_variant(ref, starts, ends, wt_mer, mut_mer, errors, mut_dna, mut_aa, transcript, cDNA_pos, aa_pos, cDNA_dict, AA_dict, three_prime_utr_dict):
+    CDS_seq = cDNA_dict[transcript]
+    protein_seq = AA_dict[transcript]
+    try:
+        utr = three_prime_utr_dict[transcript]
+    except KeyError:
+        utr = ''
+    end = [aa_pos + x if (aa_pos + x) < len(protein_seq) else None for x in ends]
+    start = [aa_pos - x for x in starts]
+    cDNA_seq = CDS_seq + utr
+    if any(ele < 0 for ele in start):
+        errors += ' Start of sequence is shorter than 12aa from mutation'
+        start = [0 if x < 0 else x for x in start]
+    wt_mer = [protein_seq[x:y] for x, y in zip(start, end)]
+    if 'del' in mut_dna:
+        fs = len(ref)
+        mut_cDNA = cDNA_seq[:cDNA_pos - 1] + cDNA_seq[cDNA_pos + fs - 1:]
+        mut_fasta = str(translate_dna(mut_cDNA.replace(' ', '')))
+        mut_mer = [mut_fasta[x:] for x in start]
+    elif 'dup' in mut_dna:
+        dup_pos = [None, None]
+        dup_pos = list(map(int, re.findall(r'\d+', mut_dna))) if mut_dna != '' else 0
+        mut_fasta = cDNA_seq[:dup_pos[0]] + cDNA_seq[dup_pos[0] - 1:dup_pos[-1]] + cDNA_seq[dup_pos[-1]:]
+        mut_protein = translate_dna(mut_fasta)
+        mut_mer = [mut_protein[x:] for x in start]
+    elif 'ins' in mut_dna:
+        ins_seq = mut_dna[int(mut_dna.find('ins')) + 3:]
+        mut_fasta = cDNA_seq[:cDNA_pos] + ins_seq + cDNA_seq[cDNA_pos:]
+        mut_protein = translate_dna(mut_fasta)
+        mut_mer = [mut_protein[x:y] for x, y in zip(start, end)]
+    return errors, wt_mer, mut_mer
+
+
+def stoplost_variant(starts, ends, wt_mer, mut_mer, errors, mut_dna, mut_aa, transcript, cDNA_pos, aa_pos, cDNA_dict, AA_dict, three_prime_utr_dict):
+    CDS_seq = cDNA_dict[transcript]
+    protein_seq = AA_dict[transcript]
+    try:
+        utr = three_prime_utr_dict[transcript]
+    except KeyError:
+        utr = ''
+    end = [aa_pos + x if (aa_pos + x) < len(protein_seq) else None for x in ends]
+    start = [aa_pos - x for x in starts]
+    cDNA_seq = CDS_seq + utr
+    pass