-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcairi.py
408 lines (364 loc) · 21.4 KB
/
cairi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
import re
import os
import sys
from Bio import Seq
from Bio import SeqIO
from Bio import SeqUtils
from Bio.SeqUtils import CodonUsage
import math
from Bio import Data
from Bio import pairwise2
import warnings
from Bio import BiopythonWarning
warnings.filterwarnings('error')
# number of nucleotide per codon ...
CODON_LEN = 3
# Number of extracted CDS coding for ribosomal proteins, that we accept as sufficient...
RIBO_LIMIT = 20
# matched amino acids threshold for cDNA recovery procedure ...
MATCHED_RATIO_PARTIAL = 0.5
MATCHED_RATIO_FULL = 0.8
# Codon dictionary for accounting, just for convenience -> get a deepcopy of it whenever you need to reset
CodonsDict = {'TTT': 0, 'TTC': 0, 'TTA': 0, 'TTG': 0, 'CTT': 0,
'CTC': 0, 'CTA': 0, 'CTG': 0, 'ATT': 0, 'ATC': 0,
'ATA': 0, 'ATG': 0, 'GTT': 0, 'GTC': 0, 'GTA': 0,
'GTG': 0, 'TAT': 0, 'TAC': 0, 'TAA': 0, 'TAG': 0,
'CAT': 0, 'CAC': 0, 'CAA': 0, 'CAG': 0, 'AAT': 0,
'AAC': 0, 'AAA': 0, 'AAG': 0, 'GAT': 0, 'GAC': 0,
'GAA': 0, 'GAG': 0, 'TCT': 0, 'TCC': 0, 'TCA': 0,
'TCG': 0, 'CCT': 0, 'CCC': 0, 'CCA': 0, 'CCG': 0,
'ACT': 0, 'ACC': 0, 'ACA': 0, 'ACG': 0, 'GCT': 0,
'GCC': 0, 'GCA': 0, 'GCG': 0, 'TGT': 0, 'TGC': 0,
'TGA': 0, 'TGG': 0, 'CGT': 0, 'CGC': 0, 'CGA': 0,
'CGG': 0, 'AGT': 0, 'AGC': 0, 'AGA': 0, 'AGG': 0,
'GGT': 0, 'GGC': 0, 'GGA': 0, 'GGG': 0}
def six_frames_translation(seq, genetic_table=11):
"""
six_frames_translation(seq, genetic_table=11)
seq: presumably coding DNA sequence in Seq or string format.
genetic_table: index of the genetic table in use (Bacterial/Archaeal/PlantPlasmid, 11 is the default)
Function returns a dictionary with keys being all 6 possible in-frame cDNA sequences, and
values being corresponding protein translations until first in-frame STOP. Beware cDNA are not
truncated at corresponding STOP codons, so they can be much long than their "to_stop" translations.
"""
anti = seq.reverse_complement()
length = len(seq)
frames = {}
for fid in range(CODON_LEN):
fragment_length = CODON_LEN * ( (length-fid)//CODON_LEN )
frames[str(seq[fid:fid+fragment_length])] = str(seq[fid:fid+fragment_length].translate(table=genetic_table,to_stop=True))
frames[str(anti[fid:fid+fragment_length])] = str(anti[fid:fid+fragment_length].translate(table=genetic_table,to_stop=True))
return frames
def get_putative_cDNA(ref_prot,frames):
"""
get_putative_cDNA(frames)
ref_prot: reference protein sequence with unknown cDNA (cDNA fuzzy location, etc.)
frames: dictionary of 6*cDNA:translation key:value pairs, returned by six_frames_translation
Takes 6-frames dictionary returned by six_frames_translation and looks for the cDNA best matching
the reference protein. Returns a pair (best_cDNA, percent_matched), where percent_matched is
the number of amino acids that matched to the reference protein sequence.
"""
# presumbaly_top_align = pairwise2.align.globalxx(seq1,seq2)[0]
# s1_aln, s2_aln, score, begin, end = presumbaly_top_align
frame_align = ( (cDNA, pairwise2.align.globalxx(str(ref_prot),translation)[0][2] ) for cDNA,translation in frames.iteritems() if translation)
# frames' translations aligned to the reference protein: iter of (cDNA, SCORE_aln)-pairs is returned ...
# returning cDNA with the best alignment score ...
best_cDNA, prot_score = max(frame_align,key=lambda x: x[1])
#
print prot_score, len(ref_prot)
#
# prot_score is simply the number of matches, in case we use the "dumb" globalxx aligner
# see BioPython pairwise2 description for details http://biopython.org/DIST/docs/api/Bio.pairwise2-module.html
# let's return the fraction of amino acids that we were able to align,
# so that we could do an advised decision on whether that recovered cDNA is usable or not ...
return (best_cDNA, prot_score/len(ref_prot))
###################################################################################
# CAI related functions ...
ribosomal_check = re.compile('ribosomal protein')
def extract_ribo_genes(genbank,nuc_seq):
"""
extract_ribo_genes(genbank,nuc_seq)
genbank: SeqRecord object with the annotated genome
nuc_seq: Seq object with the nucleotide sequence of the genome in 'genbank'
It goes through the genbank's list of features and extracts genes coding for ribosomal proteins:
their feature id (in a given genbank), nucleotide and protein sequences and finally their acceptance status.
"""
data = {"fid":[],"nucs":[],"protein":[],"table":[],"status":[]}
for fid,feature in enumerate(genbank.features):
feat_quals = feature.qualifiers
qual_keys = feat_quals.keys()
if (feature.type == 'CDS')and('translation' in qual_keys)and('product' in qual_keys):
if ribosomal_check.search(feat_quals['product'][0]):
# this is the SUPER STRICT check for suitable ribo-genes ...
# extracting coding DNA sequences ...
extracted_nucs = feature.extract(nuc_seq)
# extracting available protein sequence ...
extracted_protein = feat_quals['translation'][0]
# try to translate it back using some genetic code, which one ?
genetic_table = feat_quals['transl_table'][0] if ('transl_table' in qual_keys) else 11
# genetic_table = 11 is the Bacterial, Archaeal and Plant Plastid Code, so it'll be our default, just in case ...
#
# fill in the dict, no matter what:
data['fid'].append(fid)
data['nucs'].append(extracted_nucs)
data['protein'].append(extracted_protein)
data['table'].append(genetic_table)
#
try:
translation = extracted_nucs.translate(table=genetic_table,cds=True)
# catch the translational error exception ...
except Data.CodonTable.TranslationError:
# if gene is not true-CDS and it cannot be translated we'll automatically reject it!
data['status'].append('reject')
# and move on to the next gene in the loop ...
continue
# even if it is a true-CDS, we'll check if it translates back to what is in the feature qualifiers ..
if str(translation)==feat_quals['translation'][0]:
# OK! extracted gene translates to the the provided feature qualifier ...
data['status'].append('accept')
else:
# rejecting the gene if it does not match with the feature qualifier translation ...
data['status'].append('reject')
return data
# this returns a dictionary of with assorted info on the CDS-es from a given genbank ...
def extract_genes_features(genbank,nuc_seq):
"""
extract_genes_features(genbank,nuc_seq)
genbank: SeqRecord object with the annotated genome
nuc_seq: Seq object with the nucleotide sequence of the genome in 'genbank'
It goes through the genbank's list of features and extracts CDS-es with translation and product description.
their feature id (in a given genbank), nucleotide and protein sequences and finally their acceptance status.
"""
data = {"GenomicID":[],"fid":[],"pid":[],"cDNA":[],"protein":[],"product":[],"table":[],"status":[]}
#
for fid,feature in enumerate(genbank.features):
feat_quals = feature.qualifiers
qual_keys = feat_quals.keys()
if (feature.type == 'CDS')and('translation' in qual_keys)and('product' in qual_keys):
# we'll be extracting only nice CDS-es, ones that can be translated back to the translation present in features ...
# extract coding nucleotide sequence:
extracted_nucs = feature.extract(nuc_seq)
# extracting available protein sequence ...
extracted_protein = feat_quals['translation'][0]
# try to translate it back using some genetic code, which one ?
genetic_table = feat_quals['transl_table'][0] if ('transl_table' in qual_keys) else 11
# genetic_table = 11 is the Bacterial, Archaeal and Plant Plastid Code, so it'll be our default, just in case ...
#
# fill in the dict, no matter what:
data['GenomicID'].append(genbank.id)
data['fid'].append(fid)
data['pid'].append(feat_quals['protein_id'][0])
data['protein'].append(extracted_protein)
data['product'].append(feat_quals['product'][0].replace(',',' '))
data['table'].append(genetic_table)
#
# trying clean CDS translation first ...
try:
# print "try CDS"
translation = extracted_nucs.translate(table=genetic_table,cds=True)
# we'll be catching TranslationError exceptions here ...
except Data.CodonTable.TranslationError:
# print "try to translate ..."
# if extracted_nucs is not true-CDS, try manual translation thing ...
try:
translation = extracted_nucs.translate(table=genetic_table).strip('*')
except BiopythonWarning:
print "Partial codon detected @ genbank %s feature %d, trimming nucleotide sequence ..."%(genbank.id,int(fid))
nucs_len = len(extracted_nucs)
extracted_nucs = extracted_nucs[:-(nucs_len%CODON_LEN)]
translation = extracted_nucs.translate(table=genetic_table).strip('*')
# we are not catching anything here, it's kinda dangerous though ...
finally:
# best situation, translation matches exactly ...
if (str(translation) == extracted_protein):
# print "MATCH!"
# OK! extracted extracted_nucs translates to the the provided feature qualifier ...
data['cDNA'].append(str(extracted_nucs))
data['status'].append("accepted")
# try comparing 2 sequences as if they were aligned ...
elif sum((a==b) for a,b in zip(str(translation),extracted_protein)) >= MATCHED_RATIO_FULL*len(extracted_protein):
# print "ALMOST MATCH ..."
# OK! extracted extracted_nucs translates to the the provided feature qualifier
# ALMOST exactly ...
data['cDNA'].append(str(extracted_nucs))
data['status'].append("accepted")
# translations are matched >= 80% ...
# examples: TGA is a stop codon in 11's table, yet it can code for U (Selenocysteine)
elif pairwise2.align.globalxx(str(translation),extracted_protein,score_only=True) >= MATCHED_RATIO_FULL*len(extracted_protein):
# print "ALMOST ALIGNED ..."
# OK! extracted extracted_nucs translates to the the provided feature qualifier
# ALMOST exactly ...
data['status '].append("accepted")
data['cDNA'].append(str(extracted_nucs))
# try something else ...
else:
# print "recovery ..."
#LAST RESORT ...
# before rejecting cDNA completely, let's try recovering cDNA manually ...
print "trying to recover cDNA @ genbank %s feature %d ..."%(genbank.id,int(fid))
extracted_cDNA_trans = six_frames_translation(extracted_nucs, genetic_table=genetic_table)
putative_cDNA, matched_aa = get_putative_cDNA(extracted_protein,extracted_cDNA_trans)
# compare matched_aa with MATCHED_RATIO_PATRIAL:
# it was needed just once so far, when the start location was fuzzy for the strand=1 DNA ...
# so the extracted_nucleotides were not in frame! ...
if (matched_aa >= MATCHED_RATIO_PARTIAL):
# print "recovered!!!!!!!!!!!"
data['cDNA'].append(str(extracted_nucs))
data['status'].append("recovered")
else:
# print "total reject ..."
# final rejection ...
data['cDNA'].append(str(extracted_nucs))
data['status'].append('rejected')
# finally clause over ...
# if CDS,translation,product clause over ...
# features loop over ...
return data
#
def count_codons(dna_seq_list):
"""
count_codons(dna_seq_list)
dna_seq_list: list(or other iterable) of DNA sequences(Seq object or strings)
It goes over all sequences and count appearances of all codons.
Sequences whose length is not proportional to 3(nucleotides per codon) are skipped!
Codons containing anything but 'ATGC' are skipped as well.
All sequences must be in frame, so that first 3 letters must constitute an actual codon (user responsibility).
"""
codons_counter = {'TTT': 0, 'TTC': 0, 'TTA': 0, 'TTG': 0, 'CTT': 0,
'CTC': 0, 'CTA': 0, 'CTG': 0, 'ATT': 0, 'ATC': 0,
'ATA': 0, 'ATG': 0, 'GTT': 0, 'GTC': 0, 'GTA': 0,
'GTG': 0, 'TAT': 0, 'TAC': 0, 'TAA': 0, 'TAG': 0,
'CAT': 0, 'CAC': 0, 'CAA': 0, 'CAG': 0, 'AAT': 0,
'AAC': 0, 'AAA': 0, 'AAG': 0, 'GAT': 0, 'GAC': 0,
'GAA': 0, 'GAG': 0, 'TCT': 0, 'TCC': 0, 'TCA': 0,
'TCG': 0, 'CCT': 0, 'CCC': 0, 'CCA': 0, 'CCG': 0,
'ACT': 0, 'ACC': 0, 'ACA': 0, 'ACG': 0, 'GCT': 0,
'GCC': 0, 'GCA': 0, 'GCG': 0, 'TGT': 0, 'TGC': 0,
'TGA': 0, 'TGG': 0, 'CGT': 0, 'CGC': 0, 'CGA': 0,
'CGG': 0, 'AGT': 0, 'AGC': 0, 'AGA': 0, 'AGG': 0,
'GGT': 0, 'GGC': 0, 'GGA': 0, 'GGG': 0}
# cod_dict must be zeroed out beforehand, that's a potential issue ...
# codons = [dna_seq[i:i+CODON_LEN] for i in range(0,len(dna_seq),CODON_LEN)]
for gene_sequence in dna_seq_list:
gene_sequence = str(gene_sequence)
len_gene_sequence = len(gene_sequence)
if len_gene_sequence%CODON_LEN:
print >> sys.stderr,"(Counting codons in count_codons) gene lengths is not ~3! Skipping it!!!"
else:
for codon in [gene_sequence[i:i+CODON_LEN] for i in range(0,len_gene_sequence,CODON_LEN)]:
# count codons that we care about ...
if codon in codons_counter.keys():
codons_counter[codon] += 1 # count ...
else:
# raise TypeError("Illegal codon %s in genes" % codon)
print >> sys.stderr, "(Counting codons in count_codons) Illegal codon %s in genes! Skipping it!" % codon
#
#
return codons_counter
# CodonUsage.SynonymousCodons
def generate_codon_index(cod_dict,genetic_table=11):
"""
generate_codon_index(cod_dict,genetic_table=11)
cod_dict: dictionary with codons as keys and number of corresponding codon appearance as the value
genetic_table: index of the genetic table in use (Bacterial/Archaeal/PlantPlasmid, 11 is the default)
Function generates codon index, based on the input codon usage data (presumably from ribosomal proteins,
or other highly expressed genes). One can modify the genetic table in use.
"""
index = {}
# Let's create the index dictionary for the specific genetic table provided ...
# unambiguous DNA deals with 61 codons made of
genetic_code = Data.CodonTable.unambiguous_dna_by_id[genetic_table]
SynonymousCodons = dict([(aa,[]) for aa in genetic_code.protein_alphabet.letters])
# SynonymousCodons['STOP'] = genetic_code.stop_codons # STOP codons are excluded from analysis ...
for codon,aa in genetic_code.forward_table.iteritems():
SynonymousCodons[aa].append(codon)
# now exclude amino acids coded by a single codons: M,W for standard genetic code ...
unambiguous_aacids = [aa for aa,codons in SynonymousCodons.iteritems() if len(codons)<2]
for aa in unambiguous_aacids:
# if there is no ambiguity, there is no optimality ...
_ = SynonymousCodons.pop(aa)
#
# now to calculate the index we first need to sum the number of times
# synonymous codons were used all together.
# here is the place for potential improvement, because CodonUsage.SynonymousCodons implies the standard genetic code,
# while table=4 gives up 1 STOP codon in favor of a W(triptophane) one! While, table=11 uses the standart code, with the
# expanded range of start codons ...
for aa in SynonymousCodons:
SynCodons = SynonymousCodons[aa]
###################################
# total number of codons coding for a given amino acid in the cod_dict ...
total = sum([ cod_dict[scodon] for scodon in SynCodons ])
if not total:
print >> sys.stderr, "ACHTUNG!!! this is unprecedented!!!"
print >> sys.stderr, "Codons coding for %s amino acid are not encountered in ribo-genes of this organism!" % aa
print >> sys.stderr, "Codons corresponding to %s will appear as Illegal for this organism!" % aa
continue
else:
###################################
# calculate the RSCU value for each of the codons
# RSCU values are CodonCount/((1/num of synonymous codons) * sum of all synonymous codons)
# Relative Synonymous Codon Usage (RSCU)...
# (sum of all synonymous codons/num of synonymous codons) is expected number of this-type-of-codon assuming no codon bias!
no_bias_usage = float(total)/len(SynCodons)
RSCU = [ cod_dict[scodon]/no_bias_usage for scodon in SynCodons ]
###################################
# now generate the index W=RCSUi/RCSUmax:
RSCU_max = max(RSCU)
for scodon,RSCUi in zip(SynCodons,RSCU):
index[scodon] = RSCUi/RSCU_max
####################################
# We have excluded stop codons and those that are singletons, but mind that:
# index is alway 1, when amino acid is coded by a single codon ...
# for trans_table 11 that would be 'ATG', 'TGG' coding for Met and W(Trypt.) respectively
return index
def cai_for_gene(dna_seq,index):
"""
cai_for_gene(dna_seq,index)
dna_seq: Seq object or Python string with the in-frame coding sequence
index: Codon usage bias calculated fro presumably highly expressed CDSs, excluding singletons and STOP codons
This function calculate CAI for the provided coding sequence.
"""
dna_seq = str(dna_seq)
cai_value, cai_length, illegal_codons_count = 0.0, 0, 0
for codon in [dna_seq[i:i+CODON_LEN] for i in range(0,len(dna_seq),CODON_LEN)]:
if codon in index:
# STOP codons and singletons are already excluded ...
if index[codon]>0.0:
cai_value += math.log(index[codon])
cai_length += 1
# illegal codons are those, that contain something but 'ATGC' ...
elif sum( [ (nuc in 'ATGC') for nuc in codon ] )<3:
illegal_codons_count += 1
print >> sys.stderr, "skipping illegal codon in sequence: %s" % codon
# this line is from BioPython:
# raise TypeError("illegal codon in sequence: %s.\n%s" % (codon, index))
# we'll be skipping illegal codons instead ...
if illegal_codons_count:
print >> sys.stderr, "%d illegal codons were skipped in the %d nucs gene! %s" % (illegal_codons_count,len(dna_seq),dna_seq)
return math.exp(cai_value / (cai_length - 1.0))
###################################################################################
# #
# def count_codons_old(dna_seq_list,cod_dict):
# """
# count_codons_old(dna_seq_list,cod_dict)
# dna_seq_list: list(or other iterable) of DNA sequences(Seq object or strings)
# cod_dict: dictionary with the valid codons as the keys.
# It goes over all sequences and count appearances of all codons.
# Sequences whose length is not proportional to 3(nucleotides per codon) are skipped!
# Codons containing anything but 'ATGC' are skipped as well.
# All sequences must be in frame, so that first 3 letters must constitute an actual codon (user responsibility).
# """
# # cod_dict must be zeroed out beforehand, that's a potential issue ...
# # codons = [dna_seq[i:i+CODON_LEN] for i in range(0,len(dna_seq),CODON_LEN)]
# for gene_sequence in dna_seq_list:
# gene_sequence = str(gene_sequence)
# len_gene_sequence = len(gene_sequence)
# if len_gene_sequence%CODON_LEN:
# print >> sys.stderr,"(Counting codons in count_codons) gene lengths is not ~3! Skipping it!!!"
# else:
# for codon in [gene_sequence[i:i+CODON_LEN] for i in range(0,len_gene_sequence,CODON_LEN)]:
# if codon in cod_dict.keys():
# cod_dict[codon] += 1
# else:
# # raise TypeError("Illegal codon %s in genes" % codon)
# print >> sys.stderr, "(Counting codons in count_codons) Illegal codon %s in genes! Skipping it!" % codon