1 import math
2 from CodonUsageIndices import SharpEcoliIndex
3 from Bio import SeqIO
4
5 CodonsDict = {'TTT':0, 'TTC':0, 'TTA':0, 'TTG':0, 'CTT':0,
6 'CTC':0, 'CTA':0, 'CTG':0, 'ATT':0, 'ATC':0,
7 'ATA':0, 'ATG':0, 'GTT':0, 'GTC':0, 'GTA':0,
8 'GTG':0, 'TAT':0, 'TAC':0, 'TAA':0, 'TAG':0,
9 'CAT':0, 'CAC':0, 'CAA':0, 'CAG':0, 'AAT':0,
10 'AAC':0, 'AAA':0, 'AAG':0, 'GAT':0, 'GAC':0,
11 'GAA':0, 'GAG':0, 'TCT':0, 'TCC':0, 'TCA':0,
12 'TCG':0, 'CCT':0, 'CCC':0, 'CCA':0, 'CCG':0,
13 'ACT':0, 'ACC':0, 'ACA':0, 'ACG':0, 'GCT':0,
14 'GCC':0, 'GCA':0, 'GCG':0, 'TGT':0, 'TGC':0,
15 'TGA':0, 'TGG':0, 'CGT':0, 'CGC':0, 'CGA':0,
16 'CGG':0, 'AGT':0, 'AGC':0, 'AGA':0, 'AGG':0,
17 'GGT':0, 'GGC':0, 'GGA':0, 'GGG':0}
18
19
20
21 SynonymousCodons = {'CYS': ['TGT', 'TGC'], 'ASP': ['GAT', 'GAC'],
22 'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'],
23 'GLN': ['CAA', 'CAG'], 'MET': ['ATG'], 'ASN': ['AAC', 'AAT'],
24 'PRO': ['CCT', 'CCG', 'CCA', 'CCC'], 'LYS': ['AAG', 'AAA'],
25 'STOP': ['TAG', 'TGA', 'TAA'], 'THR': ['ACC', 'ACA', 'ACG', 'ACT'],
26 'PHE': ['TTT', 'TTC'], 'ALA': ['GCA', 'GCC', 'GCG', 'GCT'],
27 'GLY': ['GGT', 'GGG', 'GGA', 'GGC'], 'ILE': ['ATC', 'ATA', 'ATT'],
28 'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'], 'HIS': ['CAT', 'CAC'],
29 'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'], 'TRP': ['TGG'],
30 'VAL': ['GTA', 'GTC', 'GTG', 'GTT'], 'GLU': ['GAG', 'GAA'], 'TYR': ['TAT', 'TAC']}
31
32
34 """A codon adaptaion index (CAI) implementation.
35
36 This class implements the codon adaptaion index (CAI) described by Sharp and
37 Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95).
38
39 methods:
40
41 set_cai_index(Index):
42
43 This method sets-up an index to be used when calculating CAI for a gene.
44 Just pass a dictionary similar to the SharpEcoliIndex in CodonUsageIndices
45 module.
46
47 generate_index(FastaFile):
48
49 This method takes a location of a FastaFile and generates an index. This
50 index can later be used to calculate CAI of a gene.
51
52 cai_for_gene(DNAsequence):
53
54 This method uses the Index (either the one you set or the one you generated)
55 and returns the CAI for the DNA sequence.
56
57 print_index():
58 This method prints out the index you used.
59
60 NOTE - This implementation does not currently cope with alternative genetic
61 codes, only the synonymous codons in the standard table are considered.
62 """
64 self.index = {}
65 self.codon_count={}
66
67
70
72 """Generate a codon usage index from a FASTA file of CDS sequences.
73
74 This method takes a location of a Fasta file containing CDS sequences
75 (which must all have a whole number of codons) and generates a codon
76 usage index. This index can later be used to calculate CAI of a gene.
77 """
78
79 if self.index != {} or self.codon_count!={}:
80 raise ValueError("an index has already been set or a codon count has been done. cannot overwrite either.")
81
82 self._count_codons(FastaFile)
83
84
85
86 for AA in SynonymousCodons.keys():
87 Sum=0.0
88 RCSU=[]
89
90 for codon in SynonymousCodons[AA]:
91 Sum += self.codon_count[codon]
92
93 for codon in SynonymousCodons[AA]:
94 RCSU.append(self.codon_count[codon]/((1.0/len(SynonymousCodons[AA]))*Sum))
95
96 RCSUmax = max(RCSU)
97 for i in range(len(SynonymousCodons[AA])):
98 self.index[SynonymousCodons[AA][i]]= RCSU[i]/RCSUmax
99
100
102 """Calculate the CAI (float) for the provided DNA sequence (string).
103
104 This method uses the Index (either the one you set or the one you generated)
105 and returns the CAI for the DNA sequence.
106 """
107 caiValue = 0
108 LengthForCai = 0
109
110 if self.index=={}:
111 self.set_cai_index(SharpEcoliIndex)
112
113 if DNAsequence.islower():
114 DNAsequence = DNAsequence.upper()
115 for i in range (0,len(DNAsequence),3):
116 codon = DNAsequence[i:i+3]
117 if codon in self.index:
118 if codon!='ATG' and codon!= 'TGG':
119 caiValue += math.log(self.index[codon])
120 LengthForCai += 1
121 elif codon not in ['TGA','TAA', 'TAG']:
122 raise TypeError("illegal codon in sequence: %s.\n%s" % (codon, self.index))
123 return math.exp(caiValue*(1.0/(LengthForCai-1)))
124
126 handle = open(FastaFile, 'r')
127
128
129 self.codon_count = CodonsDict.copy()
130
131
132 for cur_record in SeqIO.parse(handle, "fasta") :
133
134 if str(cur_record.seq).islower():
135 DNAsequence = str(cur_record.seq).upper()
136 else:
137 DNAsequence = str(cur_record.seq)
138 for i in range(0,len(DNAsequence),3):
139 codon = DNAsequence[i:i+3]
140 if codon in self.codon_count:
141 self.codon_count[codon] += 1
142 else:
143 raise TypeError("illegal codon %s in gene: %s" % (codon, cur_record.id))
144 handle.close()
145
146
148 """This method prints out the index you used."""
149 X=self.index.keys()
150 X.sort()
151 for i in X:
152 print "%s\t%.3f" %(i, self.index[i])
153