Package Bio :: Package SeqUtils :: Module CodonUsage
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CodonUsage

  1  import math 
  2  from CodonUsageIndices import SharpEcoliIndex 
  3  from Bio import Fasta 
  4   
  5  CodonsDict = {'TTT':0, 'TTC':0, 'TTA':0, 'TTG':0, 'CTT':0,  
  6  'CTC':0, 'CTA':0, 'CTG':0, 'ATT':0, 'ATC':0,  
  7  'ATA':0, 'ATG':0, 'GTT':0, 'GTC':0, 'GTA':0,  
  8  'GTG':0, 'TAT':0, 'TAC':0, 'TAA':0, 'TAG':0,  
  9  'CAT':0, 'CAC':0, 'CAA':0, 'CAG':0, 'AAT':0,  
 10  'AAC':0, 'AAA':0, 'AAG':0, 'GAT':0, 'GAC':0,  
 11  'GAA':0, 'GAG':0, 'TCT':0, 'TCC':0, 'TCA':0,  
 12  'TCG':0, 'CCT':0, 'CCC':0, 'CCA':0, 'CCG':0,  
 13  'ACT':0, 'ACC':0, 'ACA':0, 'ACG':0, 'GCT':0,  
 14  'GCC':0, 'GCA':0, 'GCG':0, 'TGT':0, 'TGC':0,  
 15  'TGA':0, 'TGG':0, 'CGT':0, 'CGC':0, 'CGA':0,  
 16  'CGG':0, 'AGT':0, 'AGC':0, 'AGA':0, 'AGG':0,  
 17  'GGT':0, 'GGC':0, 'GGA':0, 'GGG':0} 
 18   
 19   
 20  # this dictionary is used to know which codons encode the same AA. 
 21  SynonymousCodons = {'CYS': ['TGT', 'TGC'], 'ASP': ['GAT', 'GAC'], 
 22  'SER': ['TCT', 'TCG', 'TCA', 'TCC', 'AGC', 'AGT'], 
 23  'GLN': ['CAA', 'CAG'], 'MET': ['ATG'], 'ASN': ['AAC', 'AAT'], 
 24  'PRO': ['CCT', 'CCG', 'CCA', 'CCC'], 'LYS': ['AAG', 'AAA'], 
 25  'STOP': ['TAG', 'TGA', 'TAA'], 'THR': ['ACC', 'ACA', 'ACG', 'ACT'], 
 26  'PHE': ['TTT', 'TTC'], 'ALA': ['GCA', 'GCC', 'GCG', 'GCT'], 
 27  'GLY': ['GGT', 'GGG', 'GGA', 'GGC'], 'ILE': ['ATC', 'ATA', 'ATT'], 
 28  'LEU': ['TTA', 'TTG', 'CTC', 'CTT', 'CTG', 'CTA'], 'HIS': ['CAT', 'CAC'], 
 29  'ARG': ['CGA', 'CGC', 'CGG', 'CGT', 'AGG', 'AGA'], 'TRP': ['TGG'], 
 30  'VAL': ['GTA', 'GTC', 'GTG', 'GTT'], 'GLU': ['GAG', 'GAA'], 'TYR': ['TAT', 'TAC']} 
 31   
 32   
33 -class CodonAdaptationIndex:
34 """ 35 36 This class implements the codon adaptaion index (CAI) described by Sharp and 37 Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95). 38 39 methods: 40 41 set_cai_index(Index): 42 43 This mehtod sets-up an index to be used when calculating CAI for a gene. 44 Just pass a dictionary similar to the SharpEcoliIndex in CodonUsageIndices 45 module. 46 47 generate_index(FastaFile): 48 49 This method takes a location of a FastaFile and generates an index. This 50 index can later be used to calculate CAI of a gene. 51 52 cai_for_gene(DNAsequence): 53 54 This mehtod uses the Index (either the one you set or the one you generated) 55 and returns the CAI for the DNA sequence. 56 57 print_index(): 58 This method prints out the index you used. 59 60 """
61 - def __init__(self):
62 self.index = {} 63 self.codon_count={}
64 65 # use this method with predefined CAI index
66 - def set_cai_index(self, Index):
67 self.index = Index
68
69 - def generate_index(self, FastaFile):
70 # first make sure i am not overwriting an existing index: 71 if self.index != {} or self.codon_count!={}: 72 raise Error("an index has already been set or a codon count has been done. cannot overwrite either.") 73 # count codon occurances in the file. 74 self._count_codons(FastaFile) 75 76 # now to calculate the index we first need to sum the number of times 77 # synonymous codons were used all together. 78 for AA in SynonymousCodons.keys(): 79 Sum=0.0 80 RCSU=[] # RCSU values are equal to CodonCount/((1/num of synonymous codons) * sum of all synonymous codons) 81 82 for codon in SynonymousCodons[AA]: 83 Sum += self.codon_count[codon] 84 # calculate the RSCU value for each of the codons 85 for codon in SynonymousCodons[AA]: 86 RCSU.append(self.codon_count[codon]/((1.0/len(SynonymousCodons[AA]))*Sum)) 87 # now generate the index W=RCSUi/RCSUmax: 88 RCSUmax = max(RCSU) 89 for i in range(len(SynonymousCodons[AA])): 90 self.index[SynonymousCodons[AA][i]]= RCSU[i]/RCSUmax
91 92
93 - def cai_for_gene(self, DNAsequence):
94 caiValue = 0 95 LengthForCai = 0 96 # if no index is set or generated, the default SharpEcoliIndex will be used. 97 if self.index=={}: 98 self.set_cai_index(SharpEcoliIndex) 99 100 if DNAsequence.islower(): 101 DNAsequence = DNAsequence.upper() 102 for i in range (0,len(DNAsequence),3): 103 codon = DNAsequence[i:i+3] 104 if self.index.has_key(codon): 105 if codon!='ATG' and codon!= 'TGG': #these two codons are always one, exclude them. 106 caiValue += math.log(self.index[codon]) 107 LengthForCai += 1 108 elif codon not in ['TGA','TAA', 'TAG']: # some indices you will use may not include stop codons. 109 raise TypeError("illegal codon in sequence: %s.\n%s" % (codon, self.index)) 110 return math.exp(caiValue*(1.0/(LengthForCai-1)))
111
112 - def _count_codons(self, FastaFile):
113 InputFile = open(FastaFile, 'r') 114 # set up the fasta parser 115 parser = Fasta.RecordParser() 116 iterator = Fasta.Iterator(InputFile, parser) 117 cur_record = iterator.next() 118 119 # make the codon dictionary local 120 self.codon_count = CodonsDict.copy() 121 122 123 # iterate over sequence and count all the codons in the FastaFile. 124 while cur_record: 125 # make sure the sequence is lower case 126 if cur_record.sequence.islower(): 127 DNAsequence = cur_record.sequence.upper() 128 else: 129 DNAsequence = cur_record.sequence 130 for i in range(0,len(DNAsequence),3): 131 codon = DNAsequence[i:i+3] 132 if self.codon_count.has_key(codon): 133 self.codon_count[codon] += 1 134 else: 135 raise TypeError("illegal codon %s in gene: %s" % (codon, cur_record.title)) 136 137 cur_record = iterator.next() 138 InputFile.close()
139 140 # this just gives the index when the objects is printed.
141 - def print_index (self):
142 X=self.index.keys() 143 X.sort() 144 for i in X: 145 print "%s\t%.3f" %(i, self.index[i])
146