Package Bio :: Package Alphabet
[hide private]
[frames] | no frames]

Source Code for Package Bio.Alphabet

  1  # Copyright 2000-2002 by Andrew Dalke. 
  2  # Revisions copyright 2007-2008 by Peter Cock. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Alphabets used in Seq objects etc to declare sequence type and letters. 
  9   
 10  This is used by sequences which contain a finite number of similar words. 
 11  """ 
 12   
13 -class Alphabet:
14 size = None # no fixed size for words 15 letters = None # no fixed alphabet; implement as a list-like 16 # interface,
17 - def __repr__(self):
18 return self.__class__.__name__ + "()"
19
20 - def contains(self, other):
21 """Does this alphabet 'contain' the other (OBSOLETE?). 22 23 Returns a boolean. This relies on the Alphabet subclassing 24 hierarchy only, and does not check the letters property. 25 This isn't ideal, and doesn't seem to work as intended 26 with the AlphabetEncoder classes.""" 27 return isinstance(other, self.__class__)
28 29 generic_alphabet = Alphabet() 30
31 -class SingleLetterAlphabet(Alphabet):
32 size = 1 33 letters = None # string of all letters in the alphabet
34 35 single_letter_alphabet = SingleLetterAlphabet() 36 37 ########### Protein 38
39 -class ProteinAlphabet(SingleLetterAlphabet):
40 pass
41 42 generic_protein = ProteinAlphabet() 43 44 ########### DNA
45 -class NucleotideAlphabet(SingleLetterAlphabet):
46 pass
47 48 generic_nucleotide = NucleotideAlphabet() 49
50 -class DNAAlphabet(NucleotideAlphabet):
51 pass
52 53 generic_dna = DNAAlphabet() 54 55 56 ########### RNA 57
58 -class RNAAlphabet(NucleotideAlphabet):
59 pass
60 61 generic_rna = RNAAlphabet() 62 63 64 65 ########### Other per-sequence encodings 66
67 -class SecondaryStructure(SingleLetterAlphabet):
68 letters = "HSTC"
69
70 -class ThreeLetterProtein(Alphabet):
71 size = 3 72 letters = [ 73 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 74 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 75 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", 76 ]
77 78 ###### Non per-sequence modifications 79 80 # (These are Decorator classes) 81
82 -class AlphabetEncoder:
83 - def __init__(self, alphabet, new_letters):
84 self.alphabet = alphabet 85 self.new_letters = new_letters 86 if alphabet.letters is not None: 87 self.letters = alphabet.letters + new_letters 88 else: 89 self.letters = None
90 - def __getattr__(self, key):
91 if key[:2] == "__" and key[-2:] == "__": 92 raise AttributeError(key) 93 return getattr(self.alphabet, key)
94
95 - def __repr__(self):
96 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, 97 self.new_letters)
98
99 - def contains(self, other):
100 """Does this alphabet 'contain' the other (OBSOLETE?). 101 102 This is isn't implemented for the base AlphabetEncoder, 103 which will always return 0 (False).""" 104 return 0
105
106 -class Gapped(AlphabetEncoder):
107 - def __init__(self, alphabet, gap_char = "-"):
108 AlphabetEncoder.__init__(self, alphabet, gap_char) 109 self.gap_char = gap_char
110
111 - def contains(self, other):
112 """Does this alphabet 'contain' the other (OBSOLETE?). 113 114 Returns a boolean. This relies on the Alphabet subclassing 115 hierarchy, and attempts to check the gap character. This fails 116 if the other alphabet does not have a gap character! 117 """ 118 return other.gap_char == self.gap_char and \ 119 self.alphabet.contains(other.alphabet)
120
121 -class HasStopCodon(AlphabetEncoder):
122 - def __init__(self, alphabet, stop_symbol = "*"):
123 AlphabetEncoder.__init__(self, alphabet, stop_symbol) 124 self.stop_symbol = stop_symbol
125
126 - def __cmp__(self, other):
127 x = cmp(self.alphabet, other.alphabet) 128 if x == 0: 129 return cmp(self.stop_symbol, other.stop_symbol) 130 return x
131
132 - def contains(self, other):
133 """Does this alphabet 'contain' the other (OBSOLETE?). 134 135 Returns a boolean. This relies on the Alphabet subclassing 136 hierarchy, and attempts to check the stop symbol. This fails 137 if the other alphabet does not have a stop symbol! 138 """ 139 return other.stop_symbol == self.stop_symbol and \ 140 self.alphabet.contains(other.alphabet)
141
142 -def _get_base_alphabet(alphabet) :
143 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" 144 a = alphabet 145 while isinstance(a, AlphabetEncoder) : 146 a = a.alphabet 147 assert isinstance(a, Alphabet), \ 148 "Invalid alphabet found, %s" % repr(a) 149 return a
150
151 -def _consensus_base_alphabet(alphabets) :
152 """Returns a common but often generic base alphabet object (PRIVATE). 153 154 This throws away any AlphabetEncoder information, e.g. Gapped alphabets. 155 156 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 157 letter. These DO NOT raise an exception!""" 158 common = None 159 for alpha in alphabets : 160 a = _get_base_alphabet(alpha) 161 if common is None : 162 common = a 163 elif common == a : 164 pass 165 elif isinstance(a, common.__class__) : 166 pass 167 elif isinstance(common, a.__class__) : 168 common = a 169 elif isinstance(a, NucleotideAlphabet) \ 170 and isinstance(common, NucleotideAlphabet) : 171 #e.g. Give a mix of RNA and DNA alphabets 172 common = generic_nucleotide 173 elif isinstance(a, SingleLetterAlphabet) \ 174 and isinstance(common, SingleLetterAlphabet) : 175 #This is a pretty big mis-match! 176 common = single_letter_alphabet 177 else : 178 #We have a major mis-match... take the easy way out! 179 return generic_alphabet 180 if common is None : 181 #Given NO alphabets! 182 return generic_alphabet 183 return common
184
185 -def _consensus_alphabet(alphabets) :
186 """Returns a common but often generic alphabet object (PRIVATE). 187 188 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 189 letter. These DO NOT raise an exception! 190 191 This is aware of Gapped and HasStopCodon and new letters added by 192 other AlphabetEncoders. This WILL raise an exception if more than 193 one gap character or stop symbol is present.""" 194 base = _consensus_base_alphabet(alphabets) 195 gap = None 196 stop = None 197 new_letters = "" 198 for alpha in alphabets : 199 #Gaps... 200 if not hasattr(alpha, "gap_char") : 201 pass 202 elif gap is None : 203 gap = alpha.gap_char 204 elif gap == alpha.gap_char : 205 pass 206 else : 207 raise ValueError("More than one gap character present") 208 #Stops... 209 if not hasattr(alpha, "stop_symbol") : 210 pass 211 elif stop is None : 212 stop = alpha.stop_symbol 213 elif stop == alpha.stop_symbol : 214 pass 215 else : 216 raise ValueError("More than one stop symbol present") 217 #New letters... 218 if hasattr(alpha, "new_letters") : 219 for letter in alpha.new_letters : 220 if letter not in new_letters \ 221 and letter != gap and letter != stop : 222 new_letters += letter 223 224 alpha = base 225 if new_letters : 226 alpha = AlphabetEncoder(alpha, new_letters) 227 if gap : 228 alpha = Gapped(alpha, gap_char=gap) 229 if stop : 230 alpha = HasStopCodon(alpha, stop_symbol=stop) 231 return alpha
232
233 -def _check_type_compatible(alphabets) :
234 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). 235 236 This relies on the Alphabet subclassing hierarchy. It does not 237 check things like gap characters or stop symbols.""" 238 dna, rna, nucl, protein = False, False, False, False 239 for alpha in alphabets : 240 a = _get_base_alphabet(alpha) 241 if isinstance(a, DNAAlphabet) : 242 dna = True 243 nucl = True 244 if rna or protein : return False 245 elif isinstance(a, RNAAlphabet) : 246 rna = True 247 nucl = True 248 if dna or protein : return False 249 elif isinstance(a, NucleotideAlphabet) : 250 nucl = True 251 if protein : return False 252 elif isinstance(a, ProteinAlphabet) : 253 protein = True 254 if nucl : return False 255 return True
256