Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  """Extract information from alignment objects. 
  2   
  3  In order to try and avoid huge alignment objects with tons of functions, 
  4  functions which return summary type information about alignments should 
  5  be put into classes in this module. 
  6   
  7  classes: 
  8  o SummaryInfo 
  9  o PSSM 
 10  """ 
 11   
 12  # standard library 
 13  import math 
 14  import sys 
 15   
 16  # biopython modules 
 17  from Bio import Alphabet 
 18  from Bio.Alphabet import IUPAC 
 19  from Bio.Seq import Seq 
 20  from Bio.SubsMat import FreqTable 
 21   
 22  # Expected random distributions for 20-letter protein, and 
 23  # for 4-letter nucleotide alphabets 
 24  Protein20Random = 0.05 
 25  Nucleotide4Random = 0.25 
26 -class SummaryInfo:
27 """Calculate summary info about the alignment. 28 29 This class should be used to caclculate information summarizing the 30 results of an alignment. This may either be straight consensus info 31 or more complicated things. 32 """
33 - def __init__(self, alignment):
34 """Initialize with the alignment to calculate information on. 35 ic_vector attribute. A dictionary. Keys: column numbers. Values: 36 """ 37 self.alignment = alignment 38 self.ic_vector = {}
39
40 - def dumb_consensus(self, threshold = .7, ambiguous = "X", 41 consensus_alpha = None, require_multiple = 0):
42 """Output a fast consensus sequence of the alignment. 43 44 This doesn't do anything fancy at all. It will just go through the 45 sequence residue by residue and count up the number of each type 46 of residue (ie. A or G or T or C for DNA) in all sequences in the 47 alignment. If the percentage of the most common residue type is 48 greater then the passed threshold, then we will add that residue type, 49 otherwise an ambiguous character will be added. 50 51 This could be made a lot fancier (ie. to take a substitution matrix 52 into account), but it just meant for a quick and dirty consensus. 53 54 Arguments: 55 o threshold - The threshold value that is required to add a particular 56 atom. 57 o ambiguous - The ambiguous character to be added when the threshold is 58 not reached. 59 o consensus_alpha - The alphabet to return for the consensus sequence. 60 If this is None, then we will try to guess the alphabet. 61 o require_multiple - If set as 1, this will require that more than 62 1 sequence be part of an alignment to put it in the consensus (ie. 63 not just 1 sequence and gaps). 64 """ 65 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 66 consensus = '' 67 68 # find the length of the consensus we are creating 69 con_len = self.alignment.get_alignment_length() 70 71 # go through each seq item 72 for n in range(con_len): 73 # keep track of the counts of the different atoms we get 74 atom_dict = {} 75 num_atoms = 0 76 77 for record in self.alignment._records: 78 # make sure we haven't run past the end of any sequences 79 # if they are of different lengths 80 if n < len(record.seq): 81 if record.seq[n] != '-' and record.seq[n] != '.': 82 if record.seq[n] not in atom_dict.keys(): 83 atom_dict[record.seq[n]] = 1 84 else: 85 atom_dict[record.seq[n]] = \ 86 atom_dict[record.seq[n]] + 1 87 88 num_atoms = num_atoms + 1 89 90 max_atoms = [] 91 max_size = 0 92 93 for atom in atom_dict.keys(): 94 if atom_dict[atom] > max_size: 95 max_atoms = [atom] 96 max_size = atom_dict[atom] 97 elif atom_dict[atom] == max_size: 98 max_atoms.append(atom) 99 100 if require_multiple and num_atoms == 1: 101 consensus = consensus + ambiguous 102 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 103 >= threshold): 104 consensus = consensus + max_atoms[0] 105 else: 106 consensus = consensus + ambiguous 107 108 # we need to guess a consensus alphabet if one isn't specified 109 if consensus_alpha is None: 110 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 111 112 return Seq(consensus, consensus_alpha)
113
114 - def gap_consensus(self, threshold = .7, ambiguous = "X", 115 consensus_alpha = None, require_multiple = 0):
116 """Same as dumb_consensus(), but allows gap on the output. 117 118 Things to do: Let the user define that with only one gap, the result 119 character in consensus is gap. Let the user select gap character, now 120 it takes the same is input. 121 """ 122 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 123 consensus = '' 124 125 # find the length of the consensus we are creating 126 con_len = self.alignment.get_alignment_length() 127 128 # go through each seq item 129 for n in range(con_len): 130 # keep track of the counts of the different atoms we get 131 atom_dict = {} 132 num_atoms = 0 133 134 for record in self.alignment._records: 135 # make sure we haven't run past the end of any sequences 136 # if they are of different lengths 137 if n < len(record.seq): 138 if record.seq[n] not in atom_dict.keys(): 139 atom_dict[record.seq[n]] = 1 140 else: 141 atom_dict[record.seq[n]] = \ 142 atom_dict[record.seq[n]] + 1 143 144 num_atoms = num_atoms + 1 145 146 max_atoms = [] 147 max_size = 0 148 149 for atom in atom_dict.keys(): 150 if atom_dict[atom] > max_size: 151 max_atoms = [atom] 152 max_size = atom_dict[atom] 153 elif atom_dict[atom] == max_size: 154 max_atoms.append(atom) 155 156 if require_multiple and num_atoms == 1: 157 consensus = consensus + ambiguous 158 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 159 >= threshold): 160 consensus = consensus + max_atoms[0] 161 else: 162 consensus = consensus + ambiguous 163 164 # we need to guess a consensus alphabet if one isn't specified 165 if consensus_alpha is None: 166 #TODO - Should we make this into a Gapped alphabet? 167 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 168 169 return Seq(consensus, consensus_alpha)
170
171 - def _guess_consensus_alphabet(self, ambiguous):
172 """Pick an (ungapped) alphabet for an alignment consesus sequence. 173 174 This just looks at the sequences we have, checks their type, and 175 returns as appropriate type which seems to make sense with the 176 sequences we've got. 177 """ 178 #Start with the (un-gapped version of) the alignment alphabet 179 a = Alphabet._get_base_alphabet(self.alignment._alphabet) 180 181 #Now check its compatible with all the rest of the sequences 182 for record in self.alignment : 183 #Get the (un-gapped version of) the sequence's alphabet 184 alt = Alphabet._get_base_alphabet(record.seq.alphabet) 185 if not isinstance(alt, a.__class__) : 186 raise ValueError \ 187 ("Alignment contains a sequence with an incompatible alphabet.") 188 189 #Check the ambiguous character we are going to use in the consensus 190 #is in the alphabet's list of valid letters (if defined). 191 if hasattr(a, "letters") and a.letters is not None \ 192 and ambiguous not in a.letters : 193 #We'll need to pick a more generic alphabet... 194 if isinstance(a, IUPAC.IUPACUnambiguousDNA) : 195 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters : 196 a = IUPAC.IUPACUnambiguousDNA() 197 else : 198 a = Alphabet.generic_dna 199 elif isinstance(a, IUPAC.IUPACUnambiguousRNA) : 200 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters : 201 a = IUPAC.IUPACUnambiguousRNA() 202 else : 203 a = Alphabet.generic_rna 204 elif isinstance(a, IUPAC.IUPACProtein) : 205 if ambiguous in IUPAC.ExtendedIUPACProtein().letters : 206 a = IUPAC.ExtendedIUPACProtein() 207 else : 208 a = Alphabet.generic_protein 209 else : 210 a = Alphabet.single_letter_alphabet 211 return a
212
213 - def replacement_dictionary(self, skip_chars = []):
214 """Generate a replacement dictionary to plug into a substitution matrix 215 216 This should look at an alignment, and be able to generate the number 217 of substitutions of different residues for each other in the 218 aligned object. 219 220 Will then return a dictionary with this information: 221 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 222 223 This also treats weighted sequences. The following example shows how 224 we calculate the replacement dictionary. Given the following 225 multiple sequence alignments: 226 227 GTATC 0.5 228 AT--C 0.8 229 CTGTC 1.0 230 231 For the first column we have: 232 ('A', 'G') : 0.5 * 0.8 = 0.4 233 ('C', 'G') : 0.5 * 1.0 = 0.5 234 ('A', 'C') : 0.8 * 1.0 = 0.8 235 236 We then continue this for all of the columns in the alignment, summing 237 the information for each substitution in each column, until we end 238 up with the replacement dictionary. 239 240 Arguments: 241 o skip_chars - A list of characters to skip when creating the dictionary. 242 For instance, you might have Xs (screened stuff) or Ns, and not want 243 to include the ambiguity characters in the dictionary. 244 """ 245 # get a starting dictionary based on the alphabet of the alignment 246 rep_dict, skip_items = self._get_base_replacements(skip_chars) 247 248 # iterate through each record 249 for rec_num1 in range(len(self.alignment._records)): 250 # iterate through each record from one beyond the current record 251 # to the end of the list of records 252 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)): 253 # for each pair of records, compare the sequences and add 254 # the pertinent info to the dictionary 255 rep_dict = self._pair_replacement( 256 self.alignment._records[rec_num1].seq, 257 self.alignment._records[rec_num2].seq, 258 self.alignment._records[rec_num1].annotations.get('weight',1), 259 self.alignment._records[rec_num2].annotations.get('weight',1), 260 rep_dict, skip_items) 261 262 return rep_dict
263
264 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 265 start_dict, ignore_chars):
266 """Compare two sequences and generate info on the replacements seen. 267 268 Arguments: 269 o seq1, seq2 - The two sequences to compare. 270 o weight1, weight2 - The relative weights of seq1 and seq2. 271 o start_dict - The dictionary containing the starting replacement 272 info that we will modify. 273 o ignore_chars - A list of characters to ignore when calculating 274 replacements (ie. '-'). 275 276 Returns: 277 o A replacment dictionary which is modified from initial_dict with 278 the information from the sequence comparison. 279 """ 280 # loop through each residue in the sequences 281 for residue_num in range(len(seq1)): 282 residue1 = seq1[residue_num] 283 try: 284 residue2 = seq2[residue_num] 285 # if seq2 is shorter, then we just stop looking at replacements 286 # and return the information 287 except IndexError: 288 return start_dict 289 290 # if the two residues are characters we want to count 291 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 292 try: 293 # add info about the replacement to the dictionary, 294 # modified by the sequence weights 295 start_dict[(residue1, residue2)] = \ 296 start_dict[(residue1, residue2)] + \ 297 weight1 * weight2 298 299 # if we get a key error, then we've got a problem with alphabets 300 except KeyError: 301 raise ValueError("Residues %s, %s not found in alphabet %s" 302 % (residue1, residue2, 303 self.alignment._alphabet)) 304 305 return start_dict
306 307
308 - def _get_all_letters(self):
309 """Returns a string containing the expected letters in the alignment.""" 310 all_letters = self.alignment._alphabet.letters 311 if all_letters is None \ 312 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \ 313 and all_letters == self.alignment._alphabet.gap_char): 314 #We are dealing with a generic alphabet class where the 315 #letters are not defined! We must build a list of the 316 #letters used... 317 set_letters = set() 318 for record in self.alignment : 319 #Note the built in set does not have a union_update 320 #which was provided by the sets module's Set 321 set_letters = set_letters.union(record.seq) 322 list_letters = list(set_letters) 323 list_letters.sort() 324 all_letters = "".join(list_letters) 325 return all_letters
326
327 - def _get_base_replacements(self, skip_items = []):
328 """Get a zeroed dictonary of all possible letter combinations. 329 330 This looks at the type of alphabet and gets the letters for it. 331 It then creates a dictionary with all possible combinations of these 332 letters as keys (ie. ('A', 'G')) and sets the values as zero. 333 334 Returns: 335 o The base dictionary created 336 o A list of alphabet items to skip when filling the dictionary.Right 337 now the only thing I can imagine in this list is gap characters, but 338 maybe X's or something else might be useful later. This will also 339 include any characters that are specified to be skipped. 340 """ 341 base_dictionary = {} 342 all_letters = self._get_all_letters() 343 344 # if we have a gapped alphabet we need to find the gap character 345 # and drop it out 346 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 347 skip_items.append(self.alignment._alphabet.gap_char) 348 all_letters = all_letters.replace(self.alignment._alphabet.gap_char,'') 349 350 # now create the dictionary 351 for first_letter in all_letters: 352 for second_letter in all_letters: 353 if (first_letter not in skip_items and 354 second_letter not in skip_items): 355 base_dictionary[(first_letter, second_letter)] = 0 356 357 return base_dictionary, skip_items
358 359
360 - def pos_specific_score_matrix(self, axis_seq = None, 361 chars_to_ignore = []):
362 """Create a position specific score matrix object for the alignment. 363 364 This creates a position specific score matrix (pssm) which is an 365 alternative method to look at a consensus sequence. 366 367 Arguments: 368 o chars_to_ignore - A listing of all characters not to include in 369 the pssm. If the alignment alphabet declares a gap character, 370 then it will be excluded automatically. 371 o axis_seq - An optional argument specifying the sequence to 372 put on the axis of the PSSM. This should be a Seq object. If nothing 373 is specified, the consensus sequence, calculated with default 374 parameters, will be used. 375 376 Returns: 377 o A PSSM (position specific score matrix) object. 378 """ 379 # determine all of the letters we have to deal with 380 all_letters = self._get_all_letters() 381 assert all_letters 382 383 if not isinstance(chars_to_ignore, list) : 384 raise TypeError("chars_to_ignore should be a list.") 385 386 # if we have a gap char, add it to stuff to ignore 387 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 388 chars_to_ignore.append(self.alignment._alphabet.gap_char) 389 390 for char in chars_to_ignore: 391 all_letters = all_letters.replace(char, '') 392 393 if axis_seq: 394 left_seq = axis_seq 395 assert len(axis_seq) == self.alignment.get_alignment_length() 396 else: 397 left_seq = self.dumb_consensus() 398 399 pssm_info = [] 400 # now start looping through all of the sequences and getting info 401 for residue_num in range(len(left_seq)): 402 score_dict = self._get_base_letters(all_letters) 403 for record in self.alignment._records: 404 try: 405 this_residue = record.seq[residue_num] 406 # if we hit an index error we've run out of sequence and 407 # should not add new residues 408 except IndexError: 409 this_residue = None 410 411 if this_residue and this_residue not in chars_to_ignore: 412 weight = record.annotations.get('weight', 1) 413 try: 414 score_dict[this_residue] += weight 415 # if we get a KeyError then we have an alphabet problem 416 except KeyError: 417 raise ValueError("Residue %s not found in alphabet %s" 418 % (this_residue, 419 self.alignment._alphabet)) 420 421 pssm_info.append((left_seq[residue_num], 422 score_dict)) 423 424 425 return PSSM(pssm_info)
426
427 - def _get_base_letters(self, letters):
428 """Create a zeroed dictionary with all of the specified letters. 429 """ 430 base_info = {} 431 for letter in letters: 432 base_info[letter] = 0 433 434 return base_info
435
436 - def information_content(self, start = 0, 437 end = None, 438 e_freq_table = None, log_base = 2, 439 chars_to_ignore = []):
440 """Calculate the information content for each residue along an alignment. 441 442 Arguments: 443 o start, end - The starting an ending points to calculate the 444 information content. These points should be relative to the first 445 sequence in the alignment, starting at zero (ie. even if the 'real' 446 first position in the seq is 203 in the initial sequence, for 447 the info content, we need to use zero). This defaults to the entire 448 length of the first sequence. 449 o e_freq_table - A FreqTable object specifying the expected frequencies 450 for each letter in the alphabet we are using (e.g. {'G' : 0.4, 451 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 452 included, since these should not have expected frequencies. 453 o log_base - The base of the logathrim to use in calculating the 454 information content. This defaults to 2 so the info is in bits. 455 o chars_to_ignore - A listing of characterw which should be ignored 456 in calculating the info content. 457 458 Returns: 459 o A number representing the info content for the specified region. 460 461 Please see the Biopython manual for more information on how information 462 content is calculated. 463 """ 464 # if no end was specified, then we default to the end of the sequence 465 if end is None: 466 end = len(self.alignment._records[0].seq) 467 468 if start < 0 or end > len(self.alignment._records[0].seq): 469 raise ValueError \ 470 ("Start (%s) and end (%s) are not in the range %s to %s" 471 % (start, end, 0, len(self.alignment._records[0].seq))) 472 # determine random expected frequencies, if necessary 473 random_expected = None 474 if not e_freq_table: 475 #TODO - What about ambiguous alphabets? 476 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) 477 if isinstance(base_alpha, Alphabet.ProteinAlphabet) : 478 random_expected = Protein20Random 479 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet) : 480 random_expected = Nucleotide4Random 481 else : 482 errstr = "Error in alphabet: not Nucleotide or Protein, " 483 errstr += "supply expected frequencies" 484 raise ValueError(errstr) 485 del base_alpha 486 elif not isinstance(e_freq_table, FreqTable.FreqTable) : 487 raise ValueError("e_freq_table should be a FreqTable object") 488 489 490 # determine all of the letters we have to deal with 491 all_letters = self._get_all_letters() 492 for char in chars_to_ignore: 493 all_letters = all_letters.replace(char, '') 494 495 info_content = {} 496 for residue_num in range(start, end): 497 freq_dict = self._get_letter_freqs(residue_num, 498 self.alignment._records, 499 all_letters, chars_to_ignore) 500 # print freq_dict, 501 column_score = self._get_column_info_content(freq_dict, 502 e_freq_table, 503 log_base, 504 random_expected) 505 506 info_content[residue_num] = column_score 507 # sum up the score 508 total_info = 0 509 for column_info in info_content.values(): 510 total_info = total_info + column_info 511 # fill in the ic_vector member: holds IC for each column 512 for i in info_content.keys(): 513 self.ic_vector[i] = info_content[i] 514 return total_info
515
516 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):
517 """Determine the frequency of specific letters in the alignment. 518 519 Arguments: 520 o residue_num - The number of the column we are getting frequencies 521 from. 522 o all_records - All of the SeqRecords in the alignment. 523 o letters - The letters we are interested in getting the frequency 524 for. 525 o to_ignore - Letters we are specifically supposed to ignore. 526 527 This will calculate the frequencies of each of the specified letters 528 in the alignment at the given frequency, and return this as a 529 dictionary where the keys are the letters and the values are the 530 frequencies. 531 """ 532 freq_info = self._get_base_letters(letters) 533 534 total_count = 0 535 # collect the count info into the dictionary for all the records 536 for record in all_records: 537 try: 538 if record.seq[residue_num] not in to_ignore: 539 weight = record.annotations.get('weight',1) 540 freq_info[record.seq[residue_num]] += weight 541 total_count += weight 542 # getting a key error means we've got a problem with the alphabet 543 except KeyError: 544 raise ValueError("Residue %s not found in alphabet %s" 545 % (record.seq[residue_num], 546 self.alignment._alphabet)) 547 548 if total_count == 0 : 549 # This column must be entirely ignored characters 550 for letter in freq_info.keys(): 551 assert freq_info[letter] == 0 552 #TODO - Map this to NA or NaN? 553 else : 554 # now convert the counts into frequencies 555 for letter in freq_info.keys(): 556 freq_info[letter] = freq_info[letter] / total_count 557 558 return freq_info
559
560 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 561 random_expected):
562 """Calculate the information content for a column. 563 564 Arguments: 565 o obs_freq - The frequencies observed for each letter in the column. 566 o e_freq_table - An optional argument specifying the expected 567 frequencies for each letter. This is a SubsMat.FreqTable instance. 568 o log_base - The base of the logathrim to use in calculating the 569 info content. 570 """ 571 try : 572 gap_char = self.alignment._alphabet.gap_char 573 except AttributeError : 574 #The alphabet doesn't declare a gap - there could be none 575 #in the sequence... or just a vague alphabet. 576 gap_char = "-" #Safe? 577 578 if e_freq_table: 579 if not isinstance(e_freq_table, FreqTable.FreqTable) : 580 raise ValueError("e_freq_table should be a FreqTable object") 581 # check the expected freq information to make sure it is good 582 for key in obs_freq.keys(): 583 if (key != gap_char and key not in e_freq_table): 584 raise ValueError("Expected frequency letters %s" + 585 " do not match observed %s" 586 % (e_freq_table.keys(), obs_freq.keys() - 587 [gap_char])) 588 589 total_info = 0.0 590 591 for letter in obs_freq.keys(): 592 inner_log = 0.0 593 # if we have expected frequencies, modify the log value by them 594 # gap characters do not have expected frequencies, so they 595 # should just be the observed frequency. 596 if letter != gap_char: 597 if e_freq_table: 598 inner_log = obs_freq[letter] / e_freq_table[letter] 599 else: 600 inner_log = obs_freq[letter] / random_expected 601 # if the observed frequency is zero, we don't add any info to the 602 # total information content 603 if inner_log > 0: 604 letter_info = (obs_freq[letter] * 605 math.log(inner_log) / math.log(log_base)) 606 total_info = total_info + letter_info 607 return total_info
608
609 - def get_column(self,col):
610 return self.alignment.get_column(col)
611
612 -class PSSM:
613 """Represent a position specific score matrix. 614 615 This class is meant to make it easy to access the info within a PSSM 616 and also make it easy to print out the information in a nice table. 617 618 Let's say you had an alignment like this: 619 GTATC 620 AT--C 621 CTGTC 622 623 The position specific score matrix (when printed) looks like: 624 625 G A T C 626 G 1 1 0 1 627 T 0 0 3 0 628 A 1 1 0 0 629 T 0 0 2 0 630 C 0 0 0 3 631 632 You can access a single element of the PSSM using the following: 633 634 your_pssm[sequence_number][residue_count_name] 635 636 For instance, to get the 'T' residue for the second element in the 637 above alignment you would need to do: 638 639 your_pssm[1]['T'] 640 """
641 - def __init__(self, pssm):
642 """Initialize with pssm data to represent. 643 644 The pssm passed should be a list with the following structure: 645 646 list[0] - The letter of the residue being represented (for instance, 647 from the example above, the first few list[0]s would be GTAT... 648 list[1] - A dictionary with the letter substitutions and counts. 649 """ 650 self.pssm = pssm
651
652 - def __getitem__(self, pos):
653 return self.pssm[pos][1]
654
655 - def __str__(self):
656 out = " " 657 all_residues = self.pssm[0][1].keys() 658 all_residues.sort() 659 660 # first print out the top header 661 for res in all_residues: 662 out = out + " %s" % res 663 out = out + "\n" 664 665 # for each item, write out the substitutions 666 for item in self.pssm: 667 out = out + "%s " % item[0] 668 for res in all_residues: 669 out = out + " %.1f" % item[1][res] 670 671 out = out + "\n" 672 return out
673
674 - def get_residue(self, pos):
675 """Return the residue letter at the specified position. 676 """ 677 return self.pssm[pos][0]
678 679 692 693 if __name__ == "__main__" : 694 print "Quick test" 695 from Bio import AlignIO 696 from Bio.Align.Generic import Alignment 697 698 filename = "../../Tests/GFF/multi.fna" 699 format = "fasta" 700 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25}, 701 FreqTable.FREQ, 702 IUPAC.unambiguous_dna) 703 704 alignment = AlignIO.read(open(filename), format) 705 for record in alignment : 706 print record.seq.tostring() 707 print "="*alignment.get_alignment_length() 708 709 summary = SummaryInfo(alignment) 710 consensus = summary.dumb_consensus(ambiguous="N") 711 print consensus 712 consensus = summary.gap_consensus(ambiguous="N") 713 print consensus 714 print 715 print summary.pos_specific_score_matrix(chars_to_ignore=['-'], 716 axis_seq=consensus) 717 print 718 #Have a generic alphabet, without a declared gap char, so must tell 719 #provide the frequencies and chars to ignore explicitly. 720 print summary.information_content(e_freq_table=expected, 721 chars_to_ignore=['-']) 722 print 723 print "Trying a protein sequence with gaps and stops" 724 725 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") 726 a = Alignment(alpha) 727 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") 728 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") 729 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") 730 print a 731 print "="*a.get_alignment_length() 732 733 s = SummaryInfo(a) 734 c = s.dumb_consensus(ambiguous="X") 735 print c 736 c = s.gap_consensus(ambiguous="X") 737 print c 738 print 739 print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) 740 741 print s.information_content(chars_to_ignore=['-', '*']) 742 743 744 print "Done" 745