1 """Extract information from alignment objects.
2
3 In order to try and avoid huge alignment objects with tons of functions,
4 functions which return summary type information about alignments should
5 be put into classes in this module.
6
7 classes:
8 o SummaryInfo
9 o PSSM
10 """
11
12
13 import math
14 import sys
15
16
17 from Bio import Alphabet
18 from Bio.Alphabet import IUPAC
19 from Bio.Seq import Seq
20 from Bio.SubsMat import FreqTable
21
22
23
24 Protein20Random = 0.05
25 Nucleotide4Random = 0.25
27 """Calculate summary info about the alignment.
28
29 This class should be used to caclculate information summarizing the
30 results of an alignment. This may either be straight consensus info
31 or more complicated things.
32 """
34 """Initialize with the alignment to calculate information on.
35 ic_vector attribute. A dictionary. Keys: column numbers. Values:
36 """
37 self.alignment = alignment
38 self.ic_vector = {}
39
40 - def dumb_consensus(self, threshold = .7, ambiguous = "X",
41 consensus_alpha = None, require_multiple = 0):
42 """Output a fast consensus sequence of the alignment.
43
44 This doesn't do anything fancy at all. It will just go through the
45 sequence residue by residue and count up the number of each type
46 of residue (ie. A or G or T or C for DNA) in all sequences in the
47 alignment. If the percentage of the most common residue type is
48 greater then the passed threshold, then we will add that residue type,
49 otherwise an ambiguous character will be added.
50
51 This could be made a lot fancier (ie. to take a substitution matrix
52 into account), but it just meant for a quick and dirty consensus.
53
54 Arguments:
55 o threshold - The threshold value that is required to add a particular
56 atom.
57 o ambiguous - The ambiguous character to be added when the threshold is
58 not reached.
59 o consensus_alpha - The alphabet to return for the consensus sequence.
60 If this is None, then we will try to guess the alphabet.
61 o require_multiple - If set as 1, this will require that more than
62 1 sequence be part of an alignment to put it in the consensus (ie.
63 not just 1 sequence and gaps).
64 """
65
66 consensus = ''
67
68
69 con_len = self.alignment.get_alignment_length()
70
71
72 for n in range(con_len):
73
74 atom_dict = {}
75 num_atoms = 0
76
77 for record in self.alignment._records:
78
79
80 if n < len(record.seq):
81 if record.seq[n] != '-' and record.seq[n] != '.':
82 if record.seq[n] not in atom_dict.keys():
83 atom_dict[record.seq[n]] = 1
84 else:
85 atom_dict[record.seq[n]] = \
86 atom_dict[record.seq[n]] + 1
87
88 num_atoms = num_atoms + 1
89
90 max_atoms = []
91 max_size = 0
92
93 for atom in atom_dict.keys():
94 if atom_dict[atom] > max_size:
95 max_atoms = [atom]
96 max_size = atom_dict[atom]
97 elif atom_dict[atom] == max_size:
98 max_atoms.append(atom)
99
100 if require_multiple and num_atoms == 1:
101 consensus = consensus + ambiguous
102 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
103 >= threshold):
104 consensus = consensus + max_atoms[0]
105 else:
106 consensus = consensus + ambiguous
107
108
109 if consensus_alpha is None:
110 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
111
112 return Seq(consensus, consensus_alpha)
113
114 - def gap_consensus(self, threshold = .7, ambiguous = "X",
115 consensus_alpha = None, require_multiple = 0):
116 """Same as dumb_consensus(), but allows gap on the output.
117
118 Things to do: Let the user define that with only one gap, the result
119 character in consensus is gap. Let the user select gap character, now
120 it takes the same is input.
121 """
122
123 consensus = ''
124
125
126 con_len = self.alignment.get_alignment_length()
127
128
129 for n in range(con_len):
130
131 atom_dict = {}
132 num_atoms = 0
133
134 for record in self.alignment._records:
135
136
137 if n < len(record.seq):
138 if record.seq[n] not in atom_dict.keys():
139 atom_dict[record.seq[n]] = 1
140 else:
141 atom_dict[record.seq[n]] = \
142 atom_dict[record.seq[n]] + 1
143
144 num_atoms = num_atoms + 1
145
146 max_atoms = []
147 max_size = 0
148
149 for atom in atom_dict.keys():
150 if atom_dict[atom] > max_size:
151 max_atoms = [atom]
152 max_size = atom_dict[atom]
153 elif atom_dict[atom] == max_size:
154 max_atoms.append(atom)
155
156 if require_multiple and num_atoms == 1:
157 consensus = consensus + ambiguous
158 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms))
159 >= threshold):
160 consensus = consensus + max_atoms[0]
161 else:
162 consensus = consensus + ambiguous
163
164
165 if consensus_alpha is None:
166
167 consensus_alpha = self._guess_consensus_alphabet(ambiguous)
168
169 return Seq(consensus, consensus_alpha)
170
212
214 """Generate a replacement dictionary to plug into a substitution matrix
215
216 This should look at an alignment, and be able to generate the number
217 of substitutions of different residues for each other in the
218 aligned object.
219
220 Will then return a dictionary with this information:
221 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....}
222
223 This also treats weighted sequences. The following example shows how
224 we calculate the replacement dictionary. Given the following
225 multiple sequence alignments:
226
227 GTATC 0.5
228 AT--C 0.8
229 CTGTC 1.0
230
231 For the first column we have:
232 ('A', 'G') : 0.5 * 0.8 = 0.4
233 ('C', 'G') : 0.5 * 1.0 = 0.5
234 ('A', 'C') : 0.8 * 1.0 = 0.8
235
236 We then continue this for all of the columns in the alignment, summing
237 the information for each substitution in each column, until we end
238 up with the replacement dictionary.
239
240 Arguments:
241 o skip_chars - A list of characters to skip when creating the dictionary.
242 For instance, you might have Xs (screened stuff) or Ns, and not want
243 to include the ambiguity characters in the dictionary.
244 """
245
246 rep_dict, skip_items = self._get_base_replacements(skip_chars)
247
248
249 for rec_num1 in range(len(self.alignment._records)):
250
251
252 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)):
253
254
255 rep_dict = self._pair_replacement(
256 self.alignment._records[rec_num1].seq,
257 self.alignment._records[rec_num2].seq,
258 self.alignment._records[rec_num1].annotations.get('weight',1),
259 self.alignment._records[rec_num2].annotations.get('weight',1),
260 rep_dict, skip_items)
261
262 return rep_dict
263
264 - def _pair_replacement(self, seq1, seq2, weight1, weight2,
265 start_dict, ignore_chars):
266 """Compare two sequences and generate info on the replacements seen.
267
268 Arguments:
269 o seq1, seq2 - The two sequences to compare.
270 o weight1, weight2 - The relative weights of seq1 and seq2.
271 o start_dict - The dictionary containing the starting replacement
272 info that we will modify.
273 o ignore_chars - A list of characters to ignore when calculating
274 replacements (ie. '-').
275
276 Returns:
277 o A replacment dictionary which is modified from initial_dict with
278 the information from the sequence comparison.
279 """
280
281 for residue_num in range(len(seq1)):
282 residue1 = seq1[residue_num]
283 try:
284 residue2 = seq2[residue_num]
285
286
287 except IndexError:
288 return start_dict
289
290
291 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars):
292 try:
293
294
295 start_dict[(residue1, residue2)] = \
296 start_dict[(residue1, residue2)] + \
297 weight1 * weight2
298
299
300 except KeyError:
301 raise ValueError("Residues %s, %s not found in alphabet %s"
302 % (residue1, residue2,
303 self.alignment._alphabet))
304
305 return start_dict
306
307
309 """Returns a string containing the expected letters in the alignment."""
310 all_letters = self.alignment._alphabet.letters
311 if all_letters is None \
312 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \
313 and all_letters == self.alignment._alphabet.gap_char):
314
315
316
317 set_letters = set()
318 for record in self.alignment :
319
320
321 set_letters = set_letters.union(record.seq)
322 list_letters = list(set_letters)
323 list_letters.sort()
324 all_letters = "".join(list_letters)
325 return all_letters
326
328 """Get a zeroed dictonary of all possible letter combinations.
329
330 This looks at the type of alphabet and gets the letters for it.
331 It then creates a dictionary with all possible combinations of these
332 letters as keys (ie. ('A', 'G')) and sets the values as zero.
333
334 Returns:
335 o The base dictionary created
336 o A list of alphabet items to skip when filling the dictionary.Right
337 now the only thing I can imagine in this list is gap characters, but
338 maybe X's or something else might be useful later. This will also
339 include any characters that are specified to be skipped.
340 """
341 base_dictionary = {}
342 all_letters = self._get_all_letters()
343
344
345
346 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
347 skip_items.append(self.alignment._alphabet.gap_char)
348 all_letters = all_letters.replace(self.alignment._alphabet.gap_char,'')
349
350
351 for first_letter in all_letters:
352 for second_letter in all_letters:
353 if (first_letter not in skip_items and
354 second_letter not in skip_items):
355 base_dictionary[(first_letter, second_letter)] = 0
356
357 return base_dictionary, skip_items
358
359
362 """Create a position specific score matrix object for the alignment.
363
364 This creates a position specific score matrix (pssm) which is an
365 alternative method to look at a consensus sequence.
366
367 Arguments:
368 o chars_to_ignore - A listing of all characters not to include in
369 the pssm. If the alignment alphabet declares a gap character,
370 then it will be excluded automatically.
371 o axis_seq - An optional argument specifying the sequence to
372 put on the axis of the PSSM. This should be a Seq object. If nothing
373 is specified, the consensus sequence, calculated with default
374 parameters, will be used.
375
376 Returns:
377 o A PSSM (position specific score matrix) object.
378 """
379
380 all_letters = self._get_all_letters()
381 assert all_letters
382
383 if not isinstance(chars_to_ignore, list) :
384 raise TypeError("chars_to_ignore should be a list.")
385
386
387 if isinstance(self.alignment._alphabet, Alphabet.Gapped):
388 chars_to_ignore.append(self.alignment._alphabet.gap_char)
389
390 for char in chars_to_ignore:
391 all_letters = all_letters.replace(char, '')
392
393 if axis_seq:
394 left_seq = axis_seq
395 assert len(axis_seq) == self.alignment.get_alignment_length()
396 else:
397 left_seq = self.dumb_consensus()
398
399 pssm_info = []
400
401 for residue_num in range(len(left_seq)):
402 score_dict = self._get_base_letters(all_letters)
403 for record in self.alignment._records:
404 try:
405 this_residue = record.seq[residue_num]
406
407
408 except IndexError:
409 this_residue = None
410
411 if this_residue and this_residue not in chars_to_ignore:
412 weight = record.annotations.get('weight', 1)
413 try:
414 score_dict[this_residue] += weight
415
416 except KeyError:
417 raise ValueError("Residue %s not found in alphabet %s"
418 % (this_residue,
419 self.alignment._alphabet))
420
421 pssm_info.append((left_seq[residue_num],
422 score_dict))
423
424
425 return PSSM(pssm_info)
426
428 """Create a zeroed dictionary with all of the specified letters.
429 """
430 base_info = {}
431 for letter in letters:
432 base_info[letter] = 0
433
434 return base_info
435
436 - def information_content(self, start = 0,
437 end = None,
438 e_freq_table = None, log_base = 2,
439 chars_to_ignore = []):
440 """Calculate the information content for each residue along an alignment.
441
442 Arguments:
443 o start, end - The starting an ending points to calculate the
444 information content. These points should be relative to the first
445 sequence in the alignment, starting at zero (ie. even if the 'real'
446 first position in the seq is 203 in the initial sequence, for
447 the info content, we need to use zero). This defaults to the entire
448 length of the first sequence.
449 o e_freq_table - A FreqTable object specifying the expected frequencies
450 for each letter in the alphabet we are using (e.g. {'G' : 0.4,
451 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
452 included, since these should not have expected frequencies.
453 o log_base - The base of the logathrim to use in calculating the
454 information content. This defaults to 2 so the info is in bits.
455 o chars_to_ignore - A listing of characterw which should be ignored
456 in calculating the info content.
457
458 Returns:
459 o A number representing the info content for the specified region.
460
461 Please see the Biopython manual for more information on how information
462 content is calculated.
463 """
464
465 if end is None:
466 end = len(self.alignment._records[0].seq)
467
468 if start < 0 or end > len(self.alignment._records[0].seq):
469 raise ValueError \
470 ("Start (%s) and end (%s) are not in the range %s to %s"
471 % (start, end, 0, len(self.alignment._records[0].seq)))
472
473 random_expected = None
474 if not e_freq_table:
475
476 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet)
477 if isinstance(base_alpha, Alphabet.ProteinAlphabet) :
478 random_expected = Protein20Random
479 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet) :
480 random_expected = Nucleotide4Random
481 else :
482 errstr = "Error in alphabet: not Nucleotide or Protein, "
483 errstr += "supply expected frequencies"
484 raise ValueError(errstr)
485 del base_alpha
486 elif not isinstance(e_freq_table, FreqTable.FreqTable) :
487 raise ValueError("e_freq_table should be a FreqTable object")
488
489
490
491 all_letters = self._get_all_letters()
492 for char in chars_to_ignore:
493 all_letters = all_letters.replace(char, '')
494
495 info_content = {}
496 for residue_num in range(start, end):
497 freq_dict = self._get_letter_freqs(residue_num,
498 self.alignment._records,
499 all_letters, chars_to_ignore)
500
501 column_score = self._get_column_info_content(freq_dict,
502 e_freq_table,
503 log_base,
504 random_expected)
505
506 info_content[residue_num] = column_score
507
508 total_info = 0
509 for column_info in info_content.values():
510 total_info = total_info + column_info
511
512 for i in info_content.keys():
513 self.ic_vector[i] = info_content[i]
514 return total_info
515
517 """Determine the frequency of specific letters in the alignment.
518
519 Arguments:
520 o residue_num - The number of the column we are getting frequencies
521 from.
522 o all_records - All of the SeqRecords in the alignment.
523 o letters - The letters we are interested in getting the frequency
524 for.
525 o to_ignore - Letters we are specifically supposed to ignore.
526
527 This will calculate the frequencies of each of the specified letters
528 in the alignment at the given frequency, and return this as a
529 dictionary where the keys are the letters and the values are the
530 frequencies.
531 """
532 freq_info = self._get_base_letters(letters)
533
534 total_count = 0
535
536 for record in all_records:
537 try:
538 if record.seq[residue_num] not in to_ignore:
539 weight = record.annotations.get('weight',1)
540 freq_info[record.seq[residue_num]] += weight
541 total_count += weight
542
543 except KeyError:
544 raise ValueError("Residue %s not found in alphabet %s"
545 % (record.seq[residue_num],
546 self.alignment._alphabet))
547
548 if total_count == 0 :
549
550 for letter in freq_info.keys():
551 assert freq_info[letter] == 0
552
553 else :
554
555 for letter in freq_info.keys():
556 freq_info[letter] = freq_info[letter] / total_count
557
558 return freq_info
559
560 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base,
561 random_expected):
562 """Calculate the information content for a column.
563
564 Arguments:
565 o obs_freq - The frequencies observed for each letter in the column.
566 o e_freq_table - An optional argument specifying the expected
567 frequencies for each letter. This is a SubsMat.FreqTable instance.
568 o log_base - The base of the logathrim to use in calculating the
569 info content.
570 """
571 try :
572 gap_char = self.alignment._alphabet.gap_char
573 except AttributeError :
574
575
576 gap_char = "-"
577
578 if e_freq_table:
579 if not isinstance(e_freq_table, FreqTable.FreqTable) :
580 raise ValueError("e_freq_table should be a FreqTable object")
581
582 for key in obs_freq.keys():
583 if (key != gap_char and key not in e_freq_table):
584 raise ValueError("Expected frequency letters %s" +
585 " do not match observed %s"
586 % (e_freq_table.keys(), obs_freq.keys() -
587 [gap_char]))
588
589 total_info = 0.0
590
591 for letter in obs_freq.keys():
592 inner_log = 0.0
593
594
595
596 if letter != gap_char:
597 if e_freq_table:
598 inner_log = obs_freq[letter] / e_freq_table[letter]
599 else:
600 inner_log = obs_freq[letter] / random_expected
601
602
603 if inner_log > 0:
604 letter_info = (obs_freq[letter] *
605 math.log(inner_log) / math.log(log_base))
606 total_info = total_info + letter_info
607 return total_info
608
611
613 """Represent a position specific score matrix.
614
615 This class is meant to make it easy to access the info within a PSSM
616 and also make it easy to print out the information in a nice table.
617
618 Let's say you had an alignment like this:
619 GTATC
620 AT--C
621 CTGTC
622
623 The position specific score matrix (when printed) looks like:
624
625 G A T C
626 G 1 1 0 1
627 T 0 0 3 0
628 A 1 1 0 0
629 T 0 0 2 0
630 C 0 0 0 3
631
632 You can access a single element of the PSSM using the following:
633
634 your_pssm[sequence_number][residue_count_name]
635
636 For instance, to get the 'T' residue for the second element in the
637 above alignment you would need to do:
638
639 your_pssm[1]['T']
640 """
642 """Initialize with pssm data to represent.
643
644 The pssm passed should be a list with the following structure:
645
646 list[0] - The letter of the residue being represented (for instance,
647 from the example above, the first few list[0]s would be GTAT...
648 list[1] - A dictionary with the letter substitutions and counts.
649 """
650 self.pssm = pssm
651
653 return self.pssm[pos][1]
654
656 out = " "
657 all_residues = self.pssm[0][1].keys()
658 all_residues.sort()
659
660
661 for res in all_residues:
662 out = out + " %s" % res
663 out = out + "\n"
664
665
666 for item in self.pssm:
667 out = out + "%s " % item[0]
668 for res in all_residues:
669 out = out + " %.1f" % item[1][res]
670
671 out = out + "\n"
672 return out
673
675 """Return the residue letter at the specified position.
676 """
677 return self.pssm[pos][0]
678
679
680 -def print_info_content(summary_info,fout=None,rep_record=0):
681 """ Three column output: position, aa in representative sequence,
682 ic_vector value"""
683 fout = fout or sys.stdout
684 if not summary_info.ic_vector:
685 summary_info.information_content()
686 rep_sequence = summary_info.alignment._records[rep_record].seq
687 positions = summary_info.ic_vector.keys()
688 positions.sort()
689 for pos in positions:
690 fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos],
691 summary_info.ic_vector[pos]))
692
693 if __name__ == "__main__" :
694 print "Quick test"
695 from Bio import AlignIO
696 from Bio.Align.Generic import Alignment
697
698 filename = "../../Tests/GFF/multi.fna"
699 format = "fasta"
700 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25},
701 FreqTable.FREQ,
702 IUPAC.unambiguous_dna)
703
704 alignment = AlignIO.read(open(filename), format)
705 for record in alignment :
706 print record.seq.tostring()
707 print "="*alignment.get_alignment_length()
708
709 summary = SummaryInfo(alignment)
710 consensus = summary.dumb_consensus(ambiguous="N")
711 print consensus
712 consensus = summary.gap_consensus(ambiguous="N")
713 print consensus
714 print
715 print summary.pos_specific_score_matrix(chars_to_ignore=['-'],
716 axis_seq=consensus)
717 print
718
719
720 print summary.information_content(e_freq_table=expected,
721 chars_to_ignore=['-'])
722 print
723 print "Trying a protein sequence with gaps and stops"
724
725 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*")
726 a = Alignment(alpha)
727 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-")
728 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*")
729 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*")
730 print a
731 print "="*a.get_alignment_length()
732
733 s = SummaryInfo(a)
734 c = s.dumb_consensus(ambiguous="X")
735 print c
736 c = s.gap_consensus(ambiguous="X")
737 print c
738 print
739 print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c)
740
741 print s.information_content(chars_to_ignore=['-', '*'])
742
743
744 print "Done"
745