1
2
3
4
5
6 """This package implements pairwise sequence alignment using a dynamic
7 programming algorithm.
8
9 This provides functions to get global and local alignments between two
10 sequences. A global alignment finds the best concordance between all
11 characters in two sequences. A local alignment finds just the
12 subsequences that align the best.
13
14 When doing alignments, you can specify the match score and gap
15 penalties. The match score indicates the compatibility between an
16 alignment of two characters in the sequences. Highly compatible
17 characters should be given positive scores, and incompatible ones
18 should be given negative scores or 0. The gap penalties should be
19 negative.
20
21 The names of the alignment functions in this module follow the
22 convention
23 <alignment type>XX
24 where <alignment type> is either "global" or "local" and XX is a 2
25 character code indicating the parameters it takes. The first
26 character indicates the parameters for matches (and mismatches), and
27 the second indicates the parameters for gap penalties.
28
29 The match parameters are
30 CODE DESCRIPTION
31 x No parameters. Identical characters have score of 1, otherwise 0.
32 m A match score is the score of identical chars, otherwise mismatch score.
33 d A dictionary returns the score of any pair of characters.
34 c A callback function returns scores.
35
36 The gap penalty parameters are
37 CODE DESCRIPTION
38 x No gap penalties.
39 s Same open and extend gap penalties for both sequences.
40 d The sequences have different open and extend gap penalties.
41 c A callback function returns the gap penalties.
42
43 All the different alignment functions are contained in an object
44 "align". For example:
45
46 >>> from Bio import pairwise2
47 >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG")
48
49 will return a list of the alignments between the two strings. The
50 parameters of the alignment function depends on the function called.
51 Some examples:
52
53 >>> pairwise2.align.globalxx("ACCGT", "ACG")
54 # Find the best global alignment between the two sequences.
55 # Identical characters are given 1 point. No points are deducted
56 # for mismatches or gaps.
57
58 >>> pairwise2.align.localxx("ACCGT", "ACG")
59 # Same thing as before, but with a local alignment.
60
61 >>> pairwise2.align.globalmx("ACCGT", "ACG", 2, -1)
62 # Do a global alignment. Identical characters are given 2 points,
63 # 1 point is deducted for each non-identical character.
64
65 >>> pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1)
66 # Same as above, except now 0.5 points are deducted when opening a
67 # gap, and 0.1 points are deducted when extending it.
68
69
70 To see a description of the parameters for a function, please look at
71 the docstring for the function.
72
73 >>> print newalign.align.localds.__doc__
74 localds(sequenceA, sequenceB, match_dict, open, extend) -> alignments
75
76 """
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98 from types import *
99
100 from Bio import listfns
101
102 MAX_ALIGNMENTS = 1000
103
105 """This class provides functions that do alignments."""
106
108 """This class is callable impersonates an alignment function.
109 The constructor takes the name of the function. This class
110 will decode the name of the function to figure out how to
111 interpret the parameters.
112
113 """
114
115 match2args = {
116 'x' : ([], ''),
117 'm' : (['match', 'mismatch'],
118 """match is the score to given to identical characters. mismatch is
119 the score given to non-identical ones."""),
120 'd' : (['match_dict'],
121 """match_dict is a dictionary where the keys are tuples of pairs of
122 characters and the values are the scores, e.g. ("A", "C") : 2.5."""),
123 'c' : (['match_fn'],
124 """match_fn is a callback function that takes two characters and
125 returns the score between them."""),
126 }
127
128 penalty2args = {
129 'x' : ([], ''),
130 's' : (['open', 'extend'],
131 """open and extend are the gap penalties when a gap is opened and
132 extended. They should be negative."""),
133 'd' : (['openA', 'extendA', 'openB', 'extendB'],
134 """openA and extendA are the gap penalties for sequenceA, and openB
135 and extendB for sequeneB. The penalties should be negative."""),
136 'c' : (['gap_A_fn', 'gap_B_fn'],
137 """gap_A_fn and gap_B_fn are callback functions that takes 1) the
138 index where the gap is opened, and 2) the length of the gap. They
139 should return a gap penalty."""),
140 }
141
143
144
145 if name.startswith("global"):
146 if len(name) != 8:
147 raise AttributeError("function should be globalXX")
148 elif name.startswith("local"):
149 if len(name) != 7:
150 raise AttributeError("function should be localXX")
151 else:
152 raise AttributeError(name)
153 align_type, match_type, penalty_type = \
154 name[:-2], name[-2], name[-1]
155 try:
156 match_args, match_doc = self.match2args[match_type]
157 except KeyError, x:
158 raise AttributeError("unknown match type %r" % match_type)
159 try:
160 penalty_args, penalty_doc = self.penalty2args[penalty_type]
161 except KeyError, x:
162 raise AttributeError("unknown penalty type %r" % penalty_type)
163
164
165 param_names = ['sequenceA', 'sequenceB']
166 param_names.extend(match_args)
167 param_names.extend(penalty_args)
168 self.function_name = name
169 self.align_type = align_type
170 self.param_names = param_names
171
172 self.__name__ = self.function_name
173
174 doc = "%s(%s) -> alignments\n" % (
175 self.__name__, ', '.join(self.param_names))
176 if match_doc:
177 doc += "\n%s\n" % match_doc
178 if penalty_doc:
179 doc += "\n%s\n" % penalty_doc
180 doc += (
181 """\nalignments is a list of tuples (seqA, seqB, score, begin, end).
182 seqA and seqB are strings showing the alignment between the
183 sequences. score is the score of the alignment. begin and end
184 are indexes into seqA and seqB that indicate the where the
185 alignment occurs.
186 """)
187 self.__doc__ = doc
188
189 - def decode(self, *args, **keywds):
190
191
192
193 keywds = keywds.copy()
194 if len(args) != len(self.param_names):
195 raise TypeError("%s takes exactly %d argument (%d given)" \
196 % (self.function_name, len(self.param_names), len(args)))
197 i = 0
198 while i < len(self.param_names):
199 if self.param_names[i] in [
200 'sequenceA', 'sequenceB',
201 'gap_A_fn', 'gap_B_fn', 'match_fn']:
202 keywds[self.param_names[i]] = args[i]
203 i += 1
204 elif self.param_names[i] == 'match':
205 assert self.param_names[i+1] == 'mismatch'
206 match, mismatch = args[i], args[i+1]
207 keywds['match_fn'] = identity_match(match, mismatch)
208 i += 2
209 elif self.param_names[i] == 'match_dict':
210 keywds['match_fn'] = dictionary_match(args[i])
211 i += 1
212 elif self.param_names[i] == 'open':
213 assert self.param_names[i+1] == 'extend'
214 open, extend = args[i], args[i+1]
215 pe = keywds.get('penalize_extend_when_opening', 0)
216 keywds['gap_A_fn'] = affine_penalty(open, extend, pe)
217 keywds['gap_B_fn'] = affine_penalty(open, extend, pe)
218 i += 2
219 elif self.param_names[i] == 'openA':
220 assert self.param_names[i+3] == 'extendB'
221 openA, extendA, openB, extendB = args[i:i+4]
222 pe = keywds.get('penalize_extend_when_opening', 0)
223 keywds['gap_A_fn'] = affine_penalty(openA, extendA, pe)
224 keywds['gap_B_fn'] = affine_penalty(openB, extendB, pe)
225 i += 4
226 else:
227 raise ValueError("unknown parameter %r" \
228 % self.param_names[i])
229
230
231
232 pe = keywds.get('penalize_extend_when_opening', 0)
233 default_params = [
234 ('match_fn', identity_match(1, 0)),
235 ('gap_A_fn', affine_penalty(0, 0, pe)),
236 ('gap_B_fn', affine_penalty(0, 0, pe)),
237 ('penalize_extend_when_opening', 0),
238 ('penalize_end_gaps', self.align_type == 'global'),
239 ('align_globally', self.align_type == 'global'),
240 ('gap_char', '-'),
241 ('force_generic', 0),
242 ('score_only', 0),
243 ('one_alignment_only', 0)
244 ]
245 for name, default in default_params:
246 keywds[name] = keywds.get(name, default)
247 return keywds
248
250 keywds = self.decode(*args, **keywds)
251 return _align(**keywds)
252
254 return self.alignment_function(attr)
255 align = align()
256
257
258 -def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
259 penalize_extend_when_opening, penalize_end_gaps,
260 align_globally, gap_char, force_generic, score_only,
261 one_alignment_only):
262 if not sequenceA or not sequenceB:
263 return []
264
265 if (not force_generic) and \
266 type(gap_A_fn) is InstanceType and \
267 gap_A_fn.__class__ is affine_penalty and \
268 type(gap_B_fn) is InstanceType and \
269 gap_B_fn.__class__ is affine_penalty:
270 open_A, extend_A = gap_A_fn.open, gap_A_fn.extend
271 open_B, extend_B = gap_B_fn.open, gap_B_fn.extend
272 x = _make_score_matrix_fast(
273 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
274 penalize_extend_when_opening, penalize_end_gaps, align_globally,
275 score_only)
276 else:
277 x = _make_score_matrix_generic(
278 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
279 penalize_extend_when_opening, penalize_end_gaps, align_globally,
280 score_only)
281 score_matrix, trace_matrix = x
282
283
284
285
286
287
288 starts = _find_start(
289 score_matrix, sequenceA, sequenceB,
290 gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally)
291
292 best_score = max([x[0] for x in starts])
293
294
295 if score_only:
296 return best_score
297
298 tolerance = 0
299
300
301 i = 0
302 while i < len(starts):
303 score, pos = starts[i]
304 if rint(abs(score-best_score)) > rint(tolerance):
305 del starts[i]
306 else:
307 i += 1
308
309
310 x = _recover_alignments(
311 sequenceA, sequenceB, starts, score_matrix, trace_matrix,
312 align_globally, penalize_end_gaps, gap_char, one_alignment_only)
313 return x
314
315 -def _make_score_matrix_generic(
316 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn,
317 penalize_extend_when_opening, penalize_end_gaps, align_globally,
318 score_only):
319
320
321
322
323
324
325 lenA, lenB = len(sequenceA), len(sequenceB)
326 score_matrix, trace_matrix = [], []
327 for i in range(lenA):
328 score_matrix.append([None] * lenB)
329 trace_matrix.append([[None]] * lenB)
330
331
332
333
334 for i in range(lenA):
335
336
337
338 score = match_fn(sequenceA[i], sequenceB[0])
339 if penalize_end_gaps:
340 score += gap_B_fn(0, i)
341 score_matrix[i][0] = score
342 for i in range(1, lenB):
343 score = match_fn(sequenceA[0], sequenceB[i])
344 if penalize_end_gaps:
345 score += gap_A_fn(0, i)
346 score_matrix[0][i] = score
347
348
349
350
351
352
353
354
355 for row in range(1, lenA):
356 for col in range(1, lenB):
357
358
359 best_score = score_matrix[row-1][col-1]
360 best_score_rint = rint(best_score)
361 best_indexes = [(row-1, col-1)]
362
363
364
365
366
367
368 for i in range(0, col-1):
369 score = score_matrix[row-1][i] + gap_A_fn(i, col-1-i)
370 score_rint = rint(score)
371 if score_rint == best_score_rint:
372 best_score, best_score_rint = score, score_rint
373 best_indexes.append((row-1, i))
374 elif score_rint > best_score_rint:
375 best_score, best_score_rint = score, score_rint
376 best_indexes = [(row-1, i)]
377
378
379 for i in range(0, row-1):
380 score = score_matrix[i][col-1] + gap_B_fn(i, row-1-i)
381 score_rint = rint(score)
382 if score_rint == best_score_rint:
383 best_score, best_score_rint = score, score_rint
384 best_indexes.append((i, col-1))
385 elif score_rint > best_score_rint:
386 best_score, best_score_rint = score, score_rint
387 best_indexes = [(i, col-1)]
388
389 score_matrix[row][col] = best_score + \
390 match_fn(sequenceA[row], sequenceB[col])
391 if not align_globally and score_matrix[row][col] < 0:
392 score_matrix[row][col] = 0
393 trace_matrix[row][col] = best_indexes
394 return score_matrix, trace_matrix
395
396 -def _make_score_matrix_fast(
397 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B,
398 penalize_extend_when_opening, penalize_end_gaps,
399 align_globally, score_only):
400 first_A_gap = calc_affine_penalty(1, open_A, extend_A,
401 penalize_extend_when_opening)
402 first_B_gap = calc_affine_penalty(1, open_B, extend_B,
403 penalize_extend_when_opening)
404
405
406
407
408 lenA, lenB = len(sequenceA), len(sequenceB)
409 score_matrix, trace_matrix = [], []
410 for i in range(lenA):
411 score_matrix.append([None] * lenB)
412 trace_matrix.append([[None]] * lenB)
413
414
415
416
417 for i in range(lenA):
418
419
420
421 score = match_fn(sequenceA[i], sequenceB[0])
422 if penalize_end_gaps:
423 score += calc_affine_penalty(
424 i, open_B, extend_B, penalize_extend_when_opening)
425 score_matrix[i][0] = score
426 for i in range(1, lenB):
427 score = match_fn(sequenceA[0], sequenceB[i])
428 if penalize_end_gaps:
429 score += calc_affine_penalty(
430 i, open_A, extend_A, penalize_extend_when_opening)
431 score_matrix[0][i] = score
432
433
434
435
436
437
438
439
440
441
442
443
444
445 row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1)
446
447 col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1)
448
449 for i in range(lenA-1):
450
451
452 row_cache_score[i] = score_matrix[i][0] + first_A_gap
453 row_cache_index[i] = [(i, 0)]
454 for i in range(lenB-1):
455 col_cache_score[i] = score_matrix[0][i] + first_B_gap
456 col_cache_index[i] = [(0, i)]
457
458
459 for row in range(1, lenA):
460 for col in range(1, lenB):
461
462
463 nogap_score = score_matrix[row-1][col-1]
464
465
466
467 if col > 1:
468 row_score = row_cache_score[row-1]
469 else:
470 row_score = nogap_score - 1
471
472
473 if row > 1:
474 col_score = col_cache_score[col-1]
475 else:
476 col_score = nogap_score - 1
477
478 best_score = max(nogap_score, row_score, col_score)
479 best_score_rint = rint(best_score)
480 best_index = []
481 if best_score_rint == rint(nogap_score):
482 best_index.append((row-1, col-1))
483 if best_score_rint == rint(row_score):
484 best_index.extend(row_cache_index[row-1])
485 if best_score_rint == rint(col_score):
486 best_index.extend(col_cache_index[col-1])
487
488
489 score = best_score + match_fn(sequenceA[row], sequenceB[col])
490 if not align_globally and score < 0:
491 score_matrix[row][col] = 0
492 else:
493 score_matrix[row][col] = score
494 trace_matrix[row][col] = best_index
495
496
497
498
499
500
501 open_score = score_matrix[row-1][col-1] + first_B_gap
502 extend_score = col_cache_score[col-1] + extend_B
503 open_score_rint, extend_score_rint = \
504 rint(open_score), rint(extend_score)
505 if open_score_rint > extend_score_rint:
506 col_cache_score[col-1] = open_score
507 col_cache_index[col-1] = [(row-1, col-1)]
508 elif extend_score_rint > open_score_rint:
509 col_cache_score[col-1] = extend_score
510 else:
511 col_cache_score[col-1] = open_score
512 if (row-1, col-1) not in col_cache_index[col-1]:
513 col_cache_index[col-1] = col_cache_index[col-1] + \
514 [(row-1, col-1)]
515
516
517 open_score = score_matrix[row-1][col-1] + first_A_gap
518 extend_score = row_cache_score[row-1] + extend_A
519 open_score_rint, extend_score_rint = \
520 rint(open_score), rint(extend_score)
521 if open_score_rint > extend_score_rint:
522 row_cache_score[row-1] = open_score
523 row_cache_index[row-1] = [(row-1, col-1)]
524 elif extend_score_rint > open_score_rint:
525 row_cache_score[row-1] = extend_score
526 else:
527 row_cache_score[row-1] = open_score
528 if (row-1, col-1) not in row_cache_index[row-1]:
529 row_cache_index[row-1] = row_cache_index[row-1] + \
530 [(row-1, col-1)]
531
532 return score_matrix, trace_matrix
533
534 -def _recover_alignments(sequenceA, sequenceB, starts,
535 score_matrix, trace_matrix, align_globally,
536 penalize_end_gaps, gap_char, one_alignment_only):
537
538
539
540 lenA, lenB = len(sequenceA), len(sequenceB)
541 tracebacks = []
542 in_process = []
543
544
545
546
547
548
549
550
551
552
553
554 for score, (row, col) in starts:
555 if align_globally:
556 begin, end = None, None
557 else:
558 begin, end = None, -max(lenA-row, lenB-col)+1
559 if not end:
560 end = None
561
562
563
564 in_process.append(
565 (sequenceA[0:0], sequenceB[0:0], score, begin, end,
566 (lenA, lenB), (row, col)))
567 if one_alignment_only:
568 break
569 while in_process and len(tracebacks) < MAX_ALIGNMENTS:
570 seqA, seqB, score, begin, end, prev_pos, next_pos = in_process.pop()
571 prevA, prevB = prev_pos
572 if next_pos is None:
573 prevlen = len(seqA)
574
575 seqA = sequenceA[:prevA] + seqA
576 seqB = sequenceB[:prevB] + seqB
577
578 seqA, seqB = _lpad_until_equal(seqA, seqB, gap_char)
579
580
581 if begin is None:
582 if align_globally:
583 begin = 0
584 else:
585 begin = len(seqA) - prevlen
586 tracebacks.append((seqA, seqB, score, begin, end))
587 else:
588 nextA, nextB = next_pos
589 nseqA, nseqB = prevA-nextA, prevB-nextB
590 maxseq = max(nseqA, nseqB)
591 ngapA, ngapB = maxseq-nseqA, maxseq-nseqB
592 seqA = sequenceA[nextA:nextA+nseqA] + gap_char*ngapA + seqA
593 seqB = sequenceB[nextB:nextB+nseqB] + gap_char*ngapB + seqB
594 prev_pos = next_pos
595
596 if not align_globally and score_matrix[nextA][nextB] <= 0:
597 begin = max(prevA, prevB)
598 in_process.append(
599 (seqA, seqB, score, begin, end, prev_pos, None))
600 else:
601 for next_pos in trace_matrix[nextA][nextB]:
602 in_process.append(
603 (seqA, seqB, score, begin, end, prev_pos, next_pos))
604 if one_alignment_only:
605 break
606
607 return _clean_alignments(tracebacks)
608
609 -def _find_start(score_matrix, sequenceA, sequenceB, gap_A_fn, gap_B_fn,
610 penalize_end_gaps, align_globally):
611
612
613 if align_globally:
614 if penalize_end_gaps:
615 starts = _find_global_start(
616 sequenceA, sequenceB, score_matrix, gap_A_fn, gap_B_fn, 1)
617 else:
618 starts = _find_global_start(
619 sequenceA, sequenceB, score_matrix, None, None, 0)
620 else:
621 starts = _find_local_start(score_matrix)
622 return starts
623
624 -def _find_global_start(sequenceA, sequenceB,
625 score_matrix, gap_A_fn, gap_B_fn, penalize_end_gaps):
644
646
647 positions = []
648 nrows, ncols = len(score_matrix), len(score_matrix[0])
649 for row in range(nrows):
650 for col in range(ncols):
651 score = score_matrix[row][col]
652 positions.append((score, (row, col)))
653 return positions
654
656
657
658
659 alignments = listfns.items(alignments)
660 i = 0
661 while i < len(alignments):
662 seqA, seqB, score, begin, end = alignments[i]
663
664 if end is None:
665 end = len(seqA)
666 elif end < 0:
667 end = end + len(seqA)
668
669 if begin >= end:
670 del alignments[i]
671 continue
672 alignments[i] = seqA, seqB, score, begin, end
673 i += 1
674 return alignments
675
677
678 ls1, ls2 = len(s1), len(s2)
679 if ls1 < ls2:
680 s1 = _pad(s1, char, ls2-ls1)
681 elif ls2 < ls1:
682 s2 = _pad(s2, char, ls1-ls2)
683 return s1, s2
684
686
687
688 ls1, ls2 = len(s1), len(s2)
689 if ls1 < ls2:
690 s1 = _lpad(s1, char, ls2-ls1)
691 elif ls2 < ls1:
692 s2 = _lpad(s2, char, ls1-ls2)
693 return s1, s2
694
695 -def _pad(s, char, n):
696
697 return s + char*n
698
700
701 return char*n + s
702
703 _PRECISION = 1000
705 return int(x * precision + 0.5)
706
708 """identity_match([match][, mismatch]) -> match_fn
709
710 Create a match function for use in an alignment. match and
711 mismatch are the scores to give when two residues are equal or
712 unequal. By default, match is 1 and mismatch is 0.
713
714 """
715 - def __init__(self, match=1, mismatch=0):
716 self.match = match
717 self.mismatch = mismatch
719 if charA == charB:
720 return self.match
721 return self.mismatch
722
724 """dictionary_match(score_dict[, symmetric]) -> match_fn
725
726 Create a match function for use in an alignment. score_dict is a
727 dictionary where the keys are tuples (residue 1, residue 2) and
728 the values are the match scores between those residues. symmetric
729 is a flag that indicates whether the scores are symmetric. If
730 true, then if (res 1, res 2) doesn't exist, I will use the score
731 at (res 2, res 1).
732
733 """
734 - def __init__(self, score_dict, symmetric=1):
735 self.score_dict = score_dict
736 self.symmetric = symmetric
738 if self.symmetric and (charA, charB) not in self.score_dict:
739
740
741 charB, charA = charA, charB
742 return self.score_dict[(charA, charB)]
743
745 """affine_penalty(open, extend[, penalize_extend_when_opening]) -> gap_fn
746
747 Create a gap function for use in an alignment.
748
749 """
750 - def __init__(self, open, extend, penalize_extend_when_opening=0):
751 if open > 0 or extend > 0:
752 raise ValueError("Gap penalties should be non-positive.")
753 self.open, self.extend = open, extend
754 self.penalize_extend_when_opening = penalize_extend_when_opening
758
760 if length <= 0:
761 return 0
762 penalty = open + extend * length
763 if not penalize_extend_when_opening:
764 penalty -= extend
765 return penalty
766
768 """print_matrix(matrix)
769
770 Print out a matrix. For debugging purposes.
771
772 """
773
774 matrixT = [[] for x in range(len(matrix[0]))]
775 for i in range(len(matrix)):
776 for j in range(len(matrix[i])):
777 matrixT[j].append(len(str(matrix[i][j])))
778 ndigits = map(max, matrixT)
779 for i in range(len(matrix)):
780 for j in range(len(matrix[i])):
781 n = ndigits[j]
782 print "%*s " % (n, matrix[i][j]),
783 print
784
797
798
799
800
801 try:
802 import cpairwise2
803 except ImportError:
804 pass
805 else:
806 import sys
807 this_module = sys.modules[__name__]
808 for name in cpairwise2.__dict__.keys():
809 if not name.startswith("__"):
810 this_module.__dict__[name] = cpairwise2.__dict__[name]
811