1
2
3
4
5
6
7
8 """
9 This module provides code to work with the standalone version of
10 BLAST, either blastall or blastpgp, provided by the NCBI.
11 http://www.ncbi.nlm.nih.gov/BLAST/
12
13 Classes:
14 LowQualityBlastError Except that indicates low quality query sequences.
15 BlastParser Parses output from blast.
16 BlastErrorParser Parses output and tries to diagnose possible errors.
17 PSIBlastParser Parses output from psi-blast.
18 Iterator Iterates over a file of blast results.
19
20 _Scanner Scans output from standalone BLAST.
21 _BlastConsumer Consumes output from blast.
22 _PSIBlastConsumer Consumes output from psi-blast.
23 _HeaderConsumer Consumes header information.
24 _DescriptionConsumer Consumes description information.
25 _AlignmentConsumer Consumes alignment information.
26 _HSPConsumer Consumes hsp information.
27 _DatabaseReportConsumer Consumes database report information.
28 _ParametersConsumer Consumes parameters information.
29
30 Functions:
31 blastall Execute blastall.
32 blastpgp Execute blastpgp.
33 rpsblast Execute rpsblast.
34
35 """
36
37 import os
38 import re
39
40 from Bio import File
41 from Bio.ParserSupport import *
42 from Bio.Blast import Record
43 from Bio.Application import _escape_filename
44
46 """Error caused by running a low quality sequence through BLAST.
47
48 When low quality sequences (like GenBank entries containing only
49 stretches of a single nucleotide) are BLASTed, they will result in
50 BLAST generating an error and not being able to perform the BLAST.
51 search. This error should be raised for the BLAST reports produced
52 in this case.
53 """
54 pass
55
57 """Error caused by running a short query sequence through BLAST.
58
59 If the query sequence is too short, BLAST outputs warnings and errors:
60 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed.
61 [blastall] ERROR: [000.000] AT1G08320: Blast:
62 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize
63 done
64
65 This exception is raised when that condition is detected.
66
67 """
68 pass
69
70
72 """Scan BLAST output from blastall or blastpgp.
73
74 Tested with blastall and blastpgp v2.0.10, v2.0.11
75
76 Methods:
77 feed Feed data into the scanner.
78
79 """
80 - def feed(self, handle, consumer):
100
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145 consumer.start_header()
146
147 read_and_call(uhandle, consumer.version, contains='BLAST')
148 read_and_call_while(uhandle, consumer.noevent, blank=1)
149
150
151 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
152
153
154 while attempt_read_and_call(uhandle,
155 consumer.reference, start='Reference') :
156
157
158 while 1:
159 line = uhandle.readline()
160 if is_blank_line(line) :
161 consumer.noevent(line)
162 break
163 elif line.startswith("RID"):
164 break
165 else :
166
167 consumer.reference(line)
168
169
170 read_and_call_while(uhandle, consumer.noevent, blank=1)
171 attempt_read_and_call(uhandle, consumer.reference, start="RID:")
172 read_and_call_while(uhandle, consumer.noevent, blank=1)
173
174
175
176 if attempt_read_and_call(
177 uhandle, consumer.reference, start="Reference"):
178 read_and_call_until(uhandle, consumer.reference, blank=1)
179 read_and_call_while(uhandle, consumer.noevent, blank=1)
180
181
182 if attempt_read_and_call(
183 uhandle, consumer.reference, start="Reference"):
184 read_and_call_until(uhandle, consumer.reference, blank=1)
185 read_and_call_while(uhandle, consumer.noevent, blank=1)
186
187 line = uhandle.peekline()
188 assert line.strip() != ""
189 assert not line.startswith("RID:")
190 if line.startswith("Query=") :
191
192
193
194 read_and_call(uhandle, consumer.query_info, start='Query=')
195 read_and_call_until(uhandle, consumer.query_info, blank=1)
196 read_and_call_while(uhandle, consumer.noevent, blank=1)
197
198
199 read_and_call_until(uhandle, consumer.database_info, end='total letters')
200 read_and_call(uhandle, consumer.database_info, contains='sequences')
201 read_and_call_while(uhandle, consumer.noevent, blank=1)
202 elif line.startswith("Database:") :
203
204 read_and_call_until(uhandle, consumer.database_info, end='total letters')
205 read_and_call(uhandle, consumer.database_info, contains='sequences')
206 read_and_call_while(uhandle, consumer.noevent, blank=1)
207
208
209 read_and_call(uhandle, consumer.query_info, start='Query=')
210 read_and_call_until(uhandle, consumer.query_info, blank=1)
211 read_and_call_while(uhandle, consumer.noevent, blank=1)
212 else :
213 raise ValueError("Invalid header?")
214
215 consumer.end_header()
216
235
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259 consumer.start_descriptions()
260
261
262
263 attempt_read_and_call(uhandle, consumer.noevent, start='Searching')
264
265
266
267 if not uhandle.peekline():
268 raise ValueError("Unexpected end of blast report. " + \
269 "Looks suspiciously like a PSI-BLAST crash.")
270
271
272
273
274
275
276
277
278
279 line = uhandle.peekline()
280 if line.find("ERROR:") != -1 or line.startswith("done"):
281 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
282 read_and_call(uhandle, consumer.noevent, start="done")
283
284
285
286
287
288
289
290
291
292
293
294
295
296 read_and_call_while(uhandle, consumer.noevent, blank=1)
297
298 if attempt_read_and_call(uhandle, consumer.round, start='Results'):
299 read_and_call_while(uhandle, consumer.noevent, blank=1)
300
301
302
303
304
305
306
307
308 if not attempt_read_and_call(
309 uhandle, consumer.description_header,
310 has_re=re.compile(r'Score +E')):
311
312 attempt_read_and_call(uhandle, consumer.no_hits,
313 contains='No hits found')
314 read_and_call_while(uhandle, consumer.noevent, blank=1)
315
316 consumer.end_descriptions()
317
318 return
319
320
321 read_and_call(uhandle, consumer.description_header,
322 start='Sequences producing')
323
324
325 attempt_read_and_call(uhandle, consumer.model_sequences,
326 start='Sequences used in model')
327 read_and_call_while(uhandle, consumer.noevent, blank=1)
328
329
330
331
332 if safe_peekline(uhandle).startswith(" Database:") :
333 consumer.end_descriptions()
334
335 return
336
337
338
339 if not uhandle.peekline().startswith('Sequences not found'):
340 read_and_call_until(uhandle, consumer.description, blank=1)
341 read_and_call_while(uhandle, consumer.noevent, blank=1)
342
343
344
345
346
347 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences,
348 start='Sequences not found'):
349
350 read_and_call_while(uhandle, consumer.noevent, blank=1)
351 l = safe_peekline(uhandle)
352
353
354
355
356 if not l.startswith('CONVERGED') and l[0] != '>' \
357 and not l.startswith('QUERY'):
358 read_and_call_until(uhandle, consumer.description, blank=1)
359 read_and_call_while(uhandle, consumer.noevent, blank=1)
360
361 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED')
362 read_and_call_while(uhandle, consumer.noevent, blank=1)
363
364 consumer.end_descriptions()
365
380
387
400
429
435
437
438
439
440
441
442
443 read_and_call(uhandle, consumer.score, start=' Score')
444 read_and_call(uhandle, consumer.identities, start=' Identities')
445
446 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
447
448 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
449 read_and_call(uhandle, consumer.noevent, blank=1)
450
452
453
454
455
456
457
458
459
460
461 while 1:
462
463 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
464 read_and_call(uhandle, consumer.query, start='Query')
465 read_and_call(uhandle, consumer.align, start=' ')
466 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
467 read_and_call_while(uhandle, consumer.noevent, blank=1)
468 line = safe_peekline(uhandle)
469
470 if not (line.startswith('Query') or line.startswith(' ')):
471 break
472
495
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524 consumer.start_database_report()
525
526
527
528
529 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"):
530 read_and_call(uhandle, consumer.noevent, contains="letters")
531 read_and_call(uhandle, consumer.noevent, contains="sequences")
532 read_and_call(uhandle, consumer.noevent, start=" ")
533
534
535
536 while attempt_read_and_call(uhandle, consumer.database,
537 start=' Database'):
538
539
540
541 if not uhandle.peekline().strip() \
542 or uhandle.peekline().startswith("BLAST"):
543 consumer.end_database_report()
544 return
545
546
547 read_and_call_until(uhandle, consumer.database, start=' Posted')
548 read_and_call(uhandle, consumer.posted_date, start=' Posted')
549 read_and_call(uhandle, consumer.num_letters_in_database,
550 start=' Number of letters')
551 read_and_call(uhandle, consumer.num_sequences_in_database,
552 start=' Number of sequences')
553
554 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
555
556 line = safe_readline(uhandle)
557 uhandle.saveline(line)
558 if line.find('Lambda') != -1:
559 break
560
561 read_and_call(uhandle, consumer.noevent, start='Lambda')
562 read_and_call(uhandle, consumer.ka_params)
563
564
565 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
566
567
568 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
569
570 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
571 read_and_call(uhandle, consumer.ka_params_gap)
572
573
574
575
576 try:
577 read_and_call_while(uhandle, consumer.noevent, blank=1)
578 except ValueError, x:
579 if str(x) != "Unexpected end of stream.":
580 raise
581 consumer.end_database_report()
582
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641 if not uhandle.peekline().strip():
642 return
643
644
645
646 consumer.start_parameters()
647
648
649 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
650
651 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
652
653 attempt_read_and_call(uhandle, consumer.num_sequences,
654 start='Number of Sequences')
655 read_and_call(uhandle, consumer.num_hits,
656 start='Number of Hits')
657 attempt_read_and_call(uhandle, consumer.num_sequences,
658 start='Number of Sequences')
659 read_and_call(uhandle, consumer.num_extends,
660 start='Number of extensions')
661 read_and_call(uhandle, consumer.num_good_extends,
662 start='Number of successful')
663
664 read_and_call(uhandle, consumer.num_seqs_better_e,
665 start='Number of sequences')
666
667
668 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
669 start="Number of HSP's better"):
670
671 if attempt_read_and_call(uhandle, consumer.noevent,
672 start="Number of HSP's gapped:"):
673 read_and_call(uhandle, consumer.noevent,
674 start="Number of HSP's successfully")
675
676 attempt_read_and_call(uhandle, consumer.noevent,
677 start="Number of extra gapped extensions")
678 else:
679 read_and_call(uhandle, consumer.hsps_prelim_gapped,
680 start="Number of HSP's successfully")
681 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
682 start="Number of HSP's that")
683 read_and_call(uhandle, consumer.hsps_gapped,
684 start="Number of HSP's gapped")
685
686 elif attempt_read_and_call(uhandle, consumer.noevent,
687 start="Number of HSP's gapped"):
688 read_and_call(uhandle, consumer.noevent,
689 start="Number of HSP's successfully")
690
691
692 attempt_read_and_call(uhandle, consumer.query_length,
693 has_re=re.compile(r"[Ll]ength of query"))
694 read_and_call(uhandle, consumer.database_length,
695 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"))
696
697
698 attempt_read_and_call(uhandle, consumer.noevent,
699 start="Length adjustment")
700 attempt_read_and_call(uhandle, consumer.effective_hsp_length,
701 start='effective HSP')
702
703 attempt_read_and_call(
704 uhandle, consumer.effective_query_length,
705 has_re=re.compile(r'[Ee]ffective length of query'))
706
707
708 attempt_read_and_call(
709 uhandle, consumer.effective_database_length,
710 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase'))
711
712
713 attempt_read_and_call(
714 uhandle, consumer.effective_search_space,
715 has_re=re.compile(r'[Ee]ffective search space:'))
716
717 attempt_read_and_call(
718 uhandle, consumer.effective_search_space_used,
719 has_re=re.compile(r'[Ee]ffective search space used'))
720
721
722 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
723
724
725 attempt_read_and_call(uhandle, consumer.threshold, start='T')
726
727 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold')
728
729
730 attempt_read_and_call(uhandle, consumer.window_size, start='A')
731
732 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits')
733
734 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
735
736 attempt_read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
737
738
739 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
740 start='X3')
741
742
743 attempt_read_and_call(uhandle, consumer.gap_trigger, start='S1')
744
745
746
747 if not is_blank_line(uhandle.peekline(), allow_spaces=1):
748 read_and_call(uhandle, consumer.blast_cutoff, start='S2')
749
750 consumer.end_parameters()
751
753 """Parses BLAST data into a Record.Blast object.
754
755 """
760
761 - def parse(self, handle):
762 """parse(self, handle)"""
763 self._scanner.feed(handle, self._consumer)
764 return self._consumer.data
765
767 """Parses BLAST data into a Record.PSIBlast object.
768
769 """
774
775 - def parse(self, handle):
776 """parse(self, handle)"""
777 self._scanner.feed(handle, self._consumer)
778 return self._consumer.data
779
783
785 c = line.split()
786 self._header.application = c[0]
787 self._header.version = c[1]
788 self._header.date = c[2][1:-1]
789
795
797 if line.startswith('Query= '):
798 self._header.query = line[7:]
799 elif not line.startswith(' '):
800 self._header.query = "%s%s" % (self._header.query, line)
801 else:
802 letters, = _re_search(
803 r"([0-9,]+) letters", line,
804 "I could not find the number of letters in line\n%s" % line)
805 self._header.query_letters = _safe_int(letters)
806
819
824
827 self._descriptions = []
828 self._model_sequences = []
829 self._nonmodel_sequences = []
830 self._converged = 0
831 self._type = None
832 self._roundnum = None
833
834 self.__has_n = 0
835
837 if line.startswith('Sequences producing'):
838 cols = line.split()
839 if cols[-1] == 'N':
840 self.__has_n = 1
841
843 dh = self._parse(line)
844 if self._type == 'model':
845 self._model_sequences.append(dh)
846 elif self._type == 'nonmodel':
847 self._nonmodel_sequences.append(dh)
848 else:
849 self._descriptions.append(dh)
850
853
855 self._type = 'nonmodel'
856
859
862
864 if not line.startswith('Results from round'):
865 raise ValueError("I didn't understand the round line\n%s" % line)
866 self._roundnum = _safe_int(line[18:].strip())
867
870
871 - def _parse(self, description_line):
872 line = description_line
873 dh = Record.Description()
874
875
876
877
878
879
880
881
882 cols = line.split()
883 if len(cols) < 3:
884 raise ValueError( \
885 "Line does not appear to contain description:\n%s" % line)
886 if self.__has_n:
887 i = line.rfind(cols[-1])
888 i = line.rfind(cols[-2], 0, i)
889 i = line.rfind(cols[-3], 0, i)
890 else:
891 i = line.rfind(cols[-1])
892 i = line.rfind(cols[-2], 0, i)
893 if self.__has_n:
894 dh.title, dh.score, dh.e, dh.num_alignments = \
895 line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
896 else:
897 dh.title, dh.score, dh.e, dh.num_alignments = \
898 line[:i].rstrip(), cols[-2], cols[-1], 1
899 dh.num_alignments = _safe_int(dh.num_alignments)
900 dh.score = _safe_int(dh.score)
901 dh.e = _safe_float(dh.e)
902 return dh
903
905
906
907
908
912
914 if self._alignment.title:
915 self._alignment.title += " "
916 self._alignment.title += line.strip()
917
919
920 parts = line.replace(" ","").split("=")
921 assert len(parts)==2, "Unrecognised format length line"
922 self._alignment.length = parts[1]
923 self._alignment.length = _safe_int(self._alignment.length)
924
926
927 if line.startswith('QUERY') or line.startswith('blast_tmp'):
928
929
930
931
932
933 try:
934 name, start, seq, end = line.split()
935 except ValueError:
936 raise ValueError("I do not understand the line\n%s" % line)
937 self._start_index = line.index(start, len(name))
938 self._seq_index = line.index(seq,
939 self._start_index+len(start))
940
941 self._name_length = self._start_index - 1
942 self._start_length = self._seq_index - self._start_index - 1
943 self._seq_length = line.rfind(end) - self._seq_index - 1
944
945
946
947
948
949
950
951
952
953 name = line[:self._name_length]
954 name = name.rstrip()
955 start = line[self._start_index:self._start_index+self._start_length]
956 start = start.rstrip()
957 if start:
958 start = _safe_int(start)
959 end = line[self._seq_index+self._seq_length:].rstrip()
960 if end:
961 end = _safe_int(end)
962 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
963
964 if len(seq) < self._seq_length:
965 seq = seq + ' '*(self._seq_length-len(seq))
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984 align = self._multiple_alignment.alignment
985 align.append((name, start, seq, end))
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1035
1036 if self._alignment:
1037 self._alignment.title = self._alignment.title.rstrip()
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059 try:
1060 del self._seq_index
1061 del self._seq_length
1062 del self._start_index
1063 del self._start_length
1064 del self._name_length
1065 except AttributeError:
1066 pass
1067
1071
1073 self._hsp.bits, self._hsp.score = _re_search(
1074 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line,
1075 "I could not find the score in line\n%s" % line)
1076 self._hsp.score = _safe_float(self._hsp.score)
1077 self._hsp.bits = _safe_float(self._hsp.bits)
1078
1079 x, y = _re_search(
1080 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line,
1081 "I could not find the expect in line\n%s" % line)
1082 if x:
1083 self._hsp.num_alignments = _safe_int(x)
1084 else:
1085 self._hsp.num_alignments = 1
1086 self._hsp.expect = _safe_float(y)
1087
1089 x, y = _re_search(
1090 r"Identities = (\d+)\/(\d+)", line,
1091 "I could not find the identities in line\n%s" % line)
1092 self._hsp.identities = _safe_int(x), _safe_int(y)
1093 self._hsp.align_length = _safe_int(y)
1094
1095 if line.find('Positives') != -1:
1096 x, y = _re_search(
1097 r"Positives = (\d+)\/(\d+)", line,
1098 "I could not find the positives in line\n%s" % line)
1099 self._hsp.positives = _safe_int(x), _safe_int(y)
1100 assert self._hsp.align_length == _safe_int(y)
1101
1102 if line.find('Gaps') != -1:
1103 x, y = _re_search(
1104 r"Gaps = (\d+)\/(\d+)", line,
1105 "I could not find the gaps in line\n%s" % line)
1106 self._hsp.gaps = _safe_int(x), _safe_int(y)
1107 assert self._hsp.align_length == _safe_int(y)
1108
1109
1111 self._hsp.strand = _re_search(
1112 r"Strand = (\w+) / (\w+)", line,
1113 "I could not find the strand in line\n%s" % line)
1114
1116
1117
1118
1119 if line.find('/') != -1:
1120 self._hsp.frame = _re_search(
1121 r"Frame = ([-+][123]) / ([-+][123])", line,
1122 "I could not find the frame in line\n%s" % line)
1123 else:
1124 self._hsp.frame = _re_search(
1125 r"Frame = ([-+][123])", line,
1126 "I could not find the frame in line\n%s" % line)
1127
1128
1129
1130
1131
1132
1133 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1135 m = self._query_re.search(line)
1136 if m is None:
1137 raise ValueError("I could not find the query in line\n%s" % line)
1138
1139
1140
1141 colon, start, seq, end = m.groups()
1142 self._hsp.query = self._hsp.query + seq
1143 if self._hsp.query_start is None:
1144 self._hsp.query_start = _safe_int(start)
1145
1146
1147
1148 self._hsp.query_end = _safe_int(end)
1149
1150
1151 self._query_start_index = m.start(3)
1152 self._query_len = len(seq)
1153
1155 seq = line[self._query_start_index:].rstrip()
1156 if len(seq) < self._query_len:
1157
1158 seq = seq + ' ' * (self._query_len-len(seq))
1159 elif len(seq) < self._query_len:
1160 raise ValueError("Match is longer than the query in line\n%s" \
1161 % line)
1162 self._hsp.match = self._hsp.match + seq
1163
1164
1165
1166 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1168 m = self._sbjct_re.search(line)
1169 if m is None:
1170 raise ValueError("I could not find the sbjct in line\n%s" % line)
1171 colon, start, seq, end = m.groups()
1172
1173
1174
1175
1176 if not seq.strip():
1177 seq = ' ' * self._query_len
1178 self._hsp.sbjct = self._hsp.sbjct + seq
1179 if self._hsp.sbjct_start is None:
1180 self._hsp.sbjct_start = _safe_int(start)
1181
1182 self._hsp.sbjct_end = _safe_int(end)
1183 if len(seq) != self._query_len:
1184 raise ValueError( \
1185 "QUERY and SBJCT sequence lengths don't match in line\n%s" \
1186 % line)
1187
1188 del self._query_start_index
1189 del self._query_len
1190
1193
1195
1198
1200 m = re.search(r"Database: (.+)$", line)
1201 if m:
1202 self._dr.database_name.append(m.group(1))
1203 elif self._dr.database_name:
1204
1205 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1],
1206 line.strip())
1207
1208 - def posted_date(self, line):
1209 self._dr.posted_date.append(_re_search(
1210 r"Posted date:\s*(.+)$", line,
1211 "I could not find the posted date in line\n%s" % line))
1212
1217
1222
1226
1229
1233
1236
1240
1243
1248
1250 if line.find('1st pass') != -1:
1251 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
1252 self._params.num_hits = _safe_int(x)
1253 else:
1254 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"})
1255 self._params.num_hits = _safe_int(x)
1256
1258 if line.find('1st pass') != -1:
1259 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
1260 self._params.num_sequences = _safe_int(x)
1261 else:
1262 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"})
1263 self._params.num_sequences = _safe_int(x)
1264
1266 if line.find('1st pass') != -1:
1267 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"})
1268 self._params.num_extends = _safe_int(x)
1269 else:
1270 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"})
1271 self._params.num_extends = _safe_int(x)
1272
1274 if line.find('1st pass') != -1:
1275 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"})
1276 self._params.num_good_extends = _safe_int(x)
1277 else:
1278 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"})
1279 self._params.num_good_extends = _safe_int(x)
1280
1286
1291
1297
1303
1308
1313
1318
1324
1330
1336
1342
1348
1352
1354 if line[:2] == "T:" :
1355
1356 self._params.threshold, = _get_cols(
1357 line, (1,), ncols=2, expected={0:"T:"})
1358 elif line[:28] == "Neighboring words threshold:" :
1359 self._params.threshold, = _get_cols(
1360 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"})
1361 else :
1362 raise ValueError("Unrecognised threshold line:\n%s" % line)
1363 self._params.threshold = _safe_int(self._params.threshold)
1364
1366 if line[:2] == "A:" :
1367 self._params.window_size, = _get_cols(
1368 line, (1,), ncols=2, expected={0:"A:"})
1369 elif line[:25] == "Window for multiple hits:" :
1370 self._params.window_size, = _get_cols(
1371 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"})
1372 else :
1373 raise ValueError("Unrecognised window size line:\n%s" % line)
1374 self._params.window_size = _safe_int(self._params.window_size)
1375
1381
1387
1393
1399
1405
1408
1409
1410 -class _BlastConsumer(AbstractConsumer,
1411 _HeaderConsumer,
1412 _DescriptionConsumer,
1413 _AlignmentConsumer,
1414 _HSPConsumer,
1415 _DatabaseReportConsumer,
1416 _ParametersConsumer
1417 ):
1418
1419
1420
1421
1422
1423
1424
1425
1426
1429
1431
1432 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1433
1437
1441
1443 self.data.descriptions = self._descriptions
1444
1446 _AlignmentConsumer.end_alignment(self)
1447 if self._alignment.hsps:
1448 self.data.alignments.append(self._alignment)
1449 if self._multiple_alignment.alignment:
1450 self.data.multiple_alignment = self._multiple_alignment
1451
1453 _HSPConsumer.end_hsp(self)
1454 try:
1455 self._alignment.hsps.append(self._hsp)
1456 except AttributeError:
1457 raise ValueError("Found an HSP before an alignment")
1458
1462
1466
1467 -class _PSIBlastConsumer(AbstractConsumer,
1468 _HeaderConsumer,
1469 _DescriptionConsumer,
1470 _AlignmentConsumer,
1471 _HSPConsumer,
1472 _DatabaseReportConsumer,
1473 _ParametersConsumer
1474 ):
1477
1481
1485
1490
1492 _DescriptionConsumer.end_descriptions(self)
1493 self._round.number = self._roundnum
1494 if self._descriptions:
1495 self._round.new_seqs.extend(self._descriptions)
1496 self._round.reused_seqs.extend(self._model_sequences)
1497 self._round.new_seqs.extend(self._nonmodel_sequences)
1498 if self._converged:
1499 self.data.converged = 1
1500
1502 _AlignmentConsumer.end_alignment(self)
1503 if self._alignment.hsps:
1504 self._round.alignments.append(self._alignment)
1505 if self._multiple_alignment:
1506 self._round.multiple_alignment = self._multiple_alignment
1507
1509 _HSPConsumer.end_hsp(self)
1510 try:
1511 self._alignment.hsps.append(self._hsp)
1512 except AttributeError:
1513 raise ValueError("Found an HSP before an alignment")
1514
1518
1522
1524 """Iterates over a file of multiple BLAST results.
1525
1526 Methods:
1527 next Return the next record from the stream, or None.
1528
1529 """
1530 - def __init__(self, handle, parser=None):
1531 """__init__(self, handle, parser=None)
1532
1533 Create a new iterator. handle is a file-like object. parser
1534 is an optional Parser object to change the results into another form.
1535 If set to None, then the raw contents of the file will be returned.
1536
1537 """
1538 try:
1539 handle.readline
1540 except AttributeError:
1541 raise ValueError(
1542 "I expected a file handle or file-like object, got %s"
1543 % type(handle))
1544 self._uhandle = File.UndoHandle(handle)
1545 self._parser = parser
1546
1548 """next(self) -> object
1549
1550 Return the next Blast record from the file. If no more records,
1551 return None.
1552
1553 """
1554 lines = []
1555 while 1:
1556 line = self._uhandle.readline()
1557 if not line:
1558 break
1559
1560 if lines and (line.startswith('BLAST')
1561 or line.startswith('BLAST', 1)
1562 or line.startswith('<?xml ')):
1563 self._uhandle.saveline(line)
1564 break
1565 lines.append(line)
1566
1567 if not lines:
1568 return None
1569
1570 data = ''.join(lines)
1571 if self._parser is not None:
1572 return self._parser.parse(File.StringHandle(data))
1573 return data
1574
1576 return iter(self.next, None)
1577
1578 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1579 """Execute and retrieve data from standalone BLASTPALL as handles.
1580
1581 Execute and retrieve data from blastall. blastcmd is the command
1582 used to launch the 'blastall' executable. program is the blast program
1583 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database
1584 to search against. infile is the path to the file containing
1585 the sequence to search with.
1586
1587 The return values are two handles, for standard output and standard error.
1588
1589 You may pass more parameters to **keywds to change the behavior of
1590 the search. Otherwise, optional values will be chosen by blastall.
1591 The Blast output is by default in XML format. Use the align_view keyword
1592 for output in a different format.
1593
1594 Scoring
1595 matrix Matrix to use.
1596 gap_open Gap open penalty.
1597 gap_extend Gap extension penalty.
1598 nuc_match Nucleotide match reward. (BLASTN)
1599 nuc_mismatch Nucleotide mismatch penalty. (BLASTN)
1600 query_genetic_code Genetic code for Query.
1601 db_genetic_code Genetic code for database. (TBLAST[NX])
1602
1603 Algorithm
1604 gapped Whether to do a gapped alignment. T/F (not for TBLASTX)
1605 expectation Expectation value cutoff.
1606 wordsize Word size.
1607 strands Query strands to search against database.([T]BLAST[NX])
1608 keep_hits Number of best hits from a region to keep.
1609 xdrop Dropoff value (bits) for gapped alignments.
1610 hit_extend Threshold for extending hits.
1611 region_length Length of region used to judge hits.
1612 db_length Effective database length.
1613 search_length Effective length of search space.
1614
1615 Processing
1616 filter Filter query sequence for low complexity (with SEG)? T/F
1617 believe_query Believe the query defline. T/F
1618 restrict_gi Restrict search to these GI's.
1619 nprocessors Number of processors to use.
1620 oldengine Force use of old engine T/F
1621
1622 Formatting
1623 html Produce HTML output? T/F
1624 descriptions Number of one-line descriptions.
1625 alignments Number of alignments.
1626 align_view Alignment view. Integer 0-11,
1627 passed as a string or integer.
1628 show_gi Show GI's in deflines? T/F
1629 seqalign_file seqalign file to output.
1630 outfile Output file for report. Filename to write to, if
1631 ommitted standard output is used (which you can access
1632 from the returned handles).
1633 """
1634
1635 _security_check_parameters(keywds)
1636
1637 att2param = {
1638 'matrix' : '-M',
1639 'gap_open' : '-G',
1640 'gap_extend' : '-E',
1641 'nuc_match' : '-r',
1642 'nuc_mismatch' : '-q',
1643 'query_genetic_code' : '-Q',
1644 'db_genetic_code' : '-D',
1645
1646 'gapped' : '-g',
1647 'expectation' : '-e',
1648 'wordsize' : '-W',
1649 'strands' : '-S',
1650 'keep_hits' : '-K',
1651 'xdrop' : '-X',
1652 'hit_extend' : '-f',
1653 'region_length' : '-L',
1654 'db_length' : '-z',
1655 'search_length' : '-Y',
1656
1657 'program' : '-p',
1658 'database' : '-d',
1659 'infile' : '-i',
1660 'filter' : '-F',
1661 'believe_query' : '-J',
1662 'restrict_gi' : '-l',
1663 'nprocessors' : '-a',
1664 'oldengine' : '-V',
1665
1666 'html' : '-T',
1667 'descriptions' : '-v',
1668 'alignments' : '-b',
1669 'align_view' : '-m',
1670 'show_gi' : '-I',
1671 'seqalign_file' : '-O',
1672 'outfile' : '-o',
1673 }
1674 from Applications import BlastallCommandline
1675 cline = BlastallCommandline(blastcmd)
1676 cline.set_parameter(att2param['program'], program)
1677 cline.set_parameter(att2param['database'], database)
1678 cline.set_parameter(att2param['infile'], infile)
1679 cline.set_parameter(att2param['align_view'], str(align_view))
1680 for key, value in keywds.iteritems() :
1681 cline.set_parameter(att2param[key], str(value))
1682 return _invoke_blast(cline)
1683
1684
1685 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1686 """Execute and retrieve data from standalone BLASTPGP as handles.
1687
1688 Execute and retrieve data from blastpgp. blastcmd is the command
1689 used to launch the 'blastpgp' executable. database is the path to the
1690 database to search against. infile is the path to the file containing
1691 the sequence to search with.
1692
1693 The return values are two handles, for standard output and standard error.
1694
1695 You may pass more parameters to **keywds to change the behavior of
1696 the search. Otherwise, optional values will be chosen by blastpgp.
1697 The Blast output is by default in XML format. Use the align_view keyword
1698 for output in a different format.
1699
1700 Scoring
1701 matrix Matrix to use.
1702 gap_open Gap open penalty.
1703 gap_extend Gap extension penalty.
1704 window_size Multiple hits window size.
1705 npasses Number of passes.
1706 passes Hits/passes. Integer 0-2.
1707
1708 Algorithm
1709 gapped Whether to do a gapped alignment. T/F
1710 expectation Expectation value cutoff.
1711 wordsize Word size.
1712 keep_hits Number of beset hits from a region to keep.
1713 xdrop Dropoff value (bits) for gapped alignments.
1714 hit_extend Threshold for extending hits.
1715 region_length Length of region used to judge hits.
1716 db_length Effective database length.
1717 search_length Effective length of search space.
1718 nbits_gapping Number of bits to trigger gapping.
1719 pseudocounts Pseudocounts constants for multiple passes.
1720 xdrop_final X dropoff for final gapped alignment.
1721 xdrop_extension Dropoff for blast extensions.
1722 model_threshold E-value threshold to include in multipass model.
1723 required_start Start of required region in query.
1724 required_end End of required region in query.
1725
1726 Processing
1727 XXX should document default values
1728 program The blast program to use. (PHI-BLAST)
1729 filter Filter query sequence for low complexity (with SEG)? T/F
1730 believe_query Believe the query defline? T/F
1731 nprocessors Number of processors to use.
1732
1733 Formatting
1734 html Produce HTML output? T/F
1735 descriptions Number of one-line descriptions.
1736 alignments Number of alignments.
1737 align_view Alignment view. Integer 0-11,
1738 passed as a string or integer.
1739 show_gi Show GI's in deflines? T/F
1740 seqalign_file seqalign file to output.
1741 align_outfile Output file for alignment.
1742 checkpoint_outfile Output file for PSI-BLAST checkpointing.
1743 restart_infile Input file for PSI-BLAST restart.
1744 hit_infile Hit file for PHI-BLAST.
1745 matrix_outfile Output file for PSI-BLAST matrix in ASCII.
1746 align_outfile Output file for alignment. Filename to write to, if
1747 ommitted standard output is used (which you can access
1748 from the returned handles).
1749
1750 align_infile Input alignment file for PSI-BLAST restart.
1751
1752 """
1753
1754 _security_check_parameters(keywds)
1755
1756 att2param = {
1757 'matrix' : '-M',
1758 'gap_open' : '-G',
1759 'gap_extend' : '-E',
1760 'window_size' : '-A',
1761 'npasses' : '-j',
1762 'passes' : '-P',
1763
1764 'gapped' : '-g',
1765 'expectation' : '-e',
1766 'wordsize' : '-W',
1767 'keep_hits' : '-K',
1768 'xdrop' : '-X',
1769 'hit_extend' : '-f',
1770 'region_length' : '-L',
1771 'db_length' : '-Z',
1772 'search_length' : '-Y',
1773 'nbits_gapping' : '-N',
1774 'pseudocounts' : '-c',
1775 'xdrop_final' : '-Z',
1776 'xdrop_extension' : '-y',
1777 'model_threshold' : '-h',
1778 'required_start' : '-S',
1779 'required_end' : '-H',
1780
1781 'program' : '-p',
1782 'database' : '-d',
1783 'infile' : '-i',
1784 'filter' : '-F',
1785 'believe_query' : '-J',
1786 'nprocessors' : '-a',
1787
1788 'html' : '-T',
1789 'descriptions' : '-v',
1790 'alignments' : '-b',
1791 'align_view' : '-m',
1792 'show_gi' : '-I',
1793 'seqalign_file' : '-O',
1794 'align_outfile' : '-o',
1795 'checkpoint_outfile' : '-C',
1796 'restart_infile' : '-R',
1797 'hit_infile' : '-k',
1798 'matrix_outfile' : '-Q',
1799 'align_infile' : '-B',
1800 }
1801 from Applications import BlastpgpCommandline
1802 cline = BlastpgpCommandline(blastcmd)
1803 cline.set_parameter(att2param['database'], database)
1804 cline.set_parameter(att2param['infile'], infile)
1805 cline.set_parameter(att2param['align_view'], str(align_view))
1806 for key, value in keywds.iteritems() :
1807 cline.set_parameter(att2param[key], str(value))
1808 return _invoke_blast(cline)
1809
1810
1811 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1812 """Execute and retrieve data from standalone RPS-BLAST as handles.
1813
1814 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the
1815 command used to launch the 'rpsblast' executable. database is the path
1816 to the database to search against. infile is the path to the file
1817 containing the sequence to search with.
1818
1819 The return values are two handles, for standard output and standard error.
1820
1821 You may pass more parameters to **keywds to change the behavior of
1822 the search. Otherwise, optional values will be chosen by rpsblast.
1823
1824 Please note that this function will give XML output by default, by
1825 setting align_view to seven (i.e. command line option -m 7).
1826 You should use the NCBIXML.parse() function to read the resulting output.
1827 This is because NCBIStandalone.BlastParser() does not understand the
1828 plain text output format from rpsblast.
1829
1830 WARNING - The following text and associated parameter handling has not
1831 received extensive testing. Please report any errors we might have made...
1832
1833 Algorithm/Scoring
1834 gapped Whether to do a gapped alignment. T/F
1835 multihit 0 for multiple hit (default), 1 for single hit
1836 expectation Expectation value cutoff.
1837 range_restriction Range restriction on query sequence (Format: start,stop) blastp only
1838 0 in 'start' refers to the beginning of the sequence
1839 0 in 'stop' refers to the end of the sequence
1840 Default = 0,0
1841 xdrop Dropoff value (bits) for gapped alignments.
1842 xdrop_final X dropoff for final gapped alignment (in bits).
1843 xdrop_extension Dropoff for blast extensions (in bits).
1844 search_length Effective length of search space.
1845 nbits_gapping Number of bits to trigger gapping.
1846 protein Query sequence is protein. T/F
1847 db_length Effective database length.
1848
1849 Processing
1850 filter Filter query sequence for low complexity? T/F
1851 case_filter Use lower case filtering of FASTA sequence T/F, default F
1852 believe_query Believe the query defline. T/F
1853 nprocessors Number of processors to use.
1854 logfile Name of log file to use, default rpsblast.log
1855
1856 Formatting
1857 html Produce HTML output? T/F
1858 descriptions Number of one-line descriptions.
1859 alignments Number of alignments.
1860 align_view Alignment view. Integer 0-11,
1861 passed as a string or integer.
1862 show_gi Show GI's in deflines? T/F
1863 seqalign_file seqalign file to output.
1864 align_outfile Output file for alignment. Filename to write to, if
1865 ommitted standard output is used (which you can access
1866 from the returned handles).
1867 """
1868
1869 _security_check_parameters(keywds)
1870
1871 att2param = {
1872 'multihit' : '-P',
1873 'gapped' : '-g',
1874 'expectation' : '-e',
1875 'range_restriction' : '-L',
1876 'xdrop' : '-X',
1877 'xdrop_final' : '-Z',
1878 'xdrop_extension' : '-y',
1879 'search_length' : '-Y',
1880 'nbits_gapping' : '-N',
1881 'protein' : '-p',
1882 'db_length' : '-z',
1883
1884 'database' : '-d',
1885 'infile' : '-i',
1886 'filter' : '-F',
1887 'case_filter' : '-U',
1888 'believe_query' : '-J',
1889 'nprocessors' : '-a',
1890 'logfile' : '-l',
1891
1892 'html' : '-T',
1893 'descriptions' : '-v',
1894 'alignments' : '-b',
1895 'align_view' : '-m',
1896 'show_gi' : '-I',
1897 'seqalign_file' : '-O',
1898 'align_outfile' : '-o',
1899 }
1900
1901 from Applications import RpsBlastCommandline
1902 cline = RpsBlastCommandline(blastcmd)
1903 cline.set_parameter(att2param['database'], database)
1904 cline.set_parameter(att2param['infile'], infile)
1905 cline.set_parameter(att2param['align_view'], str(align_view))
1906 for key, value in keywds.iteritems() :
1907 cline.set_parameter(att2param[key], str(value))
1908 return _invoke_blast(cline)
1909
1910
1912 m = re.search(regex, line)
1913 if not m:
1914 raise ValueError(error_msg)
1915 return m.groups()
1916
1917 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1918 cols = line.split()
1919
1920
1921 if ncols is not None and len(cols) != ncols:
1922 raise ValueError("I expected %d columns (got %d) in line\n%s" \
1923 % (ncols, len(cols), line))
1924
1925
1926 for k in expected.keys():
1927 if cols[k] != expected[k]:
1928 raise ValueError("I expected '%s' in column %d in line\n%s" \
1929 % (expected[k], k, line))
1930
1931
1932 results = []
1933 for c in cols_to_get:
1934 results.append(cols[c])
1935 return tuple(results)
1936
1938 try:
1939 return int(str)
1940 except ValueError:
1941
1942
1943 str = str.replace(',', '')
1944 try:
1945
1946 return int(str)
1947 except ValueError:
1948 pass
1949
1950
1951 return long(float(str))
1952
1954
1955
1956
1957
1958
1959 if str and str[0] in ['E', 'e']:
1960 str = '1' + str
1961 try:
1962 return float(str)
1963 except ValueError:
1964
1965 str = str.replace(',', '')
1966
1967 return float(str)
1968
1969
1971 """Start BLAST and returns handles for stdout and stderr (PRIVATE).
1972
1973 Expects a command line wrapper object from Bio.Blast.Applications
1974 """
1975 import subprocess, sys
1976 blast_cmd = cline.program_name
1977 if not os.path.exists(blast_cmd):
1978 raise ValueError("BLAST executable does not exist at %s" % blast_cmd)
1979
1980
1981
1982
1983 blast_process = subprocess.Popen(str(cline),
1984 stdin=subprocess.PIPE,
1985 stdout=subprocess.PIPE,
1986 stderr=subprocess.PIPE,
1987 shell=(sys.platform!="win32"))
1988 blast_process.stdin.close()
1989 return blast_process.stdout, blast_process.stderr
1990
1991
1993 """Look for any attempt to insert a command into a parameter.
1994
1995 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd')
1996
1997 Looks for ";" or "&&" in the strings (Unix and Windows syntax
1998 for appending a command line), or ">", "<" or "|" (redirection)
1999 and if any are found raises an exception.
2000 """
2001 for key, value in param_dict.iteritems() :
2002 str_value = str(value)
2003 for bad_str in [";", "&&", ">", "<", "|"] :
2004 if bad_str in str_value :
2005 raise ValueError("Rejecting suspicious argument for %s" % key)
2006
2017
2019 """Attempt to catch and diagnose BLAST errors while parsing.
2020
2021 This utilizes the BlastParser module but adds an additional layer
2022 of complexity on top of it by attempting to diagnose ValueErrors
2023 that may actually indicate problems during BLAST parsing.
2024
2025 Current BLAST problems this detects are:
2026 o LowQualityBlastError - When BLASTing really low quality sequences
2027 (ie. some GenBank entries which are just short streches of a single
2028 nucleotide), BLAST will report an error with the sequence and be
2029 unable to search with this. This will lead to a badly formatted
2030 BLAST report that the parsers choke on. The parser will convert the
2031 ValueError to a LowQualityBlastError and attempt to provide useful
2032 information.
2033
2034 """
2035 - def __init__(self, bad_report_handle = None):
2036 """Initialize a parser that tries to catch BlastErrors.
2037
2038 Arguments:
2039 o bad_report_handle - An optional argument specifying a handle
2040 where bad reports should be sent. This would allow you to save
2041 all of the bad reports to a file, for instance. If no handle
2042 is specified, the bad reports will not be saved.
2043 """
2044 self._bad_report_handle = bad_report_handle
2045
2046
2047 self._scanner = _Scanner()
2048 self._consumer = _BlastErrorConsumer()
2049
2050 - def parse(self, handle):
2051 """Parse a handle, attempting to diagnose errors.
2052 """
2053 results = handle.read()
2054
2055 try:
2056 self._scanner.feed(File.StringHandle(results), self._consumer)
2057 except ValueError, msg:
2058
2059 if self._bad_report_handle:
2060
2061 self._bad_report_handle.write(results)
2062
2063
2064 self._diagnose_error(
2065 File.StringHandle(results), self._consumer.data)
2066
2067
2068
2069 raise
2070 return self._consumer.data
2071
2073 """Attempt to diagnose an error in the passed handle.
2074
2075 Arguments:
2076 o handle - The handle potentially containing the error
2077 o data_record - The data record partially created by the consumer.
2078 """
2079 line = handle.readline()
2080
2081 while line:
2082
2083
2084
2085 if line.startswith('Searchingdone'):
2086 raise LowQualityBlastError("Blast failure occured on query: ",
2087 data_record.query)
2088 line = handle.readline()
2089