1 """Martel based parser to read GenBank formatted files.
2
3 This is a huge regular regular expression for GenBank, built using
4 the 'regular expressions on steroids' capabilities of Martel.
5
6 Documentation for GenBank format that I found:
7
8 o GenBank/EMBL feature tables are described at:
9 http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
10
11 o There are also descriptions of different GenBank lines at:
12 http://www.ibc.wustl.edu/standards/gbrel.txt
13 """
14
15 import warnings
16 warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
17
18
19
20 import Martel
21 from Martel import RecordReader
22
23
24 from Bio import Std
25
26
27
28
29
30 INDENT = 12
31 FEATURE_KEY_INDENT = 5
32 FEATURE_QUALIFIER_INDENT = 21
33
34 blank_space = Martel.Spaces()
35 small_indent_space = Martel.Str(" " * 2)
36 big_indent_space = Martel.Str(" " * FEATURE_KEY_INDENT)
37 qualifier_space = Martel.Str(" " * FEATURE_QUALIFIER_INDENT) | \
38 Martel.Str("\t" + " " * (FEATURE_QUALIFIER_INDENT - 8))
39
40
41 -def define_block(identifier, block_tag, block_data, std_block_tag = None,
42 std_tag = None):
43 """Define a Martel grouping which can parse a block of text.
44
45 Many of the GenBank lines we'll want to process are grouped into
46 a block like:
47
48 IDENTIFIER Blah blah blah
49
50 Where blah blah blah can wrap for multiple lines. This function makes
51 it easy to consistently define a definition for these blocks.
52
53 Arguments:
54 o identifier - The identifier that begins the block (like DEFINITION).
55 o block_tag - A callback tag for the entire block.
56 o block_data - A callback tag for the data in the block (ie. the
57 stuff you are interested in).
58 o std_block_tag - A Bio.Std Martel tag used to register the entire
59 block as having being a "standard" type of information.
60 o std_tag - A Bio.Std Martel tag used to register just the information
61 in the block as being "standard"
62 """
63 diff = INDENT - len(identifier)
64 assert diff > 0, diff
65
66
67 if std_tag is None:
68 def do_nothing(martel_info):
69 return martel_info
70 std_tag = do_nothing
71
72 identifier_and_text = Martel.Str(identifier) + \
73 Martel.Rep(Martel.Str(" ")) + \
74 std_tag(Martel.UntilEol(block_data)) + \
75 Martel.AnyEol()
76 indented_text = Martel.Str(" "*INDENT) + \
77 std_tag(Martel.UntilEol(block_data)) + \
78 Martel.AnyEol()
79 block_info = Martel.Group(
80 block_tag,
81 identifier_and_text +
82 Martel.Rep(Martel.Alt(Martel.AnyEol(), indented_text))
83 )
84
85 if std_block_tag is not None:
86 block_info = std_block_tag(block_info)
87
88 return block_info
89
90
91
92
93 locus = Martel.Group("locus",
94 Martel.Re(r"[\w\-]+"))
95 size = Martel.Group("size",
96 Martel.Rep1(Martel.Integer()))
97
98
99 valid_residue_prefixes = ["ss-", "ds-", "ms-"]
100 valid_residue_types = ["DNA", "RNA", "mRNA", "tRNA", "rRNA", "uRNA",
101 "scRNA", "snRNA", "snoRNA", "PROTEIN"]
102
103 residue_prefixes = map(Martel.Str, valid_residue_prefixes)
104 residue_types = map(Martel.Str, valid_residue_types)
105
106 residue_type = Martel.Group("residue_type",
107 Martel.Opt(Martel.Alt(*residue_prefixes)) +
108 Martel.Opt(Martel.Alt(*residue_types)) +
109 Martel.Opt(Martel.Opt(blank_space) +
110 Martel.Alt(Martel.Str("circular"),
111 Martel.Str("linear"))))
112
113 date = Martel.Group("date",
114 Martel.Re("[-\w]+"))
115
116
117 valid_divisions = ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", "RNA",
118 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", "GSS",
119 "HTG", "HTC", "CON", "ENV"]
120 divisions = map(Martel.Str, valid_divisions)
121 data_file_division = Martel.Group("data_file_division",
122 Martel.Alt(*divisions))
123
124 locus_line = Martel.Group("locus_line",
125 Martel.Str("LOCUS") +
126 blank_space +
127 locus +
128 blank_space +
129 size +
130 blank_space +
131 Martel.Re("bp|aa") +
132 blank_space +
133 Martel.Opt(residue_type +
134 blank_space) +
135 data_file_division +
136 blank_space +
137 date +
138 Martel.AnyEol())
139
140
141
142
143 definition_block = define_block("DEFINITION", "definition_block",
144 "definition", Std.description_block,
145 Std.description)
146
147
148
149
150
151 accession = Martel.Group("accession",
152 Martel.Re("[\w]+"))
153
154 region = Martel.Group("region",
155 Martel.Re("[\d]+..[\d]+"))
156
157 accession_block = Martel.Group("accession_block",
158 Martel.Str("ACCESSION") +
159 Martel.Rep1(blank_space +
160 Martel.Rep1(accession +
161 Martel.Opt(
162 Martel.Opt(Martel.Str(" ")) +
163 Martel.Str("REGION:") +
164 Martel.Opt(Martel.Str(" ")) +
165 region) +
166 Martel.Opt(Martel.Str(" "))) +
167 Martel.AnyEol()))
168
169
170
171
172
173 nid = Martel.Group("nid",
174 Martel.Re("[\w\d]+"))
175 nid_line = Martel.Group("nid_line",
176 Martel.Str("NID") +
177 blank_space +
178 nid +
179 Martel.AnyEol())
180
181
182 pid = Martel.Group("pid",
183 Martel.Re("[\w\d]+"))
184 pid_line = Martel.Group("pid_line",
185 Martel.Str("PID") +
186 blank_space +
187 pid +
188 Martel.AnyEol())
189
190
191
192 version = Martel.Group("version",
193 Std.dbid(Martel.Re("[\w\d\.]+"),
194 {"type" : "primary", "dbname" : "genbank"}))
195
196 gi = Martel.Group("gi",
197 Std.dbid(Martel.Re("[\d]+"),
198 {"type" : "secondary", "dbname" : "genbank"}))
199
200 version_line = Martel.Group("version_line",
201 Martel.Str("VERSION") +
202 blank_space +
203 version +
204 Martel.Opt(blank_space +
205 Martel.Str("GI:") +
206 gi) +
207 Martel.AnyEol())
208
209
210 db_source_block = define_block("DBSOURCE", "db_source_block", "db_source")
211
212
213
214
215 keywords_block = define_block("KEYWORDS", "keywords_block", "keywords")
216
217
218 segment = Martel.Group("segment",
219 Martel.Integer("segment_num") + \
220 Martel.Str(" of ") + \
221 Martel.Integer("segment_total"))
222 segment_line = Martel.Group("segment_line",
223 Martel.Str("SEGMENT ") + segment + \
224 Martel.AnyEol())
225
226
227 source_block = define_block("SOURCE", "source_block", "source")
228
229
230
231
232
233 organism = Martel.Group("organism",
234 Martel.ToEol())
235
236 taxonomy = Martel.Group("taxonomy",
237 Martel.Rep1(blank_space +
238 Martel.ToEol()))
239
240 organism_block = Martel.Group("organism_block",
241 Martel.Str(" ORGANISM") +
242 blank_space +
243 organism +
244 taxonomy)
245
246
247
248
249
250
251
252 reference_num = Martel.Group("reference_num",
253 Martel.Re("[\d]+"))
254
255
256
257
258
259 reference_bases = Martel.Group("reference_bases",
260 Martel.Str("(") +
261 Martel.Re("[;\w\d \R]+") +
262 Martel.Str(")"))
263 reference_line = Martel.Group("reference_line",
264 Martel.Str("REFERENCE") +
265 blank_space +
266 reference_num +
267 Martel.Opt(blank_space +
268 reference_bases) +
269 Martel.AnyEol())
270
271 authors_block = define_block(" AUTHORS", "authors_block", "authors")
272 consrtm_block = define_block(" CONSRTM", "consrtm_block", "consrtm")
273 title_block = define_block(" TITLE", "title_block", "title")
274 journal_block = define_block(" JOURNAL", "journal_block", "journal")
275
276
277 medline_line = Martel.Group("medline_line",
278 Martel.Str(" MEDLINE ") +
279 Martel.Integer("medline_id") +
280 Martel.AnyEol())
281
282
283 pubmed_line = Martel.Group("pubmed_line",
284 Martel.Str(" PUBMED ") +
285 Martel.Integer("pubmed_id") +
286 Martel.AnyEol())
287
288
289 remark_block = define_block(" REMARK", "remark_block", "remark")
290
291
292 reference = Martel.Group("reference",
293 reference_line +
294 Martel.Opt(authors_block) +
295 Martel.Opt(consrtm_block) +
296 Martel.Opt(title_block) +
297 journal_block +
298 Martel.Opt(medline_line) +
299 Martel.Opt(pubmed_line) +
300 Martel.Opt(remark_block))
301
302
303 comment_block = define_block("COMMENT", "comment_block", "comment")
304
305 primary_line = Martel.Group("primary_line",
306 Martel.Str("PRIMARY") +
307 blank_space +
308 Martel.Str("TPA_SPAN") +
309 blank_space +
310 Martel.Str("PRIMARY_IDENTIFIER") +
311 blank_space +
312 Martel.Str("PRIMARY_SPAN") +
313 blank_space +
314 Martel.Str("COMP") +
315 Martel.ToEol())
316
317 primary_ref_line =Martel.Group("primary_ref_line",
318 blank_space +
319 Martel.Re(r"\d+\-\d+") +
320 blank_space +
321 Martel.Re("[\S]+") +
322 blank_space +
323 Martel.Re("\d+\-\d+")+
324 Martel.Opt(blank_space + Martel.Str("c"))+
325 Martel.ToEol())
326
327 primary = Martel.Group("primary",primary_line +
328 Martel.Rep1(primary_ref_line))
329
330
331
332
333
334
335 features_line = Martel.Group("features_line",
336 Martel.Str("FEATURES") +
337 blank_space +
338 Martel.Str("Location/Qualifiers") +
339 Martel.AnyEol())
340
341
342
343 feature_key = Martel.Group("feature_key",
344 Martel.Re("[\w'-]+"))
345 """
346 location = Martel.Group("location",
347 Martel.ToEol("feature_location") + \
348 Martel.Rep(qualifier_space + \
349 Martel.Re("(?!/)") + \
350 Martel.ToEol("feature_location")))
351 """
352
353 location = Martel.Group("location",
354 Std.feature_location(Martel.UntilEol()) +
355 Martel.AnyEol() +
356 Martel.Rep(qualifier_space +
357 Martel.AssertNot(Martel.Str("/")) +
358 Std.feature_location(Martel.UntilEol()) +
359 Martel.AnyEol())
360 )
361
362 feature_key_line = Martel.Group("feature_key_line",
363 big_indent_space +
364 Std.feature_name(feature_key) +
365 location)
366
367
368 quote = Martel.Str('"')
369 quoted_chars = Std.feature_qualifier_description(Martel.Re(r'([^"\R]|"")*'))
370
371 quoted_string = (quote + quoted_chars +
372 Martel.Rep(Martel.AnyEol() + qualifier_space + quoted_chars) +
373 quote + Martel.AnyEol())
374
375 unquoted_string = Martel.AssertNot(quote) + \
376 Std.feature_qualifier_description(Martel.UntilEol()) + \
377 Martel.AnyEol()
378
379 qualifier = Std.feature_qualifier(
380 qualifier_space +
381 Martel.Str("/") +
382 Std.feature_qualifier_name(Martel.Word("feature_qualifier_name")) +
383 (Martel.AnyEol() |
384 (Martel.Str("=") +
385 Martel.Group("feature_qualifier_description",
386 (unquoted_string |
387 quoted_string))))
388
389 )
390
391 feature = Std.feature(feature_key_line +
392 Martel.Rep(qualifier))
393
394 feature_block = Std.feature_block(Martel.Rep1(feature),
395 {"location-style" : "genbank"})
396
397
398 base_count = Martel.Group("base_count",
399 Martel.Re("[\w\d ]+"))
400 base_count_line = Martel.Group("base_count_line",
401 Martel.Str("BASE COUNT") +
402 blank_space +
403 base_count +
404 Martel.AnyEol())
405
406
407
408 origin_line = Martel.Group("origin_line",
409 Martel.Str("ORIGIN") +
410 (Martel.ToEol("origin_name") |
411 Martel.AnyEol()))
412
413 base_number = Martel.Group("base_number",
414 Martel.Re("[\d]+"))
415 sequence = Std.sequence(Martel.Group("sequence",
416 Martel.Re("[\w]+")))
417 sequence_plus_spaces = Martel.Group("sequence_plus_spaces",
418 Martel.Rep1(Martel.Str(" ") +
419 Martel.Opt(sequence)) +
420 Martel.Opt(Martel.Str(" ")))
421 sequence_line = Martel.Group("sequence_line",
422 blank_space +
423 Martel.Opt(base_number) +
424 sequence_plus_spaces +
425 Martel.AnyEol())
426
427 sequence_entry = Std.sequence_block(Martel.Group("sequence_entry",
428 origin_line +
429 Martel.Rep1(sequence_line)))
430
431
432
433
434 contig_location = Martel.Group("contig_location",
435 Martel.ToEol("feature_location") + \
436 Martel.Rep(Martel.Str(" " * INDENT) + \
437 Martel.Re("(?!/)") + \
438 Martel.ToEol("feature_location")))
439
440 contig_block = Martel.Group("contig_block",
441 Martel.Str("CONTIG") +
442 blank_space +
443 contig_location)
444
445
446
447 record_end = Martel.Group("record_end",
448 Martel.Str("//") +
449 Martel.Rep1(Martel.AnyEol()))
450
451 record = Std.record(Martel.Group("genbank_record",
452 locus_line + \
453 definition_block + \
454 accession_block + \
455 Martel.Opt(nid_line) + \
456 Martel.Opt(pid_line) + \
457 Martel.Opt(version_line) + \
458 Martel.Opt(db_source_block) + \
459 keywords_block + \
460 Martel.Opt(segment_line) + \
461 source_block + \
462 organism_block + \
463 Martel.Rep(reference) + \
464 Martel.Opt(primary) +\
465 Martel.Opt(comment_block) + \
466 features_line + \
467 feature_block + \
468 Martel.Alt(Martel.Opt(base_count_line) +
469 sequence_entry,
470 contig_block) + \
471 record_end))
472
473
474
475
476 header = Martel.Re("""\
477 (?P<filename>[^ ]+) +Genetic Sequence Data Bank
478 *(?P<release_day>\d+) (?P<release_month>\w+) (?P<release_year>\d+)
479
480 *(?P<data_bank_name>[^\R]+)
481
482 *(?P<data_bank_name>[^\R]+)
483
484 *(?P<num_loci>\d+) loci, *(?P<num_bases>\d+) bases, from *(?P<num_reports>\d+) reported sequences
485
486
487 """)
488
489 ncbi_format = Martel.HeaderFooter("genbank", {"format" : "ncbi_genbank"},
490 header, RecordReader.CountLines, (10,),
491 record, RecordReader.EndsWith, ("//",),
492 None, None, None,
493 )
494
495 format = Martel.ParseRecords("genbank", {"format" : "genbank"},
496 record, RecordReader.StartsWith, ("LOCUS ",))
497