Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  """Hold GenBank data in a straightforward format. 
  2   
  3  classes: 
  4  o Record - All of the information in a GenBank record. 
  5  o Reference - hold reference data for a record. 
  6  o Feature - Hold the information in a Feature Table. 
  7  o Qualifier - Qualifiers on a Feature. 
  8  17-MAR-2009: added support for WGS and WGS_SCAFLD lines.  Ying Huang & Iddo Friedberg 
  9  """ 
 10  # local stuff 
 11  import Bio.GenBank 
 12   
13 -def _wrapped_genbank(information, indent, wrap_space = 1, split_char = " "):
14 """Write a line of GenBank info that can wrap over multiple lines. 15 16 This takes a line of information which can potentially wrap over 17 multiple lines, and breaks it up with carriage returns and 18 indentation so it fits properly into a GenBank record. 19 20 Arguments: 21 22 o information - The string holding the information we want 23 wrapped in GenBank method. 24 25 o indent - The indentation on the lines we are writing. 26 27 o wrap_space - Whether or not to wrap only on spaces in the 28 information. 29 30 o split_char - A specific character to split the lines on. By default 31 spaces are used. 32 """ 33 info_length = Record.GB_LINE_LENGTH - indent 34 35 if wrap_space: 36 info_parts = information.split(split_char) 37 else: 38 cur_pos = 0 39 info_parts = [] 40 while cur_pos < len(information): 41 info_parts.append(information[cur_pos: cur_pos + info_length]) 42 cur_pos += info_length 43 44 # first get the information string split up by line 45 output_parts = [] 46 cur_part = "" 47 for info_part in info_parts: 48 if len(cur_part) + 1 + len(info_part) > info_length: 49 if cur_part: 50 if split_char != " ": 51 cur_part += split_char 52 output_parts.append(cur_part) 53 cur_part = info_part 54 else: 55 if cur_part == "": 56 cur_part = info_part 57 else: 58 cur_part += split_char + info_part 59 60 # add the last bit of information to the output 61 if cur_part: 62 output_parts.append(cur_part) 63 64 # now format the information string for return 65 output_info = output_parts[0] + "\n" 66 for output_part in output_parts[1:]: 67 output_info += " " * indent + output_part + "\n" 68 69 return output_info
70
71 -def _indent_genbank(information, indent):
72 """Write out information with the specified indent. 73 74 Unlike _wrapped_genbank, this function makes no attempt to wrap 75 lines -- it assumes that the information already has newlines in the 76 appropriate places, and will add the specified indent to the start of 77 each line. 78 """ 79 # split the info into lines based on line breaks 80 info_parts = information.split("\n") 81 82 # the first line will have no indent 83 output_info = info_parts[0] + "\n" 84 for info_part in info_parts[1:]: 85 output_info += " " * indent + info_part + "\n" 86 87 return output_info
88
89 -class Record:
90 """Hold GenBank information in a format similar to the original record. 91 92 The Record class is meant to make data easy to get to when you are 93 just interested in looking at GenBank data. 94 95 Attributes: 96 o locus - The name specified after the LOCUS keyword in the GenBank 97 record. This may be the accession number, or a clone id or something else. 98 o size - The size of the record. 99 o residue_type - The type of residues making up the sequence in this 100 record. Normally something like RNA, DNA or PROTEIN, but may be as 101 esoteric as 'ss-RNA circular'. 102 o data_file_division - The division this record is stored under in 103 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 104 o date - The date of submission of the record, in a form like '28-JUL-1998' 105 o accession - list of all accession numbers for the sequence. 106 o nid - Nucleotide identifier number. 107 o pid - Proteint identifier number 108 o version - The accession number + version (ie. AB01234.2) 109 o db_source - Information about the database the record came from 110 o gi - The NCBI gi identifier for the record. 111 o keywords - A list of keywords related to the record. 112 o segment - If the record is one of a series, this is info about which 113 segment this record is (something like '1 of 6'). 114 o source - The source of material where the sequence came from. 115 o organism - The genus and species of the organism (ie. 'Homo sapiens') 116 o taxonomy - A listing of the taxonomic classification of the organism, 117 starting general and getting more specific. 118 o references - A list of Reference objects. 119 o comment - Text with any kind of comment about the record. 120 o features - A listing of Features making up the feature table. 121 o base_counts - A string with the counts of bases for the sequence. 122 o origin - A string specifying info about the origin of the sequence. 123 o sequence - A string with the sequence itself. 124 o contig - A string of location information for a CONTIG in a RefSeq file 125 o project - The genome sequencing project numbers 126 (will be replaced by the dblink cross-references in 2009). 127 o dblinks - The genome sequencing project number(s) and other links. 128 (will replace the project information in 2009). 129 """ 130 # constants for outputting GenBank information 131 GB_LINE_LENGTH = 79 132 GB_BASE_INDENT = 12 133 GB_FEATURE_INDENT = 21 134 GB_INTERNAL_INDENT = 2 135 GB_OTHER_INTERNAL_INDENT = 3 136 GB_FEATURE_INTERNAL_INDENT = 5 137 GB_SEQUENCE_INDENT = 9 138 139 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 140 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 141 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 142 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 143 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 144 "s" 145 146 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 147 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 148 str(GB_FEATURE_INDENT - 149 GB_FEATURE_INTERNAL_INDENT) + "s" 150 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 151
152 - def __init__(self):
153 self.locus = '' 154 self.size = '' 155 self.residue_type = '' 156 self.data_file_division = '' 157 self.date = '' 158 self.definition = '' 159 self.accession = [] 160 self.nid = '' 161 self.pid = '' 162 self.version = '' 163 self.projects = [] 164 self.dblinks = [] 165 self.db_source = '' 166 self.gi = '' 167 self.keywords = [] 168 self.segment = '' 169 self.source = '' 170 self.organism = '' 171 self.taxonomy = [] 172 self.references = [] 173 self.comment = '' 174 self.features = [] 175 self.base_counts = '' 176 self.origin = '' 177 self.sequence = '' 178 self.contig = '' 179 self.primary=[] 180 self.wgs = '' 181 self.wgs_scafld = []
182
183 - def __str__(self):
184 """Provide a GenBank formatted output option for a Record. 185 186 The objective of this is to provide an easy way to read in a GenBank 187 record, modify it somehow, and then output it in 'GenBank format.' 188 We are striving to make this work so that a parsed Record that is 189 output using this function will look exactly like the original 190 record. 191 192 Much of the output is based on format description info at: 193 194 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 195 """ 196 output = self._locus_line() 197 output += self._definition_line() 198 output += self._accession_line() 199 output += self._version_line() 200 output += self._project_line() 201 output += self._dblink_line() 202 output += self._nid_line() 203 output += self._pid_line() 204 output += self._keywords_line() 205 output += self._db_source_line() 206 output += self._segment_line() 207 output += self._source_line() 208 output += self._organism_line() 209 for reference in self.references: 210 output += str(reference) 211 output += self._comment_line() 212 output += self._features_line() 213 for feature in self.features: 214 output += str(feature) 215 output += self._base_count_line() 216 output += self._origin_line() 217 output += self._sequence_line() 218 output += self._wgs_line() 219 output += self._wgs_scafld_line() 220 output += self._contig_line() 221 output += "//" 222 return output
223
224 - def _locus_line(self):
225 """Provide the output string for the LOCUS line. 226 """ 227 output = "LOCUS" 228 output += " " * 7 # 6-12 spaces 229 output += "%-9s" % self.locus 230 output += " " # 22 space 231 output += "%7s" % self.size 232 if self.residue_type.find("PROTEIN") >= 0: 233 output += " aa" 234 else: 235 output += " bp " 236 237 # treat circular types differently, since they'll have long residue 238 # types 239 if self.residue_type.find("circular") >= 0: 240 output += "%17s" % self.residue_type 241 # second case: ss-DNA types of records 242 elif self.residue_type.find("-") >= 0: 243 output += "%7s" % self.residue_type 244 output += " " * 10 # spaces for circular 245 else: 246 output += " " * 3 # spaces for stuff like ss- 247 output += "%-4s" % self.residue_type 248 output += " " * 10 # spaces for circular 249 250 output += " " * 2 251 output += "%3s" % self.data_file_division 252 output += " " * 7 # spaces for 56-63 253 output += "%11s" % self.date 254 output += "\n" 255 return output
256
257 - def _definition_line(self):
258 """Provide output for the DEFINITION line. 259 """ 260 output = Record.BASE_FORMAT % "DEFINITION" 261 output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT) 262 return output
263
264 - def _accession_line(self):
265 """Output for the ACCESSION line. 266 """ 267 if self.accession: 268 output = Record.BASE_FORMAT % "ACCESSION" 269 270 acc_info = "" 271 for accession in self.accession: 272 acc_info += "%s " % accession 273 # strip off an extra space at the end 274 acc_info = acc_info.rstrip() 275 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 276 else: 277 output = "" 278 279 return output
280
281 - def _version_line(self):
282 """Output for the VERSION line. 283 """ 284 if self.version: 285 output = Record.BASE_FORMAT % "VERSION" 286 output += self.version 287 output += " GI:" 288 output += "%s\n" % self.gi 289 else: 290 output = "" 291 return output
292
293 - def _project_line(self):
294 output = "" 295 if len(self.projects) > 0: 296 output = Record.BASE_FORMAT % "PROJECT" 297 output += "%s\n" % " ".join(self.projects) 298 return output
299 307
308 - def _nid_line(self):
309 """Output for the NID line. Use of NID is obsolete in GenBank files. 310 """ 311 if self.nid: 312 output = Record.BASE_FORMAT % "NID" 313 output += "%s\n" % self.nid 314 else: 315 output = "" 316 return output
317
318 - def _pid_line(self):
319 """Output for PID line. Presumedly, PID usage is also obsolete. 320 """ 321 if self.pid: 322 output = Record.BASE_FORMAT % "PID" 323 output += "%s\n" % self.pid 324 else: 325 output = "" 326 return output
327
328 - def _keywords_line(self):
329 """Output for the KEYWORDS line. 330 """ 331 output = "" 332 if len(self.keywords) >= 0: 333 output += Record.BASE_FORMAT % "KEYWORDS" 334 keyword_info = "" 335 for keyword in self.keywords: 336 keyword_info += "%s; " % keyword 337 # replace the ; at the end with a period 338 keyword_info = keyword_info[:-2] 339 keyword_info += "." 340 341 output += _wrapped_genbank(keyword_info, 342 Record.GB_BASE_INDENT) 343 344 return output
345
346 - def _db_source_line(self):
347 """Output for DBSOURCE line. 348 """ 349 if self.db_source: 350 output = Record.BASE_FORMAT % "DBSOURCE" 351 output += "%s\n" % self.db_source 352 else: 353 output = "" 354 return output
355
356 - def _segment_line(self):
357 """Output for the SEGMENT line. 358 """ 359 output = "" 360 if self.segment: 361 output += Record.BASE_FORMAT % "SEGMENT" 362 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 363 return output
364
365 - def _source_line(self):
366 """Output for SOURCE line on where the sample came from. 367 """ 368 output = Record.BASE_FORMAT % "SOURCE" 369 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 370 return output
371
372 - def _organism_line(self):
373 """Output for ORGANISM line with taxonomy info. 374 """ 375 output = Record.INTERNAL_FORMAT % "ORGANISM" 376 # Now that species names can be too long, this line can wrap (Bug 2591) 377 output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT) 378 output += " " * Record.GB_BASE_INDENT 379 taxonomy_info = "" 380 for tax in self.taxonomy: 381 taxonomy_info += "%s; " % tax 382 # replace the ; at the end with a period 383 taxonomy_info = taxonomy_info[:-2] 384 taxonomy_info += "." 385 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 386 387 return output
388
389 - def _comment_line(self):
390 """Output for the COMMENT lines. 391 """ 392 output = "" 393 if self.comment: 394 output += Record.BASE_FORMAT % "COMMENT" 395 output += _indent_genbank(self.comment, 396 Record.GB_BASE_INDENT) 397 return output
398
399 - def _features_line(self):
400 """Output for the FEATURES line. 401 """ 402 output = "" 403 if len(self.features) > 0: 404 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 405 output += "Location/Qualifiers\n" 406 return output
407
408 - def _base_count_line(self):
409 """Output for the BASE COUNT line with base information. 410 """ 411 output = "" 412 if self.base_counts: 413 output += Record.BASE_FORMAT % "BASE COUNT " 414 # split up the base counts into their individual parts 415 count_parts = self.base_counts.split(" ") 416 while '' in count_parts: 417 count_parts.remove('') 418 # deal with the standard case, with a normal origin line 419 # like: 474 a 356 c 428 g 364 t 420 if len(count_parts) % 2 == 0: 421 while len(count_parts) > 0: 422 count_info = count_parts.pop(0) 423 count_type = count_parts.pop(0) 424 425 output += "%7s %s" % (count_info, count_type) 426 # deal with ugly ORIGIN lines like: 427 # 1311257 a2224835 c2190093 g1309889 t 428 # by just outputting the raw information 429 else: 430 output += self.base_counts 431 output += "\n" 432 return output
433
434 - def _origin_line(self):
435 """Output for the ORIGIN line 436 """ 437 output = "" 438 # only output the ORIGIN line if we have a sequence 439 if self.sequence: 440 output += Record.BASE_FORMAT % "ORIGIN" 441 if self.origin: 442 output += _wrapped_genbank(self.origin, 443 Record.GB_BASE_INDENT) 444 else: 445 output += "\n" 446 return output
447
448 - def _sequence_line(self):
449 """Output for all of the sequence. 450 """ 451 output = "" 452 if self.sequence: 453 cur_seq_pos = 0 454 while cur_seq_pos < len(self.sequence): 455 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 456 457 for section in range(6): 458 start_pos = cur_seq_pos + section * 10 459 end_pos = start_pos + 10 460 seq_section = self.sequence[start_pos:end_pos] 461 output += " %s" % seq_section.lower() 462 463 # stop looping if we are out of sequence 464 if end_pos > len(self.sequence): 465 break 466 467 output += "\n" 468 cur_seq_pos += 60 469 return output
470
471 - def _wgs_line(self):
472 output = "" 473 if self.wgs: 474 output += Record.BASE_FORMAT % "WGS" 475 output += self.wgs 476 return output
477
478 - def _wgs_scafld_line(self):
479 output = "" 480 if self.wgs_scafld: 481 output += Record.BASE_FORMAT % "WGS_SCAFLD" 482 output += self.wgs_scafld 483 return output
484
485 - def _contig_line(self):
486 """Output for CONTIG location information from RefSeq. 487 """ 488 output = "" 489 if self.contig: 490 output += Record.BASE_FORMAT % "CONTIG" 491 output += _wrapped_genbank(self.contig, 492 Record.GB_BASE_INDENT, split_char = ',') 493 return output
494 495
496 -class Reference:
497 """Hold information from a GenBank reference. 498 499 Attributes: 500 o number - The number of the reference in the listing of references. 501 o bases - The bases in the sequence the reference refers to. 502 o authors - String with all of the authors. 503 o consrtm - Consortium the authors belong to. 504 o title - The title of the reference. 505 o journal - Information about the journal where the reference appeared. 506 o medline_id - The medline id for the reference. 507 o pubmed_id - The pubmed_id for the reference. 508 o remark - Free-form remarks about the reference. 509 """
510 - def __init__(self):
511 self.number = '' 512 self.bases = '' 513 self.authors = '' 514 self.consrtm = '' 515 self.title = '' 516 self.journal = '' 517 self.medline_id = '' 518 self.pubmed_id = '' 519 self.remark = ''
520
521 - def __str__(self):
522 output = self._reference_line() 523 output += self._authors_line() 524 output += self._consrtm_line() 525 output += self._title_line() 526 output += self._journal_line() 527 output += self._medline_line() 528 output += self._pubmed_line() 529 output += self._remark_line() 530 531 return output
532
533 - def _reference_line(self):
534 """Output for REFERENCE lines. 535 """ 536 output = Record.BASE_FORMAT % "REFERENCE" 537 if self.number: 538 if self.bases: 539 output += "%-3s" % self.number 540 output += "%s" % self.bases 541 else: 542 output += "%s" % self.number 543 544 output += "\n" 545 return output
546
547 - def _authors_line(self):
548 """Output for AUTHORS information. 549 """ 550 output = "" 551 if self.authors: 552 output += Record.INTERNAL_FORMAT % "AUTHORS" 553 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 554 return output
555
556 - def _consrtm_line(self):
557 """Output for CONSRTM information. 558 """ 559 output = "" 560 if self.consrtm: 561 output += Record.INTERNAL_FORMAT % "CONSRTM" 562 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 563 return output
564
565 - def _title_line(self):
566 """Output for TITLE information. 567 """ 568 output = "" 569 if self.title: 570 output += Record.INTERNAL_FORMAT % "TITLE" 571 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 572 return output
573
574 - def _journal_line(self):
575 """Output for JOURNAL information. 576 """ 577 output = "" 578 if self.journal: 579 output += Record.INTERNAL_FORMAT % "JOURNAL" 580 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 581 return output
582
583 - def _medline_line(self):
584 """Output for MEDLINE information. 585 """ 586 output = "" 587 if self.medline_id: 588 output += Record.INTERNAL_FORMAT % "MEDLINE" 589 output += self.medline_id + "\n" 590 return output
591
592 - def _pubmed_line(self):
593 """Output for PUBMED information. 594 """ 595 output = "" 596 if self.pubmed_id: 597 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 598 output += self.pubmed_id + "\n" 599 return output
600
601 - def _remark_line(self):
602 """Output for REMARK information. 603 """ 604 output = "" 605 if self.remark: 606 output += Record.INTERNAL_FORMAT % "REMARK" 607 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 608 return output
609
610 -class Feature:
611 """Hold information about a Feature in the Feature Table of GenBank record. 612 613 Attributes: 614 o key - The key name of the featue (ie. source) 615 o location - The string specifying the location of the feature. 616 o qualfiers - A listing Qualifier objects in the feature. 617 """
618 - def __init__(self):
619 self.key = '' 620 self.location = '' 621 self.qualifiers = []
622
623 - def __str__(self):
624 output = Record.INTERNAL_FEATURE_FORMAT % self.key 625 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 626 split_char = ',') 627 for qualifier in self.qualifiers: 628 output += " " * Record.GB_FEATURE_INDENT 629 630 # determine whether we can wrap on spaces 631 space_wrap = 1 632 for no_space_key in \ 633 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 634 if qualifier.key.find(no_space_key) >= 0: 635 space_wrap = 0 636 637 output += _wrapped_genbank(qualifier.key + qualifier.value, 638 Record.GB_FEATURE_INDENT, space_wrap) 639 return output
640
641 -class Qualifier:
642 """Hold information about a qualifier in a GenBank feature. 643 644 Attributes: 645 o key - The key name of the qualifier (ie. /organism=) 646 o value - The value of the qualifier ("Dictyostelium discoideum"). 647 """
648 - def __init__(self):
649 self.key = '' 650 self.value = ''
651