Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

   1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
   2  # Revisions 2007-2008 copyright by Peter Cock.  All rights reserved. 
   3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  # 
   8  # Note that BioSQL (including the database schema and scripts) is 
   9  # available and licensed separately.  Please consult www.biosql.org 
  10   
  11  """Load biopython objects into a BioSQL database for persistent storage. 
  12   
  13  This code makes it possible to store biopython objects in a relational 
  14  database and then retrieve them back. You shouldn't use any of the 
  15  classes in this module directly. Rather, call the load() method on 
  16  a database object. 
  17  """ 
  18  # standard modules 
  19  from time import gmtime, strftime 
  20   
  21  # biopython 
  22  from Bio import Alphabet 
  23  from Bio.SeqUtils.CheckSum import crc64 
  24  from Bio import Entrez 
  25  from Bio.Seq import UnknownSeq 
  26   
27 -class DatabaseLoader:
28 """Object used to load SeqRecord objects into a BioSQL database."""
29 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
30 """Initialize with connection information for the database. 31 32 Creating a DatabaseLoader object is normally handled via the 33 BioSeqDatabase DBServer object, for example: 34 35 from BioSQL import BioSeqDatabase 36 server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse", 37 passwd = "biosql", host = "localhost", db="test_biosql") 38 try : 39 db = server["test"] 40 except KeyError : 41 db = server.new_database("test", description="For testing GBrowse") 42 """ 43 self.adaptor = adaptor 44 self.dbid = dbid 45 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
46
47 - def load_seqrecord(self, record):
48 """Load a Biopython SeqRecord into the database. 49 """ 50 bioentry_id = self._load_bioentry_table(record) 51 self._load_bioentry_date(record, bioentry_id) 52 self._load_biosequence(record, bioentry_id) 53 self._load_comment(record, bioentry_id) 54 self._load_dbxrefs(record, bioentry_id) 55 references = record.annotations.get('references', ()) 56 for reference, rank in zip(references, range(len(references))): 57 self._load_reference(reference, rank, bioentry_id) 58 self._load_annotations(record, bioentry_id) 59 for seq_feature_num in range(len(record.features)): 60 seq_feature = record.features[seq_feature_num] 61 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
62
63 - def _get_ontology_id(self, name, definition=None):
64 """Returns the identifier for the named ontology (PRIVATE). 65 66 This looks through the onotology table for a the given entry name. 67 If it is not found, a row is added for this ontology (using the 68 definition if supplied). In either case, the id corresponding to 69 the provided name is returned, so that you can reference it in 70 another table. 71 """ 72 oids = self.adaptor.execute_and_fetch_col0( 73 "SELECT ontology_id FROM ontology WHERE name = %s", 74 (name,)) 75 if oids: 76 return oids[0] 77 self.adaptor.execute( 78 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 79 (name, definition)) 80 return self.adaptor.last_id("ontology")
81 82
83 - def _get_term_id(self, 84 name, 85 ontology_id=None, 86 definition=None, 87 identifier=None):
88 """Get the id that corresponds to a term (PRIVATE). 89 90 This looks through the term table for a the given term. If it 91 is not found, a new id corresponding to this term is created. 92 In either case, the id corresponding to that term is returned, so 93 that you can reference it in another table. 94 95 The ontology_id should be used to disambiguate the term. 96 """ 97 98 # try to get the term id 99 sql = r"SELECT term_id FROM term " \ 100 r"WHERE name = %s" 101 fields = [name] 102 if ontology_id: 103 sql += ' AND ontology_id = %s' 104 fields.append(ontology_id) 105 id_results = self.adaptor.execute_and_fetchall(sql, fields) 106 # something is wrong 107 if len(id_results) > 1: 108 raise ValueError("Multiple term ids for %s: %r" % 109 (name, id_results)) 110 elif len(id_results) == 1: 111 return id_results[0][0] 112 else: 113 sql = r"INSERT INTO term (name, definition," \ 114 r" identifier, ontology_id)" \ 115 r" VALUES (%s, %s, %s, %s)" 116 self.adaptor.execute(sql, (name, definition, 117 identifier, ontology_id)) 118 return self.adaptor.last_id("term")
119
120 - def _add_dbxref(self, dbname, accession, version):
121 """Insert a dbxref and return its id.""" 122 123 self.adaptor.execute( 124 "INSERT INTO dbxref(dbname, accession, version)" \ 125 " VALUES (%s, %s, %s)", (dbname, accession, version)) 126 return self.adaptor.last_id("dbxref")
127
128 - def _get_taxon_id(self, record):
129 """Get the taxon id for this record (PRIVATE). 130 131 record - a SeqRecord object 132 133 This searches the taxon/taxon_name tables using the 134 NCBI taxon ID, scientific name and common name to find 135 the matching taxon table entry's id. 136 137 If the species isn't in the taxon table, and we have at 138 least the NCBI taxon ID, scientific name or common name, 139 at least a minimal stub entry is created in the table. 140 141 Returns the taxon id (database key for the taxon table, 142 not an NCBI taxon ID), or None if the taxonomy information 143 is missing. 144 145 See also the BioSQL script load_ncbi_taxonomy.pl which 146 will populate and update the taxon/taxon_name tables 147 with the latest information from the NCBI. 148 """ 149 150 # To find the NCBI taxid, first check for a top level annotation 151 ncbi_taxon_id = None 152 if "ncbi_taxid" in record.annotations : 153 #Could be a list of IDs. 154 if isinstance(record.annotations["ncbi_taxid"],list) : 155 if len(record.annotations["ncbi_taxid"])==1 : 156 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 157 else : 158 ncbi_taxon_id = record.annotations["ncbi_taxid"] 159 if not ncbi_taxon_id: 160 # Secondly, look for a source feature 161 for f in record.features: 162 if f.type == 'source': 163 quals = getattr(f, 'qualifiers', {}) 164 if "db_xref" in quals: 165 for db_xref in f.qualifiers["db_xref"]: 166 if db_xref.startswith("taxon:"): 167 ncbi_taxon_id = int(db_xref[6:]) 168 break 169 if ncbi_taxon_id: break 170 171 try : 172 scientific_name = record.annotations["organism"][:255] 173 except KeyError : 174 scientific_name = None 175 try : 176 common_name = record.annotations["source"][:255] 177 except KeyError : 178 common_name = None 179 # Note: The maximum length for taxon names in the schema is 255. 180 # Cropping it now should help in getting a match when searching, 181 # and avoids an error if we try and add these to the database. 182 183 184 if ncbi_taxon_id: 185 #Good, we have the NCBI taxon to go on - this is unambiguous :) 186 #Note that the scientific name and common name will only be 187 #used if we have to record a stub entry. 188 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id, 189 scientific_name, 190 common_name) 191 192 if not common_name and not scientific_name : 193 # Nothing to go on... and there is no point adding 194 # a new entry to the database. We'll just leave this 195 # sequence's taxon as a NULL in the database. 196 return None 197 198 # Next, we'll try to find a match based on the species name 199 # (stored in GenBank files as the organism and/or the source). 200 if scientific_name: 201 taxa = self.adaptor.execute_and_fetch_col0( 202 "SELECT taxon_id FROM taxon_name" \ 203 " WHERE name_class = 'scientific name' AND name = %s", 204 (scientific_name,)) 205 if taxa: 206 #Good, mapped the scientific name to a taxon table entry 207 return taxa[0] 208 209 # Last chance... 210 if common_name: 211 taxa = self.adaptor.execute_and_fetch_col0( 212 "SELECT DISTINCT taxon_id FROM taxon_name" \ 213 " WHERE name = %s", 214 (common_name,)) 215 #Its natural that several distinct taxa will have the same common 216 #name - in which case we can't resolve the taxon uniquely. 217 if len(taxa) > 1: 218 raise ValueError("Taxa: %d species have name %r" % ( 219 len(taxa), 220 common_name)) 221 if taxa: 222 #Good, mapped the common name to a taxon table entry 223 return taxa[0] 224 225 # At this point, as far as we can tell, this species isn't 226 # in the taxon table already. So we'll have to add it. 227 # We don't have an NCBI taxonomy ID, so if we do record just 228 # a stub entry, there is no simple way to fix this later. 229 # 230 # TODO - Should we try searching the NCBI taxonomy using the 231 # species name? 232 # 233 # OK, let's try inserting the species. 234 # Chances are we don't have enough information ... 235 # Furthermore, it won't be in the hierarchy. 236 237 lineage = [] 238 for c in record.annotations.get("taxonomy", []): 239 lineage.append([None, None, c]) 240 if lineage: 241 lineage[-1][1] = "genus" 242 lineage.append([None, "species", record.annotations["organism"]]) 243 # XXX do we have them? 244 if "subspecies" in record.annotations: 245 lineage.append([None, "subspecies", 246 record.annotations["subspecies"]]) 247 if "variant" in record.annotations: 248 lineage.append([None, "varietas", 249 record.annotations["variant"]]) 250 lineage[-1][0] = ncbi_taxon_id 251 252 left_value = self.adaptor.execute_one( 253 "SELECT MAX(left_value) FROM taxon")[0] 254 if not left_value: 255 left_value = 0 256 left_value += 1 257 258 # XXX -- Brad: Fixing this for now in an ugly way because 259 # I am getting overlaps for right_values. I need to dig into this 260 # more to actually understand how it works. I'm not sure it is 261 # actually working right anyhow. 262 right_start_value = self.adaptor.execute_one( 263 "SELECT MAX(right_value) FROM taxon")[0] 264 if not right_start_value: 265 right_start_value = 0 266 right_value = right_start_value + 2 * len(lineage) - 1 267 268 parent_taxon_id = None 269 for taxon in lineage: 270 self.adaptor.execute( 271 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\ 272 " left_value, right_value)" \ 273 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 274 taxon[0], 275 taxon[1], 276 left_value, 277 right_value)) 278 taxon_id = self.adaptor.last_id("taxon") 279 self.adaptor.execute( 280 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 281 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 282 #Note the name field is limited to 255, some SwissProt files 283 #have a multi-species name which can be longer. So truncate this. 284 left_value += 1 285 right_value -= 1 286 parent_taxon_id = taxon_id 287 if common_name: 288 self.adaptor.execute( 289 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 290 "VALUES (%s, %s, 'common name')", ( 291 taxon_id, common_name)) 292 293 return taxon_id
294
295 - def _fix_name_class(self, entrez_name) :
296 """Map Entrez name terms to those used in taxdump (PRIVATE). 297 298 We need to make this conversion to match the taxon_name.name_class 299 values used by the BioSQL load_ncbi_taxonomy.pl script. 300 301 e.g. 302 "ScientificName" -> "scientific name", 303 "EquivalentName" -> "equivalent name", 304 "Synonym" -> "synonym", 305 """ 306 #Add any special cases here: 307 # 308 #known = {} 309 #try : 310 # return known[entrez_name] 311 #except KeyError: 312 # pass 313 314 #Try automatically by adding spaces before each capital 315 def add_space(letter) : 316 if letter.isupper() : 317 return " "+letter.lower() 318 else : 319 return letter
320 answer = "".join([add_space(letter) for letter in entrez_name]).strip() 321 assert answer == answer.lower() 322 return answer
323
324 - def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id, 325 scientific_name = None, 326 common_name = None):
327 """Get the taxon id for this record from the NCBI taxon ID (PRIVATE). 328 329 ncbi_taxon_id - string containing an NCBI taxon id 330 scientific_name - string, used if a stub entry is recorded 331 common_name - string, used if a stub entry is recorded 332 333 This searches the taxon table using ONLY the NCBI taxon ID 334 to find the matching taxon table entry's ID (database key). 335 336 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy 337 flag is true, Biopython will attempt to go online using Bio.Entrez 338 to fetch the official NCBI lineage, recursing up the tree until an 339 existing entry is found in the database or the full lineage has been 340 fetched. 341 342 Otherwise the NCBI taxon ID, scientific name and common name are 343 recorded as a minimal stub entry in the taxon and taxon_name tables. 344 Any partial information about the lineage from the SeqRecord is NOT 345 recorded. This should mean that (re)running the BioSQL script 346 load_ncbi_taxonomy.pl can fill in the taxonomy lineage. 347 348 Returns the taxon id (database key for the taxon table, not 349 an NCBI taxon ID). 350 """ 351 assert ncbi_taxon_id 352 353 taxon_id = self.adaptor.execute_and_fetch_col0( 354 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 355 (ncbi_taxon_id,)) 356 if taxon_id: 357 #Good, we have mapped the NCBI taxid to a taxon table entry 358 return taxon_id[0] 359 360 # At this point, as far as we can tell, this species isn't 361 # in the taxon table already. So we'll have to add it. 362 363 parent_taxon_id = None 364 rank = "species" 365 genetic_code = None 366 mito_genetic_code = None 367 species_names = [] 368 if scientific_name : 369 species_names.append(("scientific name", scientific_name)) 370 if common_name : 371 species_names.append(("common name", common_name)) 372 373 if self.fetch_NCBI_taxonomy : 374 #Go online to get the parent taxon ID! 375 handle = Entrez.efetch(db="taxonomy",id=ncbi_taxon_id,retmode="XML") 376 taxonomic_record = Entrez.read(handle) 377 if len(taxonomic_record) == 1: 378 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \ 379 "%s versus %s" % (taxonomic_record[0]["TaxId"], 380 ncbi_taxon_id) 381 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( \ 382 taxonomic_record[0]["LineageEx"]) 383 rank = taxonomic_record[0]["Rank"] 384 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"] 385 mito_genetic_code = taxonomic_record[0]["MitoGeneticCode"]["MGCId"] 386 species_names = [("scientific name", 387 taxonomic_record[0]["ScientificName"])] 388 try : 389 for name_class, names in taxonomic_record[0]["OtherNames"].iteritems(): 390 name_class = self._fix_name_class(name_class) 391 if not isinstance(names, list) : 392 #The Entrez parser seems to return single entry 393 #lists as just a string which is annoying. 394 names = [names] 395 for name in names : 396 #Want to ignore complex things like ClassCDE entries 397 if isinstance(name, basestring) : 398 species_names.append((name_class, name)) 399 except KeyError : 400 #OtherNames isn't always present, 401 #e.g. NCBI taxon 41205, Bromheadia finlaysoniana 402 pass 403 else : 404 pass 405 # If we are not allowed to go online, we will record the bare minimum; 406 # as long as the NCBI taxon id is present, then (re)running 407 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage 408 # (and update the species names). 409 # 410 # I am NOT going to try and record the lineage, even if it 411 # is in the record annotation as a list of names, as we won't 412 # know the NCBI taxon IDs for these parent nodes. 413 414 self.adaptor.execute( 415 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\ 416 " genetic_code, mito_genetic_code, left_value, right_value)" \ 417 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id, 418 ncbi_taxon_id, 419 rank, 420 genetic_code, 421 mito_genetic_code, 422 None, 423 None)) 424 taxon_id = self.adaptor.last_id("taxon") 425 426 #Record the scientific name, common name, etc 427 for name_class, name in species_names : 428 self.adaptor.execute( 429 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 430 " VALUES (%s, %s, %s)", (taxon_id, 431 name[:255], 432 name_class)) 433 return taxon_id
434
435 - def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage) :
436 """This is recursive! (PRIVATE). 437 438 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez 439 440 First dictionary in list is the taxonomy root, highest would be the species. 441 Each dictionary includes: 442 - TaxID (string, NCBI taxon id) 443 - Rank (string, e.g. "species", "genus", ..., "phylum", ...) 444 - ScientificName (string) 445 (and that is all at the time of writing) 446 447 This method will record all the lineage given, returning the the taxon id 448 (database key, not NCBI taxon id) of the final entry (the species). 449 """ 450 ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"] 451 452 #Is this in the database already? Check the taxon table... 453 taxon_id = self.adaptor.execute_and_fetch_col0( 454 "SELECT taxon_id FROM taxon" \ 455 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id) 456 if taxon_id: 457 # we could verify that the Scientific Name etc in the database 458 # is the same and update it or print a warning if not... 459 if isinstance(taxon_id, list) : 460 assert len(taxon_id)==1 461 return taxon_id[0] 462 else : 463 return taxon_id 464 465 #We have to record this. 466 if len(taxonomic_lineage) > 1 : 467 #Use recursion to find out the taxon id (database key) of the parent. 468 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1]) 469 assert isinstance(parent_taxon_id, int) or isinstance(parent_taxon_id, long), repr(parent_taxon_id) 470 else : 471 parent_taxon_id = None 472 473 # INSERT new taxon 474 rank = taxonomic_lineage[-1].get("Rank", None) 475 self.adaptor.execute( 476 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)"\ 477 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank)) 478 taxon_id = self.adaptor.last_id("taxon") 479 assert isinstance(taxon_id, int) or isinstance(taxon_id, long), repr(taxon_id) 480 # ... and its name in taxon_name 481 scientific_name = taxonomic_lineage[-1].get("ScientificName", None) 482 if scientific_name : 483 self.adaptor.execute( 484 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 485 " VALUES (%s, %s, 'scientific name')", (taxon_id, 486 scientific_name[:255])) 487 return taxon_id
488 489
490 - def _load_bioentry_table(self, record):
491 """Fill the bioentry table with sequence information (PRIVATE). 492 493 record - SeqRecord object to add to the database. 494 """ 495 # get the pertinent info and insert it 496 497 if record.id.count(".") == 1: # try to get a version from the id 498 #This assumes the string is something like "XXXXXXXX.123" 499 accession, version = record.id.split('.') 500 try : 501 version = int(version) 502 except ValueError : 503 accession = record.id 504 version = 0 505 else: # otherwise just use a version of 0 506 accession = record.id 507 version = 0 508 509 if "accessions" in record.annotations \ 510 and isinstance(record.annotations["accessions"],list) \ 511 and record.annotations["accessions"] : 512 #Take the first accession (one if there is more than one) 513 accession = record.annotations["accessions"][0] 514 515 #Find the taxon id (this is not just the NCBI Taxon ID) 516 #NOTE - If the species isn't defined in the taxon table, 517 #a new minimal entry is created. 518 taxon_id = self._get_taxon_id(record) 519 520 if "gi" in record.annotations : 521 identifier = record.annotations["gi"] 522 else : 523 identifier = record.id 524 525 #Allow description and division to default to NULL as in BioPerl. 526 description = getattr(record, 'description', None) 527 division = record.annotations.get("data_file_division", None) 528 529 sql = """ 530 INSERT INTO bioentry ( 531 biodatabase_id, 532 taxon_id, 533 name, 534 accession, 535 identifier, 536 division, 537 description, 538 version) 539 VALUES ( 540 %s, 541 %s, 542 %s, 543 %s, 544 %s, 545 %s, 546 %s, 547 %s)""" 548 #print self.dbid, taxon_id, record.name, accession, identifier, \ 549 # division, description, version 550 self.adaptor.execute(sql, (self.dbid, 551 taxon_id, 552 record.name, 553 accession, 554 identifier, 555 division, 556 description, 557 version)) 558 # now retrieve the id for the bioentry 559 bioentry_id = self.adaptor.last_id('bioentry') 560 561 return bioentry_id
562
563 - def _load_bioentry_date(self, record, bioentry_id):
564 """Add the effective date of the entry into the database. 565 566 record - a SeqRecord object with an annotated date 567 bioentry_id - corresponding database identifier 568 """ 569 # dates are GenBank style, like: 570 # 14-SEP-2000 571 date = record.annotations.get("date", 572 strftime("%d-%b-%Y", gmtime()).upper()) 573 if isinstance(date, list) : date = date[0] 574 annotation_tags_id = self._get_ontology_id("Annotation Tags") 575 date_id = self._get_term_id("date_changed", annotation_tags_id) 576 sql = r"INSERT INTO bioentry_qualifier_value" \ 577 r" (bioentry_id, term_id, value, rank)" \ 578 r" VALUES (%s, %s, %s, 1)" 579 self.adaptor.execute(sql, (bioentry_id, date_id, date))
580
581 - def _load_biosequence(self, record, bioentry_id):
582 """Record a SeqRecord's sequence and alphabet in the database (PRIVATE). 583 584 record - a SeqRecord object with a seq property 585 bioentry_id - corresponding database identifier 586 """ 587 if record.seq is None : 588 #The biosequence table entry is optional, so if we haven't 589 #got a sequence, we don't need to write to the table. 590 return 591 592 # determine the string representation of the alphabet 593 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 594 alphabet = "dna" 595 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 596 alphabet = "rna" 597 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 598 alphabet = "protein" 599 else: 600 alphabet = "unknown" 601 602 if isinstance(record.seq, UnknownSeq) : 603 seq_str = None 604 else : 605 seq_str = str(record.seq) 606 607 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 608 r"length, seq, alphabet) " \ 609 r"VALUES (%s, 0, %s, %s, %s)" 610 self.adaptor.execute(sql, (bioentry_id, 611 len(record.seq), 612 seq_str, 613 alphabet))
614
615 - def _load_comment(self, record, bioentry_id):
616 """Record a SeqRecord's annotated comment in the database (PRIVATE). 617 618 record - a SeqRecord object with an annotated comment 619 bioentry_id - corresponding database identifier 620 """ 621 comments = record.annotations.get('comment') 622 if not comments: 623 return 624 if not isinstance(comments, list) : 625 #It should be a string then... 626 comments = [comments] 627 628 for index, comment in enumerate(comments) : 629 comment = comment.replace('\n', ' ') 630 #TODO - Store each line as a separate entry? This would preserve 631 #the newlines, but we should check BioPerl etc to be consistent. 632 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 633 " VALUES (%s, %s, %s)" 634 self.adaptor.execute(sql, (bioentry_id, comment, index+1))
635
636 - def _load_annotations(self, record, bioentry_id) :
637 """Record a SeqRecord's misc annotations in the database (PRIVATE). 638 639 The annotation strings are recorded in the bioentry_qualifier_value 640 table, except for special cases like the reference, comment and 641 taxonomy which are handled with their own tables. 642 643 record - a SeqRecord object with an annotations dictionary 644 bioentry_id - corresponding database identifier 645 """ 646 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 647 "(bioentry_id, term_id, value)" \ 648 " VALUES (%s, %s, %s)" 649 many_sql = "INSERT INTO bioentry_qualifier_value" \ 650 "(bioentry_id, term_id, value, rank)" \ 651 " VALUES (%s, %s, %s, %s)" 652 tag_ontology_id = self._get_ontology_id('Annotation Tags') 653 for key, value in record.annotations.iteritems() : 654 if key in ["references", "comment", "ncbi_taxid"] : 655 #Handled separately 656 continue 657 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 658 if isinstance(value, list) or isinstance(value, tuple): 659 rank = 0 660 for entry in value : 661 if isinstance(entry, str) or isinstance(entry, int): 662 #Easy case 663 rank += 1 664 self.adaptor.execute(many_sql, \ 665 (bioentry_id, term_id, str(entry), rank)) 666 else : 667 pass 668 #print "Ignoring annotation '%s' sub-entry of type '%s'" \ 669 # % (key, str(type(entry))) 670 elif isinstance(value, str) or isinstance(value, int): 671 #Have a simple single entry, leave rank as the DB default 672 self.adaptor.execute(mono_sql, \ 673 (bioentry_id, term_id, str(value))) 674 else : 675 pass
676 #print "Ignoring annotation '%s' entry of type '%s'" \ 677 # % (key, type(value)) 678 679
680 - def _load_reference(self, reference, rank, bioentry_id):
681 """Record a SeqRecord's annotated references in the database (PRIVATE). 682 683 record - a SeqRecord object with annotated references 684 bioentry_id - corresponding database identifier 685 """ 686 687 refs = None 688 if reference.medline_id: 689 refs = self.adaptor.execute_and_fetch_col0( 690 "SELECT reference_id" \ 691 " FROM reference JOIN dbxref USING (dbxref_id)" \ 692 " WHERE dbname = 'MEDLINE' AND accession = %s", 693 (reference.medline_id,)) 694 if not refs and reference.pubmed_id: 695 refs = self.adaptor.execute_and_fetch_col0( 696 "SELECT reference_id" \ 697 " FROM reference JOIN dbxref USING (dbxref_id)" \ 698 " WHERE dbname = 'PUBMED' AND accession = %s", 699 (reference.pubmed_id,)) 700 if not refs: 701 s = [] 702 for f in reference.authors, reference.title, reference.journal: 703 s.append(f or "<undef>") 704 crc = crc64("".join(s)) 705 refs = self.adaptor.execute_and_fetch_col0( 706 "SELECT reference_id FROM reference" \ 707 r" WHERE crc = %s", (crc,)) 708 if not refs: 709 if reference.medline_id: 710 dbxref_id = self._add_dbxref("MEDLINE", 711 reference.medline_id, 0) 712 elif reference.pubmed_id: 713 dbxref_id = self._add_dbxref("PUBMED", 714 reference.pubmed_id, 0) 715 else: 716 dbxref_id = None 717 authors = reference.authors or None 718 title = reference.title or None 719 #The location/journal field cannot be Null, so default 720 #to an empty string rather than None: 721 journal = reference.journal or "" 722 self.adaptor.execute( 723 "INSERT INTO reference (dbxref_id, location," \ 724 " title, authors, crc)" \ 725 " VALUES (%s, %s, %s, %s, %s)", 726 (dbxref_id, journal, title, 727 authors, crc)) 728 reference_id = self.adaptor.last_id("reference") 729 else: 730 reference_id = refs[0] 731 732 if reference.location: 733 start = 1 + int(str(reference.location[0].start)) 734 end = int(str(reference.location[0].end)) 735 else: 736 start = None 737 end = None 738 739 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 740 " start_pos, end_pos, rank)" \ 741 " VALUES (%s, %s, %s, %s, %s)" 742 self.adaptor.execute(sql, (bioentry_id, reference_id, 743 start, end, rank + 1))
744
745 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
746 """Load a biopython SeqFeature into the database (PRIVATE). 747 """ 748 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 749 bioentry_id) 750 self._load_seqfeature_locations(feature, seqfeature_id) 751 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
752
753 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id):
754 """Load the first tables of a seqfeature and returns the id (PRIVATE). 755 756 This loads the "key" of the seqfeature (ie. CDS, gene) and 757 the basic seqfeature table itself. 758 """ 759 ontology_id = self._get_ontology_id('SeqFeature Keys') 760 seqfeature_key_id = self._get_term_id(feature_type, 761 ontology_id = ontology_id) 762 # XXX source is always EMBL/GenBank/SwissProt here; it should depend on 763 # the record (how?) 764 source_cat_id = self._get_ontology_id('SeqFeature Sources') 765 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt', 766 ontology_id = source_cat_id) 767 768 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 769 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 770 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 771 source_term_id, feature_rank + 1)) 772 seqfeature_id = self.adaptor.last_id('seqfeature') 773 774 return seqfeature_id
775
776 - def _load_seqfeature_locations(self, feature, seqfeature_id):
777 """Load all of the locations for a SeqFeature into tables (PRIVATE). 778 779 This adds the locations related to the SeqFeature into the 780 seqfeature_location table. Fuzzies are not handled right now. 781 For a simple location, ie (1..2), we have a single table row 782 with seq_start = 1, seq_end = 2, location_rank = 1. 783 784 For split locations, ie (1..2, 3..4, 5..6) we would have three 785 row tables with: 786 start = 1, end = 2, rank = 1 787 start = 3, end = 4, rank = 2 788 start = 5, end = 6, rank = 3 789 """ 790 # TODO - Record an ontology for the locations (using location.term_id) 791 # which for now as in BioPerl we leave defaulting to NULL. 792 if feature.location_operator and feature.location_operator != "join" : 793 # e.g. order locations... we don't record "order" so it 794 # will become a "join" on reloading. What does BioPerl do? 795 import warnings 796 warnings.warn("%s location operators are not fully supported" \ 797 % feature.location_operator) 798 799 # two cases, a simple location or a split location 800 if not feature.sub_features: # simple location 801 self._insert_seqfeature_location(feature, 1, seqfeature_id) 802 else: # split location 803 for rank, cur_feature in enumerate(feature.sub_features): 804 self._insert_seqfeature_location(cur_feature, 805 rank + 1, 806 seqfeature_id)
807
808 - def _insert_seqfeature_location(self, feature, rank, seqfeature_id):
809 """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE). 810 811 TODO - Add location_operators to location_qualifier_value. 812 """ 813 # convert biopython locations to the 1-based location system 814 # used in bioSQL 815 # XXX This could also handle fuzzies 816 start = feature.location.nofuzzy_start + 1 817 end = feature.location.nofuzzy_end 818 819 # Biopython uses None when we don't know strand information but 820 # BioSQL requires something (non null) and sets this as zero 821 # So we'll use the strand or 0 if Biopython spits out None 822 strand = feature.strand or 0 823 824 # TODO - Record an ontology term for the location (location.term_id) 825 # which for now like BioPerl we'll leave as NULL. 826 # This might allow us to record "between" positions properly, but I 827 # doesn't really see how it could work for before/after fuzzy positions 828 loc_term_id = None 829 830 if feature.ref: 831 # sub_feature remote locations when they are in the same db as the current 832 # record do not have a value for ref_db, which the SeqFeature object 833 # stores as None. BioSQL schema requires a varchar and is not NULL 834 dbxref_id = self._get_dbxref_id(feature.ref_db or "", feature.ref) 835 else : 836 dbxref_id = None 837 838 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \ 839 r"start_pos, end_pos, strand, rank) " \ 840 r"VALUES (%s, %s, %s, %s, %s, %s, %s)" 841 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id, 842 start, end, strand, rank)) 843 844 """ 845 # See Bug 2677 846 # TODO - Record the location_operator (e.g. "join" or "order") 847 # using the location_qualifier_value table (which we and BioPerl 848 # have historically left empty). 849 # Note this will need an ontology term for the location qualifer 850 # (location_qualifier_value.term_id) for which oddly the schema 851 # does not allow NULL. 852 if feature.location_operator: 853 #e.g. "join" (common), 854 #or "order" (see Tests/GenBank/protein_refseq2.gb) 855 location_id = self.adaptor.last_id('location') 856 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1 857 sql = r"INSERT INTO location_qualifier_value" \ 858 r"(location_id, term_id, value)" \ 859 r"VALUES (%s, %s, %s)" 860 self.adaptor.execute(sql, (location_id, loc_qual_term_id, 861 feature.location_operator)) 862 """
863
864 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
865 """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE). 866 867 Qualifiers should be a dictionary of the form: 868 {key : [value1, value2]} 869 """ 870 tag_ontology_id = self._get_ontology_id('Annotation Tags') 871 for qualifier_key in qualifiers.keys(): 872 # Treat db_xref qualifiers differently to sequence annotation 873 # qualifiers by populating the seqfeature_dbxref and dbxref 874 # tables. Other qualifiers go into the seqfeature_qualifier_value 875 # and (if new) term tables. 876 if qualifier_key != 'db_xref': 877 qualifier_key_id = self._get_term_id(qualifier_key, 878 ontology_id=tag_ontology_id) 879 # now add all of the values to their table 880 entries = qualifiers[qualifier_key] 881 if not isinstance(entries, list) : 882 # Could be a plain string, or an int or a float. 883 # However, we exect a list of strings here. 884 entries = [entries] 885 for qual_value_rank in range(len(entries)): 886 qualifier_value = entries[qual_value_rank] 887 sql = r"INSERT INTO seqfeature_qualifier_value "\ 888 r" (seqfeature_id, term_id, rank, value) VALUES"\ 889 r" (%s, %s, %s, %s)" 890 self.adaptor.execute(sql, (seqfeature_id, 891 qualifier_key_id, 892 qual_value_rank + 1, 893 qualifier_value)) 894 else: 895 # The dbxref_id qualifier/value sets go into the dbxref table 896 # as dbname, accession, version tuples, with dbxref.dbxref_id 897 # being automatically assigned, and into the seqfeature_dbxref 898 # table as seqfeature_id, dbxref_id, and rank tuples 899 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 900 seqfeature_id)
901 902
903 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
904 """Add database crossreferences of a SeqFeature to the database (PRIVATE). 905 906 o dbxrefs List, dbxref data from the source file in the 907 format <database>:<accession> 908 909 o seqfeature_id Int, the identifier for the seqfeature in the 910 seqfeature table 911 912 Insert dbxref qualifier data for a seqfeature into the 913 seqfeature_dbxref and, if required, dbxref tables. 914 The dbxref_id qualifier/value sets go into the dbxref table 915 as dbname, accession, version tuples, with dbxref.dbxref_id 916 being automatically assigned, and into the seqfeature_dbxref 917 table as seqfeature_id, dbxref_id, and rank tuples 918 """ 919 # NOTE - In older versions of Biopython, we would map the GenBank 920 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 921 # for any unknown terms. This was a long term maintainance problem, 922 # and differed from BioPerl and BioJava's implementation. See bug 2405 923 for rank, value in enumerate(dbxrefs): 924 # Split the DB:accession format string at colons. We have to 925 # account for multiple-line and multiple-accession entries 926 try: 927 dbxref_data = value.replace(' ','').replace('\n','').split(':') 928 db = dbxref_data[0] 929 accessions = dbxref_data[1:] 930 except: 931 raise ValueError("Parsing of db_xref failed: '%s'" % value) 932 # Loop over all the grabbed accessions, and attempt to fill the 933 # table 934 for accession in accessions: 935 # Get the dbxref_id value for the dbxref data 936 dbxref_id = self._get_dbxref_id(db, accession) 937 # Insert the seqfeature_dbxref data 938 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
939
940 - def _get_dbxref_id(self, db, accession):
941 """ _get_dbxref_id(self, db, accession) -> Int 942 943 o db String, the name of the external database containing 944 the accession number 945 946 o accession String, the accession of the dbxref data 947 948 Finds and returns the dbxref_id for the passed data. The method 949 attempts to find an existing record first, and inserts the data 950 if there is no record. 951 """ 952 # Check for an existing record 953 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 954 r'AND accession = %s' 955 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 956 # If there was a record, return the dbxref_id, else create the 957 # record and return the created dbxref_id 958 if dbxref_id: 959 return dbxref_id[0] 960 return self._add_dbxref(db, accession, 0)
961
962 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
963 """ Check for a pre-existing seqfeature_dbxref entry with the passed 964 seqfeature_id and dbxref_id. If one does not exist, insert new 965 data 966 967 """ 968 # Check for an existing record 969 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 970 r"WHERE seqfeature_id = '%s' AND dbxref_id = '%s'" 971 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 972 dbxref_id)) 973 # If there was a record, return without executing anything, else create 974 # the record and return 975 if result: 976 return result 977 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
978
979 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
980 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 981 dbxref_id 982 """ 983 sql = r'INSERT INTO seqfeature_dbxref ' \ 984 '(seqfeature_id, dbxref_id, rank) VALUES' \ 985 r'(%s, %s, %s)' 986 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 987 return (seqfeature_id, dbxref_id)
988
989 - def _load_dbxrefs(self, record, bioentry_id) :
990 """Load any sequence level cross references into the database (PRIVATE). 991 992 See table bioentry_dbxref.""" 993 for rank, value in enumerate(record.dbxrefs): 994 # Split the DB:accession string at first colon. 995 # We have to cope with things like: 996 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 997 # "GO:GO:123" (db="GO", accession="GO:123") 998 # 999 # Annoyingly I have seen the NCBI use both the style 1000 # "GO:GO:123" and "GO:123" in different vintages. 1001 assert value.count("\n")==0 1002 try: 1003 db, accession = value.split(':',1) 1004 db = db.strip() 1005 accession = accession.strip() 1006 except: 1007 raise ValueError("Parsing of dbxrefs list failed: '%s'" % value) 1008 # Get the dbxref_id value for the dbxref data 1009 dbxref_id = self._get_dbxref_id(db, accession) 1010 # Insert the bioentry_dbxref data 1011 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank+1)
1012
1013 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1014 """ Check for a pre-existing bioentry_dbxref entry with the passed 1015 seqfeature_id and dbxref_id. If one does not exist, insert new 1016 data 1017 1018 """ 1019 # Check for an existing record 1020 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 1021 r"WHERE bioentry_id = '%s' AND dbxref_id = '%s'" 1022 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 1023 dbxref_id)) 1024 # If there was a record, return without executing anything, else create 1025 # the record and return 1026 if result: 1027 return result 1028 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1029
1030 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1031 """ Insert a bioentry_dbxref row and return the seqfeature_id and 1032 dbxref_id 1033 """ 1034 sql = r'INSERT INTO bioentry_dbxref ' \ 1035 '(bioentry_id,dbxref_id,rank) VALUES ' \ 1036 '(%s, %s, %s)' 1037 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 1038 return (bioentry_id, dbxref_id)
1039
1040 -class DatabaseRemover:
1041 """Complement the Loader functionality by fully removing a database. 1042 1043 This probably isn't really useful for normal purposes, since you 1044 can just do a: 1045 DROP DATABASE db_name 1046 and then recreate the database. But, it's really useful for testing 1047 purposes. 1048 1049 YB: now use the cascaded deletions 1050 """
1051 - def __init__(self, adaptor, dbid):
1052 """Initialize with a database id and adaptor connection. 1053 """ 1054 self.adaptor = adaptor 1055 self.dbid = dbid
1056
1057 - def remove(self):
1058 """Remove everything related to the given database id. 1059 """ 1060 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 1061 self.adaptor.execute(sql, (self.dbid,)) 1062 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 1063 self.adaptor.execute(sql, (self.dbid,))
1064