1
2
3
4
5
6
7
8
9
10 """Load biopython objects into a BioSQL database for persistent storage.
11
12 This code makes it possible to store biopython objects in a relational
13 database and then retrieve them back. You shouldn't use any of the
14 classes in this module directly. Rather, call the load() method on
15 a database object.
16 """
17
18 from time import gmtime, strftime
19
20
21 from Bio import Alphabet
22 from Bio.SeqUtils.CheckSum import crc64
23
24
26 """Load a database with biopython objects.
27 """
29 """Initialize with connection information for the database.
30
31 XXX Figure out what I need to load a database and document it.
32 """
33 self.adaptor = adaptor
34 self.dbid = dbid
35
37 """Load a Biopython SeqRecord into the database.
38 """
39 bioentry_id = self._load_bioentry_table(record)
40 self._load_bioentry_date(record, bioentry_id)
41 self._load_biosequence(record, bioentry_id)
42 self._load_comment(record, bioentry_id)
43 self._load_dbxrefs(record, bioentry_id)
44 references = record.annotations.get('references', ())
45 for reference, rank in zip(references, range(len(references))):
46 self._load_reference(reference, rank, bioentry_id)
47 self._load_annotations(record, bioentry_id)
48 for seq_feature_num in range(len(record.features)):
49 seq_feature = record.features[seq_feature_num]
50 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
51
53 """Returns the identifier for the named ontology.
54
55 This looks through the onotology table for a the given entry name.
56 If it is not found, a row is added for this ontology (using the
57 definition if supplied). In either case, the id corresponding to
58 the provided name is returned, so that you can reference it in
59 another table.
60 """
61 oids = self.adaptor.execute_and_fetch_col0(
62 "SELECT ontology_id FROM ontology WHERE name = %s",
63 (name,))
64 if oids:
65 return oids[0]
66 self.adaptor.execute(
67 "INSERT INTO ontology(name, definition) VALUES (%s, %s)",
68 (name, definition))
69 return self.adaptor.last_id("ontology")
70
71
72 - def _get_term_id(self,
73 name,
74 ontology_id=None,
75 definition=None,
76 identifier=None):
77 """Get the id that corresponds to a term.
78
79 This looks through the term table for a the given term. If it
80 is not found, a new id corresponding to this term is created.
81 In either case, the id corresponding to that term is returned, so
82 that you can reference it in another table.
83
84 The ontology_id should be used to disambiguate the term.
85 """
86
87
88 sql = r"SELECT term_id FROM term " \
89 r"WHERE name = %s"
90 fields = [name]
91 if ontology_id:
92 sql += ' AND ontology_id = %s'
93 fields.append(ontology_id)
94 id_results = self.adaptor.execute_and_fetchall(sql, fields)
95
96 if len(id_results) > 1:
97 raise ValueError("Multiple term ids for %s: %r" %
98 (name, id_results))
99 elif len(id_results) == 1:
100 return id_results[0][0]
101 else:
102 sql = r"INSERT INTO term (name, definition," \
103 r" identifier, ontology_id)" \
104 r" VALUES (%s, %s, %s, %s)"
105 self.adaptor.execute(sql, (name, definition,
106 identifier, ontology_id))
107 return self.adaptor.last_id("term")
108
110 """Insert a dbxref and return its id."""
111
112 self.adaptor.execute(
113 "INSERT INTO dbxref(dbname, accession, version)" \
114 " VALUES (%s, %s, %s)", (dbname, accession, version))
115 return self.adaptor.last_id("dbxref")
116
118 """Get the taxon id for this record.
119
120 record - a SeqRecord object
121
122 This searches the taxon/taxon_name tables using the
123 NCBI taxon ID, scientific name and common name to find
124 the matching taxon table entry's id.
125
126 If the species isn't in the taxon table, and we have at
127 least the NCBI taxon ID, scientific name or common name,
128 a minimal stub entry is created in the table.
129
130 If this information is not in the record's annotation,
131 then None is returned.
132
133 See also the BioSQL script load_ncbi_taxonomy.pl which
134 will populate and update the taxon/taxon_name tables
135 with the latest information from the NCBI.
136 """
137
138
139 ncbi_taxon_id = None
140 if "ncbi_taxid" in record.annotations :
141
142 if isinstance(record.annotations["ncbi_taxid"],list) :
143 if len(record.annotations["ncbi_taxid"])==1 :
144 ncbi_taxon_id = record.annotations["ncbi_taxid"][0]
145 else :
146 ncbi_taxon_id = record.annotations["ncbi_taxid"]
147 if not ncbi_taxon_id:
148
149 for f in record.features:
150 if f.type == 'source':
151 quals = getattr(f, 'qualifiers', {})
152 if "db_xref" in quals:
153 for db_xref in f.qualifiers["db_xref"]:
154 if db_xref.startswith("taxon:"):
155 ncbi_taxon_id = int(db_xref[6:])
156 break
157 if ncbi_taxon_id: break
158 if ncbi_taxon_id:
159 taxa = self.adaptor.execute_and_fetch_col0(
160 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s",
161 (ncbi_taxon_id,))
162 if taxa:
163
164 return taxa[0]
165
166
167
168
169
170
171
172
173 try :
174 scientific_name = record.annotations["organism"][:255]
175 except KeyError :
176 scientific_name = None
177 try :
178 common_name = record.annotations["source"][:255]
179 except KeyError :
180 common_name = None
181
182
183
184
185 if ncbi_taxon_id is None \
186 and not common_name and not scientific_name :
187
188
189
190 return None
191
192 if scientific_name:
193 taxa = self.adaptor.execute_and_fetch_col0(
194 "SELECT taxon_id FROM taxon_name" \
195 " WHERE name_class = 'scientific name' AND name = %s",
196 (scientific_name,))
197 if taxa:
198
199 return taxa[0]
200
201
202 if common_name:
203 taxa = self.adaptor.execute_and_fetch_col0(
204 "SELECT DISTINCT taxon_id FROM taxon_name" \
205 " WHERE name = %s",
206 (common_name,))
207
208
209 if len(taxa) > 1:
210 raise ValueError("Taxa: %d species have name %r" % (
211 len(taxa),
212 common_name))
213 if taxa:
214
215 return taxa[0]
216
217
218
219
220 """
221 # Possible simplification of the scary code below; pending
222 # discussion on the BioSQL mailing list...
223 #
224 # We will record the bare minimum; as long as the NCBI taxon
225 # id is present, then (re)running load_ncbi_taxonomy.pl should
226 # fill in the taxonomomy lineage.
227 #
228 # I am NOT going to try and record the lineage, even if it
229 # is in the record annotation as a list of names, as we won't
230 # know the NCBI taxon IDs for these parent nodes.
231 self.adaptor.execute(
232 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
233 " left_value, right_value)" \
234 " VALUES (%s, %s, %s, %s, %s)", (None,
235 ncbi_taxon_id,
236 "species",
237 None,
238 None))
239 taxon_id = self.adaptor.last_id("taxon")
240 if scientific_name:
241 self.adaptor.execute(
242 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
243 "VALUES (%s, %s, 'scientific name')", (
244 taxon_id, scientific_name))
245 if common_name:
246 self.adaptor.execute(
247 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
248 "VALUES (%s, %s, 'common name')", (
249 taxon_id, common_name))
250 return taxon_id
251 """
252
253
254
255
256
257 lineage = []
258 for c in record.annotations.get("taxonomy", []):
259 lineage.append([None, None, c])
260 if lineage:
261 lineage[-1][1] = "genus"
262 lineage.append([None, "species", record.annotations["organism"]])
263
264 if "subspecies" in record.annotations:
265 lineage.append([None, "subspecies",
266 record.annotations["subspecies"]])
267 if "variant" in record.annotations:
268 lineage.append([None, "varietas",
269 record.annotations["variant"]])
270 lineage[-1][0] = ncbi_taxon_id
271
272 left_value = self.adaptor.execute_one(
273 "SELECT MAX(left_value) FROM taxon")[0]
274 if not left_value:
275 left_value = 0
276 left_value += 1
277
278
279
280
281
282 right_start_value = self.adaptor.execute_one(
283 "SELECT MAX(right_value) FROM taxon")[0]
284 if not right_start_value:
285 right_start_value = 0
286 right_value = right_start_value + 2 * len(lineage) - 1
287
288 parent_taxon_id = None
289 for taxon in lineage:
290 self.adaptor.execute(
291 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\
292 " left_value, right_value)" \
293 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id,
294 taxon[0],
295 taxon[1],
296 left_value,
297 right_value))
298 taxon_id = self.adaptor.last_id("taxon")
299 self.adaptor.execute(
300 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
301 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255]))
302
303
304 left_value += 1
305 right_value -= 1
306 parent_taxon_id = taxon_id
307 if common_name:
308 self.adaptor.execute(
309 "INSERT INTO taxon_name(taxon_id, name, name_class)" \
310 "VALUES (%s, %s, 'common name')", (
311 taxon_id, common_name))
312
313 return taxon_id
314
315 - def _load_bioentry_table(self, record):
316 """Fill the bioentry table with sequence information.
317
318 record - SeqRecord object to add to the database.
319 """
320
321
322 if record.id.find('.') >= 0:
323 accession, version = record.id.split('.')
324 version = int(version)
325 else:
326 accession = record.id
327 version = 0
328
329
330
331
332 taxon_id = self._get_taxon_id(record)
333
334 identifier = record.annotations.get('gi')
335 description = getattr(record, 'description', None)
336 division = record.annotations.get("data_file_division", "UNK")
337
338 sql = """
339 INSERT INTO bioentry (
340 biodatabase_id,
341 taxon_id,
342 name,
343 accession,
344 identifier,
345 division,
346 description,
347 version)
348 VALUES (
349 %s,
350 %s,
351 %s,
352 %s,
353 %s,
354 %s,
355 %s,
356 %s)"""
357 self.adaptor.execute(sql, (self.dbid,
358 taxon_id,
359 record.name,
360 accession,
361 identifier,
362 division,
363 description,
364 version))
365
366 bioentry_id = self.adaptor.last_id('bioentry')
367
368 return bioentry_id
369
370 - def _load_bioentry_date(self, record, bioentry_id):
371 """Add the effective date of the entry into the database.
372
373 record - a SeqRecord object with an annotated date
374 bioentry_id - corresponding database identifier
375 """
376
377
378 date = record.annotations.get("date",
379 strftime("%d-%b-%Y", gmtime()).upper())
380 annotation_tags_id = self._get_ontology_id("Annotation Tags")
381 date_id = self._get_term_id("date_changed", annotation_tags_id)
382 sql = r"INSERT INTO bioentry_qualifier_value" \
383 r" (bioentry_id, term_id, value, rank)" \
384 r" VALUES (%s, %s, %s, 1)"
385 self.adaptor.execute(sql, (bioentry_id, date_id, date))
386
388 """Record a SeqRecord's sequence and alphabet in the database.
389
390 record - a SeqRecord object with a seq property
391 bioentry_id - corresponding database identifier
392 """
393
394 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
395 alphabet = "dna"
396 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
397 alphabet = "rna"
398 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
399 alphabet = "protein"
400 else:
401 alphabet = "unknown"
402
403 sql = r"INSERT INTO biosequence (bioentry_id, version, " \
404 r"length, seq, alphabet) " \
405 r"VALUES (%s, 0, %s, %s, %s)"
406 self.adaptor.execute(sql, (bioentry_id,
407 len(record.seq.data),
408 record.seq.data,
409 alphabet))
410
426
428 """Record a SeqRecord's misc annotations in the database.
429
430 The annotation strings are recorded in the bioentry_qualifier_value
431 table, except for special cases like the reference, comment and
432 taxonomy which are handled with their own tables.
433
434 record - a SeqRecord object with an annotations dictionary
435 bioentry_id - corresponding database identifier
436 """
437 mono_sql = "INSERT INTO bioentry_qualifier_value" \
438 "(bioentry_id, term_id, value)" \
439 " VALUES (%s, %s, %s)"
440 many_sql = "INSERT INTO bioentry_qualifier_value" \
441 "(bioentry_id, term_id, value, rank)" \
442 " VALUES (%s, %s, %s, %s)"
443 tag_ontology_id = self._get_ontology_id('Annotation Tags')
444 for key, value in record.annotations.iteritems() :
445 if key in ["references", "comment", "ncbi_taxid"] :
446
447 continue
448 term_id = self._get_term_id(key, ontology_id=tag_ontology_id)
449 if isinstance(value, list) :
450 rank = 0
451 for entry in value :
452 if isinstance(entry, str) or isinstance(entry, int):
453
454 rank += 1
455 self.adaptor.execute(many_sql, \
456 (bioentry_id, term_id, str(entry), rank))
457 else :
458 pass
459
460
461 elif isinstance(value, str) or isinstance(value, int):
462
463 self.adaptor.execute(mono_sql, \
464 (bioentry_id, term_id, str(value)))
465 else :
466 pass
467
468
469
470
472 """Record a SeqRecord's annotated references in the database.
473
474 record - a SeqRecord object with annotated references
475 bioentry_id - corresponding database identifier
476 """
477
478 refs = None
479 if reference.medline_id:
480 refs = self.adaptor.execute_and_fetch_col0(
481 "SELECT reference_id" \
482 " FROM reference JOIN dbxref USING (dbxref_id)" \
483 " WHERE dbname = 'MEDLINE' AND accession = %s",
484 (reference.medline_id,))
485 if not refs and reference.pubmed_id:
486 refs = self.adaptor.execute_and_fetch_col0(
487 "SELECT reference_id" \
488 " FROM reference JOIN dbxref USING (dbxref_id)" \
489 " WHERE dbname = 'PUBMED' AND accession = %s",
490 (reference.pubmed_id,))
491 if not refs:
492 s = []
493 for f in reference.authors, reference.title, reference.journal:
494 s.append(f or "<undef>")
495 crc = crc64("".join(s))
496 refs = self.adaptor.execute_and_fetch_col0(
497 "SELECT reference_id FROM reference" \
498 r" WHERE crc = %s", (crc,))
499 if not refs:
500 if reference.medline_id:
501 dbxref_id = self._add_dbxref("MEDLINE",
502 reference.medline_id, 0)
503 elif reference.pubmed_id:
504 dbxref_id = self._add_dbxref("PUBMED",
505 reference.pubmed_id, 0)
506 else:
507 dbxref_id = None
508 authors = reference.authors or None
509 title = reference.title or None
510
511
512 journal = reference.journal or ""
513 self.adaptor.execute(
514 "INSERT INTO reference (dbxref_id, location," \
515 " title, authors, crc)" \
516 " VALUES (%s, %s, %s, %s, %s)",
517 (dbxref_id, journal, title,
518 authors, crc))
519 reference_id = self.adaptor.last_id("reference")
520 else:
521 reference_id = refs[0]
522
523 if reference.location:
524 start = 1 + int(str(reference.location[0].start))
525 end = int(str(reference.location[0].end))
526 else:
527 start = None
528 end = None
529
530 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \
531 " start_pos, end_pos, rank)" \
532 " VALUES (%s, %s, %s, %s, %s)"
533 self.adaptor.execute(sql, (bioentry_id, reference_id,
534 start, end, rank + 1))
535
543
545 """Load the first tables of a seqfeature and returns the id.
546
547 This loads the "key" of the seqfeature (ie. CDS, gene) and
548 the basic seqfeature table itself.
549 """
550 ontology_id = self._get_ontology_id('SeqFeature Keys')
551 seqfeature_key_id = self._get_term_id(feature_type,
552 ontology_id = ontology_id)
553
554
555
556 source_cat_id = self._get_ontology_id('SeqFeature Sources')
557 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt',
558 ontology_id = source_cat_id)
559
560 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \
561 r"source_term_id, rank) VALUES (%s, %s, %s, %s)"
562 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id,
563 source_term_id, feature_rank + 1))
564 seqfeature_id = self.adaptor.last_id('seqfeature')
565
566 return seqfeature_id
567
569 """Load all of the locations for a SeqFeature into tables.
570
571 This adds the locations related to the SeqFeature into the
572 seqfeature_location table. Fuzzies are not handled right now.
573 For a simple location, ie (1..2), we have a single table row
574 with seq_start = 1, seq_end = 2, location_rank = 1.
575
576 For split locations, ie (1..2, 3..4, 5..6) we would have three
577 row tables with:
578 start = 1, end = 2, rank = 1
579 start = 3, end = 4, rank = 2
580 start = 5, end = 6, rank = 3
581 """
582
583 if not feature.sub_features:
584 self._insert_seqfeature_location(feature, 1, seqfeature_id)
585 else:
586 for rank, cur_feature in enumerate(feature.sub_features):
587 self._insert_seqfeature_location(cur_feature,
588 rank + 1,
589 seqfeature_id)
590
592 """Add a location of a SeqFeature to the seqfeature_location table.
593 """
594 sql = r"INSERT INTO location (seqfeature_id, " \
595 r"start_pos, end_pos, strand, rank) " \
596 r"VALUES (%s, %s, %s, %s, %s)"
597
598
599
600
601 start = feature.location.nofuzzy_start + 1
602 end = feature.location.nofuzzy_end
603
604
605
606 strand = feature.strand or 0
607
608 self.adaptor.execute(sql, (seqfeature_id, start, end, strand, rank))
609
611 """Insert the (key, value) pair qualifiers relating to a feature.
612
613 Qualifiers should be a dictionary of the form:
614 {key : [value1, value2]}
615 """
616 tag_ontology_id = self._get_ontology_id('Annotation Tags')
617 for qualifier_key in qualifiers.keys():
618
619
620
621
622 if qualifier_key != 'db_xref':
623 qualifier_key_id = self._get_term_id(qualifier_key,
624 ontology_id=tag_ontology_id)
625
626 entries = qualifiers[qualifier_key]
627 if not isinstance(entries, list) :
628
629
630 entries = [entries]
631 for qual_value_rank in range(len(entries)):
632 qualifier_value = entries[qual_value_rank]
633 sql = r"INSERT INTO seqfeature_qualifier_value "\
634 r" (seqfeature_id, term_id, rank, value) VALUES"\
635 r" (%s, %s, %s, %s)"
636 self.adaptor.execute(sql, (seqfeature_id,
637 qualifier_key_id,
638 qual_value_rank + 1,
639 qualifier_value))
640 else:
641
642
643
644
645 self._load_seqfeature_dbxref(qualifiers[qualifier_key],
646 seqfeature_id)
647
648
650 """Add the database crossreferences of a SeqFeature to the database.
651
652 o dbxrefs List, dbxref data from the source file in the
653 format <database>:<accession>
654
655 o seqfeature_id Int, the identifier for the seqfeature in the
656 seqfeature table
657
658 Insert dbxref qualifier data for a seqfeature into the
659 seqfeature_dbxref and, if required, dbxref tables.
660 The dbxref_id qualifier/value sets go into the dbxref table
661 as dbname, accession, version tuples, with dbxref.dbxref_id
662 being automatically assigned, and into the seqfeature_dbxref
663 table as seqfeature_id, dbxref_id, and rank tuples
664 """
665
666
667
668
669 for rank, value in enumerate(dbxrefs):
670
671
672 try:
673 dbxref_data = value.replace(' ','').replace('\n','').split(':')
674 db = dbxref_data[0]
675 accessions = dbxref_data[1:]
676 except:
677 raise ValueError("Parsing of db_xref failed: '%s'" % value)
678
679
680 for accession in accessions:
681
682 dbxref_id = self._get_dbxref_id(db, accession)
683
684 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
685
687 """ _get_dbxref_id(self, db, accession) -> Int
688
689 o db String, the name of the external database containing
690 the accession number
691
692 o accession String, the accession of the dbxref data
693
694 Finds and returns the dbxref_id for the passed data. The method
695 attempts to find an existing record first, and inserts the data
696 if there is no record.
697 """
698
699 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \
700 r'AND accession = %s'
701 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession))
702
703
704 if dbxref_id:
705 return dbxref_id[0]
706 return self._add_dbxref(db, accession, 0)
707
709 """ Check for a pre-existing seqfeature_dbxref entry with the passed
710 seqfeature_id and dbxref_id. If one does not exist, insert new
711 data
712
713 """
714
715 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \
716 r"WHERE seqfeature_id = '%s' AND dbxref_id = '%s'"
717 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id,
718 dbxref_id))
719
720
721 if result:
722 return result
723 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
724
726 """ Insert a seqfeature_dbxref row and return the seqfeature_id and
727 dbxref_id
728 """
729 sql = r'INSERT INTO seqfeature_dbxref ' \
730 '(seqfeature_id, dbxref_id, rank) VALUES' \
731 r'(%s, %s, %s)'
732 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank))
733 return (seqfeature_id, dbxref_id)
734
758
759 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
760 """ Check for a pre-existing bioentry_dbxref entry with the passed
761 seqfeature_id and dbxref_id. If one does not exist, insert new
762 data
763
764 """
765
766 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \
767 r"WHERE bioentry_id = '%s' AND dbxref_id = '%s'"
768 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id,
769 dbxref_id))
770
771
772 if result:
773 return result
774 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
775
776 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
777 """ Insert a bioentry_dbxref row and return the seqfeature_id and
778 dbxref_id
779 """
780 sql = r'INSERT INTO bioentry_dbxref ' \
781 '(bioentry_id,dbxref_id,rank) VALUES ' \
782 '(%s, %s, %s)'
783 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank))
784 return (bioentry_id, dbxref_id)
785
787 """Complement the Loader functionality by fully removing a database.
788
789 This probably isn't really useful for normal purposes, since you
790 can just do a:
791 DROP DATABASE db_name
792 and then recreate the database. But, it's really useful for testing
793 purposes.
794
795 YB: now use the cascaded deletions
796 """
798 """Initialize with a database id and adaptor connection.
799 """
800 self.adaptor = adaptor
801 self.dbid = dbid
802
804 """Remove everything related to the given database id.
805 """
806 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s"
807 self.adaptor.execute(sql, (self.dbid,))
808 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s"
809 self.adaptor.execute(sql, (self.dbid,))
810