1
2
3
4
5
6
7
8
9
10 """Connect with a BioSQL database and load Biopython like objects from it.
11
12 This provides interfaces for loading biological objects from a relational
13 database, and is compatible with the BioSQL standards.
14 """
15 import BioSeq
16 import Loader
17 import DBUtils
18
19 _POSTGRES_RULES_PRESENT = False
20
22 """Main interface for loading a existing BioSQL-style database.
23
24 This function is the easiest way to retrieve a connection to a
25 database, doing something like:
26
27 >>> from BioSeq import BioSeqDatabase
28 >>> server = BioSeqDatabase.open_database(user = "root", db="minidb")
29
30 the various options are:
31 driver -> The name of the database driver to use for connecting. The
32 driver should implement the python DB API. By default, the MySQLdb
33 driver is used.
34 user -> the username to connect to the database with.
35 password, passwd -> the password to connect with
36 host -> the hostname of the database
37 database or db -> the name of the database
38 """
39 module = __import__(driver)
40 connect = getattr(module, "connect")
41
42
43 kw = kwargs.copy()
44 if driver == "MySQLdb":
45 if "database" in kw:
46 kw["db"] = kw["database"]
47 del kw["database"]
48 if "password" in kw:
49 kw["passwd"] = kw["password"]
50 del kw["password"]
51 else:
52
53 if "db" in kw:
54 kw["database"] = kw["db"]
55 del kw["db"]
56 if "passwd" in kw:
57 kw["password"] = kw["passwd"]
58 del kw["passwd"]
59 if driver in ["psycopg", "psycopg2", "pgdb"] and not kw.get("database"):
60 kw["database"] = "template1"
61 try:
62 conn = connect(**kw)
63 except module.InterfaceError:
64
65
66 if "database" in kw:
67 kw["dbname"] = kw["database"]
68 del kw["database"]
69 elif "db" in kw:
70 kw["dbname"] = kw["db"]
71 del kw["db"]
72
73 dsn = ' '.join(['='.join(i) for i in kw.items()])
74 conn = connect(dsn)
75
76 server = DBServer(conn, module)
77
78 if driver == "psycopg" :
79 import warnings
80 warnings.warn("Using BioSQL with psycopg (version one) is deprecated. "
81 "It still works for now, but we recommend you update "
82 "to using psycopg2 as a future release of Biopython "
83 "will drop support for psycop (version one).",
84 DeprecationWarning)
85
86
87
88 if driver in ["psycopg", "psycopg2", "pgdb"]:
89 sql = "SELECT ev_class FROM pg_rewrite WHERE " + \
90 "rulename='rule_bioentry_i1' OR " + \
91 "rulename='rule_bioentry_i2';"
92 if server.adaptor.execute_and_fetchall(sql):
93 import warnings
94 warnings.warn("Your BioSQL PostgreSQL schema includes some "
95 "rules currently required for bioperl-db but "
96 "which may cause problems loading data using "
97 "Biopython (see BioSQL Bug 2839). If you do not "
98 "use BioPerl, please remove these rules. "
99 "Biopython should cope with the rules present, "
100 "but with a performance penalty when loading "
101 "new records.")
102 global _POSTGRES_RULES_PRESENT
103 _POSTGRES_RULES_PRESENT = True
104
105 return server
106
108 - def __init__(self, conn, module, module_name=None):
109 self.module = module
110 if module_name is None:
111 module_name = module.__name__
112 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name))
113 self.module_name = module_name
114
116 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
125
132
133 - def new_database(self, db_name, authority=None, description=None):
134 """Add a new database to the server and return it.
135 """
136
137 sql = r"INSERT INTO biodatabase (name, authority, description)" \
138 r" VALUES (%s, %s, %s)"
139 self.adaptor.execute(sql, (db_name,authority, description))
140 return BioSeqDatabase(self.adaptor, db_name)
141
143 """Load a database schema into the given database.
144
145 This is used to create tables, etc when a database is first created.
146 sql_file should specify the complete path to a file containing
147 SQL entries for building the tables.
148 """
149
150
151
152
153
154 sql_handle = open(sql_file, "rb")
155 sql = r""
156 for line in sql_handle.xreadlines():
157 if line.find("--") == 0:
158 pass
159 elif line.find("#") == 0:
160 pass
161 elif line.strip():
162 sql += line.strip()
163 sql += ' '
164
165
166
167
168
169 if self.module_name in ["psycopg", "psycopg2", "pgdb"]:
170 self.adaptor.cursor.execute(sql)
171
172
173 elif self.module_name in ["MySQLdb"]:
174 sql_parts = sql.split(";")
175 for sql_line in sql_parts[:-1]:
176 self.adaptor.cursor.execute(sql_line)
177 else:
178 raise ValueError("Module %s not supported by the loader." %
179 (self.module_name))
180
182 """Commits the current transaction to the database."""
183 return self.adaptor.commit()
184
186 """Rolls backs the current transaction."""
187 return self.adaptor.rollback()
188
190 """Close the connection. No further activity possible."""
191 return self.adaptor.close()
192
195 self.conn = conn
196 self.cursor = conn.cursor()
197 self.dbutils = dbutils
198
201
203 """Set the autocommit mode. True values enable; False value disable."""
204 return self.dbutils.autocommit(self.conn, y)
205
207 """Commits the current transaction."""
208 return self.conn.commit()
209
211 """Rolls backs the current transaction."""
212 return self.conn.rollback()
213
215 """Close the connection. No further activity possible."""
216 return self.conn.close()
217
219 self.cursor.execute(
220 r"select biodatabase_id from biodatabase where name = %s",
221 (dbname,))
222 rv = self.cursor.fetchall()
223 if not rv:
224 raise KeyError("Cannot find biodatabase with name %r" % dbname)
225
226
227 return rv[0][0]
228
230 sql = r"select bioentry_id from bioentry where name = %s"
231 fields = [name]
232 if dbid:
233 sql += " and biodatabase_id = %s"
234 fields.append(dbid)
235 self.cursor.execute(sql, fields)
236 rv = self.cursor.fetchall()
237 if not rv:
238 raise IndexError("Cannot find display id %r" % name)
239 if len(rv) > 1:
240 raise IndexError("More than one entry with display id %r" % name)
241 return rv[0][0]
242
244 sql = r"select bioentry_id from bioentry where accession = %s"
245 fields = [name]
246 if dbid:
247 sql += " and biodatabase_id = %s"
248 fields.append(dbid)
249 self.cursor.execute(sql, fields)
250 rv = self.cursor.fetchall()
251 if not rv:
252 raise IndexError("Cannot find accession %r" % name)
253 if len(rv) > 1:
254 raise IndexError("More than one entry with accession %r" % name)
255 return rv[0][0]
256
258 sql = r"select bioentry_id from bioentry where accession = %s"
259 fields = [name]
260 if dbid:
261 sql += " and biodatabase_id = %s"
262 fields.append(dbid)
263 return self.execute_and_fetch_col0(sql, fields)
264
266 acc_version = name.split(".")
267 if len(acc_version) > 2:
268 raise IndexError("Bad version %r" % name)
269 acc = acc_version[0]
270 if len(acc_version) == 2:
271 version = acc_version[1]
272 else:
273 version = "0"
274 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \
275 r" AND version = %s"
276 fields = [acc, version]
277 if dbid:
278 sql += " and biodatabase_id = %s"
279 fields.append(dbid)
280 self.cursor.execute(sql, fields)
281 rv = self.cursor.fetchall()
282 if not rv:
283 raise IndexError("Cannot find version %r" % name)
284 if len(rv) > 1:
285 raise IndexError("More than one entry with version %r" % name)
286 return rv[0][0]
287
289
290 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s"
291 fields = [identifier]
292 if dbid:
293 sql += " and biodatabase_id = %s"
294 fields.append(dbid)
295 self.cursor.execute(sql, fields)
296 rv = self.cursor.fetchall()
297 if not rv:
298 raise IndexError("Cannot find display id %r" % identifier)
299 return rv[0][0]
300
304
305 - def list_bioentry_ids(self, dbid):
306 return self.execute_and_fetch_col0(
307 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s",
308 (dbid,))
309
311 return self.execute_and_fetch_col0(
312 "SELECT name FROM bioentry WHERE biodatabase_id = %s",
313 (dbid,))
314
316 """Return ids given a SQL statement to select for them.
317
318 This assumes that the given SQL does a SELECT statement that
319 returns a list of items. This parses them out of the 2D list
320 they come as and just returns them in a list.
321 """
322 return self.cursor.execute_and_fetch_col0(sql, args)
323
325 self.cursor.execute(sql, args or ())
326 rv = self.cursor.fetchall()
327 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv)
328 return rv[0]
329
330 - def execute(self, sql, args=None):
331 """Just execute an sql command.
332 """
333 self.cursor.execute(sql, args or ())
334
341
343 self.cursor.execute(sql, args or ())
344 return [field[0] for field in self.cursor.fetchall()]
345
349
350 _allowed_lookups = {
351
352 'primary_id': "fetch_seqid_by_identifier",
353 'gi': "fetch_seqid_by_identifier",
354 'display_id': "fetch_seqid_by_display_id",
355 'name': "fetch_seqid_by_display_id",
356 'accession': "fetch_seqid_by_accession",
357 'version': "fetch_seqid_by_version",
358 }
359
365
367 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
368
377
379 """Gets a Bio::Seq object by accession number
380
381 Example: seq = db.get_Seq_by_acc('X77802')
382
383 """
384 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name)
385 return BioSeq.DBSeqRecord(self.adaptor, seqid)
386
388 """Gets a Bio::Seq object by version number
389
390 Example: seq = db.get_Seq_by_ver('X77802.1')
391
392 """
393 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name)
394 return BioSeq.DBSeqRecord(self.adaptor, seqid)
395
397 """Gets a *list* of Bio::Seq objects by accession number
398
399 Example: seqs = db.get_Seq_by_acc('X77802')
400
401 """
402 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name)
403 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
404
406
407
408
409
410 raise NotImplementedError("waiting for Python 2.2's iter")
411
413 """Array of all the primary_ids of the sequences in the database.
414
415 These maybe ids (display style) or accession numbers or
416 something else completely different - they *are not*
417 meaningful outside of this database implementation.
418 """
419 return self.adaptor.list_bioentry_ids(self.dbid)
420
429
431 if len(kwargs) != 1:
432 raise TypeError("single key/value parameter expected")
433 k, v = kwargs.items()[0]
434 if k not in _allowed_lookups:
435 raise TypeError("lookup() expects one of %s, not %r" % \
436 (repr(_allowed_lookups.keys())[1:-1], repr(k)))
437 lookup_name = _allowed_lookups[k]
438 lookup_func = getattr(self.adaptor, lookup_name)
439 seqid = lookup_func(self.dbid, v)
440 return BioSeq.DBSeqRecord(self.adaptor, seqid)
441
443 """Gets a Bio::Seq object by the primary (internal) id.
444
445 The primary id in these cases has to come from
446 $db->get_all_primary_ids. There is no other way to get (or
447 guess) the primary_ids in a database.
448 """
449 return self[seqid]
450
451 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
452 """Load a set of SeqRecords into the BioSQL database.
453
454 record_iterator is either a list of SeqRecord objects, or an
455 Iterator object that returns SeqRecord objects (such as the
456 output from the Bio.SeqIO.parse() function), which will be
457 used to populate the database.
458
459 fetch_NCBI_taxonomy is boolean flag allowing or preventing
460 connection to the taxonomic database on the NCBI server
461 (via Bio.Entrez) to fetch a detailed taxonomy for each
462 SeqRecord.
463
464 Example:
465 from Bio import SeqIO
466 count = db.load(SeqIO.parse(open(filename), format))
467
468 Returns the number of records loaded.
469 """
470 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, \
471 fetch_NCBI_taxonomy)
472 num_records = 0
473 global _POSTGRES_RULES_PRESENT
474 for cur_record in record_iterator :
475 num_records += 1
476
477
478 if _POSTGRES_RULES_PRESENT:
479
480 if cur_record.id.count(".") == 1:
481 accession, version = cur_record.id.split('.')
482 try :
483 version = int(version)
484 except ValueError :
485 accession = cur_record.id
486 version = 0
487 else:
488 accession = cur_record.id
489 version = 0
490 gi = cur_record.annotations.get("gi", None)
491 sql = "SELECT bioentry_id FROM bioentry WHERE (identifier " + \
492 "= '%s' AND biodatabase_id = '%s') OR (accession = " + \
493 "'%s' AND version = '%s' AND biodatabase_id = '%s')"
494 self.adaptor.execute(sql % (gi, self.dbid, accession, version, self.dbid))
495 if self.adaptor.cursor.fetchone():
496 try:
497 raise self.adaptor.conn.IntegrityError("Duplicate record "
498 "detected: record has not been inserted")
499 except AttributeError:
500 import psycopg
501 raise psycopg.IntegrityError("Psycopg1: Duplicate record "
502 "detected: record has not been inserted")
503
504 db_loader.load_seqrecord(cur_record)
505 return num_records
506