Package BioSQL :: Module BioSeqDatabase
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeqDatabase

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2009 copyright by Peter Cock.  All rights reserved. 
  3  # Revisions 2009 copyright by Cymon J. Cox.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  # 
  8  # Note that BioSQL (including the database schema and scripts) is 
  9  # available and licensed separately.  Please consult www.biosql.org 
 10  """Connect with a BioSQL database and load Biopython like objects from it. 
 11   
 12  This provides interfaces for loading biological objects from a relational 
 13  database, and is compatible with the BioSQL standards. 
 14  """ 
 15  import BioSeq 
 16  import Loader 
 17  import DBUtils 
 18   
 19  _POSTGRES_RULES_PRESENT = False # Hack for BioSQL Bug 2839 
 20   
21 -def open_database(driver = "MySQLdb", **kwargs):
22 """Main interface for loading a existing BioSQL-style database. 23 24 This function is the easiest way to retrieve a connection to a 25 database, doing something like: 26 27 >>> from BioSeq import BioSeqDatabase 28 >>> server = BioSeqDatabase.open_database(user = "root", db="minidb") 29 30 the various options are: 31 driver -> The name of the database driver to use for connecting. The 32 driver should implement the python DB API. By default, the MySQLdb 33 driver is used. 34 user -> the username to connect to the database with. 35 password, passwd -> the password to connect with 36 host -> the hostname of the database 37 database or db -> the name of the database 38 """ 39 module = __import__(driver) 40 connect = getattr(module, "connect") 41 42 # Different drivers use different keywords... 43 kw = kwargs.copy() 44 if driver == "MySQLdb": 45 if "database" in kw: 46 kw["db"] = kw["database"] 47 del kw["database"] 48 if "password" in kw: 49 kw["passwd"] = kw["password"] 50 del kw["password"] 51 else: 52 # DB-API recommendations 53 if "db" in kw: 54 kw["database"] = kw["db"] 55 del kw["db"] 56 if "passwd" in kw: 57 kw["password"] = kw["passwd"] 58 del kw["passwd"] 59 if driver in ["psycopg", "psycopg2", "pgdb"] and not kw.get("database"): 60 kw["database"] = "template1" 61 try: 62 conn = connect(**kw) 63 except module.InterfaceError: 64 # Ok, so let's try building a DSN 65 # (older releases of psycopg need this) 66 if "database" in kw: 67 kw["dbname"] = kw["database"] 68 del kw["database"] 69 elif "db" in kw: 70 kw["dbname"] = kw["db"] 71 del kw["db"] 72 73 dsn = ' '.join(['='.join(i) for i in kw.items()]) 74 conn = connect(dsn) 75 76 server = DBServer(conn, module) 77 78 if driver == "psycopg" : 79 import warnings 80 warnings.warn("Using BioSQL with psycopg (version one) is deprecated. " 81 "It still works for now, but we recommend you update " 82 "to using psycopg2 as a future release of Biopython " 83 "will drop support for psycop (version one).", 84 DeprecationWarning) 85 86 # TODO - Remove the following once BioSQL Bug 2839 is fixed. 87 # Test for RULES in PostgreSQL schema, see also Bug 2833. 88 if driver in ["psycopg", "psycopg2", "pgdb"]: 89 sql = "SELECT ev_class FROM pg_rewrite WHERE " + \ 90 "rulename='rule_bioentry_i1' OR " + \ 91 "rulename='rule_bioentry_i2';" 92 if server.adaptor.execute_and_fetchall(sql): 93 import warnings 94 warnings.warn("Your BioSQL PostgreSQL schema includes some " 95 "rules currently required for bioperl-db but " 96 "which may cause problems loading data using " 97 "Biopython (see BioSQL Bug 2839). If you do not " 98 "use BioPerl, please remove these rules. " 99 "Biopython should cope with the rules present, " 100 "but with a performance penalty when loading " 101 "new records.") 102 global _POSTGRES_RULES_PRESENT 103 _POSTGRES_RULES_PRESENT = True 104 105 return server
106
107 -class DBServer:
108 - def __init__(self, conn, module, module_name=None):
109 self.module = module 110 if module_name is None: 111 module_name = module.__name__ 112 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name)) 113 self.module_name = module_name
114
115 - def __repr__(self):
116 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
117 - def __getitem__(self, name):
118 return BioSeqDatabase(self.adaptor, name)
119 - def keys(self):
120 return self.adaptor.list_biodatabase_names()
121 - def values(self):
122 return [self[key] for key in self.keys()]
123 - def items(self):
124 return [(key, self[key]) for key in self.keys()]
125
126 - def remove_database(self, db_name):
127 """Try to remove all references to items in a database. 128 """ 129 db_id = self.adaptor.fetch_dbid_by_dbname(db_name) 130 remover = Loader.DatabaseRemover(self.adaptor, db_id) 131 remover.remove()
132
133 - def new_database(self, db_name, authority=None, description=None):
134 """Add a new database to the server and return it. 135 """ 136 # make the database 137 sql = r"INSERT INTO biodatabase (name, authority, description)" \ 138 r" VALUES (%s, %s, %s)" 139 self.adaptor.execute(sql, (db_name,authority, description)) 140 return BioSeqDatabase(self.adaptor, db_name)
141
142 - def load_database_sql(self, sql_file):
143 """Load a database schema into the given database. 144 145 This is used to create tables, etc when a database is first created. 146 sql_file should specify the complete path to a file containing 147 SQL entries for building the tables. 148 """ 149 # Not sophisticated enough for PG schema. Is it needed by MySQL? 150 # Looks like we need this more complicated way for both. Leaving it 151 # the default and removing the simple-minded approach. 152 153 # read the file with all comment lines removed 154 sql_handle = open(sql_file, "rb") 155 sql = r"" 156 for line in sql_handle.xreadlines(): 157 if line.find("--") == 0: # don't include comment lines 158 pass 159 elif line.find("#") == 0: # ditto for MySQL comments 160 pass 161 elif line.strip(): # only include non-blank lines 162 sql += line.strip() 163 sql += ' ' 164 165 # two ways to load the SQL 166 # 1. PostgreSQL can load it all at once and actually needs to 167 # due to FUNCTION defines at the end of the SQL which mess up 168 # the splitting by semicolons 169 if self.module_name in ["psycopg", "psycopg2", "pgdb"]: 170 self.adaptor.cursor.execute(sql) 171 # 2. MySQL needs the database loading split up into single lines of 172 # SQL executed one at a time 173 elif self.module_name in ["MySQLdb"]: 174 sql_parts = sql.split(";") # one line per sql command 175 for sql_line in sql_parts[:-1]: # don't use the last item, it's blank 176 self.adaptor.cursor.execute(sql_line) 177 else: 178 raise ValueError("Module %s not supported by the loader." % 179 (self.module_name))
180
181 - def commit(self):
182 """Commits the current transaction to the database.""" 183 return self.adaptor.commit()
184
185 - def rollback(self):
186 """Rolls backs the current transaction.""" 187 return self.adaptor.rollback()
188
189 - def close(self):
190 """Close the connection. No further activity possible.""" 191 return self.adaptor.close()
192
193 -class Adaptor:
194 - def __init__(self, conn, dbutils):
195 self.conn = conn 196 self.cursor = conn.cursor() 197 self.dbutils = dbutils
198
199 - def last_id(self, table):
200 return self.dbutils.last_id(self.cursor, table)
201
202 - def autocommit(self, y=True):
203 """Set the autocommit mode. True values enable; False value disable.""" 204 return self.dbutils.autocommit(self.conn, y)
205
206 - def commit(self):
207 """Commits the current transaction.""" 208 return self.conn.commit()
209
210 - def rollback(self):
211 """Rolls backs the current transaction.""" 212 return self.conn.rollback()
213
214 - def close(self):
215 """Close the connection. No further activity possible.""" 216 return self.conn.close()
217
218 - def fetch_dbid_by_dbname(self, dbname):
219 self.cursor.execute( 220 r"select biodatabase_id from biodatabase where name = %s", 221 (dbname,)) 222 rv = self.cursor.fetchall() 223 if not rv: 224 raise KeyError("Cannot find biodatabase with name %r" % dbname) 225 # Cannot happen (UK) 226 ## assert len(rv) == 1, "More than one biodatabase with name %r" % dbname 227 return rv[0][0]
228
229 - def fetch_seqid_by_display_id(self, dbid, name):
230 sql = r"select bioentry_id from bioentry where name = %s" 231 fields = [name] 232 if dbid: 233 sql += " and biodatabase_id = %s" 234 fields.append(dbid) 235 self.cursor.execute(sql, fields) 236 rv = self.cursor.fetchall() 237 if not rv: 238 raise IndexError("Cannot find display id %r" % name) 239 if len(rv) > 1: 240 raise IndexError("More than one entry with display id %r" % name) 241 return rv[0][0]
242
243 - def fetch_seqid_by_accession(self, dbid, name):
244 sql = r"select bioentry_id from bioentry where accession = %s" 245 fields = [name] 246 if dbid: 247 sql += " and biodatabase_id = %s" 248 fields.append(dbid) 249 self.cursor.execute(sql, fields) 250 rv = self.cursor.fetchall() 251 if not rv: 252 raise IndexError("Cannot find accession %r" % name) 253 if len(rv) > 1: 254 raise IndexError("More than one entry with accession %r" % name) 255 return rv[0][0]
256
257 - def fetch_seqids_by_accession(self, dbid, name):
258 sql = r"select bioentry_id from bioentry where accession = %s" 259 fields = [name] 260 if dbid: 261 sql += " and biodatabase_id = %s" 262 fields.append(dbid) 263 return self.execute_and_fetch_col0(sql, fields)
264
265 - def fetch_seqid_by_version(self, dbid, name):
266 acc_version = name.split(".") 267 if len(acc_version) > 2: 268 raise IndexError("Bad version %r" % name) 269 acc = acc_version[0] 270 if len(acc_version) == 2: 271 version = acc_version[1] 272 else: 273 version = "0" 274 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \ 275 r" AND version = %s" 276 fields = [acc, version] 277 if dbid: 278 sql += " and biodatabase_id = %s" 279 fields.append(dbid) 280 self.cursor.execute(sql, fields) 281 rv = self.cursor.fetchall() 282 if not rv: 283 raise IndexError("Cannot find version %r" % name) 284 if len(rv) > 1: 285 raise IndexError("More than one entry with version %r" % name) 286 return rv[0][0]
287
288 - def fetch_seqid_by_identifier(self, dbid, identifier):
289 # YB: was fetch_seqid_by_seqid 290 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" 291 fields = [identifier] 292 if dbid: 293 sql += " and biodatabase_id = %s" 294 fields.append(dbid) 295 self.cursor.execute(sql, fields) 296 rv = self.cursor.fetchall() 297 if not rv: 298 raise IndexError("Cannot find display id %r" % identifier) 299 return rv[0][0]
300
301 - def list_biodatabase_names(self):
302 return self.execute_and_fetch_col0( 303 "SELECT name FROM biodatabase")
304
305 - def list_bioentry_ids(self, dbid):
306 return self.execute_and_fetch_col0( 307 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", 308 (dbid,))
309
310 - def list_bioentry_display_ids(self, dbid):
311 return self.execute_and_fetch_col0( 312 "SELECT name FROM bioentry WHERE biodatabase_id = %s", 313 (dbid,))
314
315 - def list_any_ids(self, sql, args):
316 """Return ids given a SQL statement to select for them. 317 318 This assumes that the given SQL does a SELECT statement that 319 returns a list of items. This parses them out of the 2D list 320 they come as and just returns them in a list. 321 """ 322 return self.cursor.execute_and_fetch_col0(sql, args)
323
324 - def execute_one(self, sql, args=None):
325 self.cursor.execute(sql, args or ()) 326 rv = self.cursor.fetchall() 327 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv) 328 return rv[0]
329
330 - def execute(self, sql, args=None):
331 """Just execute an sql command. 332 """ 333 self.cursor.execute(sql, args or ())
334
335 - def get_subseq_as_string(self, seqid, start, end):
336 length = end - start 337 return self.execute_one( 338 """select SUBSTRING(seq FROM %s FOR %s) 339 from biosequence where bioentry_id = %s""", 340 (start+1, length, seqid))[0]
341
342 - def execute_and_fetch_col0(self, sql, args=None):
343 self.cursor.execute(sql, args or ()) 344 return [field[0] for field in self.cursor.fetchall()]
345
346 - def execute_and_fetchall(self, sql, args=None):
347 self.cursor.execute(sql, args or ()) 348 return self.cursor.fetchall()
349 350 _allowed_lookups = { 351 # Lookup name / function name to get id, function to list all ids 352 'primary_id': "fetch_seqid_by_identifier", 353 'gi': "fetch_seqid_by_identifier", 354 'display_id': "fetch_seqid_by_display_id", 355 'name': "fetch_seqid_by_display_id", 356 'accession': "fetch_seqid_by_accession", 357 'version': "fetch_seqid_by_version", 358 } 359
360 -class BioSeqDatabase:
361 - def __init__(self, adaptor, name):
362 self.adaptor = adaptor 363 self.name = name 364 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
365
366 - def __repr__(self):
367 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
368
369 - def get_Seq_by_id(self, name):
370 """Gets a Bio::Seq object by its name 371 372 Example: seq = db.get_Seq_by_id('ROA1_HUMAN') 373 374 """ 375 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) 376 return BioSeq.DBSeqRecord(self.adaptor, seqid)
377
378 - def get_Seq_by_acc(self, name):
379 """Gets a Bio::Seq object by accession number 380 381 Example: seq = db.get_Seq_by_acc('X77802') 382 383 """ 384 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) 385 return BioSeq.DBSeqRecord(self.adaptor, seqid)
386
387 - def get_Seq_by_ver(self, name):
388 """Gets a Bio::Seq object by version number 389 390 Example: seq = db.get_Seq_by_ver('X77802.1') 391 392 """ 393 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) 394 return BioSeq.DBSeqRecord(self.adaptor, seqid)
395
396 - def get_Seqs_by_acc(self, name):
397 """Gets a *list* of Bio::Seq objects by accession number 398 399 Example: seqs = db.get_Seq_by_acc('X77802') 400 401 """ 402 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) 403 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
404
405 - def get_PrimarySeq_stream(self):
406 # my @array = $self->get_all_primary_ids; 407 # my $stream = Bio::DB::BioDatabasePSeqStream->new( 408 # -adaptor => $self->_adaptor->db->get_PrimarySeqAdaptor, 409 # -idlist => \@array); 410 raise NotImplementedError("waiting for Python 2.2's iter")
411
412 - def get_all_primary_ids(self):
413 """Array of all the primary_ids of the sequences in the database. 414 415 These maybe ids (display style) or accession numbers or 416 something else completely different - they *are not* 417 meaningful outside of this database implementation. 418 """ 419 return self.adaptor.list_bioentry_ids(self.dbid)
420
421 - def __getitem__(self, key):
422 return BioSeq.DBSeqRecord(self.adaptor, key)
423 - def keys(self):
424 return self.get_all_primary_ids()
425 - def values(self):
426 return [self[key] for key in self.keys()]
427 - def items(self):
428 return [(key, self[key]) for key in self.keys()]
429
430 - def lookup(self, **kwargs):
431 if len(kwargs) != 1: 432 raise TypeError("single key/value parameter expected") 433 k, v = kwargs.items()[0] 434 if k not in _allowed_lookups: 435 raise TypeError("lookup() expects one of %s, not %r" % \ 436 (repr(_allowed_lookups.keys())[1:-1], repr(k))) 437 lookup_name = _allowed_lookups[k] 438 lookup_func = getattr(self.adaptor, lookup_name) 439 seqid = lookup_func(self.dbid, v) 440 return BioSeq.DBSeqRecord(self.adaptor, seqid)
441
442 - def get_Seq_by_primary_id(self, seqid):
443 """Gets a Bio::Seq object by the primary (internal) id. 444 445 The primary id in these cases has to come from 446 $db->get_all_primary_ids. There is no other way to get (or 447 guess) the primary_ids in a database. 448 """ 449 return self[seqid]
450
451 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
452 """Load a set of SeqRecords into the BioSQL database. 453 454 record_iterator is either a list of SeqRecord objects, or an 455 Iterator object that returns SeqRecord objects (such as the 456 output from the Bio.SeqIO.parse() function), which will be 457 used to populate the database. 458 459 fetch_NCBI_taxonomy is boolean flag allowing or preventing 460 connection to the taxonomic database on the NCBI server 461 (via Bio.Entrez) to fetch a detailed taxonomy for each 462 SeqRecord. 463 464 Example: 465 from Bio import SeqIO 466 count = db.load(SeqIO.parse(open(filename), format)) 467 468 Returns the number of records loaded. 469 """ 470 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, \ 471 fetch_NCBI_taxonomy) 472 num_records = 0 473 global _POSTGRES_RULES_PRESENT 474 for cur_record in record_iterator : 475 num_records += 1 476 #Hack to work arround BioSQL Bug 2839 - If using PostgreSQL and 477 #the RULES are present check for a duplicate record before loading 478 if _POSTGRES_RULES_PRESENT: 479 #Recreate what the Loader's _load_bioentry_table will do: 480 if cur_record.id.count(".") == 1: 481 accession, version = cur_record.id.split('.') 482 try : 483 version = int(version) 484 except ValueError : 485 accession = cur_record.id 486 version = 0 487 else: 488 accession = cur_record.id 489 version = 0 490 gi = cur_record.annotations.get("gi", None) 491 sql = "SELECT bioentry_id FROM bioentry WHERE (identifier " + \ 492 "= '%s' AND biodatabase_id = '%s') OR (accession = " + \ 493 "'%s' AND version = '%s' AND biodatabase_id = '%s')" 494 self.adaptor.execute(sql % (gi, self.dbid, accession, version, self.dbid)) 495 if self.adaptor.cursor.fetchone(): 496 try: 497 raise self.adaptor.conn.IntegrityError("Duplicate record " 498 "detected: record has not been inserted") 499 except AttributeError: #psycopg version 1 500 import psycopg 501 raise psycopg.IntegrityError("Psycopg1: Duplicate record " 502 "detected: record has not been inserted") 503 #End of hack 504 db_loader.load_seqrecord(cur_record) 505 return num_records
506