Package BioSQL :: Module BioSeqDatabase
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeqDatabase

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2008 by Peter Cock. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  # 
  7  # Note that BioSQL (including the database schema and scripts) is 
  8  # available and licensed separately.  Please consult www.biosql.org 
  9  """Connect with a BioSQL database and load Biopython like objects from it. 
 10   
 11  This provides interfaces for loading biological objects from a relational 
 12  database, and is compatible with the BioSQL standards. 
 13  """ 
 14  import BioSeq 
 15  import Loader 
 16  import DBUtils 
 17   
18 -def open_database(driver = "MySQLdb", **kwargs):
19 """Main interface for loading a existing BioSQL-style database. 20 21 This function is the easiest way to retrieve a connection to a 22 database, doing something like: 23 24 >>> from BioSeq import BioSeqDatabase 25 >>> server = BioSeqDatabase.open_database(user = "root", db="minidb") 26 27 the various options are: 28 driver -> The name of the database driver to use for connecting. The 29 driver should implement the python DB API. By default, the MySQLdb 30 driver is used. 31 user -> the username to connect to the database with. 32 password, passwd -> the password to connect with 33 host -> the hostname of the database 34 database or db -> the name of the database 35 """ 36 module = __import__(driver) 37 connect = getattr(module, "connect") 38 39 # Different drivers use different keywords... 40 kw = kwargs.copy() 41 if driver == "MySQLdb": 42 if kw.has_key("database"): 43 kw["db"] = kw["database"] 44 del kw["database"] 45 if kw.has_key("password"): 46 kw["passwd"] = kw["password"] 47 del kw["password"] 48 else: 49 # DB-API recommendations 50 if kw.has_key("db"): 51 kw["database"] = kw["db"] 52 del kw["db"] 53 if kw.has_key("passwd"): 54 kw["password"] = kw["passwd"] 55 del kw["passwd"] 56 if driver == "psycopg" and not kw.get("database"): 57 kw["database"] = "template1" 58 try: 59 conn = connect(**kw) 60 except module.InterfaceError: 61 # Ok, so let's try building a DSN 62 # (older releases of psycopg need this) 63 if kw.has_key("database"): 64 kw["dbname"] = kw["database"] 65 del kw["database"] 66 elif kw.has_key("db"): 67 kw["dbname"] = kw["db"] 68 del kw["db"] 69 70 dsn = ' '.join(['='.join(i) for i in kw.items()]) 71 conn = connect(dsn) 72 73 return DBServer(conn, module)
74
75 -class DBServer:
76 - def __init__(self, conn, module, module_name=None):
77 self.module = module 78 if module_name is None: 79 module_name = module.__name__ 80 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name)) 81 self.module_name = module_name
82
83 - def __repr__(self):
84 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
85 - def __getitem__(self, name):
86 return BioSeqDatabase(self.adaptor, name)
87 - def keys(self):
88 return self.adaptor.list_biodatabase_names()
89 - def values(self):
90 return [self[key] for key in self.keys()]
91 - def items(self):
92 return [(key, self[key]) for key in self.keys()]
93
94 - def remove_database(self, db_name):
95 """Try to remove all references to items in a database. 96 """ 97 db_id = self.adaptor.fetch_dbid_by_dbname(db_name) 98 remover = Loader.DatabaseRemover(self.adaptor, db_id) 99 remover.remove()
100
101 - def new_database(self, db_name, authority=None, description=None):
102 """Add a new database to the server and return it. 103 """ 104 # make the database 105 sql = r"INSERT INTO biodatabase (name, authority, description)" \ 106 r" VALUES (%s, %s, %s)" 107 self.adaptor.execute(sql, (db_name,authority, description)) 108 return BioSeqDatabase(self.adaptor, db_name)
109
110 - def load_database_sql(self, sql_file):
111 """Load a database schema into the given database. 112 113 This is used to create tables, etc when a database is first created. 114 sql_file should specify the complete path to a file containing 115 SQL entries for building the tables. 116 """ 117 # Not sophisticated enough for PG schema. Is it needed by MySQL? 118 # Looks like we need this more complicated way for both. Leaving it 119 # the default and removing the simple-minded approach. 120 121 # read the file with all comment lines removed 122 sql_handle = open(sql_file, "rb") 123 sql = r"" 124 for line in sql_handle.xreadlines(): 125 if line.find("--") == 0: # don't include comment lines 126 pass 127 elif line.find("#") == 0: # ditto for MySQL comments 128 pass 129 elif line.strip(): # only include non-blank lines 130 sql += line.strip() 131 sql += ' ' 132 133 # two ways to load the SQL 134 # 1. PostgreSQL can load it all at once and actually needs to 135 # due to FUNCTION defines at the end of the SQL which mess up 136 # the splitting by semicolons 137 if self.module_name in ["psycopg"]: 138 self.adaptor.cursor.execute(sql) 139 # 2. MySQL needs the database loading split up into single lines of 140 # SQL executed one at a time 141 elif self.module_name in ["MySQLdb"]: 142 sql_parts = sql.split(";") # one line per sql command 143 for sql_line in sql_parts[:-1]: # don't use the last item, it's blank 144 self.adaptor.cursor.execute(sql_line) 145 else: 146 raise ValueError("Module %s not supported by the loader." % 147 (self.module_name))
148
149 -class Adaptor:
150 - def __init__(self, conn, dbutils):
151 self.conn = conn 152 self.cursor = conn.cursor() 153 self.dbutils = dbutils
154
155 - def last_id(self, table):
156 return self.dbutils.last_id(self.cursor, table)
157
158 - def autocommit(self, y=True):
159 return self.dbutils.autocommit(self.conn, y)
160
161 - def commit(self):
162 return self.conn.commit()
163
164 - def rollback(self):
165 return self.conn.rollback()
166
167 - def close(self):
168 return self.conn.close()
169
170 - def fetch_dbid_by_dbname(self, dbname):
171 self.cursor.execute( 172 r"select biodatabase_id from biodatabase where name = %s", 173 (dbname,)) 174 rv = self.cursor.fetchall() 175 if not rv: 176 raise KeyError("Cannot find biodatabase with name %r" % dbname) 177 # Cannot happen (UK) 178 ## assert len(rv) == 1, "More than one biodatabase with name %r" % dbname 179 return rv[0][0]
180
181 - def fetch_seqid_by_display_id(self, dbid, name):
182 sql = r"select bioentry_id from bioentry where name = %s" 183 fields = [name] 184 if dbid: 185 sql += " and biodatabase_id = %s" 186 fields.append(dbid) 187 self.cursor.execute(sql, fields) 188 rv = self.cursor.fetchall() 189 if not rv: 190 raise IndexError("Cannot find display id %r" % name) 191 if len(rv) > 1: 192 raise IndexError("More than one entry with display id %r" % name) 193 return rv[0][0]
194
195 - def fetch_seqid_by_accession(self, dbid, name):
196 sql = r"select bioentry_id from bioentry where accession = %s" 197 fields = [name] 198 if dbid: 199 sql += " and biodatabase_id = %s" 200 fields.append(dbid) 201 self.cursor.execute(sql, fields) 202 rv = self.cursor.fetchall() 203 if not rv: 204 raise IndexError("Cannot find accession %r" % name) 205 if len(rv) > 1: 206 raise IndexError("More than one entry with accession %r" % name) 207 return rv[0][0]
208
209 - def fetch_seqids_by_accession(self, dbid, name):
210 sql = r"select bioentry_id from bioentry where accession = %s" 211 fields = [name] 212 if dbid: 213 sql += " and biodatabase_id = %s" 214 fields.append(dbid) 215 return self.execute_and_fetch_col0(sql, fields)
216
217 - def fetch_seqid_by_version(self, dbid, name):
218 acc_version = name.split(".") 219 if len(acc_version) > 2: 220 raise IndexError("Bad version %r" % name) 221 acc = acc_version[0] 222 if len(acc_version) == 2: 223 version = acc_version[1] 224 else: 225 version = "0" 226 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \ 227 r" AND version = %s" 228 fields = [acc, version] 229 if dbid: 230 sql += " and biodatabase_id = %s" 231 fields.append(dbid) 232 self.cursor.execute(sql, fields) 233 rv = self.cursor.fetchall() 234 if not rv: 235 raise IndexError("Cannot find version %r" % name) 236 if len(rv) > 1: 237 raise IndexError("More than one entry with version %r" % name) 238 return rv[0][0]
239
240 - def fetch_seqid_by_identifier(self, dbid, identifier):
241 # YB: was fetch_seqid_by_seqid 242 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" 243 fields = [identifier] 244 if dbid: 245 sql += " and biodatabase_id = %s" 246 fields.append(dbid) 247 self.cursor.execute(sql, fields) 248 rv = self.cursor.fetchall() 249 if not rv: 250 raise IndexError("Cannot find display id %r" % identifier) 251 return rv[0][0]
252
253 - def list_biodatabase_names(self):
254 return self.execute_and_fetch_col0( 255 "SELECT name FROM biodatabase")
256
257 - def list_bioentry_ids(self, dbid):
258 return self.execute_and_fetch_col0( 259 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", 260 (dbid,))
261
262 - def list_bioentry_display_ids(self, dbid):
263 return self.execute_and_fetch_col0( 264 "SELECT name FROM bioentry WHERE biodatabase_id = %s", 265 (dbid,))
266
267 - def list_any_ids(self, sql, args):
268 """Return ids given a SQL statement to select for them. 269 270 This assumes that the given SQL does a SELECT statement that 271 returns a list of items. This parses them out of the 2D list 272 they come as and just returns them in a list. 273 """ 274 return self.cursor.execute_and_fetch_col0(sql, args)
275
276 - def execute_one(self, sql, args=None):
277 self.cursor.execute(sql, args or ()) 278 rv = self.cursor.fetchall() 279 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv) 280 return rv[0]
281
282 - def execute(self, sql, args=None):
283 """Just execute an sql command. 284 """ 285 self.cursor.execute(sql, args or ())
286
287 - def get_subseq_as_string(self, seqid, start, end):
288 length = end - start 289 return self.execute_one( 290 """select SUBSTRING(seq FROM %s FOR %s) 291 from biosequence where bioentry_id = %s""", 292 (start+1, length, seqid))[0]
293
294 - def execute_and_fetch_col0(self, sql, args=None):
295 self.cursor.execute(sql, args or ()) 296 return [field[0] for field in self.cursor.fetchall()]
297
298 - def execute_and_fetchall(self, sql, args=None):
299 self.cursor.execute(sql, args or ()) 300 return self.cursor.fetchall()
301 302 _allowed_lookups = { 303 # Lookup name / function name to get id, function to list all ids 304 'primary_id': "fetch_seqid_by_identifier", 305 'gi': "fetch_seqid_by_identifier", 306 'display_id': "fetch_seqid_by_display_id", 307 'name': "fetch_seqid_by_display_id", 308 'accession': "fetch_seqid_by_accession", 309 'version': "fetch_seqid_by_version", 310 } 311
312 -class BioSeqDatabase:
313 - def __init__(self, adaptor, name):
314 self.adaptor = adaptor 315 self.name = name 316 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
317 - def __repr__(self):
318 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
319
320 - def get_Seq_by_id(self, name):
321 """Gets a Bio::Seq object by its name 322 323 Example: seq = db.get_Seq_by_id('ROA1_HUMAN') 324 325 """ 326 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) 327 return BioSeq.DBSeqRecord(self.adaptor, seqid)
328
329 - def get_Seq_by_acc(self, name):
330 """Gets a Bio::Seq object by accession number 331 332 Example: seq = db.get_Seq_by_acc('X77802') 333 334 """ 335 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) 336 return BioSeq.DBSeqRecord(self.adaptor, seqid)
337
338 - def get_Seq_by_ver(self, name):
339 """Gets a Bio::Seq object by version number 340 341 Example: seq = db.get_Seq_by_ver('X77802.1') 342 343 """ 344 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) 345 return BioSeq.DBSeqRecord(self.adaptor, seqid)
346
347 - def get_Seqs_by_acc(self, name):
348 """Gets a *list* of Bio::Seq objects by accession number 349 350 Example: seqs = db.get_Seq_by_acc('X77802') 351 352 """ 353 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) 354 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
355
356 - def get_PrimarySeq_stream(self):
357 # my @array = $self->get_all_primary_ids; 358 # my $stream = Bio::DB::BioDatabasePSeqStream->new( 359 # -adaptor => $self->_adaptor->db->get_PrimarySeqAdaptor, 360 # -idlist => \@array); 361 raise NotImplementedError("waiting for Python 2.2's iter")
362
363 - def get_all_primary_ids(self):
364 """Array of all the primary_ids of the sequences in the database. 365 366 These maybe ids (display style) or accession numbers or 367 something else completely different - they *are not* 368 meaningful outside of this database implementation. 369 """ 370 return self.adaptor.list_bioentry_ids(self.dbid)
371
372 - def __getitem__(self, key):
373 return BioSeq.DBSeqRecord(self.adaptor, key)
374 - def keys(self):
375 return self.get_all_primary_ids()
376 - def values(self):
377 return [self[key] for key in self.keys()]
378 - def items(self):
379 return [(key, self[key]) for key in self.keys()]
380
381 - def lookup(self, **kwargs):
382 if len(kwargs) != 1: 383 raise TypeError("single key/value parameter expected") 384 k, v = kwargs.items()[0] 385 if not _allowed_lookups.has_key(k): 386 raise TypeError("lookup() expects one of %s, not %r" % \ 387 (repr(_allowed_lookups.keys())[1:-1], repr(k))) 388 lookup_name = _allowed_lookups[k] 389 lookup_func = getattr(self.adaptor, lookup_name) 390 seqid = lookup_func(self.dbid, v) 391 return BioSeq.DBSeqRecord(self.adaptor, seqid)
392
393 - def get_Seq_by_primary_id(self, seqid):
394 """Gets a Bio::Seq object by the primary (internal) id. 395 396 The primary id in these cases has to come from 397 $db->get_all_primary_ids. There is no other way to get (or 398 guess) the primary_ids in a database. 399 """ 400 return self[seqid]
401
402 - def load(self, record_iterator):
403 """Load a set of SeqRecords into the BioSQL database. 404 405 record_iterator is either a list of SeqRecord objects, or an 406 Iterator object that returns SeqRecord objects (such as the 407 output from the Bio.SeqIO.parse() function), which will be 408 used to populate the database. 409 410 Example: 411 from Bio import SeqIO 412 count = db.load(SeqIO.parse(open(filename), format)) 413 414 Returns the number of records loaded. 415 """ 416 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid) 417 num_records = 0 418 for cur_record in record_iterator : 419 num_records += 1 420 db_loader.load_seqrecord(cur_record) 421 return num_records
422