Package Bio :: Package config :: Module DBRegistry
[hide private]
[frames] | no frames]

Source Code for Module Bio.config.DBRegistry

  1  # Copyright 2002 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # The SQL and Corba was modified from an original implementation by 
  7  # Brad Chapman. 
  8   
  9  """Implements Registry to access databases.  These objects access 
 10  databases using a dictionary-like interface, where the key is the ID 
 11  of the thing to look up, and the value returned is the data associated 
 12  with the key. 
 13   
 14  Classes: 
 15  DBRegistry     Accesses databases with a dictionary-like interface. 
 16  DBObject       Base class for Registry objects for databases. 
 17  DBGroup        Groups DBObjects. 
 18   
 19  CGIDB          Accesses CGI databases. 
 20  EUtilsDB       Accesses NCBI using EUtils. 
 21  BioSQLDB       Accesses a BioSQL database. 
 22  BioCorbaDB     Accesses a BioCorba database. 
 23  IndexedFileDB  Accesses a Mindy Indexed file. 
 24  """ 
 25  from Bio.config.Registry import * 
 26   
27 -class DBRegistry(Registry):
28 """This implements a dictionary-like interface to databases. 29 30 """
31 - def __init__(self, name, load_path=None):
32 Registry.__init__(self, name, load_path=load_path)
33 34 # Create a registry for access to databases. 35 db = DBRegistry("db", "Bio.dbdefs") 36
37 -def _clean_abbrev(abbrev):
38 return abbrev.replace("-", "_")
39
40 -class DBObject(RegisterableObject):
41 """This is a base class for dictionary-like interfaces to 42 databases. 43 44 Methods: 45 get Lookup a key in a database, with a default value. 46 get_as Lookup a key and convert to an object. 47 __getitem__ Lookup a key in a database. 48 49 THE FOLLOWING SHOULD BE IMPLEMENTED IN A DERIVED CLASS. 50 _get Return the data indicated by key. 51 _convert_to Convert the data to another object. 52 IMPLEMENT THESE ONLY IF TIMEOUT OR CONCURRENT ACCESS IS NEEDED. 53 _make_pickleable Make the object returned by _get to a pickleable. 54 _unmake_pickleable Turn the pickleable object back into the original 55 56 """
57 - def __init__(self, name, abbrev=None, doc=None, delay=None):
58 """DBObject(name[, abbrev][, doc][, delay])""" 59 import _support 60 abbrev = _clean_abbrev(abbrev or name) 61 RegisterableObject.__init__(self, name, abbrev, doc) 62 if delay is not None: 63 x = _support.make_rate_limited_function(self._get, delay) 64 setattr(self, "_get", x)
65
66 - def set(self, key, data):
67 self._set(key, data)
68
69 - def get(self, key, default=None):
70 """S.get(key[, default]) -> data""" 71 try: 72 results = self[key] 73 except KeyError: 74 results = default 75 return results
76
77 - def get_as(self, key, to_io=None, default=None):
78 """S.get_as(key[, to_io][, default]) -> object""" 79 data = self.get(key, default=default) 80 return self._convert_to(data, to_io)
81
82 - def __getitem__(self, key):
83 try: 84 return self._get(key) 85 except IOError, x: 86 if str(x) == "timed out": 87 raise KeyError, x 88 raise
89 90 # THESE FUNCTIONS CAN BE OVERLOADED IN A DERIVED CLASS. 91
92 - def _get(self, key):
93 """S._get(key) -> data""" 94 # Look up a key in the DB and return the data. 95 raise NotImplementedError, "Please implement in a derived class."
96 - def _convert_to(self, data, to_io):
97 """S._convert_to(data, to_io) -> another data type"""
98 # Convert the data returned by _get to the type specified by 99 # to_io, which is a FormatIO object.
100 - def _set(self, key, data):
101 """S._set(key, data)""" 102 # Not used. May be used in the future to support caching. 103 raise NotImplementedError, "Caching not supported here."
104 - def _make_pickleable(self, data):
105 """S._make_pickleable(key, data) -> pickleable_obj""" 106 # Make the handle a pickle-able python object. 107 # Only need to implement if supporting timeout or concurrent 108 # access. 109 raise NotImplementedError, "pickling not supported."
110 - def _unmake_pickleable(self, pickleable_obj):
111 """S._unmake_pickleable(key, pickleable_obj) -> data""" 112 # Turn the pickle-able python object back into a handle. 113 # Only need to implement if supporting timeout or concurrent 114 # access. 115 raise NotImplementedError, "pickling not supported."
116
117 -class DBGroup(RegisterableGroup):
118 """Groups DBObjects that return the same kind of data. 119 120 """
121 - def __init__(self, name, abbrev=None, doc=None, cache=None):
122 """DBGroup(name[, abbrev][, doc]) 123 124 name is the name of the object, and abbrev is an abbreviation 125 for the name. 126 """ 127 abbrev = _clean_abbrev(abbrev or name) 128 RegisterableGroup.__init__(self, name, abbrev, doc) 129 self._last_object_used = None
130
131 - def __getitem__(self, key):
132 for obj in self.objs: 133 try: 134 handle = obj[key] 135 except SystemError, KeyboardInterrupt: 136 raise 137 except Exception, x: 138 continue 139 else: 140 self._last_object_used = obj 141 return handle 142 raise KeyError, "I could not get any results."
143
144 - def get(self, key, default=None):
145 try: 146 data = self[key] 147 except KeyError: 148 data = default 149 return data
150
151 - def get_as(self, key, to_io=None, default=None):
152 """S.get_as(key[, to_io][, default]) -> object""" 153 data = self.get(key, default=default) 154 return self._last_object_used._convert_to(data, to_io)
155
156 -class TextLikeMixin:
157 """Mixin class with useful functionality for retrival of text files. 158 159 This implements some useful helper functions and overrides of DBObject 160 for those implementations which need to retrieve text, check for errors in 161 the retrieve text, and then convert that text to other formats. 162 """
163 - def _check_for_errors(self, handle, failure_cases):
164 from Martel import Parser 165 from Bio import StdHandler 166 from Bio.EUtils.ReseekFile import ReseekFile 167 168 if not failure_cases: 169 return handle 170 handle = ReseekFile(handle) 171 pos = handle.tell() 172 for expression, errormsg in failure_cases: 173 handle.seek(pos) 174 parser = expression.make_parser() 175 handler = StdHandler.RecognizeHandler() 176 parser.setContentHandler(handler) 177 parser.setErrorHandler(handler) 178 try: 179 parser.parseFile(handle) 180 except Parser.ParserException: 181 pass 182 if handler.recognized: 183 raise KeyError, errormsg 184 handle.seek(pos) 185 return handle
186
187 - def _convert_to(self, handle, to_io):
188 from Bio import FormatIO 189 x = to_io.read(handle) 190 if isinstance(x, FormatIO.FormatIOIterator): 191 i = 0 192 for rec in x: 193 if i > 0: 194 raise AssertionError, "Multiple records returned" 195 i += 1 196 else: 197 rec = x 198 return rec
199
200 -class CGIDB(DBObject, TextLikeMixin):
201 """This class implements DBObject for accessing CGI databases. 202 203 """
204 - def __init__(self, name, cgi, url=None, key=None, params=None, 205 abbrev=None, doc=None, delay=None, timeout=None, 206 getmethod=1, failure_cases=None):
207 """CGIDB(name, cgi[, url][, key][, params][, abbrev][, doc] 208 [, delay][, timeout][, getmethod][, failure_cases]) 209 210 name is the name of the object, abbrev is an abbreviation for 211 the name, and doc is some documentation describing the object. 212 213 cgi is the URL for the cgi script. url points to the 214 human-readable URL of the form. 215 216 params is a list of (key, value) tuples indicating the 217 parameters that should be passed to the CGI script. key is 218 the name of the parameter for the CGI script whose value is 219 the ID of the object to retrieve. 220 221 getmethod is a boolean describing whether a GET or POST should 222 be used. By default, GET is used. 223 224 failure_cases is a list of (Martel Expression, error message) 225 describing patterns of errors in the text returned by the 226 script. 227 228 """ 229 import _support 230 DBObject.__init__(self, name=name, abbrev=abbrev, 231 doc=doc, delay=delay, timeout=timeout) 232 self.cgi = cgi 233 self.key = key or '' 234 self.params = params or [] 235 self.url = url 236 self.getmethod = getmethod 237 self.failure_cases = [] 238 for exp, message in failure_cases or []: 239 exp = _support.make_cached_expression(exp) 240 self.failure_cases.append((exp, message))
241
242 - def _normalize_params(self, key):
243 return self.params + [(self.key, key)]
244
245 - def _get(self, key):
246 handle = self._cgiopen(key) 247 handle = self._check_for_errors(handle, self.failure_cases) 248 return handle
249
250 - def _cgiopen(self, key):
251 import urllib 252 params = self._normalize_params(key) 253 options = _my_urlencode(params) 254 if self.getmethod: 255 fullcgi = self.cgi 256 if options: 257 fullcgi = "%s?%s" % (self.cgi, options) 258 handle = urllib.urlopen(fullcgi) 259 else: # do a POST 260 handle = urllib.urlopen(self.cgi, options) 261 return handle
262
263 - def _make_pickleable(self, handle):
264 return handle.read()
265
266 - def _unmake_pickleable(self, obj):
267 import StringIO 268 return StringIO.StringIO(obj)
269
270 -class EUtilsDB(DBObject, TextLikeMixin):
271 """Implement DBObject for accessing EUtils databases at NCBI. 272 """
273 - def __init__(self, name, db, rettype, abbrev = None, doc = None, 274 failure_cases = None, delay = None, timeout = None):
275 """Initialize an EUtilsDB connection for retrieval. 276 277 name is the name of the object, abbrev is an abbreviation for 278 the name, and doc is some documentation describing the object. 279 280 db is the name of the database at NCBI you want to retrieve from 281 (ie. protein, nucleotide, pubmed) 282 283 rettype is the type of information to return 284 (ie. gp, gb, fasta, medline) 285 286 failure_cases is a list of (Martel Expression, error message) 287 describing patterns of errors in the text returned by the 288 script. 289 """ 290 import _support 291 DBObject.__init__(self, name=name, abbrev=abbrev, 292 doc=doc, delay=delay, timeout=timeout) 293 self.db = db 294 self.rettype = rettype 295 self.failure_cases = [] 296 for exp, message in failure_cases or []: 297 exp = _support.make_cached_expression(exp) 298 self.failure_cases.append((exp, message))
299
300 - def _get(self, key):
301 """Implementation of retrieval -- used DBIds client from EUtils. 302 """ 303 from Bio.EUtils import DBIds 304 from Bio.EUtils import DBIdsClient 305 db_id = DBIds(self.db, [key]) 306 eutils_client = DBIdsClient.from_dbids(db_id) 307 handle = eutils_client.efetch(retmode = "text", rettype = 308 self.rettype) 309 handle = self._check_for_errors(handle, self.failure_cases) 310 return handle
311
312 -class BioSQLDB(DBObject):
313 """Represent a BioSQL-style database to retrieve SeqRecord objects. 314 315 This returns a SeqRecord-like object from _get() instead of a 316 handle (since BioSQL is not going to give you a handle). 317 318 """
319 - def __init__(self, name, doc = "", db_host = 'localhost', db_port = '', 320 db_user = 'root', db_passwd = '', sql_db = '', 321 namespace_db = '', db_type = 'mysql'):
322 """Intialize with information for connecting to the BioSQL db. 323 """ 324 DBObject.__init__(self, name=name, doc=doc) 325 self.db_host = db_host 326 self.db_port = db_port 327 self.db_user = db_user 328 self.db_passwd = db_passwd 329 self.sql_db = sql_db 330 self.namespace_db = namespace_db 331 self.db_type = db_type
332
333 - def _get_db_module(self, db_type):
334 """Retrieve the appropriate module to use for connecting to a database 335 336 This parses a description of the database and tries to determine 337 which module is appropriate for that database type. 338 """ 339 if db_type in ['mysql']: 340 return 'MySQLdb' 341 elif db_type in ['pg', 'postgres', 'postgresql']: 342 raise ValueError("Postgres not supported yet. Sorry.") 343 else: 344 raise ValueError("Unknown database type: %s" % db_type)
345
346 - def _get(self, key):
347 # do the import here to prevent circular import problems 348 from BioSQL import BioSeqDatabase 349 350 # for params, we expect to get something like 351 # [('accession', 'AB030760')]. We don't worry about what the id 352 # is called right now, and just try to find it in the database 353 # any way we can 354 find_id = key 355 356 db_driver = self._get_db_module(self.db_type) 357 open_args = {"user" : self.db_user, 358 "passwd" : self.db_passwd, 359 "host" : self.db_host, 360 "db" : self.sql_db, 361 "driver" : db_driver} 362 if self.db_port: 363 open_args["port"] = self.db_port 364 server = BioSeqDatabase.open_database( *(), **open_args) 365 db = server[self.namespace_db] 366 # try our different id choices to test the query 367 item = None 368 for possible_id_type in ["accession", "display_id"]: 369 try: 370 item = db.lookup( *(), **{possible_id_type : find_id}) 371 except IndexError: 372 pass 373 if item is None: 374 raise KeyError("Could not get item with id: %s" % find_id) 375 return item
376
377 - def _convert_to(self, data, to_io):
378 from Bio import SeqRecord 379 if to_io != SeqRecord.io: 380 raise ValueError, "format %s not supported" % to_io.name 381 return data
382
383 - def _make_pickleable(self, item):
384 return item
385 - def _unmake_pickleable(self, item):
386 return item
387
388 -class BioCorbaDB(DBObject):
389 """Represent a BioCorba BioSequenceCollection for SeqRecord objects. 390 391 Returns SeqRecord-like objects. 392 393 """
394 - def __init__(self, name, ior_ref, server_type=None, doc=""):
395 """Intialize with IOR reference for a BioCorba Collection. 396 397 ior_ref is a URL or file reference to an IOR string. The IOR 398 should reference a BioSequenceCollection. This is the top level 399 BioCorba object we should use for making objects available. 400 401 server_type is a hack parameter which might be necessary if there 402 are server/client issues (ie. as with Perl ORBit) that we need 403 to muck around with. If not set, we just use a standard retriever. 404 """ 405 DBObject.__init__(self, name=name, doc=doc) 406 self.retriever = self._get_retriever(server_type) 407 self.ior_ref = ior_ref 408 self.corba_dict = None
409
410 - def _get_retriever(self, server_type):
411 """Return a BioCorba retriever object based on the specified server. 412 413 This returns a ready-to-go client retriever which can be used to 414 connect to a BioCorba server. 415 """ 416 # do the BioCorba imports here, so we don't have to have it 417 # installed to use this module 418 from BioCorba.Client.BiocorbaConnect import PerlCorbaClient, \ 419 PythonCorbaClient, JavaCorbaClient, GenericCorbaClient 420 from BioCorba.Client.Seqcore.CorbaCollection import \ 421 BioSequenceCollection 422 423 if server_type is None: 424 client_type = GenericCorbaClient 425 else: 426 server_type = server_type.lower() 427 if server_type.find("python") >= 0: 428 client_type = PythonCorbaClient 429 elif server_type.find("java") >= 0: 430 client_type = JavaCorbaClient 431 elif server_type.find("perl") >= 0: 432 client_type = PerlCorbaClient 433 else: 434 raise ValueError("Unexpected server type specified: %s" % 435 server_type) 436 437 retriever = client_type(BioSequenceCollection) 438 return retriever
439
440 - def _get_corba_client(self, ior_ref, retriever):
441 """Get a connection to the CORBA server based on the ior_ref 442 """ 443 # do the imports here so we don't need BioCorba for whole module 444 from BioCorba.Bio import GenBank 445 446 if ior_ref.find("http") >= 0: # assume it is a url 447 client = retriever.from_url_ior(ior_ref) 448 else: # assume it is a file 449 client = retriever.from_file_ior(ior_ref) 450 451 return GenBank.Dictionary(client, GenBank.FeatureParser())
452
453 - def _get(self, key):
454 # get the corba dictionary only once when fetched 455 if self.corba_dict is None: 456 self.corba_dict = self._get_corba_client(self.ior_ref, 457 self.retriever) 458 return self.corba_dict[key]
459
460 - def _convert_to(self, data, to_io):
461 from Bio import SeqRecord 462 if to_io != SeqRecord.io: 463 raise ValueError, "format %s not supported" % to_io.name 464 return data
465
466 -class IndexedFileDB(DBObject):
467 """Return SeqRecord objects from an indexed file. 468 469 This module deals with both flat file and BerkeleyDB indexes. 470 These indexed files can be created by any of the compliant indexing 471 implementations from Biopython, BioPerl, BioJava, etc... 472 473 """
474 - def __init__(self, name, dbname, doc = ""):
475 """Intialize with information about loading the database. 476 477 dbname is the name of the database to open. This will likely 478 be a filesystem path to a database directory. 479 """ 480 DBObject.__init__(self, name=name, doc=doc) 481 self.db = self._load_database(dbname)
482
483 - def _load_database(self, name):
484 """Get a connection with the given database. 485 """ 486 from Bio import Mindy 487 db = Mindy.open(dbname = name) 488 return db
489
490 - def _get_check_names(self, given_name, db):
491 """Get a list of all namespaces to search for the file under. 492 493 If given_name is a valid key, then it is returned as the only 494 thing to check. Otherwise, we go forward and check all possible 495 namespaces. 496 """ 497 if given_name is not None and given_name in db.keys(): 498 return [given_name] 499 else: 500 return db.keys()
501
502 - def _get(self, key):
503 """Do the database retrieval of the sequence, returning a handle. 504 """ 505 # XXX jchang: how does this namespace/key stuff work? can we 506 # get rid of namespace? 507 import operator 508 import StringIO 509 if not operator.isSequenceType(key) or len(key) != 2: 510 raise ValueError, "Key should be tuple of (namespace, key)" 511 namespace, key = key 512 names_to_check = self._get_check_names(namespace, self.db) 513 for check_name in names_to_check: 514 location = self.db.lookup( *(), **{check_name : key}) 515 if len(location) >= 1: 516 break 517 assert len(location) == 1, "Got multiple hits: %s" % location 518 return StringIO(location[0].text)
519
520 - def _convert_to(self, handle, to_io):
521 from Bio import FormatIO 522 x = to_io.read(handle) 523 if isinstance(x, FormatIO.FormatIOIterator): 524 i = 0 525 for rec in x: 526 if i > 0: 527 raise AssertionError, "Multiple records returned" 528 i += 1 529 else: 530 rec = x 531 return rec
532
533 -def _my_urlencode(params):
534 # urllib only handles key=value pairs. However, some CGI 535 # scripts also contain parameters that are passed without the 536 # key= part. Thus, search through the params for empty 537 # strings (or None), and handle these myself. 538 539 # params could be a dictionary of key->value or a list of 540 # (key,value) pairs. If it's a dictionary, convert it to a list. 541 import operator 542 import urllib 543 544 if operator.isMappingType(params) and hasattr(params, "items"): 545 params = params.items() 546 547 paramlist = [] 548 for key, value in params: 549 if key: 550 paramlist.append(urllib.urlencode([(key, value)])) 551 else: 552 paramlist.append(urllib.quote_plus(value)) 553 return '&'.join(paramlist)
554