Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15   
 16  Functions: 
 17  efetch       Retrieves records in the requested format from a list of one or 
 18               more primary IDs or from the user's environment 
 19  epost        Posts a file containing a list of primary IDs for future use in 
 20               the user's environment to use with subsequent search strategies 
 21  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 22               and ESummary) and term translations and optionally retains 
 23               results for future use in the user's environment. 
 24  elink        Checks for the existence of an external or Related Articles link 
 25               from a list of one or more primary IDs.  Retrieves primary IDs 
 26               and relevancy scores for links to Entrez databases or Related 
 27               Articles;  creates a hyperlink to the primary LinkOut provider 
 28               for a specific ID and database, or lists LinkOut URLs 
 29               and Attributes for multiple IDs. 
 30  einfo        Provides field index term counts, last update, and available 
 31               links for each database. 
 32  esummary     Retrieves document summaries from a list of primary IDs or from 
 33               the user's environment. 
 34  egquery      Provides Entrez database counts in XML for a single search 
 35               using Global Query. 
 36  espell       Retrieves spelling suggestions. 
 37   
 38  read         Parses the XML results returned by any of the above functions. 
 39               Typical usage is: 
 40               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 41               >>> record = Entrez.read(handle) 
 42               where record is now a Python dictionary or list. 
 43   
 44  _open        Internally used function. 
 45   
 46  """ 
 47  import urllib, time 
 48  import os.path 
 49  from Bio import File 
 50   
51 -def query(cmd, db, cgi='http://www.ncbi.nlm.nih.gov/sites/entrez', 52 **keywds):
53 """Query Entrez and return a handle to the HTML results (DEPRECATED). 54 55 See the online documentation for an explanation of the parameters: 56 http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helplinks.chapter.linkshelp 57 58 Return a handle to the results. 59 60 Raises an IOError exception if there's a network error. 61 """ 62 import warnings 63 warnings.warn("Bio.Entrez.query is deprecated, since it breaks NCBI's rule to only use the E-Utilities URL.", DeprecationWarning)
64 65 # XXX retmode?
66 -def epost(db, cgi=None, **keywds):
67 """Post a file of identifiers for future use. 68 69 Posts a file containing a list of UIs for future use in the user's 70 environment to use with subsequent search strategies. 71 72 See the online documentation for an explanation of the parameters: 73 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 74 75 Return a handle to the results. 76 77 Raises an IOError exception if there's a network error. 78 """ 79 if cgi: 80 import warnings 81 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 82 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 83 variables = {'db' : db} 84 variables.update(keywds) 85 return _open(cgi, variables)
86
87 -def efetch(db, cgi=None, **keywds):
88 """Fetches Entrez results which are returned as a handle. 89 90 EFetch retrieves records in the requested format from a list of one or 91 more UIs or from user's environment. 92 93 See the online documentation for an explanation of the parameters: 94 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 95 96 Return a handle to the results. 97 98 Raises an IOError exception if there's a network error. 99 100 Short example: 101 102 from Bio import Entrez 103 handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="genbank") 104 print handle.read() 105 """ 106 if cgi: 107 import warnings 108 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 109 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 110 variables = {'db' : db} 111 variables.update(keywds) 112 return _open(cgi, variables)
113
114 -def esearch(db, term, cgi=None, **keywds):
115 """ESearch runs an Entrez search and returns a handle to the results. 116 117 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 118 and ESummary) and term translations, and optionally retains results 119 for future use in the user's environment. 120 121 See the online documentation for an explanation of the parameters: 122 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 123 124 Return a handle to the results which are always in XML format. 125 126 Raises an IOError exception if there's a network error. 127 128 Short example: 129 130 from Bio import Entez 131 handle = Entrez.esearch(db="nucleotide", retmax=10, term="Opuntia") 132 record = Entrez.read(handle) 133 print record["Count"] 134 print record["IdList"] 135 """ 136 if cgi: 137 import warnings 138 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 139 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 140 variables = {'db' : db, 141 'term' : term} 142 variables.update(keywds) 143 return _open(cgi, variables)
144 168
169 -def einfo(cgi=None, **keywds):
170 """EInfo returns a summary of the Entez databases as a results handle. 171 172 EInfo provides field names, index term counts, last update, and 173 available links for each Entrez database. 174 175 See the online documentation for an explanation of the parameters: 176 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 177 178 Return a handle to the results, by default in XML format. 179 180 Raises an IOError exception if there's a network error. 181 182 Short example: 183 184 from Bio import Entrez 185 record = Entrez.read(Entrez.einfo()) 186 print record['DbList'] 187 """ 188 if cgi: 189 import warnings 190 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 191 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 192 variables = {} 193 variables.update(keywds) 194 return _open(cgi, variables)
195
196 -def esummary(cgi=None, **keywds):
197 """ESummary retrieves document summaries as a results handle. 198 199 ESummary retrieves document summaries from a list of primary IDs or 200 from the user's environment. 201 202 See the online documentation for an explanation of the parameters: 203 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 204 205 Return a handle to the results, by default in XML format. 206 207 Raises an IOError exception if there's a network error. 208 """ 209 if cgi: 210 import warnings 211 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 212 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 213 variables = {} 214 variables.update(keywds) 215 return _open(cgi, variables)
216
217 -def egquery(cgi=None, **keywds):
218 """EGQuery provides Entrez database counts for a global search. 219 220 EGQuery provides Entrez database counts in XML for a single search 221 using Global Query. 222 223 See the online documentation for an explanation of the parameters: 224 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 225 226 Return a handle to the results in XML format. 227 228 Raises an IOError exception if there's a network error. 229 """ 230 if cgi: 231 import warnings 232 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 233 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 234 variables = {} 235 variables.update(keywds) 236 return _open(cgi, variables)
237
238 -def espell(cgi=None, **keywds):
239 """ESpell retrieves spelling suggestions, returned in a results handle. 240 241 ESpell retrieves spelling suggestions, if available. 242 243 See the online documentation for an explanation of the parameters: 244 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 245 246 Return a handle to the results, by default in XML format. 247 248 Raises an IOError exception if there's a network error. 249 250 Short example: 251 252 from Bio import Entrez 253 record = Entrez.read(Entrez.espell(term="biopythooon")) 254 print record["Query"] 255 print record["CorrectedQuery"] 256 """ 257 if cgi: 258 import warnings 259 warnings.warn("Using a URL other than NCBI's main url for the E-Utilities is deprecated.", DeprecationWarning) 260 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 261 variables = {} 262 variables.update(keywds) 263 return _open(cgi, variables)
264
265 -def read(handle):
266 """Parses an XML file from the NCBI Entrez Utilities into python objects. 267 268 This function parses an XML file created by NCBI's Entrez Utilities, 269 returning a multilevel data structure of Python lists and dictionaries. 270 Most XML files returned by NCBI's Entrez Utilities can be parsed by 271 this function, provided its DTD is available. Biopython includes the 272 DTDs for most commonly used Entrez Utilities. 273 274 Whereas the data structure seems to consist of generic Python lists, 275 dictionaries, strings, and so on, each of these is actually a class 276 derived from the base type. This allows us to store the attributes 277 (if any) of each element in a dictionary my_element.attributes, and 278 the tag name in my_element.tag. 279 """ 280 from Parser import DataHandler 281 DTDs = os.path.join(__path__[0], "DTDs") 282 handler = DataHandler(DTDs) 283 record = handler.run(handle) 284 return record
285
286 -def _open(cgi, params={}):
287 """Helper function to build the URL and open a handle to it (PRIVATE). 288 289 Open a handle to Entrez. cgi is the URL for the cgi script to access. 290 params is a dictionary with the options to pass to it. Does some 291 simple error checking, and will raise an IOError if it encounters one. 292 293 This function also enforces the "three second rule" to avoid abusing 294 the NCBI servers. 295 """ 296 # NCBI requirement: At least three seconds between queries 297 delay = 3.0 298 current = time.time() 299 wait = _open.previous + delay - current 300 if wait > 0: 301 time.sleep(wait) 302 _open.previous = current + wait 303 else: 304 _open.previous = current 305 # Tell Entrez that we are Biopython 306 if not "tool" in params: 307 params["tool"] = "biopython" 308 # Open a handle to Entrez. 309 options = urllib.urlencode(params, doseq=True) 310 cgi += "?" + options 311 handle = urllib.urlopen(cgi) 312 313 # Wrap the handle inside an UndoHandle. 314 uhandle = File.UndoHandle(handle) 315 316 # Check for errors in the first 5 lines. 317 # This is kind of ugly. 318 lines = [] 319 for i in range(5): 320 lines.append(uhandle.readline()) 321 for i in range(4, -1, -1): 322 uhandle.saveline(lines[i]) 323 data = ''.join(lines) 324 325 if "500 Proxy Error" in data: 326 # Sometimes Entrez returns a Proxy Error instead of results 327 raise IOError, "500 Proxy Error (NCBI busy?)" 328 elif "502 Proxy Error" in data: 329 raise IOError, "502 Proxy Error (NCBI busy?)" 330 elif "WWW Error 500 Diagnostic" in data: 331 raise IOError, "WWW Error 500 Diagnostic (NCBI busy?)" 332 elif data.startswith("Error:") : 333 #e.g. 'Error: Your session has expired. Please repeat your search.\n' 334 raise IOError, data.strip() 335 elif data.startswith("The resource is temporarily unavailable") : 336 #This can occur with an invalid query_key 337 #Perhaps this should be a ValueError? 338 raise IOError, "The resource is temporarily unavailable" 339 elif data.startswith("download dataset is empty") : 340 #This can occur when omit the identifier, or the WebEnv and query_key 341 #Perhaps this should be a ValueError? 342 raise IOError, "download dataset is empty" 343 elif data[:5] == "ERROR": 344 # XXX Possible bug here, because I don't know whether this really 345 # occurs on the first line. I need to check this! 346 raise IOError, "ERROR, possibly because id not available?" 347 # Should I check for 404? timeout? etc? 348 return uhandle
349 350 _open.previous = 0 351