Package Bio :: Module PubMed
[hide private]
[frames] | no frames]

Source Code for Module Bio.PubMed

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with PubMed from the NCBI (DEPRECATED). 
  8   
  9  This module has been deprecated and is likely to be removed in a future 
 10  release of Biopython.  Please use Bio.Entrez instead, which is described 
 11  in the Biopython Tutorial. 
 12   
 13  See also: 
 14  http://www.ncbi.nlm.nih.gov/PubMed/ 
 15   
 16  Online documentation for linking to PubMed is available at: 
 17  http://www.ncbi.nlm.nih.gov/PubMed/linking.html 
 18   
 19   
 20  Classes: 
 21  Dictionary     Access PubMed articles using a dictionary interface. 
 22   
 23  Functions: 
 24  search_for     Search PubMed. 
 25  find_related   Find related articles in PubMed. 
 26  download_many  Download many articles from PubMed in batch mode. 
 27   
 28  """ 
 29   
 30  import warnings 
 31  warnings.warn("Bio.PubMed has been deprecated, and we intend to remove it in" \ 
 32                +" a future release of Biopython.  Please use Bio.Entrez"\ 
 33                +" instead as described in the Tutorial.  If you need help" \ 
 34                +" with this transition, or wish to continue to use this code,"\ 
 35                +" please get in contact via the mailing lists.", \ 
 36                DeprecationWarning) 
 37   
 38  import re 
 39  import sgmllib 
 40   
 41  from Bio import File 
 42  from Bio import Entrez 
 43  from Bio import Medline 
 44   
45 -class Dictionary:
46 """Access PubMed using a read-only dictionary interface (DEPRECATED). 47 48 Please use the Bio.Entrez.efetch(...) function instead as described in the 49 Biopython Tutorial. 50 """
51 - def __init__(self, parser=None):
52 """Dictionary(parser=None) 53 54 Create a new Dictionary to access PubMed. parser is an optional 55 parser (e.g. Medline.RecordParser) object to change the results 56 into another form. If set to None, then the raw contents of the 57 file will be returned. 58 59 """ 60 self.parser = parser
61
62 - def __len__(self):
63 raise NotImplementedError("PubMed contains lots of entries")
64 - def clear(self):
65 raise NotImplementedError("This is a read-only dictionary")
66 - def __setitem__(self, key, item):
67 raise NotImplementedError("This is a read-only dictionary")
68 - def update(self):
69 raise NotImplementedError("This is a read-only dictionary")
70 - def copy(self):
71 raise NotImplementedError("You don't need to do this...")
72 - def keys(self):
73 raise NotImplementedError("You don't really want to do this...")
74 - def items(self):
75 raise NotImplementedError("You don't really want to do this...")
76 - def values(self):
77 raise NotImplementedError("You don't really want to do this...")
78
79 - def has_key(self, id):
80 """S.has_key(id) -> bool""" 81 try: 82 self[id] 83 except KeyError: 84 return 0 85 return 1
86
87 - def get(self, id, failobj=None):
88 try: 89 return self[id] 90 except KeyError: 91 return failobj
92
93 - def __getitem__(self, id):
94 """S.__getitem__(id) -> object 95 96 Return the Medline entry. id is either the Medline Unique ID 97 or the Pubmed ID of the article. Raises a KeyError if there's an 98 error. 99 100 """ 101 try: 102 handle = Entrez.efetch( 103 db="pubmed", id=id, retmode='text', rettype='medlars') 104 except IOError, x: 105 # raise a KeyError instead of an IOError 106 # XXX I really should distinguish between a real IOError and 107 # if the id is not in the database. 108 raise KeyError(x) 109 if self.parser is not None: 110 return self.parser.parse(handle) 111 return handle.read()
112
113 -def search_for(search, reldate=None, mindate=None, maxdate=None, 114 batchsize=100, callback_fn=None, start_id=0, max_ids=None):
115 """Search PubMed, returns a list of IDs (DEPRECATED). 116 117 Please use Bio.Entrez instead as described in the Biopython Tutorial. 118 119 Search PubMed and return a list of the PMID's that match the 120 criteria. search is the search string used to search the 121 database. reldate is the number of dates prior to the current 122 date to restrict the search. mindate and maxdate are the dates to 123 restrict the search, e.g. 2002/01/01. batchsize specifies the 124 number of ids to return at one time. By default, it is set to 125 10000, the maximum. callback_fn is an optional callback function 126 that will be called as passed a PMID as results are retrieved. 127 start_id specifies the index of the first id to retrieve and 128 max_ids specifies the maximum number of id's to retrieve. 129 130 XXX The date parameters don't seem to be working with NCBI's 131 script. Please let me know if you can get it to work. 132 133 """ 134 params = { 135 'db' : 'pubmed', 136 'term' : search, 137 'reldate' : reldate, 138 'mindate' : mindate, 139 'maxdate' : maxdate 140 } 141 #Note that Bio.Entrez can now cope with None arguments (it ignores them) 142 143 ids = [] 144 while max_ids is None or len(ids) < max_ids: 145 start = start_id + len(ids) 146 max = batchsize 147 if max_ids is not None and max > max_ids - len(ids): 148 max = max_ids - len(ids) 149 150 params['retstart'] = start 151 params['retmax'] = max 152 h = Entrez.esearch(**params) 153 record = Entrez.read(h) 154 idlist = record["IdList"] 155 ids.extend(idlist) 156 if callback_fn is not None: 157 # Call the callback function with each of the new ID's. 158 for id in idlist: 159 callback_fn(id) 160 if len(idlist) < max: # no more id's to read 161 break 162 return ids
163 199 def start_id(self, attributes): 200 self.in_id = 1 201 def end_id(self): 202 self.in_id = 0 203 def start_link(self, attributes): 204 self.in_link = 1 205 def end_link(self): 206 self.in_link = 0 207 _not_pmid_re = re.compile(r'\D') 208 def handle_data(self, data): 209 if not self.in_link or not self.in_id: 210 return 211 # Everything here should be a PMID. Check and make sure 212 # data really is one. A PMID should be a string consisting 213 # of only integers. Should I check to make sure it 214 # meets a certain minimum length? 215 if self._not_pmid_re.search(data): 216 raise ValueError(\ 217 "I expected an ID, but '%s' doesn't look like one." % \ 218 repr(data)) 219 self.ids.append(data) 220 221 parser = ResultParser() 222 if type(pmid) is type([]): 223 pmid = ','.join(pmid) 224 h = Entrez.elink(dbfrom='pubmed', id=pmid) 225 parser.feed(h.read()) 226 return parser.ids 227
228 -def download_many(ids, callback_fn, broken_fn=None, 229 batchsize=500, parser=None):
230 """Download multiple PubMed records, no return value (DEPRECATED). 231 232 Please use Bio.Entrez instead as described in the Biopython Tutorial. 233 234 Download many records from PubMed. ids is a list of either the 235 Medline Unique ID or the PubMed ID's of the articles. Each time a 236 record is downloaded, callback_fn is called with the text of the 237 record. broken_fn is an optional function that is called with the 238 id of records that were not able to be downloaded. batchsize is the 239 number of records to request each time. 240 241 """ 242 # parser is an undocumented parameter that allows people to 243 # specify an optional parser to handle each record. This is 244 # dangerous because the results may be malformed, and exceptions 245 # in the parser may disrupt the whole download process. 246 if batchsize > 500 or batchsize < 1: 247 raise ValueError("batchsize must be between 1 and 500") 248 current_batchsize = batchsize 249 250 # Loop until all the ids are processed. We want to process as 251 # many as possible with each request. Unfortunately, errors can 252 # occur. Some id may be incorrect, or the server may be 253 # unresponsive. In addition, one broken id out of a list of id's 254 # can cause a non-specific error. Thus, the strategy I'm going to 255 # take, is to start by downloading as many as I can. If the 256 # request fails, I'm going to half the number of records I try to 257 # get. If there's only one more record, then I'll report it as 258 # broken and move on. If the request succeeds, I'll double the 259 # number of records until I get back up to the batchsize. 260 nsuccesses = 0 261 while ids: 262 if current_batchsize > len(ids): 263 current_batchsize = len(ids) 264 265 id_str = ','.join(ids[:current_batchsize]) 266 267 try: 268 # Query PubMed. If one or more of the id's are broken, 269 # this will raise an IOError. 270 handle = Entrez.efetch( 271 db="pubmed", id=id_str, retmode='text', rettype='medlars') 272 273 # I'm going to check to make sure PubMed returned the same 274 # number of id's as I requested. If it didn't then I'm going 275 # to raise an exception. This could take a lot of memory if 276 # the batchsize is large. 277 results = handle.read() 278 num_ids = 0 279 for x in Medline.Iterator(File.StringHandle(results)): 280 num_ids = num_ids + 1 281 if num_ids != current_batchsize: 282 raise IOError 283 handle = File.StringHandle(results) 284 except IOError: # Query did not work. 285 if current_batchsize == 1: 286 # There was only 1 id in the query. Report it as 287 # broken and move on. 288 id = ids.pop(0) 289 if broken_fn is not None: 290 broken_fn(id) 291 else: 292 # I don't know which one is broken. Try again with 293 # fewer id's. 294 current_batchsize = current_batchsize / 2 295 nsuccesses = 0 296 continue 297 nsuccesses = nsuccesses + 1 298 299 # Iterate through the results and pass the records to the 300 # callback. 301 idnum = 0 302 for rec in Medline.Iterator(handle, parser): 303 callback_fn(ids[idnum], rec) 304 idnum = idnum + 1 305 306 ids = ids[current_batchsize:] 307 308 # If I'm not downloading the maximum number of articles, 309 # double the number for next time. 310 if nsuccesses >= 2 and current_batchsize < batchsize: 311 current_batchsize = current_batchsize * 2 312 if current_batchsize > batchsize: 313 current_batchsize = batchsize
314