Package Bio :: Package Entrez :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.Entrez.Parser

  1  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parser for XML results returned by NCBI's Entrez Utilities. This 
  7  parser is used by the read() function in Bio.Entrez, and is not intended 
  8  be used directly. 
  9  """ 
 10   
 11  # The question is how to represent an XML file as Python objects. Some 
 12  # XML files returned by NCBI look like lists, others look like dictionaries, 
 13  # and others look like a mix of lists and dictionaries. 
 14  # 
 15  # My approach is to classify each possible element in the XML as a plain 
 16  # string, an integer, a list, a dictionary, or a structure. The latter is a 
 17  # dictionary where the same key can occur multiple times; in Python, it is 
 18  # represented as a dictionary where that key occurs once, pointing to a list 
 19  # of values found in the XML file. 
 20  # 
 21  # The parser then goes through the XML and creates the appropriate Python 
 22  # object for each element. The different levels encountered in the XML are 
 23  # preserved on the Python side. So a subelement of a subelement of an element 
 24  # is a value in a dictionary that is stored in a list which is a value in 
 25  # some other dictionary (or a value in a list which itself belongs to a list 
 26  # which is a value in a dictionary, and so on). Attributes encountered in  
 27  # the XML are stored as a dictionary in a member .attributes of each element, 
 28  # and the tag name is saved in a member .tag. 
 29  # 
 30  # To decide which kind of Python object corresponds to each element in the 
 31  # XML, the parser analyzes the DTD referred at the top of (almost) every 
 32  # XML file returned by the Entrez Utilities. This is preferred over a hand- 
 33  # written solution, since the number of DTDs is rather large and their 
 34  # contents may change over time. About half the code in this parser deals 
 35  # wih parsing the DTD, and the other half with the XML itself. 
 36   
 37   
 38  import os.path 
 39  from xml.parsers import expat 
 40   
 41  # The following four classes are used to add a member .attributes to integers, 
 42  # strings, lists, and dictionaries, respectively. 
 43   
44 -class IntegerElement(int): pass
45
46 -class StringElement(str): pass
47
48 -class UnicodeElement(unicode): pass
49
50 -class ListElement(list): pass
51
52 -class DictionaryElement(dict): pass
53 54 # A StructureElement is like a dictionary, but some of its keys can have 55 # multiple values associated with it. These values are stored in a list 56 # under each key.
57 -class StructureElement(dict):
58 - def __init__(self, keys):
59 dict.__init__(self) 60 for key in keys: 61 dict.__setitem__(self, key, []) 62 self.listkeys = keys
63 - def __setitem__(self, key, value):
64 if key in self.listkeys: 65 self[key].append(value) 66 else: 67 dict.__setitem__(self, key, value)
68
69 -class DataHandler:
70
71 - def __init__(self, dtd_dir):
72 self.stack = [] 73 self.errors = [] 74 self.integers = [] 75 self.strings = [] 76 self.lists = [] 77 self.dictionaries = [] 78 self.structures = {} 79 self.items = [] 80 self.dtd_dir = dtd_dir
81
82 - def run(self, handle):
83 """Set up the parser and let it parse the XML results""" 84 self.parser = expat.ParserCreate() 85 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 86 self.parser.StartElementHandler = self.startElement 87 self.parser.EndElementHandler = self.endElement 88 self.parser.CharacterDataHandler = self.characters 89 self.parser.ExternalEntityRefHandler = self.external_entity_ref_handler 90 self.parser.ParseFile(handle) 91 self.parser = None 92 return self.object
93
94 - def parse(self, handle):
95 BLOCK = 1024 96 self.parser = expat.ParserCreate() 97 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 98 self.parser.StartElementHandler = self.startElement 99 self.parser.EndElementHandler = self.endElement 100 self.parser.CharacterDataHandler = self.characters 101 self.parser.ExternalEntityRefHandler = self.external_entity_ref_handler 102 103 while True : 104 105 #Read in another block of the file... 106 text = handle.read(BLOCK) 107 if not text: 108 # We have reached the end of the XML file 109 for record in self.object: 110 yield record 111 self.parser.Parse("", True) 112 self.parser = None 113 return 114 115 self.parser.Parse(text, False) 116 117 if not self.stack: 118 # Haven't read enough from the XML file yet 119 continue 120 121 records = self.stack[0] 122 while len(records) > 1: # Then the top record is finished 123 try: 124 record = records[0] 125 except TypeError: 126 raise ValueError, "The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse" 127 records[:] = records[1:] 128 yield record
129
130 - def startElement(self, name, attrs):
131 self.content = "" 132 if name in self.lists: 133 object = ListElement() 134 elif name in self.dictionaries: 135 object = DictionaryElement() 136 elif name in self.structures: 137 object = StructureElement(self.structures[name]) 138 elif name in self.items: # Only appears in ESummary 139 name = str(attrs["Name"]) # convert from Unicode 140 del attrs["Name"] 141 itemtype = str(attrs["Type"]) # convert from Unicode 142 del attrs["Type"] 143 if itemtype=="Structure": 144 object = DictionaryElement() 145 elif name in ("ArticleIds", "History"): 146 object = StructureElement(["pubmed", "medline"]) 147 elif itemtype=="List": 148 object = ListElement() 149 else: 150 object = StringElement() 151 object.itemname = name 152 object.itemtype = itemtype 153 elif name in self.strings + self.errors + self.integers: 154 self.attributes = attrs 155 return 156 else: 157 # Element not found in DTD; this will not be stored in the record 158 object = "" 159 if object!="": 160 object.tag = name 161 if attrs: 162 object.attributes = dict(attrs) 163 if len(self.stack)!=0: 164 current = self.stack[-1] 165 try: 166 current.append(object) 167 except AttributeError: 168 current[name] = object 169 self.stack.append(object)
170
171 - def endElement(self, name):
172 value = self.content 173 if name in self.errors: 174 if value=="": 175 return 176 else: 177 raise RuntimeError(value) 178 elif name in self.integers: 179 value = IntegerElement(value) 180 elif name in self.strings: 181 # Convert Unicode strings to plain strings if possible 182 try: 183 value = StringElement(value) 184 except UnicodeEncodeError: 185 value = UnicodeElement(value) 186 elif name in self.items: 187 self.object = self.stack.pop() 188 if self.object.itemtype in ("List", "Structure"): 189 return 190 elif self.object.itemtype=="Integer": 191 value = IntegerElement(value) 192 else: 193 # Convert Unicode strings to plain strings if possible 194 try: 195 value = StringElement(value) 196 except UnicodeEncodeError: 197 value = UnicodeElement(value) 198 name = self.object.itemname 199 else: 200 self.object = self.stack.pop() 201 return 202 value.tag = name 203 if self.attributes: 204 value.attributes = dict(self.attributes) 205 del self.attributes 206 current = self.stack[-1] 207 try: 208 current.append(value) 209 except AttributeError: 210 current[name] = value
211
212 - def characters(self, content):
213 self.content += content
214
215 - def elementDecl(self, name, model):
216 """This callback function is called for each element declaration: 217 <!ELEMENT name (...)> 218 encountered in a DTD. The purpose of this function is to determine 219 whether this element should be regarded as a string, integer, list 220 dictionary, structure, or error.""" 221 if name.upper()=="ERROR": 222 self.errors.append(name) 223 return 224 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED, 225 expat.model.XML_CQUANT_REP, 226 None, ((expat.model.XML_CTYPE_NAME, 227 expat.model.XML_CQUANT_NONE, 228 'Item', 229 () 230 ), 231 ) 232 ): 233 # Special case. As far as I can tell, this only occurs in the 234 # eSummary DTD. 235 self.items.append(name) 236 return 237 # First, remove ignorable parentheses around declarations 238 while (model[0] in (expat.model.XML_CTYPE_SEQ, 239 expat.model.XML_CTYPE_CHOICE) 240 and model[1] in (expat.model.XML_CQUANT_NONE, 241 expat.model.XML_CQUANT_OPT) 242 and len(model[3])==1): 243 model = model[3][0] 244 # PCDATA declarations correspond to strings 245 if model[0] in (expat.model.XML_CTYPE_MIXED, 246 expat.model.XML_CTYPE_EMPTY): 247 self.strings.append(name) 248 return 249 # List-type elements 250 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 251 expat.model.XML_CTYPE_SEQ) and 252 model[1] in (expat.model.XML_CQUANT_PLUS, 253 expat.model.XML_CQUANT_REP)): 254 self.lists.append(name) 255 return 256 # This is the tricky case. Check which keys can occur multiple 257 # times. If only one key is possible, and it can occur multiple 258 # times, then this is a list. If more than one key is possible, 259 # but none of them can occur multiple times, then this is a 260 # dictionary. Otherwise, this is a structure. 261 # In 'single' and 'multiple', we keep track which keys can occur 262 # only once, and which can occur multiple times. 263 single = [] 264 multiple = [] 265 # The 'count' function is called recursively to make sure all the 266 # children in this model are counted. Error keys are ignored; 267 # they raise an exception in Python. 268 def count(model): 269 quantifier, name, children = model[1:] 270 if name==None: 271 if quantifier in (expat.model.XML_CQUANT_PLUS, 272 expat.model.XML_CQUANT_REP): 273 for child in children: 274 multiple.append(child[2]) 275 else: 276 for child in children: 277 count(child) 278 elif name.upper()!="ERROR": 279 if quantifier in (expat.model.XML_CQUANT_NONE, 280 expat.model.XML_CQUANT_OPT): 281 single.append(name) 282 elif quantifier in (expat.model.XML_CQUANT_PLUS, 283 expat.model.XML_CQUANT_REP): 284 multiple.append(name)
285 count(model) 286 if len(single)==0 and len(multiple)==1: 287 self.lists.append(name) 288 elif len(multiple)==0: 289 self.dictionaries.append(name) 290 else: 291 self.structures.update({name: multiple})
292
293 - def external_entity_ref_handler(self, context, base, systemId, publicId):
294 """The purpose of this function is to load the DTD locally, instead 295 of downloading it from the URL specified in the XML. Using the local 296 DTD results in much faster parsing. If the DTD is not found locally, 297 we try to download it. In practice, this may fail though, if the XML 298 relies on many interrelated DTDs. If new DTDs appear, putting them in 299 Bio/Entrez/DTDs will allow the parser to see them.""" 300 location, filename = os.path.split(systemId) 301 path = os.path.join(self.dtd_dir, filename) 302 try: 303 handle = open(path) 304 except IOError: 305 message = """\ 306 Unable to load DTD file %s. 307 308 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez. 309 Though most of NCBI's DTD files are included in the Biopython distribution, 310 sometimes you may find that a particular DTD file is missing. In such a 311 case, you can download the DTD file from NCBI and install it manually. 312 313 Usually, you can find missing DTD files at either 314 http://www.ncbi.nlm.nih.gov/dtd/ 315 or 316 http://eutils.ncbi.nlm.nih.gov/entrez/query/DTD/ 317 If you cannot find %s there, you may also try to search 318 for it with a search engine such as Google. 319 320 Please save %s in the directory 321 %s 322 in order for Bio.Entrez to find it. 323 Alternatively, you can save %s in the directory 324 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython. 325 326 Please also inform the Biopython developers by sending an email to 327 biopython-dev@biopython.org to inform us about this missing DTD, so that we 328 can include it with the next release of Biopython. 329 """ % (filename, filename, filename, self.dtd_dir, filename) 330 raise RuntimeError(message) 331 332 parser = self.parser.ExternalEntityParserCreate(context) 333 parser.ElementDeclHandler = self.elementDecl 334 parser.ParseFile(handle) 335 return 1
336