1
2
3
4
5
6 """Parser for XML results returned by NCBI's Entrez Utilities. This
7 parser is used by the read() function in Bio.Entrez, and is not intended
8 be used directly.
9 """
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 import os.path
39 from xml.parsers import expat
40
41
42
43
45
47
49
51
53
54
55
56
68
70
72 self.stack = []
73 self.errors = []
74 self.integers = []
75 self.strings = []
76 self.lists = []
77 self.dictionaries = []
78 self.structures = {}
79 self.items = []
80 self.dtd_dir = dtd_dir
81
82 - def run(self, handle):
83 """Set up the parser and let it parse the XML results"""
84 self.parser = expat.ParserCreate()
85 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
86 self.parser.StartElementHandler = self.startElement
87 self.parser.EndElementHandler = self.endElement
88 self.parser.CharacterDataHandler = self.characters
89 self.parser.ExternalEntityRefHandler = self.external_entity_ref_handler
90 self.parser.ParseFile(handle)
91 self.parser = None
92 return self.object
93
95 BLOCK = 1024
96 self.parser = expat.ParserCreate()
97 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
98 self.parser.StartElementHandler = self.startElement
99 self.parser.EndElementHandler = self.endElement
100 self.parser.CharacterDataHandler = self.characters
101 self.parser.ExternalEntityRefHandler = self.external_entity_ref_handler
102
103 while True :
104
105
106 text = handle.read(BLOCK)
107 if not text:
108
109 for record in self.object:
110 yield record
111 self.parser.Parse("", True)
112 self.parser = None
113 return
114
115 self.parser.Parse(text, False)
116
117 if not self.stack:
118
119 continue
120
121 records = self.stack[0]
122 while len(records) > 1:
123 try:
124 record = records[0]
125 except TypeError:
126 raise ValueError, "The XML file does not represent a list. Please use Entrez.read instead of Entrez.parse"
127 records[:] = records[1:]
128 yield record
129
131 self.content = ""
132 if name in self.lists:
133 object = ListElement()
134 elif name in self.dictionaries:
135 object = DictionaryElement()
136 elif name in self.structures:
137 object = StructureElement(self.structures[name])
138 elif name in self.items:
139 name = str(attrs["Name"])
140 del attrs["Name"]
141 itemtype = str(attrs["Type"])
142 del attrs["Type"]
143 if itemtype=="Structure":
144 object = DictionaryElement()
145 elif name in ("ArticleIds", "History"):
146 object = StructureElement(["pubmed", "medline"])
147 elif itemtype=="List":
148 object = ListElement()
149 else:
150 object = StringElement()
151 object.itemname = name
152 object.itemtype = itemtype
153 elif name in self.strings + self.errors + self.integers:
154 self.attributes = attrs
155 return
156 else:
157
158 object = ""
159 if object!="":
160 object.tag = name
161 if attrs:
162 object.attributes = dict(attrs)
163 if len(self.stack)!=0:
164 current = self.stack[-1]
165 try:
166 current.append(object)
167 except AttributeError:
168 current[name] = object
169 self.stack.append(object)
170
211
213 self.content += content
214
216 """This callback function is called for each element declaration:
217 <!ELEMENT name (...)>
218 encountered in a DTD. The purpose of this function is to determine
219 whether this element should be regarded as a string, integer, list
220 dictionary, structure, or error."""
221 if name.upper()=="ERROR":
222 self.errors.append(name)
223 return
224 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED,
225 expat.model.XML_CQUANT_REP,
226 None, ((expat.model.XML_CTYPE_NAME,
227 expat.model.XML_CQUANT_NONE,
228 'Item',
229 ()
230 ),
231 )
232 ):
233
234
235 self.items.append(name)
236 return
237
238 while (model[0] in (expat.model.XML_CTYPE_SEQ,
239 expat.model.XML_CTYPE_CHOICE)
240 and model[1] in (expat.model.XML_CQUANT_NONE,
241 expat.model.XML_CQUANT_OPT)
242 and len(model[3])==1):
243 model = model[3][0]
244
245 if model[0] in (expat.model.XML_CTYPE_MIXED,
246 expat.model.XML_CTYPE_EMPTY):
247 self.strings.append(name)
248 return
249
250 if (model[0] in (expat.model.XML_CTYPE_CHOICE,
251 expat.model.XML_CTYPE_SEQ) and
252 model[1] in (expat.model.XML_CQUANT_PLUS,
253 expat.model.XML_CQUANT_REP)):
254 self.lists.append(name)
255 return
256
257
258
259
260
261
262
263 single = []
264 multiple = []
265
266
267
268 def count(model):
269 quantifier, name, children = model[1:]
270 if name==None:
271 if quantifier in (expat.model.XML_CQUANT_PLUS,
272 expat.model.XML_CQUANT_REP):
273 for child in children:
274 multiple.append(child[2])
275 else:
276 for child in children:
277 count(child)
278 elif name.upper()!="ERROR":
279 if quantifier in (expat.model.XML_CQUANT_NONE,
280 expat.model.XML_CQUANT_OPT):
281 single.append(name)
282 elif quantifier in (expat.model.XML_CQUANT_PLUS,
283 expat.model.XML_CQUANT_REP):
284 multiple.append(name)
285 count(model)
286 if len(single)==0 and len(multiple)==1:
287 self.lists.append(name)
288 elif len(multiple)==0:
289 self.dictionaries.append(name)
290 else:
291 self.structures.update({name: multiple})
292
294 """The purpose of this function is to load the DTD locally, instead
295 of downloading it from the URL specified in the XML. Using the local
296 DTD results in much faster parsing. If the DTD is not found locally,
297 we try to download it. In practice, this may fail though, if the XML
298 relies on many interrelated DTDs. If new DTDs appear, putting them in
299 Bio/Entrez/DTDs will allow the parser to see them."""
300 location, filename = os.path.split(systemId)
301 path = os.path.join(self.dtd_dir, filename)
302 try:
303 handle = open(path)
304 except IOError:
305 message = """\
306 Unable to load DTD file %s.
307
308 Bio.Entrez uses NCBI's DTD files to parse XML files returned by NCBI Entrez.
309 Though most of NCBI's DTD files are included in the Biopython distribution,
310 sometimes you may find that a particular DTD file is missing. In such a
311 case, you can download the DTD file from NCBI and install it manually.
312
313 Usually, you can find missing DTD files at either
314 http://www.ncbi.nlm.nih.gov/dtd/
315 or
316 http://eutils.ncbi.nlm.nih.gov/entrez/query/DTD/
317 If you cannot find %s there, you may also try to search
318 for it with a search engine such as Google.
319
320 Please save %s in the directory
321 %s
322 in order for Bio.Entrez to find it.
323 Alternatively, you can save %s in the directory
324 Bio/Entrez/DTDs in the Biopython distribution, and reinstall Biopython.
325
326 Please also inform the Biopython developers by sending an email to
327 biopython-dev@biopython.org to inform us about this missing DTD, so that we
328 can include it with the next release of Biopython.
329 """ % (filename, filename, filename, self.dtd_dir, filename)
330 raise RuntimeError(message)
331
332 parser = self.parser.ExternalEntityParserCreate(context)
333 parser.ElementDeclHandler = self.elementDecl
334 parser.ParseFile(handle)
335 return 1
336