OLD | NEW |
| (Empty) |
1 # -*- test-case-name: twisted.words.test.test_domish -*- | |
2 # | |
3 # Copyright (c) 2001-2007 Twisted Matrix Laboratories. | |
4 # See LICENSE for details. | |
5 | |
6 """ | |
7 DOM-like XML processing support. | |
8 | |
9 This module provides support for parsing XML into DOM-like object structures | |
10 and serializing such structures to an XML string representation, optimized | |
11 for use in streaming XML applications. | |
12 """ | |
13 | |
14 import types | |
15 | |
16 from zope.interface import implements, Interface, Attribute | |
17 | |
18 def _splitPrefix(name): | |
19 """ Internal method for splitting a prefixed Element name into its | |
20 respective parts """ | |
21 ntok = name.split(":", 1) | |
22 if len(ntok) == 2: | |
23 return ntok | |
24 else: | |
25 return (None, ntok[0]) | |
26 | |
27 # Global map of prefixes that always get injected | |
28 # into the serializers prefix map (note, that doesn't | |
29 # mean they're always _USED_) | |
30 G_PREFIXES = { "http://www.w3.org/XML/1998/namespace":"xml" } | |
31 | |
32 class _ListSerializer: | |
33 """ Internal class which serializes an Element tree into a buffer """ | |
34 def __init__(self, prefixes=None, prefixesInScope=None): | |
35 self.writelist = [] | |
36 self.prefixes = {} | |
37 if prefixes: | |
38 self.prefixes.update(prefixes) | |
39 self.prefixes.update(G_PREFIXES) | |
40 self.prefixStack = [G_PREFIXES.values()] + (prefixesInScope or []) | |
41 self.prefixCounter = 0 | |
42 | |
43 def getValue(self): | |
44 return u"".join(self.writelist) | |
45 | |
46 def getPrefix(self, uri): | |
47 if not self.prefixes.has_key(uri): | |
48 self.prefixes[uri] = "xn%d" % (self.prefixCounter) | |
49 self.prefixCounter = self.prefixCounter + 1 | |
50 return self.prefixes[uri] | |
51 | |
52 def prefixInScope(self, prefix): | |
53 stack = self.prefixStack | |
54 for i in range(-1, (len(self.prefixStack)+1) * -1, -1): | |
55 if prefix in stack[i]: | |
56 return True | |
57 return False | |
58 | |
59 def serialize(self, elem, closeElement=1, defaultUri=''): | |
60 # Optimization shortcuts | |
61 write = self.writelist.append | |
62 | |
63 # Shortcut, check to see if elem is actually a chunk o' serialized XML | |
64 if isinstance(elem, SerializedXML): | |
65 write(elem) | |
66 return | |
67 | |
68 # Shortcut, check to see if elem is actually a string (aka Cdata) | |
69 if isinstance(elem, types.StringTypes): | |
70 write(escapeToXml(elem)) | |
71 return | |
72 | |
73 # Further optimizations | |
74 parent = elem.parent | |
75 name = elem.name | |
76 uri = elem.uri | |
77 defaultUri, currentDefaultUri = elem.defaultUri, defaultUri | |
78 | |
79 for p, u in elem.localPrefixes.iteritems(): | |
80 self.prefixes[u] = p | |
81 self.prefixStack.append(elem.localPrefixes.keys()) | |
82 | |
83 # Inherit the default namespace | |
84 if defaultUri is None: | |
85 defaultUri = currentDefaultUri | |
86 | |
87 if uri is None: | |
88 uri = defaultUri | |
89 | |
90 prefix = None | |
91 if uri != defaultUri or uri in self.prefixes: | |
92 prefix = self.getPrefix(uri) | |
93 inScope = self.prefixInScope(prefix) | |
94 | |
95 # Create the starttag | |
96 | |
97 if not prefix: | |
98 write("<%s" % (name)) | |
99 else: | |
100 write("<%s:%s" % (prefix, name)) | |
101 | |
102 if not inScope: | |
103 write(" xmlns:%s='%s'" % (prefix, uri)) | |
104 self.prefixStack[-1].append(prefix) | |
105 inScope = True | |
106 | |
107 if defaultUri != currentDefaultUri and \ | |
108 (uri != defaultUri or not prefix or not inScope): | |
109 write(" xmlns='%s'" % (defaultUri)) | |
110 | |
111 for p, u in elem.localPrefixes.iteritems(): | |
112 write(" xmlns:%s='%s'" % (p, u)) | |
113 | |
114 # Serialize attributes | |
115 for k,v in elem.attributes.items(): | |
116 # If the attribute name is a tuple, it's a qualified attribute | |
117 if isinstance(k, types.TupleType): | |
118 attr_uri, attr_name = k | |
119 attr_prefix = self.getPrefix(attr_uri) | |
120 | |
121 if not self.prefixInScope(attr_prefix): | |
122 write(" xmlns:%s='%s'" % (attr_prefix, attr_uri)) | |
123 self.prefixStack[-1].append(attr_prefix) | |
124 | |
125 write(" %s:%s='%s'" % (attr_prefix, attr_name, | |
126 escapeToXml(v, 1))) | |
127 else: | |
128 write((" %s='%s'" % ( k, escapeToXml(v, 1)))) | |
129 | |
130 # Shortcut out if this is only going to return | |
131 # the element (i.e. no children) | |
132 if closeElement == 0: | |
133 write(">") | |
134 return | |
135 | |
136 # Serialize children | |
137 if len(elem.children) > 0: | |
138 write(">") | |
139 for c in elem.children: | |
140 self.serialize(c, defaultUri=defaultUri) | |
141 # Add closing tag | |
142 if not prefix: | |
143 write("</%s>" % (name)) | |
144 else: | |
145 write("</%s:%s>" % (prefix, name)) | |
146 else: | |
147 write("/>") | |
148 | |
149 self.prefixStack.pop() | |
150 | |
151 | |
152 SerializerClass = _ListSerializer | |
153 | |
154 def escapeToXml(text, isattrib = 0): | |
155 """ Escape text to proper XML form, per section 2.3 in the XML specification
. | |
156 | |
157 @type text: L{str} | |
158 @param text: Text to escape | |
159 | |
160 @type isattrib: L{bool} | |
161 @param isattrib: Triggers escaping of characters necessary for use as | |
162 attribute values | |
163 """ | |
164 text = text.replace("&", "&") | |
165 text = text.replace("<", "<") | |
166 text = text.replace(">", ">") | |
167 if isattrib == 1: | |
168 text = text.replace("'", "'") | |
169 text = text.replace("\"", """) | |
170 return text | |
171 | |
172 def unescapeFromXml(text): | |
173 text = text.replace("<", "<") | |
174 text = text.replace(">", ">") | |
175 text = text.replace("'", "'") | |
176 text = text.replace(""", "\"") | |
177 text = text.replace("&", "&") | |
178 return text | |
179 | |
180 def generateOnlyInterface(list, int): | |
181 """ Filters items in a list by class | |
182 """ | |
183 for n in list: | |
184 if int.providedBy(n): | |
185 yield n | |
186 | |
187 def generateElementsQNamed(list, name, uri): | |
188 """ Filters Element items in a list with matching name and URI. """ | |
189 for n in list: | |
190 if IElement.providedBy(n) and n.name == name and n.uri == uri: | |
191 yield n | |
192 | |
193 def generateElementsNamed(list, name): | |
194 """ Filters Element items in a list with matching name, regardless of URI. | |
195 """ | |
196 for n in list: | |
197 if IElement.providedBy(n) and n.name == name: | |
198 yield n | |
199 | |
200 | |
201 class SerializedXML(unicode): | |
202 """ Marker class for pre-serialized XML in the DOM. """ | |
203 pass | |
204 | |
205 | |
206 class Namespace: | |
207 """ Convenience object for tracking namespace declarations. """ | |
208 def __init__(self, uri): | |
209 self._uri = uri | |
210 def __getattr__(self, n): | |
211 return (self._uri, n) | |
212 def __getitem__(self, n): | |
213 return (self._uri, n) | |
214 | |
215 class IElement(Interface): | |
216 """ | |
217 Interface to XML element nodes. | |
218 | |
219 See L{Element} for a detailed example of its general use. | |
220 | |
221 Warning: this Interface is not yet complete! | |
222 """ | |
223 | |
224 uri = Attribute(""" Element's namespace URI """) | |
225 name = Attribute(""" Element's local name """) | |
226 defaultUri = Attribute(""" Default namespace URI of child elements """) | |
227 attributes = Attribute(""" Dictionary of element attributes """) | |
228 children = Attribute(""" List of child nodes """) | |
229 parent = Attribute(""" Reference to element's parent element """) | |
230 localPrefixes = Attribute(""" Dictionary of local prefixes """) | |
231 | |
232 def toXml(prefixes=None, closeElement=1, defaultUri='', | |
233 prefixesInScope=None): | |
234 """ Serializes object to a (partial) XML document | |
235 | |
236 @param prefixes: dictionary that maps namespace URIs to suggested | |
237 prefix names. | |
238 @type prefixes: L{dict} | |
239 @param closeElement: flag that determines whether to include the | |
240 closing tag of the element in the serialized | |
241 string. A value of C{0} only generates the | |
242 element's start tag. A value of C{1} yields a | |
243 complete serialization. | |
244 @type closeElement: L{int} | |
245 @param defaultUri: Initial default namespace URI. This is most useful | |
246 for partial rendering, where the logical parent | |
247 element (of which the starttag was already | |
248 serialized) declares a default namespace that should | |
249 be inherited. | |
250 @type defaultUri: L{str} | |
251 @param prefixesInScope: list of prefixes that are assumed to be | |
252 declared by ancestors. | |
253 @type prefixesInScope: L{list} | |
254 @return: (partial) serialized XML | |
255 @rtype: L{unicode} | |
256 """ | |
257 | |
258 def addElement(name, defaultUri = None, content = None): | |
259 """ Create an element and add as child. | |
260 | |
261 The new element is added to this element as a child, and will have | |
262 this element as its parent. | |
263 | |
264 @param name: element name. This can be either a L{unicode} object that | |
265 contains the local name, or a tuple of (uri, local_name) | |
266 for a fully qualified name. In the former case, | |
267 the namespace URI is inherited from this element. | |
268 @type name: L{unicode} or L{tuple} of (L{unicode}, L{unicode}) | |
269 @param defaultUri: default namespace URI for child elements. If | |
270 C{None}, this is inherited from this element. | |
271 @type defaultUri: L{unicode} | |
272 @param content: text contained by the new element. | |
273 @type content: L{unicode} | |
274 @return: the created element | |
275 @rtype: object providing L{IElement} | |
276 """ | |
277 | |
278 def addChild(node): | |
279 """ Adds a node as child of this element. | |
280 | |
281 The C{node} will be added to the list of childs of this element, and | |
282 will have this element set as its parent when C{node} provides | |
283 L{IElement}. | |
284 | |
285 @param node: the child node. | |
286 @type node: L{unicode} or object implementing L{IElement} | |
287 """ | |
288 | |
289 class Element(object): | |
290 """ Represents an XML element node. | |
291 | |
292 An Element contains a series of attributes (name/value pairs), content | |
293 (character data), and other child Element objects. When building a document | |
294 with markup (such as HTML or XML), use this object as the starting point. | |
295 | |
296 Element objects fully support XML Namespaces. The fully qualified name of | |
297 the XML Element it represents is stored in the C{uri} and C{name} | |
298 attributes, where C{uri} holds the namespace URI. There is also a default | |
299 namespace, for child elements. This is stored in the C{defaultUri} | |
300 attribute. Note that C{''} means the empty namespace. | |
301 | |
302 Serialization of Elements through C{toXml()} will use these attributes | |
303 for generating proper serialized XML. When both C{uri} and C{defaultUri} | |
304 are not None in the Element and all of its descendents, serialization | |
305 proceeds as expected: | |
306 | |
307 >>> from twisted.words.xish import domish | |
308 >>> root = domish.Element(('myns', 'root')) | |
309 >>> root.addElement('child', content='test') | |
310 <twisted.words.xish.domish.Element object at 0x83002ac> | |
311 >>> root.toXml() | |
312 u"<root xmlns='myns'><child>test</child></root>" | |
313 | |
314 For partial serialization, needed for streaming XML, a special value for | |
315 namespace URIs can be used: C{None}. | |
316 | |
317 Using C{None} as the value for C{uri} means: this element is in whatever | |
318 namespace inherited by the closest logical ancestor when the complete XML | |
319 document has been serialized. The serialized start tag will have a | |
320 non-prefixed name, and no xmlns declaration will be generated. | |
321 | |
322 Similarly, C{None} for C{defaultUri} means: the default namespace for my | |
323 child elements is inherited from the logical ancestors of this element, | |
324 when the complete XML document has been serialized. | |
325 | |
326 To illustrate, an example from a Jabber stream. Assume the start tag of the | |
327 root element of the stream has already been serialized, along with several | |
328 complete child elements, and sent off, looking like this:: | |
329 | |
330 <stream:stream xmlns:stream='http://etherx.jabber.org/streams' | |
331 xmlns='jabber:client' to='example.com'> | |
332 ... | |
333 | |
334 Now suppose we want to send a complete element represented by an | |
335 object C{message} created like: | |
336 | |
337 >>> message = domish.Element((None, 'message')) | |
338 >>> message['to'] = 'user@example.com' | |
339 >>> message.addElement('body', content='Hi!') | |
340 <twisted.words.xish.domish.Element object at 0x8276e8c> | |
341 >>> message.toXml() | |
342 u"<message to='user@example.com'><body>Hi!</body></message>" | |
343 | |
344 As, you can see, this XML snippet has no xmlns declaration. When sent | |
345 off, it inherits the C{jabber:client} namespace from the root element. | |
346 Note that this renders the same as using C{''} instead of C{None}: | |
347 | |
348 >>> presence = domish.Element(('', 'presence')) | |
349 >>> presence.toXml() | |
350 u"<presence/>" | |
351 | |
352 However, if this object has a parent defined, the difference becomes | |
353 clear: | |
354 | |
355 >>> child = message.addElement(('http://example.com/', 'envelope')) | |
356 >>> child.addChild(presence) | |
357 <twisted.words.xish.domish.Element object at 0x8276fac> | |
358 >>> message.toXml() | |
359 u"<message to='user@example.com'><body>Hi!</body><envelope xmlns='http://e
xample.com/'><presence xmlns=''/></envelope></message>" | |
360 | |
361 As, you can see, the <presence/> element is now in the empty namespace, not | |
362 in the default namespace of the parent or the streams'. | |
363 | |
364 @type uri: L{unicode} or None | |
365 @ivar uri: URI of this Element's name | |
366 | |
367 @type name: L{unicode} | |
368 @ivar name: Name of this Element | |
369 | |
370 @type defaultUri: L{unicode} or None | |
371 @ivar defaultUri: URI this Element exists within | |
372 | |
373 @type children: L{list} | |
374 @ivar children: List of child Elements and content | |
375 | |
376 @type parent: L{Element} | |
377 @ivar parent: Reference to the parent Element, if any. | |
378 | |
379 @type attributes: L{dict} | |
380 @ivar attributes: Dictionary of attributes associated with this Element. | |
381 | |
382 @type localPrefixes: L{dict} | |
383 @ivar localPrefixes: Dictionary of namespace declarations on this | |
384 element. The key is the prefix to bind the | |
385 namespace uri to. | |
386 """ | |
387 | |
388 implements(IElement) | |
389 | |
390 _idCounter = 0 | |
391 | |
392 def __init__(self, qname, defaultUri=None, attribs=None, | |
393 localPrefixes=None): | |
394 """ | |
395 @param qname: Tuple of (uri, name) | |
396 @param defaultUri: The default URI of the element; defaults to the URI | |
397 specified in L{qname} | |
398 @param attribs: Dictionary of attributes | |
399 @param localPrefixes: Dictionary of namespace declarations on this | |
400 element. The key is the prefix to bind the | |
401 namespace uri to. | |
402 """ | |
403 self.localPrefixes = localPrefixes or {} | |
404 self.uri, self.name = qname | |
405 if defaultUri is None and \ | |
406 self.uri not in self.localPrefixes.itervalues(): | |
407 self.defaultUri = self.uri | |
408 else: | |
409 self.defaultUri = defaultUri | |
410 self.attributes = attribs or {} | |
411 self.children = [] | |
412 self.parent = None | |
413 | |
414 def __getattr__(self, key): | |
415 # Check child list for first Element with a name matching the key | |
416 for n in self.children: | |
417 if IElement.providedBy(n) and n.name == key: | |
418 return n | |
419 | |
420 # Tweak the behaviour so that it's more friendly about not | |
421 # finding elements -- we need to document this somewhere :) | |
422 if key.startswith('_'): | |
423 raise AttributeError(key) | |
424 else: | |
425 return None | |
426 | |
427 def __getitem__(self, key): | |
428 return self.attributes[self._dqa(key)] | |
429 | |
430 def __delitem__(self, key): | |
431 del self.attributes[self._dqa(key)]; | |
432 | |
433 def __setitem__(self, key, value): | |
434 self.attributes[self._dqa(key)] = value | |
435 | |
436 def __str__(self): | |
437 """ Retrieve the first CData (content) node | |
438 """ | |
439 for n in self.children: | |
440 if isinstance(n, types.StringTypes): return n | |
441 return "" | |
442 | |
443 def _dqa(self, attr): | |
444 """ Dequalify an attribute key as needed """ | |
445 if isinstance(attr, types.TupleType) and not attr[0]: | |
446 return attr[1] | |
447 else: | |
448 return attr | |
449 | |
450 def getAttribute(self, attribname, default = None): | |
451 """ Retrieve the value of attribname, if it exists """ | |
452 return self.attributes.get(attribname, default) | |
453 | |
454 def hasAttribute(self, attrib): | |
455 """ Determine if the specified attribute exists """ | |
456 return self.attributes.has_key(self._dqa(attrib)) | |
457 | |
458 def compareAttribute(self, attrib, value): | |
459 """ Safely compare the value of an attribute against a provided value. | |
460 | |
461 C{None}-safe. | |
462 """ | |
463 return self.attributes.get(self._dqa(attrib), None) == value | |
464 | |
465 def swapAttributeValues(self, left, right): | |
466 """ Swap the values of two attribute. """ | |
467 d = self.attributes | |
468 l = d[left] | |
469 d[left] = d[right] | |
470 d[right] = l | |
471 | |
472 def addChild(self, node): | |
473 """ Add a child to this Element. """ | |
474 if IElement.providedBy(node): | |
475 node.parent = self | |
476 self.children.append(node) | |
477 return self.children[-1] | |
478 | |
479 def addContent(self, text): | |
480 """ Add some text data to this Element. """ | |
481 c = self.children | |
482 if len(c) > 0 and isinstance(c[-1], types.StringTypes): | |
483 c[-1] = c[-1] + text | |
484 else: | |
485 c.append(text) | |
486 return c[-1] | |
487 | |
488 def addElement(self, name, defaultUri = None, content = None): | |
489 result = None | |
490 if isinstance(name, type(())): | |
491 if defaultUri is None: | |
492 defaultUri = name[0] | |
493 self.children.append(Element(name, defaultUri)) | |
494 else: | |
495 if defaultUri is None: | |
496 defaultUri = self.defaultUri | |
497 self.children.append(Element((defaultUri, name), defaultUri)) | |
498 | |
499 result = self.children[-1] | |
500 result.parent = self | |
501 | |
502 if content: | |
503 result.children.append(content) | |
504 | |
505 return result | |
506 | |
507 def addRawXml(self, rawxmlstring): | |
508 """ Add a pre-serialized chunk o' XML as a child of this Element. """ | |
509 self.children.append(SerializedXML(rawxmlstring)) | |
510 | |
511 def addUniqueId(self): | |
512 """ Add a unique (across a given Python session) id attribute to this | |
513 Element. | |
514 """ | |
515 self.attributes["id"] = "H_%d" % Element._idCounter | |
516 Element._idCounter = Element._idCounter + 1 | |
517 | |
518 def elements(self): | |
519 """ Iterate across all children of this Element that are Elements. """ | |
520 return generateOnlyInterface(self.children, IElement) | |
521 | |
522 def toXml(self, prefixes=None, closeElement=1, defaultUri='', | |
523 prefixesInScope=None): | |
524 """ Serialize this Element and all children to a string. """ | |
525 s = SerializerClass(prefixes=prefixes, prefixesInScope=prefixesInScope) | |
526 s.serialize(self, closeElement=closeElement, defaultUri=defaultUri) | |
527 return s.getValue() | |
528 | |
529 def firstChildElement(self): | |
530 for c in self.children: | |
531 if IElement.providedBy(c): | |
532 return c | |
533 return None | |
534 | |
535 | |
536 class ParserError(Exception): | |
537 """ Exception thrown when a parsing error occurs """ | |
538 pass | |
539 | |
540 def elementStream(): | |
541 """ Preferred method to construct an ElementStream | |
542 | |
543 Uses Expat-based stream if available, and falls back to Sux if necessary. | |
544 """ | |
545 try: | |
546 es = ExpatElementStream() | |
547 return es | |
548 except ImportError: | |
549 if SuxElementStream is None: | |
550 raise Exception("No parsers available :(") | |
551 es = SuxElementStream() | |
552 return es | |
553 | |
554 try: | |
555 from twisted.web import sux | |
556 except: | |
557 SuxElementStream = None | |
558 else: | |
559 class SuxElementStream(sux.XMLParser): | |
560 def __init__(self): | |
561 self.connectionMade() | |
562 self.DocumentStartEvent = None | |
563 self.ElementEvent = None | |
564 self.DocumentEndEvent = None | |
565 self.currElem = None | |
566 self.rootElem = None | |
567 self.documentStarted = False | |
568 self.defaultNsStack = [] | |
569 self.prefixStack = [] | |
570 | |
571 def parse(self, buffer): | |
572 try: | |
573 self.dataReceived(buffer) | |
574 except sux.ParseError, e: | |
575 raise ParserError, str(e) | |
576 | |
577 | |
578 def findUri(self, prefix): | |
579 # Walk prefix stack backwards, looking for the uri | |
580 # matching the specified prefix | |
581 stack = self.prefixStack | |
582 for i in range(-1, (len(self.prefixStack)+1) * -1, -1): | |
583 if prefix in stack[i]: | |
584 return stack[i][prefix] | |
585 return None | |
586 | |
587 def gotTagStart(self, name, attributes): | |
588 defaultUri = None | |
589 localPrefixes = {} | |
590 attribs = {} | |
591 uri = None | |
592 | |
593 # Pass 1 - Identify namespace decls | |
594 for k, v in attributes.items(): | |
595 if k.startswith("xmlns"): | |
596 x, p = _splitPrefix(k) | |
597 if (x is None): # I.e. default declaration | |
598 defaultUri = v | |
599 else: | |
600 localPrefixes[p] = v | |
601 del attributes[k] | |
602 | |
603 # Push namespace decls onto prefix stack | |
604 self.prefixStack.append(localPrefixes) | |
605 | |
606 # Determine default namespace for this element; if there | |
607 # is one | |
608 if defaultUri is None: | |
609 if len(self.defaultNsStack) > 0: | |
610 defaultUri = self.defaultNsStack[-1] | |
611 else: | |
612 defaultUri = '' | |
613 | |
614 # Fix up name | |
615 prefix, name = _splitPrefix(name) | |
616 if prefix is None: # This element is in the default namespace | |
617 uri = defaultUri | |
618 else: | |
619 # Find the URI for the prefix | |
620 uri = self.findUri(prefix) | |
621 | |
622 # Pass 2 - Fix up and escape attributes | |
623 for k, v in attributes.items(): | |
624 p, n = _splitPrefix(k) | |
625 if p is None: | |
626 attribs[n] = v | |
627 else: | |
628 attribs[(self.findUri(p)), n] = unescapeFromXml(v) | |
629 | |
630 # Construct the actual Element object | |
631 e = Element((uri, name), defaultUri, attribs, localPrefixes) | |
632 | |
633 # Save current default namespace | |
634 self.defaultNsStack.append(defaultUri) | |
635 | |
636 # Document already started | |
637 if self.documentStarted: | |
638 # Starting a new packet | |
639 if self.currElem is None: | |
640 self.currElem = e | |
641 # Adding to existing element | |
642 else: | |
643 self.currElem = self.currElem.addChild(e) | |
644 # New document | |
645 else: | |
646 self.rootElem = e | |
647 self.documentStarted = True | |
648 self.DocumentStartEvent(e) | |
649 | |
650 def gotText(self, data): | |
651 if self.currElem != None: | |
652 self.currElem.addContent(data) | |
653 | |
654 def gotCData(self, data): | |
655 if self.currElem != None: | |
656 self.currElem.addContent(data) | |
657 | |
658 def gotComment(self, data): | |
659 # Ignore comments for the moment | |
660 pass | |
661 | |
662 entities = { "amp" : "&", | |
663 "lt" : "<", | |
664 "gt" : ">", | |
665 "apos": "'", | |
666 "quot": "\"" } | |
667 | |
668 def gotEntityReference(self, entityRef): | |
669 # If this is an entity we know about, add it as content | |
670 # to the current element | |
671 if entityRef in SuxElementStream.entities: | |
672 self.currElem.addContent(SuxElementStream.entities[entityRef]) | |
673 | |
674 def gotTagEnd(self, name): | |
675 # Ensure the document hasn't already ended | |
676 if self.rootElem is None: | |
677 # XXX: Write more legible explanation | |
678 raise ParserError, "Element closed after end of document." | |
679 | |
680 # Fix up name | |
681 prefix, name = _splitPrefix(name) | |
682 if prefix is None: | |
683 uri = self.defaultNsStack[-1] | |
684 else: | |
685 uri = self.findUri(prefix) | |
686 | |
687 # End of document | |
688 if self.currElem is None: | |
689 # Ensure element name and uri matches | |
690 if self.rootElem.name != name or self.rootElem.uri != uri: | |
691 raise ParserError, "Mismatched root elements" | |
692 self.DocumentEndEvent() | |
693 self.rootElem = None | |
694 | |
695 # Other elements | |
696 else: | |
697 # Ensure the tag being closed matches the name of the current | |
698 # element | |
699 if self.currElem.name != name or self.currElem.uri != uri: | |
700 # XXX: Write more legible explanation | |
701 raise ParserError, "Malformed element close" | |
702 | |
703 # Pop prefix and default NS stack | |
704 self.prefixStack.pop() | |
705 self.defaultNsStack.pop() | |
706 | |
707 # Check for parent null parent of current elem; | |
708 # that's the top of the stack | |
709 if self.currElem.parent is None: | |
710 self.currElem.parent = self.rootElem | |
711 self.ElementEvent(self.currElem) | |
712 self.currElem = None | |
713 | |
714 # Anything else is just some element wrapping up | |
715 else: | |
716 self.currElem = self.currElem.parent | |
717 | |
718 | |
719 class ExpatElementStream: | |
720 def __init__(self): | |
721 import pyexpat | |
722 self.DocumentStartEvent = None | |
723 self.ElementEvent = None | |
724 self.DocumentEndEvent = None | |
725 self.error = pyexpat.error | |
726 self.parser = pyexpat.ParserCreate("UTF-8", " ") | |
727 self.parser.StartElementHandler = self._onStartElement | |
728 self.parser.EndElementHandler = self._onEndElement | |
729 self.parser.CharacterDataHandler = self._onCdata | |
730 self.parser.StartNamespaceDeclHandler = self._onStartNamespace | |
731 self.parser.EndNamespaceDeclHandler = self._onEndNamespace | |
732 self.currElem = None | |
733 self.defaultNsStack = [''] | |
734 self.documentStarted = 0 | |
735 self.localPrefixes = {} | |
736 | |
737 def parse(self, buffer): | |
738 try: | |
739 self.parser.Parse(buffer) | |
740 except self.error, e: | |
741 raise ParserError, str(e) | |
742 | |
743 def _onStartElement(self, name, attrs): | |
744 # Generate a qname tuple from the provided name | |
745 qname = name.split(" ") | |
746 if len(qname) == 1: | |
747 qname = ('', name) | |
748 | |
749 # Process attributes | |
750 for k, v in attrs.items(): | |
751 if k.find(" ") != -1: | |
752 aqname = k.split(" ") | |
753 attrs[(aqname[0], aqname[1])] = v | |
754 del attrs[k] | |
755 | |
756 # Construct the new element | |
757 e = Element(qname, self.defaultNsStack[-1], attrs, self.localPrefixes) | |
758 self.localPrefixes = {} | |
759 | |
760 # Document already started | |
761 if self.documentStarted == 1: | |
762 if self.currElem != None: | |
763 self.currElem.children.append(e) | |
764 e.parent = self.currElem | |
765 self.currElem = e | |
766 | |
767 # New document | |
768 else: | |
769 self.documentStarted = 1 | |
770 self.DocumentStartEvent(e) | |
771 | |
772 def _onEndElement(self, _): | |
773 # Check for null current elem; end of doc | |
774 if self.currElem is None: | |
775 self.DocumentEndEvent() | |
776 | |
777 # Check for parent that is None; that's | |
778 # the top of the stack | |
779 elif self.currElem.parent is None: | |
780 self.ElementEvent(self.currElem) | |
781 self.currElem = None | |
782 | |
783 # Anything else is just some element in the current | |
784 # packet wrapping up | |
785 else: | |
786 self.currElem = self.currElem.parent | |
787 | |
788 def _onCdata(self, data): | |
789 if self.currElem != None: | |
790 self.currElem.addContent(data) | |
791 | |
792 def _onStartNamespace(self, prefix, uri): | |
793 # If this is the default namespace, put | |
794 # it on the stack | |
795 if prefix is None: | |
796 self.defaultNsStack.append(uri) | |
797 else: | |
798 self.localPrefixes[prefix] = uri | |
799 | |
800 def _onEndNamespace(self, prefix): | |
801 # Remove last element on the stack | |
802 if prefix is None: | |
803 self.defaultNsStack.pop() | |
804 | |
805 ## class FileParser(ElementStream): | |
806 ## def __init__(self): | |
807 ## ElementStream.__init__(self) | |
808 ## self.DocumentStartEvent = self.docStart | |
809 ## self.ElementEvent = self.elem | |
810 ## self.DocumentEndEvent = self.docEnd | |
811 ## self.done = 0 | |
812 | |
813 ## def docStart(self, elem): | |
814 ## self.document = elem | |
815 | |
816 ## def elem(self, elem): | |
817 ## self.document.addChild(elem) | |
818 | |
819 ## def docEnd(self): | |
820 ## self.done = 1 | |
821 | |
822 ## def parse(self, filename): | |
823 ## for l in open(filename).readlines(): | |
824 ## self.parser.Parse(l) | |
825 ## assert self.done == 1 | |
826 ## return self.document | |
827 | |
828 ## def parseFile(filename): | |
829 ## return FileParser().parse(filename) | |
830 | |
831 | |
OLD | NEW |