| OLD | NEW |
| (Empty) |
| 1 # -*- test-case-name: twisted.words.test.test_domish -*- | |
| 2 # | |
| 3 # Copyright (c) 2001-2007 Twisted Matrix Laboratories. | |
| 4 # See LICENSE for details. | |
| 5 | |
| 6 """ | |
| 7 DOM-like XML processing support. | |
| 8 | |
| 9 This module provides support for parsing XML into DOM-like object structures | |
| 10 and serializing such structures to an XML string representation, optimized | |
| 11 for use in streaming XML applications. | |
| 12 """ | |
| 13 | |
| 14 import types | |
| 15 | |
| 16 from zope.interface import implements, Interface, Attribute | |
| 17 | |
| 18 def _splitPrefix(name): | |
| 19 """ Internal method for splitting a prefixed Element name into its | |
| 20 respective parts """ | |
| 21 ntok = name.split(":", 1) | |
| 22 if len(ntok) == 2: | |
| 23 return ntok | |
| 24 else: | |
| 25 return (None, ntok[0]) | |
| 26 | |
| 27 # Global map of prefixes that always get injected | |
| 28 # into the serializers prefix map (note, that doesn't | |
| 29 # mean they're always _USED_) | |
| 30 G_PREFIXES = { "http://www.w3.org/XML/1998/namespace":"xml" } | |
| 31 | |
| 32 class _ListSerializer: | |
| 33 """ Internal class which serializes an Element tree into a buffer """ | |
| 34 def __init__(self, prefixes=None, prefixesInScope=None): | |
| 35 self.writelist = [] | |
| 36 self.prefixes = {} | |
| 37 if prefixes: | |
| 38 self.prefixes.update(prefixes) | |
| 39 self.prefixes.update(G_PREFIXES) | |
| 40 self.prefixStack = [G_PREFIXES.values()] + (prefixesInScope or []) | |
| 41 self.prefixCounter = 0 | |
| 42 | |
| 43 def getValue(self): | |
| 44 return u"".join(self.writelist) | |
| 45 | |
| 46 def getPrefix(self, uri): | |
| 47 if not self.prefixes.has_key(uri): | |
| 48 self.prefixes[uri] = "xn%d" % (self.prefixCounter) | |
| 49 self.prefixCounter = self.prefixCounter + 1 | |
| 50 return self.prefixes[uri] | |
| 51 | |
| 52 def prefixInScope(self, prefix): | |
| 53 stack = self.prefixStack | |
| 54 for i in range(-1, (len(self.prefixStack)+1) * -1, -1): | |
| 55 if prefix in stack[i]: | |
| 56 return True | |
| 57 return False | |
| 58 | |
| 59 def serialize(self, elem, closeElement=1, defaultUri=''): | |
| 60 # Optimization shortcuts | |
| 61 write = self.writelist.append | |
| 62 | |
| 63 # Shortcut, check to see if elem is actually a chunk o' serialized XML | |
| 64 if isinstance(elem, SerializedXML): | |
| 65 write(elem) | |
| 66 return | |
| 67 | |
| 68 # Shortcut, check to see if elem is actually a string (aka Cdata) | |
| 69 if isinstance(elem, types.StringTypes): | |
| 70 write(escapeToXml(elem)) | |
| 71 return | |
| 72 | |
| 73 # Further optimizations | |
| 74 parent = elem.parent | |
| 75 name = elem.name | |
| 76 uri = elem.uri | |
| 77 defaultUri, currentDefaultUri = elem.defaultUri, defaultUri | |
| 78 | |
| 79 for p, u in elem.localPrefixes.iteritems(): | |
| 80 self.prefixes[u] = p | |
| 81 self.prefixStack.append(elem.localPrefixes.keys()) | |
| 82 | |
| 83 # Inherit the default namespace | |
| 84 if defaultUri is None: | |
| 85 defaultUri = currentDefaultUri | |
| 86 | |
| 87 if uri is None: | |
| 88 uri = defaultUri | |
| 89 | |
| 90 prefix = None | |
| 91 if uri != defaultUri or uri in self.prefixes: | |
| 92 prefix = self.getPrefix(uri) | |
| 93 inScope = self.prefixInScope(prefix) | |
| 94 | |
| 95 # Create the starttag | |
| 96 | |
| 97 if not prefix: | |
| 98 write("<%s" % (name)) | |
| 99 else: | |
| 100 write("<%s:%s" % (prefix, name)) | |
| 101 | |
| 102 if not inScope: | |
| 103 write(" xmlns:%s='%s'" % (prefix, uri)) | |
| 104 self.prefixStack[-1].append(prefix) | |
| 105 inScope = True | |
| 106 | |
| 107 if defaultUri != currentDefaultUri and \ | |
| 108 (uri != defaultUri or not prefix or not inScope): | |
| 109 write(" xmlns='%s'" % (defaultUri)) | |
| 110 | |
| 111 for p, u in elem.localPrefixes.iteritems(): | |
| 112 write(" xmlns:%s='%s'" % (p, u)) | |
| 113 | |
| 114 # Serialize attributes | |
| 115 for k,v in elem.attributes.items(): | |
| 116 # If the attribute name is a tuple, it's a qualified attribute | |
| 117 if isinstance(k, types.TupleType): | |
| 118 attr_uri, attr_name = k | |
| 119 attr_prefix = self.getPrefix(attr_uri) | |
| 120 | |
| 121 if not self.prefixInScope(attr_prefix): | |
| 122 write(" xmlns:%s='%s'" % (attr_prefix, attr_uri)) | |
| 123 self.prefixStack[-1].append(attr_prefix) | |
| 124 | |
| 125 write(" %s:%s='%s'" % (attr_prefix, attr_name, | |
| 126 escapeToXml(v, 1))) | |
| 127 else: | |
| 128 write((" %s='%s'" % ( k, escapeToXml(v, 1)))) | |
| 129 | |
| 130 # Shortcut out if this is only going to return | |
| 131 # the element (i.e. no children) | |
| 132 if closeElement == 0: | |
| 133 write(">") | |
| 134 return | |
| 135 | |
| 136 # Serialize children | |
| 137 if len(elem.children) > 0: | |
| 138 write(">") | |
| 139 for c in elem.children: | |
| 140 self.serialize(c, defaultUri=defaultUri) | |
| 141 # Add closing tag | |
| 142 if not prefix: | |
| 143 write("</%s>" % (name)) | |
| 144 else: | |
| 145 write("</%s:%s>" % (prefix, name)) | |
| 146 else: | |
| 147 write("/>") | |
| 148 | |
| 149 self.prefixStack.pop() | |
| 150 | |
| 151 | |
| 152 SerializerClass = _ListSerializer | |
| 153 | |
| 154 def escapeToXml(text, isattrib = 0): | |
| 155 """ Escape text to proper XML form, per section 2.3 in the XML specification
. | |
| 156 | |
| 157 @type text: L{str} | |
| 158 @param text: Text to escape | |
| 159 | |
| 160 @type isattrib: L{bool} | |
| 161 @param isattrib: Triggers escaping of characters necessary for use as | |
| 162 attribute values | |
| 163 """ | |
| 164 text = text.replace("&", "&") | |
| 165 text = text.replace("<", "<") | |
| 166 text = text.replace(">", ">") | |
| 167 if isattrib == 1: | |
| 168 text = text.replace("'", "'") | |
| 169 text = text.replace("\"", """) | |
| 170 return text | |
| 171 | |
| 172 def unescapeFromXml(text): | |
| 173 text = text.replace("<", "<") | |
| 174 text = text.replace(">", ">") | |
| 175 text = text.replace("'", "'") | |
| 176 text = text.replace(""", "\"") | |
| 177 text = text.replace("&", "&") | |
| 178 return text | |
| 179 | |
| 180 def generateOnlyInterface(list, int): | |
| 181 """ Filters items in a list by class | |
| 182 """ | |
| 183 for n in list: | |
| 184 if int.providedBy(n): | |
| 185 yield n | |
| 186 | |
| 187 def generateElementsQNamed(list, name, uri): | |
| 188 """ Filters Element items in a list with matching name and URI. """ | |
| 189 for n in list: | |
| 190 if IElement.providedBy(n) and n.name == name and n.uri == uri: | |
| 191 yield n | |
| 192 | |
| 193 def generateElementsNamed(list, name): | |
| 194 """ Filters Element items in a list with matching name, regardless of URI. | |
| 195 """ | |
| 196 for n in list: | |
| 197 if IElement.providedBy(n) and n.name == name: | |
| 198 yield n | |
| 199 | |
| 200 | |
| 201 class SerializedXML(unicode): | |
| 202 """ Marker class for pre-serialized XML in the DOM. """ | |
| 203 pass | |
| 204 | |
| 205 | |
| 206 class Namespace: | |
| 207 """ Convenience object for tracking namespace declarations. """ | |
| 208 def __init__(self, uri): | |
| 209 self._uri = uri | |
| 210 def __getattr__(self, n): | |
| 211 return (self._uri, n) | |
| 212 def __getitem__(self, n): | |
| 213 return (self._uri, n) | |
| 214 | |
| 215 class IElement(Interface): | |
| 216 """ | |
| 217 Interface to XML element nodes. | |
| 218 | |
| 219 See L{Element} for a detailed example of its general use. | |
| 220 | |
| 221 Warning: this Interface is not yet complete! | |
| 222 """ | |
| 223 | |
| 224 uri = Attribute(""" Element's namespace URI """) | |
| 225 name = Attribute(""" Element's local name """) | |
| 226 defaultUri = Attribute(""" Default namespace URI of child elements """) | |
| 227 attributes = Attribute(""" Dictionary of element attributes """) | |
| 228 children = Attribute(""" List of child nodes """) | |
| 229 parent = Attribute(""" Reference to element's parent element """) | |
| 230 localPrefixes = Attribute(""" Dictionary of local prefixes """) | |
| 231 | |
| 232 def toXml(prefixes=None, closeElement=1, defaultUri='', | |
| 233 prefixesInScope=None): | |
| 234 """ Serializes object to a (partial) XML document | |
| 235 | |
| 236 @param prefixes: dictionary that maps namespace URIs to suggested | |
| 237 prefix names. | |
| 238 @type prefixes: L{dict} | |
| 239 @param closeElement: flag that determines whether to include the | |
| 240 closing tag of the element in the serialized | |
| 241 string. A value of C{0} only generates the | |
| 242 element's start tag. A value of C{1} yields a | |
| 243 complete serialization. | |
| 244 @type closeElement: L{int} | |
| 245 @param defaultUri: Initial default namespace URI. This is most useful | |
| 246 for partial rendering, where the logical parent | |
| 247 element (of which the starttag was already | |
| 248 serialized) declares a default namespace that should | |
| 249 be inherited. | |
| 250 @type defaultUri: L{str} | |
| 251 @param prefixesInScope: list of prefixes that are assumed to be | |
| 252 declared by ancestors. | |
| 253 @type prefixesInScope: L{list} | |
| 254 @return: (partial) serialized XML | |
| 255 @rtype: L{unicode} | |
| 256 """ | |
| 257 | |
| 258 def addElement(name, defaultUri = None, content = None): | |
| 259 """ Create an element and add as child. | |
| 260 | |
| 261 The new element is added to this element as a child, and will have | |
| 262 this element as its parent. | |
| 263 | |
| 264 @param name: element name. This can be either a L{unicode} object that | |
| 265 contains the local name, or a tuple of (uri, local_name) | |
| 266 for a fully qualified name. In the former case, | |
| 267 the namespace URI is inherited from this element. | |
| 268 @type name: L{unicode} or L{tuple} of (L{unicode}, L{unicode}) | |
| 269 @param defaultUri: default namespace URI for child elements. If | |
| 270 C{None}, this is inherited from this element. | |
| 271 @type defaultUri: L{unicode} | |
| 272 @param content: text contained by the new element. | |
| 273 @type content: L{unicode} | |
| 274 @return: the created element | |
| 275 @rtype: object providing L{IElement} | |
| 276 """ | |
| 277 | |
| 278 def addChild(node): | |
| 279 """ Adds a node as child of this element. | |
| 280 | |
| 281 The C{node} will be added to the list of childs of this element, and | |
| 282 will have this element set as its parent when C{node} provides | |
| 283 L{IElement}. | |
| 284 | |
| 285 @param node: the child node. | |
| 286 @type node: L{unicode} or object implementing L{IElement} | |
| 287 """ | |
| 288 | |
| 289 class Element(object): | |
| 290 """ Represents an XML element node. | |
| 291 | |
| 292 An Element contains a series of attributes (name/value pairs), content | |
| 293 (character data), and other child Element objects. When building a document | |
| 294 with markup (such as HTML or XML), use this object as the starting point. | |
| 295 | |
| 296 Element objects fully support XML Namespaces. The fully qualified name of | |
| 297 the XML Element it represents is stored in the C{uri} and C{name} | |
| 298 attributes, where C{uri} holds the namespace URI. There is also a default | |
| 299 namespace, for child elements. This is stored in the C{defaultUri} | |
| 300 attribute. Note that C{''} means the empty namespace. | |
| 301 | |
| 302 Serialization of Elements through C{toXml()} will use these attributes | |
| 303 for generating proper serialized XML. When both C{uri} and C{defaultUri} | |
| 304 are not None in the Element and all of its descendents, serialization | |
| 305 proceeds as expected: | |
| 306 | |
| 307 >>> from twisted.words.xish import domish | |
| 308 >>> root = domish.Element(('myns', 'root')) | |
| 309 >>> root.addElement('child', content='test') | |
| 310 <twisted.words.xish.domish.Element object at 0x83002ac> | |
| 311 >>> root.toXml() | |
| 312 u"<root xmlns='myns'><child>test</child></root>" | |
| 313 | |
| 314 For partial serialization, needed for streaming XML, a special value for | |
| 315 namespace URIs can be used: C{None}. | |
| 316 | |
| 317 Using C{None} as the value for C{uri} means: this element is in whatever | |
| 318 namespace inherited by the closest logical ancestor when the complete XML | |
| 319 document has been serialized. The serialized start tag will have a | |
| 320 non-prefixed name, and no xmlns declaration will be generated. | |
| 321 | |
| 322 Similarly, C{None} for C{defaultUri} means: the default namespace for my | |
| 323 child elements is inherited from the logical ancestors of this element, | |
| 324 when the complete XML document has been serialized. | |
| 325 | |
| 326 To illustrate, an example from a Jabber stream. Assume the start tag of the | |
| 327 root element of the stream has already been serialized, along with several | |
| 328 complete child elements, and sent off, looking like this:: | |
| 329 | |
| 330 <stream:stream xmlns:stream='http://etherx.jabber.org/streams' | |
| 331 xmlns='jabber:client' to='example.com'> | |
| 332 ... | |
| 333 | |
| 334 Now suppose we want to send a complete element represented by an | |
| 335 object C{message} created like: | |
| 336 | |
| 337 >>> message = domish.Element((None, 'message')) | |
| 338 >>> message['to'] = 'user@example.com' | |
| 339 >>> message.addElement('body', content='Hi!') | |
| 340 <twisted.words.xish.domish.Element object at 0x8276e8c> | |
| 341 >>> message.toXml() | |
| 342 u"<message to='user@example.com'><body>Hi!</body></message>" | |
| 343 | |
| 344 As, you can see, this XML snippet has no xmlns declaration. When sent | |
| 345 off, it inherits the C{jabber:client} namespace from the root element. | |
| 346 Note that this renders the same as using C{''} instead of C{None}: | |
| 347 | |
| 348 >>> presence = domish.Element(('', 'presence')) | |
| 349 >>> presence.toXml() | |
| 350 u"<presence/>" | |
| 351 | |
| 352 However, if this object has a parent defined, the difference becomes | |
| 353 clear: | |
| 354 | |
| 355 >>> child = message.addElement(('http://example.com/', 'envelope')) | |
| 356 >>> child.addChild(presence) | |
| 357 <twisted.words.xish.domish.Element object at 0x8276fac> | |
| 358 >>> message.toXml() | |
| 359 u"<message to='user@example.com'><body>Hi!</body><envelope xmlns='http://e
xample.com/'><presence xmlns=''/></envelope></message>" | |
| 360 | |
| 361 As, you can see, the <presence/> element is now in the empty namespace, not | |
| 362 in the default namespace of the parent or the streams'. | |
| 363 | |
| 364 @type uri: L{unicode} or None | |
| 365 @ivar uri: URI of this Element's name | |
| 366 | |
| 367 @type name: L{unicode} | |
| 368 @ivar name: Name of this Element | |
| 369 | |
| 370 @type defaultUri: L{unicode} or None | |
| 371 @ivar defaultUri: URI this Element exists within | |
| 372 | |
| 373 @type children: L{list} | |
| 374 @ivar children: List of child Elements and content | |
| 375 | |
| 376 @type parent: L{Element} | |
| 377 @ivar parent: Reference to the parent Element, if any. | |
| 378 | |
| 379 @type attributes: L{dict} | |
| 380 @ivar attributes: Dictionary of attributes associated with this Element. | |
| 381 | |
| 382 @type localPrefixes: L{dict} | |
| 383 @ivar localPrefixes: Dictionary of namespace declarations on this | |
| 384 element. The key is the prefix to bind the | |
| 385 namespace uri to. | |
| 386 """ | |
| 387 | |
| 388 implements(IElement) | |
| 389 | |
| 390 _idCounter = 0 | |
| 391 | |
| 392 def __init__(self, qname, defaultUri=None, attribs=None, | |
| 393 localPrefixes=None): | |
| 394 """ | |
| 395 @param qname: Tuple of (uri, name) | |
| 396 @param defaultUri: The default URI of the element; defaults to the URI | |
| 397 specified in L{qname} | |
| 398 @param attribs: Dictionary of attributes | |
| 399 @param localPrefixes: Dictionary of namespace declarations on this | |
| 400 element. The key is the prefix to bind the | |
| 401 namespace uri to. | |
| 402 """ | |
| 403 self.localPrefixes = localPrefixes or {} | |
| 404 self.uri, self.name = qname | |
| 405 if defaultUri is None and \ | |
| 406 self.uri not in self.localPrefixes.itervalues(): | |
| 407 self.defaultUri = self.uri | |
| 408 else: | |
| 409 self.defaultUri = defaultUri | |
| 410 self.attributes = attribs or {} | |
| 411 self.children = [] | |
| 412 self.parent = None | |
| 413 | |
| 414 def __getattr__(self, key): | |
| 415 # Check child list for first Element with a name matching the key | |
| 416 for n in self.children: | |
| 417 if IElement.providedBy(n) and n.name == key: | |
| 418 return n | |
| 419 | |
| 420 # Tweak the behaviour so that it's more friendly about not | |
| 421 # finding elements -- we need to document this somewhere :) | |
| 422 if key.startswith('_'): | |
| 423 raise AttributeError(key) | |
| 424 else: | |
| 425 return None | |
| 426 | |
| 427 def __getitem__(self, key): | |
| 428 return self.attributes[self._dqa(key)] | |
| 429 | |
| 430 def __delitem__(self, key): | |
| 431 del self.attributes[self._dqa(key)]; | |
| 432 | |
| 433 def __setitem__(self, key, value): | |
| 434 self.attributes[self._dqa(key)] = value | |
| 435 | |
| 436 def __str__(self): | |
| 437 """ Retrieve the first CData (content) node | |
| 438 """ | |
| 439 for n in self.children: | |
| 440 if isinstance(n, types.StringTypes): return n | |
| 441 return "" | |
| 442 | |
| 443 def _dqa(self, attr): | |
| 444 """ Dequalify an attribute key as needed """ | |
| 445 if isinstance(attr, types.TupleType) and not attr[0]: | |
| 446 return attr[1] | |
| 447 else: | |
| 448 return attr | |
| 449 | |
| 450 def getAttribute(self, attribname, default = None): | |
| 451 """ Retrieve the value of attribname, if it exists """ | |
| 452 return self.attributes.get(attribname, default) | |
| 453 | |
| 454 def hasAttribute(self, attrib): | |
| 455 """ Determine if the specified attribute exists """ | |
| 456 return self.attributes.has_key(self._dqa(attrib)) | |
| 457 | |
| 458 def compareAttribute(self, attrib, value): | |
| 459 """ Safely compare the value of an attribute against a provided value. | |
| 460 | |
| 461 C{None}-safe. | |
| 462 """ | |
| 463 return self.attributes.get(self._dqa(attrib), None) == value | |
| 464 | |
| 465 def swapAttributeValues(self, left, right): | |
| 466 """ Swap the values of two attribute. """ | |
| 467 d = self.attributes | |
| 468 l = d[left] | |
| 469 d[left] = d[right] | |
| 470 d[right] = l | |
| 471 | |
| 472 def addChild(self, node): | |
| 473 """ Add a child to this Element. """ | |
| 474 if IElement.providedBy(node): | |
| 475 node.parent = self | |
| 476 self.children.append(node) | |
| 477 return self.children[-1] | |
| 478 | |
| 479 def addContent(self, text): | |
| 480 """ Add some text data to this Element. """ | |
| 481 c = self.children | |
| 482 if len(c) > 0 and isinstance(c[-1], types.StringTypes): | |
| 483 c[-1] = c[-1] + text | |
| 484 else: | |
| 485 c.append(text) | |
| 486 return c[-1] | |
| 487 | |
| 488 def addElement(self, name, defaultUri = None, content = None): | |
| 489 result = None | |
| 490 if isinstance(name, type(())): | |
| 491 if defaultUri is None: | |
| 492 defaultUri = name[0] | |
| 493 self.children.append(Element(name, defaultUri)) | |
| 494 else: | |
| 495 if defaultUri is None: | |
| 496 defaultUri = self.defaultUri | |
| 497 self.children.append(Element((defaultUri, name), defaultUri)) | |
| 498 | |
| 499 result = self.children[-1] | |
| 500 result.parent = self | |
| 501 | |
| 502 if content: | |
| 503 result.children.append(content) | |
| 504 | |
| 505 return result | |
| 506 | |
| 507 def addRawXml(self, rawxmlstring): | |
| 508 """ Add a pre-serialized chunk o' XML as a child of this Element. """ | |
| 509 self.children.append(SerializedXML(rawxmlstring)) | |
| 510 | |
| 511 def addUniqueId(self): | |
| 512 """ Add a unique (across a given Python session) id attribute to this | |
| 513 Element. | |
| 514 """ | |
| 515 self.attributes["id"] = "H_%d" % Element._idCounter | |
| 516 Element._idCounter = Element._idCounter + 1 | |
| 517 | |
| 518 def elements(self): | |
| 519 """ Iterate across all children of this Element that are Elements. """ | |
| 520 return generateOnlyInterface(self.children, IElement) | |
| 521 | |
| 522 def toXml(self, prefixes=None, closeElement=1, defaultUri='', | |
| 523 prefixesInScope=None): | |
| 524 """ Serialize this Element and all children to a string. """ | |
| 525 s = SerializerClass(prefixes=prefixes, prefixesInScope=prefixesInScope) | |
| 526 s.serialize(self, closeElement=closeElement, defaultUri=defaultUri) | |
| 527 return s.getValue() | |
| 528 | |
| 529 def firstChildElement(self): | |
| 530 for c in self.children: | |
| 531 if IElement.providedBy(c): | |
| 532 return c | |
| 533 return None | |
| 534 | |
| 535 | |
| 536 class ParserError(Exception): | |
| 537 """ Exception thrown when a parsing error occurs """ | |
| 538 pass | |
| 539 | |
| 540 def elementStream(): | |
| 541 """ Preferred method to construct an ElementStream | |
| 542 | |
| 543 Uses Expat-based stream if available, and falls back to Sux if necessary. | |
| 544 """ | |
| 545 try: | |
| 546 es = ExpatElementStream() | |
| 547 return es | |
| 548 except ImportError: | |
| 549 if SuxElementStream is None: | |
| 550 raise Exception("No parsers available :(") | |
| 551 es = SuxElementStream() | |
| 552 return es | |
| 553 | |
| 554 try: | |
| 555 from twisted.web import sux | |
| 556 except: | |
| 557 SuxElementStream = None | |
| 558 else: | |
| 559 class SuxElementStream(sux.XMLParser): | |
| 560 def __init__(self): | |
| 561 self.connectionMade() | |
| 562 self.DocumentStartEvent = None | |
| 563 self.ElementEvent = None | |
| 564 self.DocumentEndEvent = None | |
| 565 self.currElem = None | |
| 566 self.rootElem = None | |
| 567 self.documentStarted = False | |
| 568 self.defaultNsStack = [] | |
| 569 self.prefixStack = [] | |
| 570 | |
| 571 def parse(self, buffer): | |
| 572 try: | |
| 573 self.dataReceived(buffer) | |
| 574 except sux.ParseError, e: | |
| 575 raise ParserError, str(e) | |
| 576 | |
| 577 | |
| 578 def findUri(self, prefix): | |
| 579 # Walk prefix stack backwards, looking for the uri | |
| 580 # matching the specified prefix | |
| 581 stack = self.prefixStack | |
| 582 for i in range(-1, (len(self.prefixStack)+1) * -1, -1): | |
| 583 if prefix in stack[i]: | |
| 584 return stack[i][prefix] | |
| 585 return None | |
| 586 | |
| 587 def gotTagStart(self, name, attributes): | |
| 588 defaultUri = None | |
| 589 localPrefixes = {} | |
| 590 attribs = {} | |
| 591 uri = None | |
| 592 | |
| 593 # Pass 1 - Identify namespace decls | |
| 594 for k, v in attributes.items(): | |
| 595 if k.startswith("xmlns"): | |
| 596 x, p = _splitPrefix(k) | |
| 597 if (x is None): # I.e. default declaration | |
| 598 defaultUri = v | |
| 599 else: | |
| 600 localPrefixes[p] = v | |
| 601 del attributes[k] | |
| 602 | |
| 603 # Push namespace decls onto prefix stack | |
| 604 self.prefixStack.append(localPrefixes) | |
| 605 | |
| 606 # Determine default namespace for this element; if there | |
| 607 # is one | |
| 608 if defaultUri is None: | |
| 609 if len(self.defaultNsStack) > 0: | |
| 610 defaultUri = self.defaultNsStack[-1] | |
| 611 else: | |
| 612 defaultUri = '' | |
| 613 | |
| 614 # Fix up name | |
| 615 prefix, name = _splitPrefix(name) | |
| 616 if prefix is None: # This element is in the default namespace | |
| 617 uri = defaultUri | |
| 618 else: | |
| 619 # Find the URI for the prefix | |
| 620 uri = self.findUri(prefix) | |
| 621 | |
| 622 # Pass 2 - Fix up and escape attributes | |
| 623 for k, v in attributes.items(): | |
| 624 p, n = _splitPrefix(k) | |
| 625 if p is None: | |
| 626 attribs[n] = v | |
| 627 else: | |
| 628 attribs[(self.findUri(p)), n] = unescapeFromXml(v) | |
| 629 | |
| 630 # Construct the actual Element object | |
| 631 e = Element((uri, name), defaultUri, attribs, localPrefixes) | |
| 632 | |
| 633 # Save current default namespace | |
| 634 self.defaultNsStack.append(defaultUri) | |
| 635 | |
| 636 # Document already started | |
| 637 if self.documentStarted: | |
| 638 # Starting a new packet | |
| 639 if self.currElem is None: | |
| 640 self.currElem = e | |
| 641 # Adding to existing element | |
| 642 else: | |
| 643 self.currElem = self.currElem.addChild(e) | |
| 644 # New document | |
| 645 else: | |
| 646 self.rootElem = e | |
| 647 self.documentStarted = True | |
| 648 self.DocumentStartEvent(e) | |
| 649 | |
| 650 def gotText(self, data): | |
| 651 if self.currElem != None: | |
| 652 self.currElem.addContent(data) | |
| 653 | |
| 654 def gotCData(self, data): | |
| 655 if self.currElem != None: | |
| 656 self.currElem.addContent(data) | |
| 657 | |
| 658 def gotComment(self, data): | |
| 659 # Ignore comments for the moment | |
| 660 pass | |
| 661 | |
| 662 entities = { "amp" : "&", | |
| 663 "lt" : "<", | |
| 664 "gt" : ">", | |
| 665 "apos": "'", | |
| 666 "quot": "\"" } | |
| 667 | |
| 668 def gotEntityReference(self, entityRef): | |
| 669 # If this is an entity we know about, add it as content | |
| 670 # to the current element | |
| 671 if entityRef in SuxElementStream.entities: | |
| 672 self.currElem.addContent(SuxElementStream.entities[entityRef]) | |
| 673 | |
| 674 def gotTagEnd(self, name): | |
| 675 # Ensure the document hasn't already ended | |
| 676 if self.rootElem is None: | |
| 677 # XXX: Write more legible explanation | |
| 678 raise ParserError, "Element closed after end of document." | |
| 679 | |
| 680 # Fix up name | |
| 681 prefix, name = _splitPrefix(name) | |
| 682 if prefix is None: | |
| 683 uri = self.defaultNsStack[-1] | |
| 684 else: | |
| 685 uri = self.findUri(prefix) | |
| 686 | |
| 687 # End of document | |
| 688 if self.currElem is None: | |
| 689 # Ensure element name and uri matches | |
| 690 if self.rootElem.name != name or self.rootElem.uri != uri: | |
| 691 raise ParserError, "Mismatched root elements" | |
| 692 self.DocumentEndEvent() | |
| 693 self.rootElem = None | |
| 694 | |
| 695 # Other elements | |
| 696 else: | |
| 697 # Ensure the tag being closed matches the name of the current | |
| 698 # element | |
| 699 if self.currElem.name != name or self.currElem.uri != uri: | |
| 700 # XXX: Write more legible explanation | |
| 701 raise ParserError, "Malformed element close" | |
| 702 | |
| 703 # Pop prefix and default NS stack | |
| 704 self.prefixStack.pop() | |
| 705 self.defaultNsStack.pop() | |
| 706 | |
| 707 # Check for parent null parent of current elem; | |
| 708 # that's the top of the stack | |
| 709 if self.currElem.parent is None: | |
| 710 self.currElem.parent = self.rootElem | |
| 711 self.ElementEvent(self.currElem) | |
| 712 self.currElem = None | |
| 713 | |
| 714 # Anything else is just some element wrapping up | |
| 715 else: | |
| 716 self.currElem = self.currElem.parent | |
| 717 | |
| 718 | |
| 719 class ExpatElementStream: | |
| 720 def __init__(self): | |
| 721 import pyexpat | |
| 722 self.DocumentStartEvent = None | |
| 723 self.ElementEvent = None | |
| 724 self.DocumentEndEvent = None | |
| 725 self.error = pyexpat.error | |
| 726 self.parser = pyexpat.ParserCreate("UTF-8", " ") | |
| 727 self.parser.StartElementHandler = self._onStartElement | |
| 728 self.parser.EndElementHandler = self._onEndElement | |
| 729 self.parser.CharacterDataHandler = self._onCdata | |
| 730 self.parser.StartNamespaceDeclHandler = self._onStartNamespace | |
| 731 self.parser.EndNamespaceDeclHandler = self._onEndNamespace | |
| 732 self.currElem = None | |
| 733 self.defaultNsStack = [''] | |
| 734 self.documentStarted = 0 | |
| 735 self.localPrefixes = {} | |
| 736 | |
| 737 def parse(self, buffer): | |
| 738 try: | |
| 739 self.parser.Parse(buffer) | |
| 740 except self.error, e: | |
| 741 raise ParserError, str(e) | |
| 742 | |
| 743 def _onStartElement(self, name, attrs): | |
| 744 # Generate a qname tuple from the provided name | |
| 745 qname = name.split(" ") | |
| 746 if len(qname) == 1: | |
| 747 qname = ('', name) | |
| 748 | |
| 749 # Process attributes | |
| 750 for k, v in attrs.items(): | |
| 751 if k.find(" ") != -1: | |
| 752 aqname = k.split(" ") | |
| 753 attrs[(aqname[0], aqname[1])] = v | |
| 754 del attrs[k] | |
| 755 | |
| 756 # Construct the new element | |
| 757 e = Element(qname, self.defaultNsStack[-1], attrs, self.localPrefixes) | |
| 758 self.localPrefixes = {} | |
| 759 | |
| 760 # Document already started | |
| 761 if self.documentStarted == 1: | |
| 762 if self.currElem != None: | |
| 763 self.currElem.children.append(e) | |
| 764 e.parent = self.currElem | |
| 765 self.currElem = e | |
| 766 | |
| 767 # New document | |
| 768 else: | |
| 769 self.documentStarted = 1 | |
| 770 self.DocumentStartEvent(e) | |
| 771 | |
| 772 def _onEndElement(self, _): | |
| 773 # Check for null current elem; end of doc | |
| 774 if self.currElem is None: | |
| 775 self.DocumentEndEvent() | |
| 776 | |
| 777 # Check for parent that is None; that's | |
| 778 # the top of the stack | |
| 779 elif self.currElem.parent is None: | |
| 780 self.ElementEvent(self.currElem) | |
| 781 self.currElem = None | |
| 782 | |
| 783 # Anything else is just some element in the current | |
| 784 # packet wrapping up | |
| 785 else: | |
| 786 self.currElem = self.currElem.parent | |
| 787 | |
| 788 def _onCdata(self, data): | |
| 789 if self.currElem != None: | |
| 790 self.currElem.addContent(data) | |
| 791 | |
| 792 def _onStartNamespace(self, prefix, uri): | |
| 793 # If this is the default namespace, put | |
| 794 # it on the stack | |
| 795 if prefix is None: | |
| 796 self.defaultNsStack.append(uri) | |
| 797 else: | |
| 798 self.localPrefixes[prefix] = uri | |
| 799 | |
| 800 def _onEndNamespace(self, prefix): | |
| 801 # Remove last element on the stack | |
| 802 if prefix is None: | |
| 803 self.defaultNsStack.pop() | |
| 804 | |
| 805 ## class FileParser(ElementStream): | |
| 806 ## def __init__(self): | |
| 807 ## ElementStream.__init__(self) | |
| 808 ## self.DocumentStartEvent = self.docStart | |
| 809 ## self.ElementEvent = self.elem | |
| 810 ## self.DocumentEndEvent = self.docEnd | |
| 811 ## self.done = 0 | |
| 812 | |
| 813 ## def docStart(self, elem): | |
| 814 ## self.document = elem | |
| 815 | |
| 816 ## def elem(self, elem): | |
| 817 ## self.document.addChild(elem) | |
| 818 | |
| 819 ## def docEnd(self): | |
| 820 ## self.done = 1 | |
| 821 | |
| 822 ## def parse(self, filename): | |
| 823 ## for l in open(filename).readlines(): | |
| 824 ## self.parser.Parse(l) | |
| 825 ## assert self.done == 1 | |
| 826 ## return self.document | |
| 827 | |
| 828 ## def parseFile(filename): | |
| 829 ## return FileParser().parse(filename) | |
| 830 | |
| 831 | |
| OLD | NEW |