OLD | NEW |
| (Empty) |
1 # -*- test-case-name: twisted.web.test.test_xml -*- | |
2 # | |
3 # Copyright (c) 2001-2004 Twisted Matrix Laboratories. | |
4 # See LICENSE for details. | |
5 | |
6 # | |
7 | |
8 """Micro Document Object Model: a partial DOM implementation with SUX. | |
9 | |
10 This is an implementation of what we consider to be the useful subset of the | |
11 DOM. The chief advantage of this library is that, not being burdened with | |
12 standards compliance, it can remain very stable between versions. We can also | |
13 implement utility 'pythonic' ways to access and mutate the XML tree. | |
14 | |
15 Since this has not subjected to a serious trial by fire, it is not recommended | |
16 to use this outside of Twisted applications. However, it seems to work just | |
17 fine for the documentation generator, which parses a fairly representative | |
18 sample of XML. | |
19 | |
20 Microdom mainly focuses on working with HTML and XHTML. | |
21 """ | |
22 | |
23 from __future__ import nested_scopes | |
24 | |
25 # System Imports | |
26 import re | |
27 from cStringIO import StringIO | |
28 | |
29 # Twisted Imports | |
30 from twisted.web.sux import XMLParser, ParseError | |
31 from twisted.python.util import InsensitiveDict | |
32 | |
33 # create NodeList class | |
34 from types import ListType as NodeList | |
35 from types import StringTypes, UnicodeType | |
36 | |
37 def getElementsByTagName(iNode, name): | |
38 matches = [] | |
39 matches_append = matches.append # faster lookup. don't do this at home | |
40 slice=[iNode] | |
41 while len(slice)>0: | |
42 c = slice.pop(0) | |
43 if c.nodeName == name: | |
44 matches_append(c) | |
45 slice[:0] = c.childNodes | |
46 return matches | |
47 | |
48 def getElementsByTagNameNoCase(iNode, name): | |
49 name = name.lower() | |
50 matches = [] | |
51 matches_append = matches.append | |
52 slice=[iNode] | |
53 while len(slice)>0: | |
54 c = slice.pop(0) | |
55 if c.nodeName.lower() == name: | |
56 matches_append(c) | |
57 slice[:0] = c.childNodes | |
58 return matches | |
59 | |
60 # order is important | |
61 HTML_ESCAPE_CHARS = (('&', '&'), # don't add any entities before this one | |
62 ('<', '<'), | |
63 ('>', '>'), | |
64 ('"', '"')) | |
65 REV_HTML_ESCAPE_CHARS = list(HTML_ESCAPE_CHARS) | |
66 REV_HTML_ESCAPE_CHARS.reverse() | |
67 | |
68 XML_ESCAPE_CHARS = HTML_ESCAPE_CHARS + (("'", '''),) | |
69 REV_XML_ESCAPE_CHARS = list(XML_ESCAPE_CHARS) | |
70 REV_XML_ESCAPE_CHARS.reverse() | |
71 | |
72 def unescape(text, chars=REV_HTML_ESCAPE_CHARS): | |
73 "Perform the exact opposite of 'escape'." | |
74 for s, h in chars: | |
75 text = text.replace(h, s) | |
76 return text | |
77 | |
78 def escape(text, chars=HTML_ESCAPE_CHARS): | |
79 "Escape a few XML special chars with XML entities." | |
80 for s, h in chars: | |
81 text = text.replace(s, h) | |
82 return text | |
83 | |
84 | |
85 class MismatchedTags(Exception): | |
86 | |
87 def __init__(self, filename, expect, got, endLine, endCol, begLine, begCol): | |
88 (self.filename, self.expect, self.got, self.begLine, self.begCol, self.en
dLine, | |
89 self.endCol) = filename, expect, got, begLine, begCol, endLine, endCol | |
90 | |
91 def __str__(self): | |
92 return ("expected </%s>, got </%s> line: %s col: %s, began line: %s col:
%s" | |
93 % (self.expect, self.got, self.endLine, self.endCol, self.begLin
e, | |
94 self.begCol)) | |
95 | |
96 | |
97 class Node(object): | |
98 nodeName = "Node" | |
99 | |
100 def __init__(self, parentNode=None): | |
101 self.parentNode = parentNode | |
102 self.childNodes = [] | |
103 | |
104 def isEqualToNode(self, n): | |
105 for a, b in zip(self.childNodes, n.childNodes): | |
106 if not a.isEqualToNode(b): | |
107 return 0 | |
108 return 1 | |
109 | |
110 def writexml(self, stream, indent='', addindent='', newl='', strip=0, | |
111 nsprefixes={}, namespace=''): | |
112 raise NotImplementedError() | |
113 | |
114 def toxml(self, indent='', addindent='', newl='', strip=0, nsprefixes={}, | |
115 namespace=''): | |
116 s = StringIO() | |
117 self.writexml(s, indent, addindent, newl, strip, nsprefixes, namespace) | |
118 rv = s.getvalue() | |
119 return rv | |
120 | |
121 def writeprettyxml(self, stream, indent='', addindent=' ', newl='\n', strip=
0): | |
122 return self.writexml(stream, indent, addindent, newl, strip) | |
123 | |
124 def toprettyxml(self, indent='', addindent=' ', newl='\n', strip=0): | |
125 return self.toxml(indent, addindent, newl, strip) | |
126 | |
127 def cloneNode(self, deep=0, parent=None): | |
128 raise NotImplementedError() | |
129 | |
130 def hasChildNodes(self): | |
131 if self.childNodes: | |
132 return 1 | |
133 else: | |
134 return 0 | |
135 | |
136 def appendChild(self, child): | |
137 assert isinstance(child, Node) | |
138 self.childNodes.append(child) | |
139 child.parentNode = self | |
140 | |
141 def insertBefore(self, new, ref): | |
142 i = self.childNodes.index(ref) | |
143 new.parentNode = self | |
144 self.childNodes.insert(i, new) | |
145 return new | |
146 | |
147 def removeChild(self, child): | |
148 if child in self.childNodes: | |
149 self.childNodes.remove(child) | |
150 child.parentNode = None | |
151 return child | |
152 | |
153 def replaceChild(self, newChild, oldChild): | |
154 assert isinstance(newChild, Node) | |
155 #if newChild.parentNode: | |
156 # newChild.parentNode.removeChild(newChild) | |
157 assert (oldChild.parentNode is self, | |
158 ('oldChild (%s): oldChild.parentNode (%s) != self (%s)' | |
159 % (oldChild, oldChild.parentNode, self))) | |
160 self.childNodes[self.childNodes.index(oldChild)] = newChild | |
161 oldChild.parentNode = None | |
162 newChild.parentNode = self | |
163 | |
164 def lastChild(self): | |
165 return self.childNodes[-1] | |
166 | |
167 def firstChild(self): | |
168 if len(self.childNodes): | |
169 return self.childNodes[0] | |
170 return None | |
171 | |
172 #def get_ownerDocument(self): | |
173 # """This doesn't really get the owner document; microdom nodes | |
174 # don't even have one necessarily. This gets the root node, | |
175 # which is usually what you really meant. | |
176 # *NOT DOM COMPLIANT.* | |
177 # """ | |
178 # node=self | |
179 # while (node.parentNode): node=node.parentNode | |
180 # return node | |
181 #ownerDocument=node.get_ownerDocument() | |
182 # leaving commented for discussion; see also domhelpers.getParents(node) | |
183 | |
184 class Document(Node): | |
185 | |
186 def __init__(self, documentElement=None): | |
187 Node.__init__(self) | |
188 if documentElement: | |
189 self.appendChild(documentElement) | |
190 | |
191 def cloneNode(self, deep=0, parent=None): | |
192 d = Document() | |
193 d.doctype = self.doctype | |
194 if deep: | |
195 newEl = self.documentElement.cloneNode(1, self) | |
196 else: | |
197 newEl = self.documentElement | |
198 d.appendChild(newEl) | |
199 return d | |
200 | |
201 doctype = None | |
202 | |
203 def isEqualToDocument(self, n): | |
204 return (self.doctype == n.doctype) and self.isEqualToNode(n) | |
205 | |
206 def get_documentElement(self): | |
207 return self.childNodes[0] | |
208 documentElement=property(get_documentElement) | |
209 | |
210 def appendChild(self, c): | |
211 assert not self.childNodes, "Only one element per document." | |
212 Node.appendChild(self, c) | |
213 | |
214 def writexml(self, stream, indent='', addindent='', newl='', strip=0, | |
215 nsprefixes={}, namespace=''): | |
216 stream.write('<?xml version="1.0"?>' + newl) | |
217 if self.doctype: | |
218 stream.write("<!DOCTYPE "+self.doctype+">" + newl) | |
219 self.documentElement.writexml(stream, indent, addindent, newl, strip, | |
220 nsprefixes, namespace) | |
221 | |
222 # of dubious utility (?) | |
223 def createElement(self, name, **kw): | |
224 return Element(name, **kw) | |
225 | |
226 def createTextNode(self, text): | |
227 return Text(text) | |
228 | |
229 def createComment(self, text): | |
230 return Comment(text) | |
231 | |
232 def getElementsByTagName(self, name): | |
233 if self.documentElement.caseInsensitive: | |
234 return getElementsByTagNameNoCase(self, name) | |
235 return getElementsByTagName(self, name) | |
236 | |
237 def getElementById(self, id): | |
238 childNodes = self.childNodes[:] | |
239 while childNodes: | |
240 node = childNodes.pop(0) | |
241 if node.childNodes: | |
242 childNodes.extend(node.childNodes) | |
243 if hasattr(node, 'getAttribute') and node.getAttribute("id") == id: | |
244 return node | |
245 | |
246 | |
247 class EntityReference(Node): | |
248 | |
249 def __init__(self, eref, parentNode=None): | |
250 Node.__init__(self, parentNode) | |
251 self.eref = eref | |
252 self.nodeValue = self.data = "&" + eref + ";" | |
253 | |
254 def isEqualToEntityReference(self, n): | |
255 if not isinstance(n, EntityReference): | |
256 return 0 | |
257 return (self.eref == n.eref) and (self.nodeValue == n.nodeValue) | |
258 | |
259 def writexml(self, stream, indent='', addindent='', newl='', strip=0, | |
260 nsprefixes={}, namespace=''): | |
261 stream.write(self.nodeValue) | |
262 | |
263 def cloneNode(self, deep=0, parent=None): | |
264 return EntityReference(self.eref, parent) | |
265 | |
266 | |
267 class CharacterData(Node): | |
268 | |
269 def __init__(self, data, parentNode=None): | |
270 Node.__init__(self, parentNode) | |
271 self.value = self.data = self.nodeValue = data | |
272 | |
273 def isEqualToCharacterData(self, n): | |
274 return self.value == n.value | |
275 | |
276 | |
277 class Comment(CharacterData): | |
278 """A comment node.""" | |
279 | |
280 def writexml(self, stream, indent='', addindent='', newl='', strip=0, | |
281 nsprefixes={}, namespace=''): | |
282 val=self.data | |
283 if isinstance(val, UnicodeType): | |
284 val=val.encode('utf8') | |
285 stream.write("<!--%s-->" % val) | |
286 | |
287 def cloneNode(self, deep=0, parent=None): | |
288 return Comment(self.nodeValue, parent) | |
289 | |
290 | |
291 class Text(CharacterData): | |
292 | |
293 def __init__(self, data, parentNode=None, raw=0): | |
294 CharacterData.__init__(self, data, parentNode) | |
295 self.raw = raw | |
296 | |
297 def cloneNode(self, deep=0, parent=None): | |
298 return Text(self.nodeValue, parent, self.raw) | |
299 | |
300 def writexml(self, stream, indent='', addindent='', newl='', strip=0, | |
301 nsprefixes={}, namespace=''): | |
302 if self.raw: | |
303 val = self.nodeValue | |
304 if not isinstance(val, StringTypes): | |
305 val = str(self.nodeValue) | |
306 else: | |
307 v = self.nodeValue | |
308 if not isinstance(v, StringTypes): | |
309 v = str(v) | |
310 if strip: | |
311 v = ' '.join(v.split()) | |
312 val = escape(v) | |
313 if isinstance(val, UnicodeType): | |
314 val = val.encode('utf8') | |
315 stream.write(val) | |
316 | |
317 def __repr__(self): | |
318 return "Text(%s" % repr(self.nodeValue) + ')' | |
319 | |
320 | |
321 class CDATASection(CharacterData): | |
322 def cloneNode(self, deep=0, parent=None): | |
323 return CDATASection(self.nodeValue, parent) | |
324 | |
325 def writexml(self, stream, indent='', addindent='', newl='', strip=0, | |
326 nsprefixes={}, namespace=''): | |
327 stream.write("<![CDATA[") | |
328 stream.write(self.nodeValue) | |
329 stream.write("]]>") | |
330 | |
331 def _genprefix(): | |
332 i = 0 | |
333 while True: | |
334 yield 'p' + str(i) | |
335 i = i + 1 | |
336 genprefix = _genprefix().next | |
337 | |
338 class _Attr(CharacterData): | |
339 "Support class for getAttributeNode." | |
340 | |
341 class Element(Node): | |
342 | |
343 preserveCase = 0 | |
344 caseInsensitive = 1 | |
345 nsprefixes = None | |
346 | |
347 def __init__(self, tagName, attributes=None, parentNode=None, | |
348 filename=None, markpos=None, | |
349 caseInsensitive=1, preserveCase=0, | |
350 namespace=None): | |
351 Node.__init__(self, parentNode) | |
352 self.preserveCase = preserveCase or not caseInsensitive | |
353 self.caseInsensitive = caseInsensitive | |
354 if not preserveCase: | |
355 tagName = tagName.lower() | |
356 if attributes is None: | |
357 self.attributes = {} | |
358 else: | |
359 self.attributes = attributes | |
360 for k, v in self.attributes.items(): | |
361 self.attributes[k] = unescape(v) | |
362 | |
363 if caseInsensitive: | |
364 self.attributes = InsensitiveDict(self.attributes, | |
365 preserve=preserveCase) | |
366 | |
367 self.endTagName = self.nodeName = self.tagName = tagName | |
368 self._filename = filename | |
369 self._markpos = markpos | |
370 self.namespace = namespace | |
371 | |
372 def addPrefixes(self, pfxs): | |
373 if self.nsprefixes is None: | |
374 self.nsprefixes = pfxs | |
375 else: | |
376 self.nsprefixes.update(pfxs) | |
377 | |
378 def endTag(self, endTagName): | |
379 if not self.preserveCase: | |
380 endTagName = endTagName.lower() | |
381 self.endTagName = endTagName | |
382 | |
383 def isEqualToElement(self, n): | |
384 if self.caseInsensitive: | |
385 return ((self.attributes == n.attributes) | |
386 and (self.nodeName.lower() == n.nodeName.lower())) | |
387 return (self.attributes == n.attributes) and (self.nodeName == n.nodeNam
e) | |
388 | |
389 def cloneNode(self, deep=0, parent=None): | |
390 clone = Element( | |
391 self.tagName, parentNode=parent, namespace=self.namespace, | |
392 preserveCase=self.preserveCase, caseInsensitive=self.caseInsensitive
) | |
393 clone.attributes.update(self.attributes) | |
394 if deep: | |
395 clone.childNodes = [child.cloneNode(1, clone) for child in self.chil
dNodes] | |
396 else: | |
397 clone.childNodes = [] | |
398 return clone | |
399 | |
400 def getElementsByTagName(self, name): | |
401 if self.caseInsensitive: | |
402 return getElementsByTagNameNoCase(self, name) | |
403 return getElementsByTagName(self, name) | |
404 | |
405 def hasAttributes(self): | |
406 return 1 | |
407 | |
408 def getAttribute(self, name, default=None): | |
409 return self.attributes.get(name, default) | |
410 | |
411 def getAttributeNS(self, ns, name, default=None): | |
412 nsk = (ns, name) | |
413 if self.attributes.has_key(nsk): | |
414 return self.attributes[nsk] | |
415 if ns == self.namespace: | |
416 return self.attributes.get(name, default) | |
417 return default | |
418 | |
419 def getAttributeNode(self, name): | |
420 return _Attr(self.getAttribute(name), self) | |
421 | |
422 def setAttribute(self, name, attr): | |
423 self.attributes[name] = attr | |
424 | |
425 def removeAttribute(self, name): | |
426 if name in self.attributes: | |
427 del self.attributes[name] | |
428 | |
429 def hasAttribute(self, name): | |
430 return name in self.attributes | |
431 | |
432 def writexml(self, stream, indent='', addindent='', newl='', strip=0, | |
433 nsprefixes={}, namespace=''): | |
434 # write beginning | |
435 ALLOWSINGLETON = ('img', 'br', 'hr', 'base', 'meta', 'link', 'param', | |
436 'area', 'input', 'col', 'basefont', 'isindex', | |
437 'frame') | |
438 BLOCKELEMENTS = ('html', 'head', 'body', 'noscript', 'ins', 'del', | |
439 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'script', | |
440 'ul', 'ol', 'dl', 'pre', 'hr', 'blockquote', | |
441 'address', 'p', 'div', 'fieldset', 'table', 'tr', | |
442 'form', 'object', 'fieldset', 'applet', 'map') | |
443 FORMATNICELY = ('tr', 'ul', 'ol', 'head') | |
444 | |
445 # this should never be necessary unless people start | |
446 # changing .tagName on the fly(?) | |
447 if not self.preserveCase: | |
448 self.endTagName = self.tagName | |
449 w = stream.write | |
450 if self.nsprefixes: | |
451 newprefixes = self.nsprefixes.copy() | |
452 for ns in nsprefixes.keys(): | |
453 if ns in newprefixes: | |
454 del newprefixes[ns] | |
455 else: | |
456 newprefixes = {} | |
457 | |
458 begin = ['<'] | |
459 if self.tagName in BLOCKELEMENTS: | |
460 begin = [newl, indent] + begin | |
461 bext = begin.extend | |
462 writeattr = lambda _atr, _val: bext((' ', _atr, '="', escape(_val), '"')
) | |
463 if namespace != self.namespace and self.namespace is not None: | |
464 if nsprefixes.has_key(self.namespace): | |
465 prefix = nsprefixes[self.namespace] | |
466 bext(prefix+':'+self.tagName) | |
467 else: | |
468 bext(self.tagName) | |
469 writeattr("xmlns", self.namespace) | |
470 else: | |
471 bext(self.tagName) | |
472 j = ''.join | |
473 for attr, val in self.attributes.iteritems(): | |
474 if isinstance(attr, tuple): | |
475 ns, key = attr | |
476 if nsprefixes.has_key(ns): | |
477 prefix = nsprefixes[ns] | |
478 else: | |
479 prefix = genprefix() | |
480 newprefixes[ns] = prefix | |
481 assert val is not None | |
482 writeattr(prefix+':'+key,val) | |
483 else: | |
484 assert val is not None | |
485 writeattr(attr, val) | |
486 if newprefixes: | |
487 for ns, prefix in newprefixes.iteritems(): | |
488 if prefix: | |
489 writeattr('xmlns:'+prefix, ns) | |
490 newprefixes.update(nsprefixes) | |
491 downprefixes = newprefixes | |
492 else: | |
493 downprefixes = nsprefixes | |
494 w(j(begin)) | |
495 if self.childNodes: | |
496 w(">") | |
497 newindent = indent + addindent | |
498 for child in self.childNodes: | |
499 if self.tagName in BLOCKELEMENTS and \ | |
500 self.tagName in FORMATNICELY: | |
501 w(j((newl, newindent))) | |
502 child.writexml(stream, newindent, addindent, newl, strip, | |
503 downprefixes, self.namespace) | |
504 if self.tagName in BLOCKELEMENTS: | |
505 w(j((newl, indent))) | |
506 w(j(("</", self.endTagName, '>'))) | |
507 | |
508 elif self.tagName.lower() not in ALLOWSINGLETON: | |
509 w(j(('></', self.endTagName, '>'))) | |
510 else: | |
511 w(" />") | |
512 | |
513 def __repr__(self): | |
514 rep = "Element(%s" % repr(self.nodeName) | |
515 if self.attributes: | |
516 rep += ", attributes=%r" % (self.attributes,) | |
517 if self._filename: | |
518 rep += ", filename=%r" % (self._filename,) | |
519 if self._markpos: | |
520 rep += ", markpos=%r" % (self._markpos,) | |
521 return rep + ')' | |
522 | |
523 def __str__(self): | |
524 rep = "<" + self.nodeName | |
525 if self._filename or self._markpos: | |
526 rep += " (" | |
527 if self._filename: | |
528 rep += repr(self._filename) | |
529 if self._markpos: | |
530 rep += " line %s column %s" % self._markpos | |
531 if self._filename or self._markpos: | |
532 rep += ")" | |
533 for item in self.attributes.items(): | |
534 rep += " %s=%r" % item | |
535 if self.hasChildNodes(): | |
536 rep += " >...</%s>" % self.nodeName | |
537 else: | |
538 rep += " />" | |
539 return rep | |
540 | |
541 def _unescapeDict(d): | |
542 dd = {} | |
543 for k, v in d.items(): | |
544 dd[k] = unescape(v) | |
545 return dd | |
546 | |
547 def _reverseDict(d): | |
548 dd = {} | |
549 for k, v in d.items(): | |
550 dd[v]=k | |
551 return dd | |
552 | |
553 class MicroDOMParser(XMLParser): | |
554 | |
555 # <dash> glyph: a quick scan thru the DTD says BODY, AREA, LINK, IMG, HR, | |
556 # P, DT, DD, LI, INPUT, OPTION, THEAD, TFOOT, TBODY, COLGROUP, COL, TR, TH, | |
557 # TD, HEAD, BASE, META, HTML all have optional closing tags | |
558 | |
559 soonClosers = 'area link br img hr input base meta'.split() | |
560 laterClosers = {'p': ['p', 'dt'], | |
561 'dt': ['dt','dd'], | |
562 'dd': ['dt', 'dd'], | |
563 'li': ['li'], | |
564 'tbody': ['thead', 'tfoot', 'tbody'], | |
565 'thead': ['thead', 'tfoot', 'tbody'], | |
566 'tfoot': ['thead', 'tfoot', 'tbody'], | |
567 'colgroup': ['colgroup'], | |
568 'col': ['col'], | |
569 'tr': ['tr'], | |
570 'td': ['td'], | |
571 'th': ['th'], | |
572 'head': ['body'], | |
573 'title': ['head', 'body'], # this looks wrong... | |
574 'option': ['option'], | |
575 } | |
576 | |
577 | |
578 def __init__(self, beExtremelyLenient=0, caseInsensitive=1, preserveCase=0, | |
579 soonClosers=soonClosers, laterClosers=laterClosers): | |
580 self.elementstack = [] | |
581 d = {'xmlns': 'xmlns', '': None} | |
582 dr = _reverseDict(d) | |
583 self.nsstack = [(d,None,dr)] | |
584 self.documents = [] | |
585 self._mddoctype = None | |
586 self.beExtremelyLenient = beExtremelyLenient | |
587 self.caseInsensitive = caseInsensitive | |
588 self.preserveCase = preserveCase or not caseInsensitive | |
589 self.soonClosers = soonClosers | |
590 self.laterClosers = laterClosers | |
591 # self.indentlevel = 0 | |
592 | |
593 def shouldPreserveSpace(self): | |
594 for edx in xrange(len(self.elementstack)): | |
595 el = self.elementstack[-edx] | |
596 if el.tagName == 'pre' or el.getAttribute("xml:space", '') == 'prese
rve': | |
597 return 1 | |
598 return 0 | |
599 | |
600 def _getparent(self): | |
601 if self.elementstack: | |
602 return self.elementstack[-1] | |
603 else: | |
604 return None | |
605 | |
606 COMMENT = re.compile(r"\s*/[/*]\s*") | |
607 | |
608 def _fixScriptElement(self, el): | |
609 # this deals with case where there is comment or CDATA inside | |
610 # <script> tag and we want to do the right thing with it | |
611 if not self.beExtremelyLenient or not len(el.childNodes) == 1: | |
612 return | |
613 c = el.firstChild() | |
614 if isinstance(c, Text): | |
615 # deal with nasty people who do stuff like: | |
616 # <script> // <!-- | |
617 # x = 1; | |
618 # // --></script> | |
619 # tidy does this, for example. | |
620 prefix = "" | |
621 oldvalue = c.value | |
622 match = self.COMMENT.match(oldvalue) | |
623 if match: | |
624 prefix = match.group() | |
625 oldvalue = oldvalue[len(prefix):] | |
626 | |
627 # now see if contents are actual node and comment or CDATA | |
628 try: | |
629 e = parseString("<a>%s</a>" % oldvalue).childNodes[0] | |
630 except (ParseError, MismatchedTags): | |
631 return | |
632 if len(e.childNodes) != 1: | |
633 return | |
634 e = e.firstChild() | |
635 if isinstance(e, (CDATASection, Comment)): | |
636 el.childNodes = [] | |
637 if prefix: | |
638 el.childNodes.append(Text(prefix)) | |
639 el.childNodes.append(e) | |
640 | |
641 def gotDoctype(self, doctype): | |
642 self._mddoctype = doctype | |
643 | |
644 def gotTagStart(self, name, attributes): | |
645 # print ' '*self.indentlevel, 'start tag',name | |
646 # self.indentlevel += 1 | |
647 parent = self._getparent() | |
648 if (self.beExtremelyLenient and isinstance(parent, Element)): | |
649 parentName = parent.tagName | |
650 myName = name | |
651 if self.caseInsensitive: | |
652 parentName = parentName.lower() | |
653 myName = myName.lower() | |
654 if myName in self.laterClosers.get(parentName, []): | |
655 self.gotTagEnd(parent.tagName) | |
656 parent = self._getparent() | |
657 attributes = _unescapeDict(attributes) | |
658 namespaces = self.nsstack[-1][0] | |
659 newspaces = {} | |
660 for k, v in attributes.items(): | |
661 if k.startswith('xmlns'): | |
662 spacenames = k.split(':',1) | |
663 if len(spacenames) == 2: | |
664 newspaces[spacenames[1]] = v | |
665 else: | |
666 newspaces[''] = v | |
667 del attributes[k] | |
668 if newspaces: | |
669 namespaces = namespaces.copy() | |
670 namespaces.update(newspaces) | |
671 for k, v in attributes.items(): | |
672 ksplit = k.split(':', 1) | |
673 if len(ksplit) == 2: | |
674 pfx, tv = ksplit | |
675 if pfx != 'xml' and namespaces.has_key(pfx): | |
676 attributes[namespaces[pfx], tv] = v | |
677 del attributes[k] | |
678 el = Element(name, attributes, parent, | |
679 self.filename, self.saveMark(), | |
680 caseInsensitive=self.caseInsensitive, | |
681 preserveCase=self.preserveCase, | |
682 namespace=namespaces.get('')) | |
683 revspaces = _reverseDict(newspaces) | |
684 el.addPrefixes(revspaces) | |
685 | |
686 if newspaces: | |
687 rscopy = self.nsstack[-1][2].copy() | |
688 rscopy.update(revspaces) | |
689 self.nsstack.append((namespaces, el, rscopy)) | |
690 self.elementstack.append(el) | |
691 if parent: | |
692 parent.appendChild(el) | |
693 if (self.beExtremelyLenient and el.tagName in self.soonClosers): | |
694 self.gotTagEnd(name) | |
695 | |
696 def _gotStandalone(self, factory, data): | |
697 parent = self._getparent() | |
698 te = factory(data, parent) | |
699 if parent: | |
700 parent.appendChild(te) | |
701 elif self.beExtremelyLenient: | |
702 self.documents.append(te) | |
703 | |
704 def gotText(self, data): | |
705 if data.strip() or self.shouldPreserveSpace(): | |
706 self._gotStandalone(Text, data) | |
707 | |
708 def gotComment(self, data): | |
709 self._gotStandalone(Comment, data) | |
710 | |
711 def gotEntityReference(self, entityRef): | |
712 self._gotStandalone(EntityReference, entityRef) | |
713 | |
714 def gotCData(self, cdata): | |
715 self._gotStandalone(CDATASection, cdata) | |
716 | |
717 def gotTagEnd(self, name): | |
718 # print ' '*self.indentlevel, 'end tag',name | |
719 # self.indentlevel -= 1 | |
720 if not self.elementstack: | |
721 if self.beExtremelyLenient: | |
722 return | |
723 raise MismatchedTags(*((self.filename, "NOTHING", name) | |
724 +self.saveMark()+(0,0))) | |
725 el = self.elementstack.pop() | |
726 pfxdix = self.nsstack[-1][2] | |
727 if self.nsstack[-1][1] is el: | |
728 nstuple = self.nsstack.pop() | |
729 else: | |
730 nstuple = None | |
731 if self.caseInsensitive: | |
732 tn = el.tagName.lower() | |
733 cname = name.lower() | |
734 else: | |
735 tn = el.tagName | |
736 cname = name | |
737 | |
738 nsplit = name.split(':',1) | |
739 if len(nsplit) == 2: | |
740 pfx, newname = nsplit | |
741 ns = pfxdix.get(pfx,None) | |
742 if ns is not None: | |
743 if el.namespace != ns: | |
744 if not self.beExtremelyLenient: | |
745 raise MismatchedTags(*((self.filename, el.tagName, name) | |
746 +self.saveMark()+el._markpos)) | |
747 if not (tn == cname): | |
748 if self.beExtremelyLenient: | |
749 if self.elementstack: | |
750 lastEl = self.elementstack[0] | |
751 for idx in xrange(len(self.elementstack)): | |
752 if self.elementstack[-(idx+1)].tagName == cname: | |
753 self.elementstack[-(idx+1)].endTag(name) | |
754 break | |
755 else: | |
756 # this was a garbage close tag; wait for a real one | |
757 self.elementstack.append(el) | |
758 if nstuple is not None: | |
759 self.nsstack.append(nstuple) | |
760 return | |
761 del self.elementstack[-(idx+1):] | |
762 if not self.elementstack: | |
763 self.documents.append(lastEl) | |
764 return | |
765 else: | |
766 raise MismatchedTags(*((self.filename, el.tagName, name) | |
767 +self.saveMark()+el._markpos)) | |
768 el.endTag(name) | |
769 if not self.elementstack: | |
770 self.documents.append(el) | |
771 if self.beExtremelyLenient and el.tagName == "script": | |
772 self._fixScriptElement(el) | |
773 | |
774 def connectionLost(self, reason): | |
775 XMLParser.connectionLost(self, reason) # This can cause more events! | |
776 if self.elementstack: | |
777 if self.beExtremelyLenient: | |
778 self.documents.append(self.elementstack[0]) | |
779 else: | |
780 raise MismatchedTags(*((self.filename, self.elementstack[-1], | |
781 "END_OF_FILE") | |
782 +self.saveMark() | |
783 +self.elementstack[-1]._markpos)) | |
784 | |
785 | |
786 def parse(readable, *args, **kwargs): | |
787 """Parse HTML or XML readable.""" | |
788 if not hasattr(readable, "read"): | |
789 readable = open(readable, "rb") | |
790 mdp = MicroDOMParser(*args, **kwargs) | |
791 mdp.filename = getattr(readable, "name", "<xmlfile />") | |
792 mdp.makeConnection(None) | |
793 if hasattr(readable,"getvalue"): | |
794 mdp.dataReceived(readable.getvalue()) | |
795 else: | |
796 r = readable.read(1024) | |
797 while r: | |
798 mdp.dataReceived(r) | |
799 r = readable.read(1024) | |
800 mdp.connectionLost(None) | |
801 | |
802 if not mdp.documents: | |
803 raise ParseError(mdp.filename, 0, 0, "No top-level Nodes in document") | |
804 | |
805 if mdp.beExtremelyLenient: | |
806 if len(mdp.documents) == 1: | |
807 d = mdp.documents[0] | |
808 if not isinstance(d, Element): | |
809 el = Element("html") | |
810 el.appendChild(d) | |
811 d = el | |
812 else: | |
813 d = Element("html") | |
814 for child in mdp.documents: | |
815 d.appendChild(child) | |
816 else: | |
817 d = mdp.documents[0] | |
818 doc = Document(d) | |
819 doc.doctype = mdp._mddoctype | |
820 return doc | |
821 | |
822 def parseString(st, *args, **kw): | |
823 if isinstance(st, UnicodeType): | |
824 # this isn't particularly ideal, but it does work. | |
825 return parse(StringIO(st.encode('UTF-16')), *args, **kw) | |
826 return parse(StringIO(st), *args, **kw) | |
827 | |
828 | |
829 def parseXML(readable): | |
830 """Parse an XML readable object.""" | |
831 return parse(readable, caseInsensitive=0, preserveCase=1) | |
832 | |
833 | |
834 def parseXMLString(st): | |
835 """Parse an XML readable object.""" | |
836 return parseString(st, caseInsensitive=0, preserveCase=1) | |
837 | |
838 | |
839 # Utility | |
840 | |
841 class lmx: | |
842 """Easy creation of XML.""" | |
843 | |
844 def __init__(self, node='div'): | |
845 if isinstance(node, StringTypes): | |
846 node = Element(node) | |
847 self.node = node | |
848 | |
849 def __getattr__(self, name): | |
850 if name[0] == '_': | |
851 raise AttributeError("no private attrs") | |
852 return lambda **kw: self.add(name,**kw) | |
853 | |
854 def __setitem__(self, key, val): | |
855 self.node.setAttribute(key, val) | |
856 | |
857 def __getitem__(self, key): | |
858 return self.node.getAttribute(key) | |
859 | |
860 def text(self, txt, raw=0): | |
861 nn = Text(txt, raw=raw) | |
862 self.node.appendChild(nn) | |
863 return self | |
864 | |
865 def add(self, tagName, **kw): | |
866 newNode = Element(tagName, caseInsensitive=0, preserveCase=0) | |
867 self.node.appendChild(newNode) | |
868 xf = lmx(newNode) | |
869 for k, v in kw.items(): | |
870 if k[0] == '_': | |
871 k = k[1:] | |
872 xf[k]=v | |
873 return xf | |
OLD | NEW |