OLD | NEW |
| (Empty) |
1 # -*- test-case-name: twisted.web.test.test_xml -*- | |
2 # | |
3 # Copyright (c) 2001-2004 Twisted Matrix Laboratories. | |
4 # See LICENSE for details. | |
5 | |
6 | |
7 """ | |
8 *S*mall, *U*ncomplicated *X*ML. | |
9 | |
10 This is a very simple implementation of XML/HTML as a network | |
11 protocol. It is not at all clever. Its main features are that it | |
12 does not: | |
13 | |
14 - support namespaces | |
15 - mung mnemonic entity references | |
16 - validate | |
17 - perform *any* external actions (such as fetching URLs or writing files) | |
18 under *any* circumstances | |
19 - has lots and lots of horrible hacks for supporting broken HTML (as an | |
20 option, they're not on by default). | |
21 """ | |
22 | |
23 from twisted.internet.protocol import Protocol, FileWrapper | |
24 from twisted.python.reflect import prefixedMethodNames | |
25 | |
26 | |
27 | |
28 # Elements of the three-tuples in the state table. | |
29 BEGIN_HANDLER = 0 | |
30 DO_HANDLER = 1 | |
31 END_HANDLER = 2 | |
32 | |
33 identChars = '.-_:' | |
34 lenientIdentChars = identChars + ';+#/%~' | |
35 | |
36 def nop(*args, **kw): | |
37 "Do nothing." | |
38 | |
39 | |
40 def unionlist(*args): | |
41 l = [] | |
42 for x in args: | |
43 l.extend(x) | |
44 d = dict([(x, 1) for x in l]) | |
45 return d.keys() | |
46 | |
47 | |
48 def zipfndict(*args, **kw): | |
49 default = kw.get('default', nop) | |
50 d = {} | |
51 for key in unionlist(*[fndict.keys() for fndict in args]): | |
52 d[key] = tuple([x.get(key, default) for x in args]) | |
53 return d | |
54 | |
55 | |
56 def prefixedMethodClassDict(clazz, prefix): | |
57 return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMetho
dNames(clazz, prefix)]) | |
58 | |
59 | |
60 def prefixedMethodObjDict(obj, prefix): | |
61 return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodN
ames(obj.__class__, prefix)]) | |
62 | |
63 | |
64 class ParseError(Exception): | |
65 | |
66 def __init__(self, filename, line, col, message): | |
67 self.filename = filename | |
68 self.line = line | |
69 self.col = col | |
70 self.message = message | |
71 | |
72 def __str__(self): | |
73 return "%s:%s:%s: %s" % (self.filename, self.line, self.col, | |
74 self.message) | |
75 | |
76 class XMLParser(Protocol): | |
77 | |
78 state = None | |
79 encodings = None | |
80 filename = "<xml />" | |
81 beExtremelyLenient = 0 | |
82 _prepend = None | |
83 | |
84 # _leadingBodyData will sometimes be set before switching to the | |
85 # 'bodydata' state, when we "accidentally" read a byte of bodydata | |
86 # in a different state. | |
87 _leadingBodyData = None | |
88 | |
89 def connectionMade(self): | |
90 self.lineno = 1 | |
91 self.colno = 0 | |
92 self.encodings = [] | |
93 | |
94 def saveMark(self): | |
95 '''Get the line number and column of the last character parsed''' | |
96 # This gets replaced during dataReceived, restored afterwards | |
97 return (self.lineno, self.colno) | |
98 | |
99 def _parseError(self, message): | |
100 raise ParseError(*((self.filename,)+self.saveMark()+(message,))) | |
101 | |
102 def _buildStateTable(self): | |
103 '''Return a dictionary of begin, do, end state function tuples''' | |
104 # _buildStateTable leaves something to be desired but it does what it | |
105 # does.. probably slowly, so I'm doing some evil caching so it doesn't | |
106 # get called more than once per class. | |
107 stateTable = getattr(self.__class__, '__stateTable', None) | |
108 if stateTable is None: | |
109 stateTable = self.__class__.__stateTable = zipfndict( | |
110 *[prefixedMethodObjDict(self, prefix) | |
111 for prefix in ('begin_', 'do_', 'end_')]) | |
112 return stateTable | |
113 | |
114 def _decode(self, data): | |
115 if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings: | |
116 assert not len(data) & 1, 'UTF-16 must come in pairs for now' | |
117 if self._prepend: | |
118 data = self._prepend + data | |
119 for encoding in self.encodings: | |
120 data = unicode(data, encoding) | |
121 return data | |
122 | |
123 def maybeBodyData(self): | |
124 if self.endtag: | |
125 return 'bodydata' | |
126 | |
127 # Get ready for fun! We're going to allow | |
128 # <script>if (foo < bar)</script> to work! | |
129 # We do this by making everything between <script> and | |
130 # </script> a Text | |
131 # BUT <script src="foo"> will be special-cased to do regular, | |
132 # lenient behavior, because those may not have </script> | |
133 # -radix | |
134 | |
135 if (self.tagName == 'script' | |
136 and not self.tagAttributes.has_key('src')): | |
137 # we do this ourselves rather than having begin_waitforendscript | |
138 # becuase that can get called multiple times and we don't want | |
139 # bodydata to get reset other than the first time. | |
140 self.begin_bodydata(None) | |
141 return 'waitforendscript' | |
142 return 'bodydata' | |
143 | |
144 | |
145 | |
146 def dataReceived(self, data): | |
147 stateTable = self._buildStateTable() | |
148 if not self.state: | |
149 # all UTF-16 starts with this string | |
150 if data.startswith('\xff\xfe'): | |
151 self._prepend = '\xff\xfe' | |
152 self.encodings.append('UTF-16') | |
153 data = data[2:] | |
154 elif data.startswith('\xfe\xff'): | |
155 self._prepend = '\xfe\xff' | |
156 self.encodings.append('UTF-16') | |
157 data = data[2:] | |
158 self.state = 'begin' | |
159 if self.encodings: | |
160 data = self._decode(data) | |
161 # bring state, lineno, colno into local scope | |
162 lineno, colno = self.lineno, self.colno | |
163 curState = self.state | |
164 # replace saveMark with a nested scope function | |
165 _saveMark = self.saveMark | |
166 def saveMark(): | |
167 return (lineno, colno) | |
168 self.saveMark = saveMark | |
169 # fetch functions from the stateTable | |
170 beginFn, doFn, endFn = stateTable[curState] | |
171 try: | |
172 for byte in data: | |
173 # do newline stuff | |
174 if byte == '\n': | |
175 lineno += 1 | |
176 colno = 0 | |
177 else: | |
178 colno += 1 | |
179 newState = doFn(byte) | |
180 if newState is not None and newState != curState: | |
181 # this is the endFn from the previous state | |
182 endFn() | |
183 curState = newState | |
184 beginFn, doFn, endFn = stateTable[curState] | |
185 beginFn(byte) | |
186 finally: | |
187 self.saveMark = _saveMark | |
188 self.lineno, self.colno = lineno, colno | |
189 # state doesn't make sense if there's an exception.. | |
190 self.state = curState | |
191 | |
192 | |
193 def connectionLost(self, reason): | |
194 """ | |
195 End the last state we were in. | |
196 """ | |
197 stateTable = self._buildStateTable() | |
198 stateTable[self.state][END_HANDLER]() | |
199 | |
200 | |
201 # state methods | |
202 | |
203 def do_begin(self, byte): | |
204 if byte.isspace(): | |
205 return | |
206 if byte != '<': | |
207 if self.beExtremelyLenient: | |
208 self._leadingBodyData = byte | |
209 return 'bodydata' | |
210 self._parseError("First char of document [%r] wasn't <" % (byte,)) | |
211 return 'tagstart' | |
212 | |
213 def begin_comment(self, byte): | |
214 self.commentbuf = '' | |
215 | |
216 def do_comment(self, byte): | |
217 self.commentbuf += byte | |
218 if self.commentbuf.endswith('-->'): | |
219 self.gotComment(self.commentbuf[:-3]) | |
220 return 'bodydata' | |
221 | |
222 def begin_tagstart(self, byte): | |
223 self.tagName = '' # name of the tag | |
224 self.tagAttributes = {} # attributes of the tag | |
225 self.termtag = 0 # is the tag self-terminating | |
226 self.endtag = 0 | |
227 | |
228 def do_tagstart(self, byte): | |
229 if byte.isalnum() or byte in identChars: | |
230 self.tagName += byte | |
231 if self.tagName == '!--': | |
232 return 'comment' | |
233 elif byte.isspace(): | |
234 if self.tagName: | |
235 if self.endtag: | |
236 # properly strict thing to do here is probably to only | |
237 # accept whitespace | |
238 return 'waitforgt' | |
239 return 'attrs' | |
240 else: | |
241 self._parseError("Whitespace before tag-name") | |
242 elif byte == '>': | |
243 if self.endtag: | |
244 self.gotTagEnd(self.tagName) | |
245 return 'bodydata' | |
246 else: | |
247 self.gotTagStart(self.tagName, {}) | |
248 return (not self.beExtremelyLenient) and 'bodydata' or self.mayb
eBodyData() | |
249 elif byte == '/': | |
250 if self.tagName: | |
251 return 'afterslash' | |
252 else: | |
253 self.endtag = 1 | |
254 elif byte in '!?': | |
255 if self.tagName: | |
256 if not self.beExtremelyLenient: | |
257 self._parseError("Invalid character in tag-name") | |
258 else: | |
259 self.tagName += byte | |
260 self.termtag = 1 | |
261 elif byte == '[': | |
262 if self.tagName == '!': | |
263 return 'expectcdata' | |
264 else: | |
265 self._parseError("Invalid '[' in tag-name") | |
266 else: | |
267 if self.beExtremelyLenient: | |
268 self.bodydata = '<' | |
269 return 'unentity' | |
270 self._parseError('Invalid tag character: %r'% byte) | |
271 | |
272 def begin_unentity(self, byte): | |
273 self.bodydata += byte | |
274 | |
275 def do_unentity(self, byte): | |
276 self.bodydata += byte | |
277 return 'bodydata' | |
278 | |
279 def end_unentity(self): | |
280 self.gotText(self.bodydata) | |
281 | |
282 def begin_expectcdata(self, byte): | |
283 self.cdatabuf = byte | |
284 | |
285 def do_expectcdata(self, byte): | |
286 self.cdatabuf += byte | |
287 cdb = self.cdatabuf | |
288 cd = '[CDATA[' | |
289 if len(cd) > len(cdb): | |
290 if cd.startswith(cdb): | |
291 return | |
292 elif self.beExtremelyLenient: | |
293 ## WHAT THE CRAP!? MSWord9 generates HTML that includes these | |
294 ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore | |
295 ## 'em as best I can. this should really be a separate parse | |
296 ## state but I don't even have any idea what these _are_. | |
297 return 'waitforgt' | |
298 else: | |
299 self._parseError("Mal-formed CDATA header") | |
300 if cd == cdb: | |
301 self.cdatabuf = '' | |
302 return 'cdata' | |
303 self._parseError("Mal-formed CDATA header") | |
304 | |
305 def do_cdata(self, byte): | |
306 self.cdatabuf += byte | |
307 if self.cdatabuf.endswith("]]>"): | |
308 self.cdatabuf = self.cdatabuf[:-3] | |
309 return 'bodydata' | |
310 | |
311 def end_cdata(self): | |
312 self.gotCData(self.cdatabuf) | |
313 self.cdatabuf = '' | |
314 | |
315 def do_attrs(self, byte): | |
316 if byte.isalnum() or byte in identChars: | |
317 # XXX FIXME really handle !DOCTYPE at some point | |
318 if self.tagName == '!DOCTYPE': | |
319 return 'doctype' | |
320 if self.tagName[0] in '!?': | |
321 return 'waitforgt' | |
322 return 'attrname' | |
323 elif byte.isspace(): | |
324 return | |
325 elif byte == '>': | |
326 self.gotTagStart(self.tagName, self.tagAttributes) | |
327 return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBod
yData() | |
328 elif byte == '/': | |
329 return 'afterslash' | |
330 elif self.beExtremelyLenient: | |
331 # discard and move on? Only case I've seen of this so far was: | |
332 # <foo bar="baz""> | |
333 return | |
334 self._parseError("Unexpected character: %r" % byte) | |
335 | |
336 def begin_doctype(self, byte): | |
337 self.doctype = byte | |
338 | |
339 def do_doctype(self, byte): | |
340 if byte == '>': | |
341 return 'bodydata' | |
342 self.doctype += byte | |
343 | |
344 def end_doctype(self): | |
345 self.gotDoctype(self.doctype) | |
346 self.doctype = None | |
347 | |
348 def do_waitforgt(self, byte): | |
349 if byte == '>': | |
350 if self.endtag or not self.beExtremelyLenient: | |
351 return 'bodydata' | |
352 return self.maybeBodyData() | |
353 | |
354 def begin_attrname(self, byte): | |
355 self.attrname = byte | |
356 self._attrname_termtag = 0 | |
357 | |
358 def do_attrname(self, byte): | |
359 if byte.isalnum() or byte in identChars: | |
360 self.attrname += byte | |
361 return | |
362 elif byte == '=': | |
363 return 'beforeattrval' | |
364 elif byte.isspace(): | |
365 return 'beforeeq' | |
366 elif self.beExtremelyLenient: | |
367 if byte in '"\'': | |
368 return 'attrval' | |
369 if byte in lenientIdentChars or byte.isalnum(): | |
370 self.attrname += byte | |
371 return | |
372 if byte == '/': | |
373 self._attrname_termtag = 1 | |
374 return | |
375 if byte == '>': | |
376 self.attrval = 'True' | |
377 self.tagAttributes[self.attrname] = self.attrval | |
378 self.gotTagStart(self.tagName, self.tagAttributes) | |
379 if self._attrname_termtag: | |
380 self.gotTagEnd(self.tagName) | |
381 return 'bodydata' | |
382 return self.maybeBodyData() | |
383 # something is really broken. let's leave this attribute where it | |
384 # is and move on to the next thing | |
385 return | |
386 self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte)
) | |
387 | |
388 def do_beforeattrval(self, byte): | |
389 if byte in '"\'': | |
390 return 'attrval' | |
391 elif byte.isspace(): | |
392 return | |
393 elif self.beExtremelyLenient: | |
394 if byte in lenientIdentChars or byte.isalnum(): | |
395 return 'messyattr' | |
396 if byte == '>': | |
397 self.attrval = 'True' | |
398 self.tagAttributes[self.attrname] = self.attrval | |
399 self.gotTagStart(self.tagName, self.tagAttributes) | |
400 return self.maybeBodyData() | |
401 if byte == '\\': | |
402 # I saw this in actual HTML once: | |
403 # <font size=\"3\"><sup>SM</sup></font> | |
404 return | |
405 self._parseError("Invalid initial attribute value: %r; Attribute values
must be quoted." % byte) | |
406 | |
407 attrname = '' | |
408 attrval = '' | |
409 | |
410 def begin_beforeeq(self,byte): | |
411 self._beforeeq_termtag = 0 | |
412 | |
413 def do_beforeeq(self, byte): | |
414 if byte == '=': | |
415 return 'beforeattrval' | |
416 elif byte.isspace(): | |
417 return | |
418 elif self.beExtremelyLenient: | |
419 if byte.isalnum() or byte in identChars: | |
420 self.attrval = 'True' | |
421 self.tagAttributes[self.attrname] = self.attrval | |
422 return 'attrname' | |
423 elif byte == '>': | |
424 self.attrval = 'True' | |
425 self.tagAttributes[self.attrname] = self.attrval | |
426 self.gotTagStart(self.tagName, self.tagAttributes) | |
427 if self._beforeeq_termtag: | |
428 self.gotTagEnd(self.tagName) | |
429 return 'bodydata' | |
430 return self.maybeBodyData() | |
431 elif byte == '/': | |
432 self._beforeeq_termtag = 1 | |
433 return | |
434 self._parseError("Invalid attribute") | |
435 | |
436 def begin_attrval(self, byte): | |
437 self.quotetype = byte | |
438 self.attrval = '' | |
439 | |
440 def do_attrval(self, byte): | |
441 if byte == self.quotetype: | |
442 return 'attrs' | |
443 self.attrval += byte | |
444 | |
445 def end_attrval(self): | |
446 self.tagAttributes[self.attrname] = self.attrval | |
447 self.attrname = self.attrval = '' | |
448 | |
449 def begin_messyattr(self, byte): | |
450 self.attrval = byte | |
451 | |
452 def do_messyattr(self, byte): | |
453 if byte.isspace(): | |
454 return 'attrs' | |
455 elif byte == '>': | |
456 endTag = 0 | |
457 if self.attrval.endswith('/'): | |
458 endTag = 1 | |
459 self.attrval = self.attrval[:-1] | |
460 self.tagAttributes[self.attrname] = self.attrval | |
461 self.gotTagStart(self.tagName, self.tagAttributes) | |
462 if endTag: | |
463 self.gotTagEnd(self.tagName) | |
464 return 'bodydata' | |
465 return self.maybeBodyData() | |
466 else: | |
467 self.attrval += byte | |
468 | |
469 def end_messyattr(self): | |
470 if self.attrval: | |
471 self.tagAttributes[self.attrname] = self.attrval | |
472 | |
473 def begin_afterslash(self, byte): | |
474 self._after_slash_closed = 0 | |
475 | |
476 def do_afterslash(self, byte): | |
477 # this state is only after a self-terminating slash, e.g. <foo/> | |
478 if self._after_slash_closed: | |
479 self._parseError("Mal-formed")#XXX When does this happen?? | |
480 if byte != '>': | |
481 if self.beExtremelyLenient: | |
482 return | |
483 else: | |
484 self._parseError("No data allowed after '/'") | |
485 self._after_slash_closed = 1 | |
486 self.gotTagStart(self.tagName, self.tagAttributes) | |
487 self.gotTagEnd(self.tagName) | |
488 # don't need maybeBodyData here because there better not be | |
489 # any javascript code after a <script/>... we'll see :( | |
490 return 'bodydata' | |
491 | |
492 def begin_bodydata(self, byte): | |
493 if self._leadingBodyData: | |
494 self.bodydata = self._leadingBodyData | |
495 del self._leadingBodyData | |
496 else: | |
497 self.bodydata = '' | |
498 | |
499 def do_bodydata(self, byte): | |
500 if byte == '<': | |
501 return 'tagstart' | |
502 if byte == '&': | |
503 return 'entityref' | |
504 self.bodydata += byte | |
505 | |
506 def end_bodydata(self): | |
507 self.gotText(self.bodydata) | |
508 self.bodydata = '' | |
509 | |
510 def do_waitforendscript(self, byte): | |
511 if byte == '<': | |
512 return 'waitscriptendtag' | |
513 self.bodydata += byte | |
514 | |
515 def begin_waitscriptendtag(self, byte): | |
516 self.temptagdata = '' | |
517 self.tagName = '' | |
518 self.endtag = 0 | |
519 | |
520 def do_waitscriptendtag(self, byte): | |
521 # 1 enforce / as first byte read | |
522 # 2 enforce following bytes to be subset of "script" until | |
523 # tagName == "script" | |
524 # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagN
ame) | |
525 # 3 spaces can happen anywhere, they're ignored | |
526 # e.g. < / script > | |
527 # 4 anything else causes all data I've read to be moved to the | |
528 # bodydata, and switch back to waitforendscript state | |
529 | |
530 # If it turns out this _isn't_ a </script>, we need to | |
531 # remember all the data we've been through so we can append it | |
532 # to bodydata | |
533 self.temptagdata += byte | |
534 | |
535 # 1 | |
536 if byte == '/': | |
537 self.endtag = True | |
538 elif not self.endtag: | |
539 self.bodydata += "<" + self.temptagdata | |
540 return 'waitforendscript' | |
541 # 2 | |
542 elif byte.isalnum() or byte in identChars: | |
543 self.tagName += byte | |
544 if not 'script'.startswith(self.tagName): | |
545 self.bodydata += "<" + self.temptagdata | |
546 return 'waitforendscript' | |
547 elif self.tagName == 'script': | |
548 self.gotText(self.bodydata) | |
549 self.gotTagEnd(self.tagName) | |
550 return 'waitforgt' | |
551 # 3 | |
552 elif byte.isspace(): | |
553 return 'waitscriptendtag' | |
554 # 4 | |
555 else: | |
556 self.bodydata += "<" + self.temptagdata | |
557 return 'waitforendscript' | |
558 | |
559 | |
560 def begin_entityref(self, byte): | |
561 self.erefbuf = '' | |
562 self.erefextra = '' # extra bit for lenient mode | |
563 | |
564 def do_entityref(self, byte): | |
565 if byte.isspace() or byte == "<": | |
566 if self.beExtremelyLenient: | |
567 # '&foo' probably was '&foo' | |
568 if self.erefbuf and self.erefbuf != "amp": | |
569 self.erefextra = self.erefbuf | |
570 self.erefbuf = "amp" | |
571 if byte == "<": | |
572 return "tagstart" | |
573 else: | |
574 self.erefextra += byte | |
575 return 'spacebodydata' | |
576 self._parseError("Bad entity reference") | |
577 elif byte != ';': | |
578 self.erefbuf += byte | |
579 else: | |
580 return 'bodydata' | |
581 | |
582 def end_entityref(self): | |
583 self.gotEntityReference(self.erefbuf) | |
584 | |
585 # hacky support for space after & in entityref in beExtremelyLenient | |
586 # state should only happen in that case | |
587 def begin_spacebodydata(self, byte): | |
588 self.bodydata = self.erefextra | |
589 self.erefextra = None | |
590 do_spacebodydata = do_bodydata | |
591 end_spacebodydata = end_bodydata | |
592 | |
593 # Sorta SAX-ish API | |
594 | |
595 def gotTagStart(self, name, attributes): | |
596 '''Encountered an opening tag. | |
597 | |
598 Default behaviour is to print.''' | |
599 print 'begin', name, attributes | |
600 | |
601 def gotText(self, data): | |
602 '''Encountered text | |
603 | |
604 Default behaviour is to print.''' | |
605 print 'text:', repr(data) | |
606 | |
607 def gotEntityReference(self, entityRef): | |
608 '''Encountered mnemonic entity reference | |
609 | |
610 Default behaviour is to print.''' | |
611 print 'entityRef: &%s;' % entityRef | |
612 | |
613 def gotComment(self, comment): | |
614 '''Encountered comment. | |
615 | |
616 Default behaviour is to ignore.''' | |
617 pass | |
618 | |
619 def gotCData(self, cdata): | |
620 '''Encountered CDATA | |
621 | |
622 Default behaviour is to call the gotText method''' | |
623 self.gotText(cdata) | |
624 | |
625 def gotDoctype(self, doctype): | |
626 """Encountered DOCTYPE | |
627 | |
628 This is really grotty: it basically just gives you everything between | |
629 '<!DOCTYPE' and '>' as an argument. | |
630 """ | |
631 print '!DOCTYPE', repr(doctype) | |
632 | |
633 def gotTagEnd(self, name): | |
634 '''Encountered closing tag | |
635 | |
636 Default behaviour is to print.''' | |
637 print 'end', name | |
638 | |
639 if __name__ == '__main__': | |
640 from cStringIO import StringIO | |
641 testDocument = ''' | |
642 | |
643 <!DOCTYPE ignore all this shit, hah its malformed!!!!@$> | |
644 <?xml version="suck it"?> | |
645 <foo> | |
646 A | |
647 <bar /> | |
648 <baz boz="buz">boz &zop;</baz> | |
649 <![CDATA[ foo bar baz ]]> | |
650 </foo> | |
651 ''' | |
652 x = XMLParser() | |
653 x.makeConnection(FileWrapper(StringIO())) | |
654 # fn = "/home/glyph/Projects/Twisted/doc/howto/ipc10paper.html" | |
655 fn = "/home/glyph/gruesome.xml" | |
656 # testDocument = open(fn).read() | |
657 x.dataReceived(testDocument) | |
OLD | NEW |