Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(89)

Side by Side Diff: third_party/twisted_8_1/twisted/web/sux.py

Issue 12261012: Remove third_party/twisted_8_1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/build
Patch Set: Created 7 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 # -*- test-case-name: twisted.web.test.test_xml -*-
2 #
3 # Copyright (c) 2001-2004 Twisted Matrix Laboratories.
4 # See LICENSE for details.
5
6
7 """
8 *S*mall, *U*ncomplicated *X*ML.
9
10 This is a very simple implementation of XML/HTML as a network
11 protocol. It is not at all clever. Its main features are that it
12 does not:
13
14 - support namespaces
15 - mung mnemonic entity references
16 - validate
17 - perform *any* external actions (such as fetching URLs or writing files)
18 under *any* circumstances
19 - has lots and lots of horrible hacks for supporting broken HTML (as an
20 option, they're not on by default).
21 """
22
23 from twisted.internet.protocol import Protocol, FileWrapper
24 from twisted.python.reflect import prefixedMethodNames
25
26
27
28 # Elements of the three-tuples in the state table.
29 BEGIN_HANDLER = 0
30 DO_HANDLER = 1
31 END_HANDLER = 2
32
33 identChars = '.-_:'
34 lenientIdentChars = identChars + ';+#/%~'
35
36 def nop(*args, **kw):
37 "Do nothing."
38
39
40 def unionlist(*args):
41 l = []
42 for x in args:
43 l.extend(x)
44 d = dict([(x, 1) for x in l])
45 return d.keys()
46
47
48 def zipfndict(*args, **kw):
49 default = kw.get('default', nop)
50 d = {}
51 for key in unionlist(*[fndict.keys() for fndict in args]):
52 d[key] = tuple([x.get(key, default) for x in args])
53 return d
54
55
56 def prefixedMethodClassDict(clazz, prefix):
57 return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMetho dNames(clazz, prefix)])
58
59
60 def prefixedMethodObjDict(obj, prefix):
61 return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodN ames(obj.__class__, prefix)])
62
63
64 class ParseError(Exception):
65
66 def __init__(self, filename, line, col, message):
67 self.filename = filename
68 self.line = line
69 self.col = col
70 self.message = message
71
72 def __str__(self):
73 return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
74 self.message)
75
76 class XMLParser(Protocol):
77
78 state = None
79 encodings = None
80 filename = "<xml />"
81 beExtremelyLenient = 0
82 _prepend = None
83
84 # _leadingBodyData will sometimes be set before switching to the
85 # 'bodydata' state, when we "accidentally" read a byte of bodydata
86 # in a different state.
87 _leadingBodyData = None
88
89 def connectionMade(self):
90 self.lineno = 1
91 self.colno = 0
92 self.encodings = []
93
94 def saveMark(self):
95 '''Get the line number and column of the last character parsed'''
96 # This gets replaced during dataReceived, restored afterwards
97 return (self.lineno, self.colno)
98
99 def _parseError(self, message):
100 raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
101
102 def _buildStateTable(self):
103 '''Return a dictionary of begin, do, end state function tuples'''
104 # _buildStateTable leaves something to be desired but it does what it
105 # does.. probably slowly, so I'm doing some evil caching so it doesn't
106 # get called more than once per class.
107 stateTable = getattr(self.__class__, '__stateTable', None)
108 if stateTable is None:
109 stateTable = self.__class__.__stateTable = zipfndict(
110 *[prefixedMethodObjDict(self, prefix)
111 for prefix in ('begin_', 'do_', 'end_')])
112 return stateTable
113
114 def _decode(self, data):
115 if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
116 assert not len(data) & 1, 'UTF-16 must come in pairs for now'
117 if self._prepend:
118 data = self._prepend + data
119 for encoding in self.encodings:
120 data = unicode(data, encoding)
121 return data
122
123 def maybeBodyData(self):
124 if self.endtag:
125 return 'bodydata'
126
127 # Get ready for fun! We're going to allow
128 # <script>if (foo < bar)</script> to work!
129 # We do this by making everything between <script> and
130 # </script> a Text
131 # BUT <script src="foo"> will be special-cased to do regular,
132 # lenient behavior, because those may not have </script>
133 # -radix
134
135 if (self.tagName == 'script'
136 and not self.tagAttributes.has_key('src')):
137 # we do this ourselves rather than having begin_waitforendscript
138 # becuase that can get called multiple times and we don't want
139 # bodydata to get reset other than the first time.
140 self.begin_bodydata(None)
141 return 'waitforendscript'
142 return 'bodydata'
143
144
145
146 def dataReceived(self, data):
147 stateTable = self._buildStateTable()
148 if not self.state:
149 # all UTF-16 starts with this string
150 if data.startswith('\xff\xfe'):
151 self._prepend = '\xff\xfe'
152 self.encodings.append('UTF-16')
153 data = data[2:]
154 elif data.startswith('\xfe\xff'):
155 self._prepend = '\xfe\xff'
156 self.encodings.append('UTF-16')
157 data = data[2:]
158 self.state = 'begin'
159 if self.encodings:
160 data = self._decode(data)
161 # bring state, lineno, colno into local scope
162 lineno, colno = self.lineno, self.colno
163 curState = self.state
164 # replace saveMark with a nested scope function
165 _saveMark = self.saveMark
166 def saveMark():
167 return (lineno, colno)
168 self.saveMark = saveMark
169 # fetch functions from the stateTable
170 beginFn, doFn, endFn = stateTable[curState]
171 try:
172 for byte in data:
173 # do newline stuff
174 if byte == '\n':
175 lineno += 1
176 colno = 0
177 else:
178 colno += 1
179 newState = doFn(byte)
180 if newState is not None and newState != curState:
181 # this is the endFn from the previous state
182 endFn()
183 curState = newState
184 beginFn, doFn, endFn = stateTable[curState]
185 beginFn(byte)
186 finally:
187 self.saveMark = _saveMark
188 self.lineno, self.colno = lineno, colno
189 # state doesn't make sense if there's an exception..
190 self.state = curState
191
192
193 def connectionLost(self, reason):
194 """
195 End the last state we were in.
196 """
197 stateTable = self._buildStateTable()
198 stateTable[self.state][END_HANDLER]()
199
200
201 # state methods
202
203 def do_begin(self, byte):
204 if byte.isspace():
205 return
206 if byte != '<':
207 if self.beExtremelyLenient:
208 self._leadingBodyData = byte
209 return 'bodydata'
210 self._parseError("First char of document [%r] wasn't <" % (byte,))
211 return 'tagstart'
212
213 def begin_comment(self, byte):
214 self.commentbuf = ''
215
216 def do_comment(self, byte):
217 self.commentbuf += byte
218 if self.commentbuf.endswith('-->'):
219 self.gotComment(self.commentbuf[:-3])
220 return 'bodydata'
221
222 def begin_tagstart(self, byte):
223 self.tagName = '' # name of the tag
224 self.tagAttributes = {} # attributes of the tag
225 self.termtag = 0 # is the tag self-terminating
226 self.endtag = 0
227
228 def do_tagstart(self, byte):
229 if byte.isalnum() or byte in identChars:
230 self.tagName += byte
231 if self.tagName == '!--':
232 return 'comment'
233 elif byte.isspace():
234 if self.tagName:
235 if self.endtag:
236 # properly strict thing to do here is probably to only
237 # accept whitespace
238 return 'waitforgt'
239 return 'attrs'
240 else:
241 self._parseError("Whitespace before tag-name")
242 elif byte == '>':
243 if self.endtag:
244 self.gotTagEnd(self.tagName)
245 return 'bodydata'
246 else:
247 self.gotTagStart(self.tagName, {})
248 return (not self.beExtremelyLenient) and 'bodydata' or self.mayb eBodyData()
249 elif byte == '/':
250 if self.tagName:
251 return 'afterslash'
252 else:
253 self.endtag = 1
254 elif byte in '!?':
255 if self.tagName:
256 if not self.beExtremelyLenient:
257 self._parseError("Invalid character in tag-name")
258 else:
259 self.tagName += byte
260 self.termtag = 1
261 elif byte == '[':
262 if self.tagName == '!':
263 return 'expectcdata'
264 else:
265 self._parseError("Invalid '[' in tag-name")
266 else:
267 if self.beExtremelyLenient:
268 self.bodydata = '<'
269 return 'unentity'
270 self._parseError('Invalid tag character: %r'% byte)
271
272 def begin_unentity(self, byte):
273 self.bodydata += byte
274
275 def do_unentity(self, byte):
276 self.bodydata += byte
277 return 'bodydata'
278
279 def end_unentity(self):
280 self.gotText(self.bodydata)
281
282 def begin_expectcdata(self, byte):
283 self.cdatabuf = byte
284
285 def do_expectcdata(self, byte):
286 self.cdatabuf += byte
287 cdb = self.cdatabuf
288 cd = '[CDATA['
289 if len(cd) > len(cdb):
290 if cd.startswith(cdb):
291 return
292 elif self.beExtremelyLenient:
293 ## WHAT THE CRAP!? MSWord9 generates HTML that includes these
294 ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
295 ## 'em as best I can. this should really be a separate parse
296 ## state but I don't even have any idea what these _are_.
297 return 'waitforgt'
298 else:
299 self._parseError("Mal-formed CDATA header")
300 if cd == cdb:
301 self.cdatabuf = ''
302 return 'cdata'
303 self._parseError("Mal-formed CDATA header")
304
305 def do_cdata(self, byte):
306 self.cdatabuf += byte
307 if self.cdatabuf.endswith("]]>"):
308 self.cdatabuf = self.cdatabuf[:-3]
309 return 'bodydata'
310
311 def end_cdata(self):
312 self.gotCData(self.cdatabuf)
313 self.cdatabuf = ''
314
315 def do_attrs(self, byte):
316 if byte.isalnum() or byte in identChars:
317 # XXX FIXME really handle !DOCTYPE at some point
318 if self.tagName == '!DOCTYPE':
319 return 'doctype'
320 if self.tagName[0] in '!?':
321 return 'waitforgt'
322 return 'attrname'
323 elif byte.isspace():
324 return
325 elif byte == '>':
326 self.gotTagStart(self.tagName, self.tagAttributes)
327 return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBod yData()
328 elif byte == '/':
329 return 'afterslash'
330 elif self.beExtremelyLenient:
331 # discard and move on? Only case I've seen of this so far was:
332 # <foo bar="baz"">
333 return
334 self._parseError("Unexpected character: %r" % byte)
335
336 def begin_doctype(self, byte):
337 self.doctype = byte
338
339 def do_doctype(self, byte):
340 if byte == '>':
341 return 'bodydata'
342 self.doctype += byte
343
344 def end_doctype(self):
345 self.gotDoctype(self.doctype)
346 self.doctype = None
347
348 def do_waitforgt(self, byte):
349 if byte == '>':
350 if self.endtag or not self.beExtremelyLenient:
351 return 'bodydata'
352 return self.maybeBodyData()
353
354 def begin_attrname(self, byte):
355 self.attrname = byte
356 self._attrname_termtag = 0
357
358 def do_attrname(self, byte):
359 if byte.isalnum() or byte in identChars:
360 self.attrname += byte
361 return
362 elif byte == '=':
363 return 'beforeattrval'
364 elif byte.isspace():
365 return 'beforeeq'
366 elif self.beExtremelyLenient:
367 if byte in '"\'':
368 return 'attrval'
369 if byte in lenientIdentChars or byte.isalnum():
370 self.attrname += byte
371 return
372 if byte == '/':
373 self._attrname_termtag = 1
374 return
375 if byte == '>':
376 self.attrval = 'True'
377 self.tagAttributes[self.attrname] = self.attrval
378 self.gotTagStart(self.tagName, self.tagAttributes)
379 if self._attrname_termtag:
380 self.gotTagEnd(self.tagName)
381 return 'bodydata'
382 return self.maybeBodyData()
383 # something is really broken. let's leave this attribute where it
384 # is and move on to the next thing
385 return
386 self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte) )
387
388 def do_beforeattrval(self, byte):
389 if byte in '"\'':
390 return 'attrval'
391 elif byte.isspace():
392 return
393 elif self.beExtremelyLenient:
394 if byte in lenientIdentChars or byte.isalnum():
395 return 'messyattr'
396 if byte == '>':
397 self.attrval = 'True'
398 self.tagAttributes[self.attrname] = self.attrval
399 self.gotTagStart(self.tagName, self.tagAttributes)
400 return self.maybeBodyData()
401 if byte == '\\':
402 # I saw this in actual HTML once:
403 # <font size=\"3\"><sup>SM</sup></font>
404 return
405 self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
406
407 attrname = ''
408 attrval = ''
409
410 def begin_beforeeq(self,byte):
411 self._beforeeq_termtag = 0
412
413 def do_beforeeq(self, byte):
414 if byte == '=':
415 return 'beforeattrval'
416 elif byte.isspace():
417 return
418 elif self.beExtremelyLenient:
419 if byte.isalnum() or byte in identChars:
420 self.attrval = 'True'
421 self.tagAttributes[self.attrname] = self.attrval
422 return 'attrname'
423 elif byte == '>':
424 self.attrval = 'True'
425 self.tagAttributes[self.attrname] = self.attrval
426 self.gotTagStart(self.tagName, self.tagAttributes)
427 if self._beforeeq_termtag:
428 self.gotTagEnd(self.tagName)
429 return 'bodydata'
430 return self.maybeBodyData()
431 elif byte == '/':
432 self._beforeeq_termtag = 1
433 return
434 self._parseError("Invalid attribute")
435
436 def begin_attrval(self, byte):
437 self.quotetype = byte
438 self.attrval = ''
439
440 def do_attrval(self, byte):
441 if byte == self.quotetype:
442 return 'attrs'
443 self.attrval += byte
444
445 def end_attrval(self):
446 self.tagAttributes[self.attrname] = self.attrval
447 self.attrname = self.attrval = ''
448
449 def begin_messyattr(self, byte):
450 self.attrval = byte
451
452 def do_messyattr(self, byte):
453 if byte.isspace():
454 return 'attrs'
455 elif byte == '>':
456 endTag = 0
457 if self.attrval.endswith('/'):
458 endTag = 1
459 self.attrval = self.attrval[:-1]
460 self.tagAttributes[self.attrname] = self.attrval
461 self.gotTagStart(self.tagName, self.tagAttributes)
462 if endTag:
463 self.gotTagEnd(self.tagName)
464 return 'bodydata'
465 return self.maybeBodyData()
466 else:
467 self.attrval += byte
468
469 def end_messyattr(self):
470 if self.attrval:
471 self.tagAttributes[self.attrname] = self.attrval
472
473 def begin_afterslash(self, byte):
474 self._after_slash_closed = 0
475
476 def do_afterslash(self, byte):
477 # this state is only after a self-terminating slash, e.g. <foo/>
478 if self._after_slash_closed:
479 self._parseError("Mal-formed")#XXX When does this happen??
480 if byte != '>':
481 if self.beExtremelyLenient:
482 return
483 else:
484 self._parseError("No data allowed after '/'")
485 self._after_slash_closed = 1
486 self.gotTagStart(self.tagName, self.tagAttributes)
487 self.gotTagEnd(self.tagName)
488 # don't need maybeBodyData here because there better not be
489 # any javascript code after a <script/>... we'll see :(
490 return 'bodydata'
491
492 def begin_bodydata(self, byte):
493 if self._leadingBodyData:
494 self.bodydata = self._leadingBodyData
495 del self._leadingBodyData
496 else:
497 self.bodydata = ''
498
499 def do_bodydata(self, byte):
500 if byte == '<':
501 return 'tagstart'
502 if byte == '&':
503 return 'entityref'
504 self.bodydata += byte
505
506 def end_bodydata(self):
507 self.gotText(self.bodydata)
508 self.bodydata = ''
509
510 def do_waitforendscript(self, byte):
511 if byte == '<':
512 return 'waitscriptendtag'
513 self.bodydata += byte
514
515 def begin_waitscriptendtag(self, byte):
516 self.temptagdata = ''
517 self.tagName = ''
518 self.endtag = 0
519
520 def do_waitscriptendtag(self, byte):
521 # 1 enforce / as first byte read
522 # 2 enforce following bytes to be subset of "script" until
523 # tagName == "script"
524 # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagN ame)
525 # 3 spaces can happen anywhere, they're ignored
526 # e.g. < / script >
527 # 4 anything else causes all data I've read to be moved to the
528 # bodydata, and switch back to waitforendscript state
529
530 # If it turns out this _isn't_ a </script>, we need to
531 # remember all the data we've been through so we can append it
532 # to bodydata
533 self.temptagdata += byte
534
535 # 1
536 if byte == '/':
537 self.endtag = True
538 elif not self.endtag:
539 self.bodydata += "<" + self.temptagdata
540 return 'waitforendscript'
541 # 2
542 elif byte.isalnum() or byte in identChars:
543 self.tagName += byte
544 if not 'script'.startswith(self.tagName):
545 self.bodydata += "<" + self.temptagdata
546 return 'waitforendscript'
547 elif self.tagName == 'script':
548 self.gotText(self.bodydata)
549 self.gotTagEnd(self.tagName)
550 return 'waitforgt'
551 # 3
552 elif byte.isspace():
553 return 'waitscriptendtag'
554 # 4
555 else:
556 self.bodydata += "<" + self.temptagdata
557 return 'waitforendscript'
558
559
560 def begin_entityref(self, byte):
561 self.erefbuf = ''
562 self.erefextra = '' # extra bit for lenient mode
563
564 def do_entityref(self, byte):
565 if byte.isspace() or byte == "<":
566 if self.beExtremelyLenient:
567 # '&foo' probably was '&amp;foo'
568 if self.erefbuf and self.erefbuf != "amp":
569 self.erefextra = self.erefbuf
570 self.erefbuf = "amp"
571 if byte == "<":
572 return "tagstart"
573 else:
574 self.erefextra += byte
575 return 'spacebodydata'
576 self._parseError("Bad entity reference")
577 elif byte != ';':
578 self.erefbuf += byte
579 else:
580 return 'bodydata'
581
582 def end_entityref(self):
583 self.gotEntityReference(self.erefbuf)
584
585 # hacky support for space after & in entityref in beExtremelyLenient
586 # state should only happen in that case
587 def begin_spacebodydata(self, byte):
588 self.bodydata = self.erefextra
589 self.erefextra = None
590 do_spacebodydata = do_bodydata
591 end_spacebodydata = end_bodydata
592
593 # Sorta SAX-ish API
594
595 def gotTagStart(self, name, attributes):
596 '''Encountered an opening tag.
597
598 Default behaviour is to print.'''
599 print 'begin', name, attributes
600
601 def gotText(self, data):
602 '''Encountered text
603
604 Default behaviour is to print.'''
605 print 'text:', repr(data)
606
607 def gotEntityReference(self, entityRef):
608 '''Encountered mnemonic entity reference
609
610 Default behaviour is to print.'''
611 print 'entityRef: &%s;' % entityRef
612
613 def gotComment(self, comment):
614 '''Encountered comment.
615
616 Default behaviour is to ignore.'''
617 pass
618
619 def gotCData(self, cdata):
620 '''Encountered CDATA
621
622 Default behaviour is to call the gotText method'''
623 self.gotText(cdata)
624
625 def gotDoctype(self, doctype):
626 """Encountered DOCTYPE
627
628 This is really grotty: it basically just gives you everything between
629 '<!DOCTYPE' and '>' as an argument.
630 """
631 print '!DOCTYPE', repr(doctype)
632
633 def gotTagEnd(self, name):
634 '''Encountered closing tag
635
636 Default behaviour is to print.'''
637 print 'end', name
638
639 if __name__ == '__main__':
640 from cStringIO import StringIO
641 testDocument = '''
642
643 <!DOCTYPE ignore all this shit, hah its malformed!!!!@$>
644 <?xml version="suck it"?>
645 <foo>
646 &#65;
647 <bar />
648 <baz boz="buz">boz &zop;</baz>
649 <![CDATA[ foo bar baz ]]>
650 </foo>
651 '''
652 x = XMLParser()
653 x.makeConnection(FileWrapper(StringIO()))
654 # fn = "/home/glyph/Projects/Twisted/doc/howto/ipc10paper.html"
655 fn = "/home/glyph/gruesome.xml"
656 # testDocument = open(fn).read()
657 x.dataReceived(testDocument)
OLDNEW
« no previous file with comments | « third_party/twisted_8_1/twisted/web/static.py ('k') | third_party/twisted_8_1/twisted/web/tap.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698