Index: third_party/twisted_8_1/twisted/web/sux.py |
diff --git a/third_party/twisted_8_1/twisted/web/sux.py b/third_party/twisted_8_1/twisted/web/sux.py |
deleted file mode 100644 |
index 6f8fea1dc3b1f80e6afa31b46694810e8b279ecc..0000000000000000000000000000000000000000 |
--- a/third_party/twisted_8_1/twisted/web/sux.py |
+++ /dev/null |
@@ -1,657 +0,0 @@ |
-# -*- test-case-name: twisted.web.test.test_xml -*- |
-# |
-# Copyright (c) 2001-2004 Twisted Matrix Laboratories. |
-# See LICENSE for details. |
- |
- |
-""" |
-*S*mall, *U*ncomplicated *X*ML. |
- |
-This is a very simple implementation of XML/HTML as a network |
-protocol. It is not at all clever. Its main features are that it |
-does not: |
- |
- - support namespaces |
- - mung mnemonic entity references |
- - validate |
- - perform *any* external actions (such as fetching URLs or writing files) |
- under *any* circumstances |
- - has lots and lots of horrible hacks for supporting broken HTML (as an |
- option, they're not on by default). |
-""" |
- |
-from twisted.internet.protocol import Protocol, FileWrapper |
-from twisted.python.reflect import prefixedMethodNames |
- |
- |
- |
-# Elements of the three-tuples in the state table. |
-BEGIN_HANDLER = 0 |
-DO_HANDLER = 1 |
-END_HANDLER = 2 |
- |
-identChars = '.-_:' |
-lenientIdentChars = identChars + ';+#/%~' |
- |
-def nop(*args, **kw): |
- "Do nothing." |
- |
- |
-def unionlist(*args): |
- l = [] |
- for x in args: |
- l.extend(x) |
- d = dict([(x, 1) for x in l]) |
- return d.keys() |
- |
- |
-def zipfndict(*args, **kw): |
- default = kw.get('default', nop) |
- d = {} |
- for key in unionlist(*[fndict.keys() for fndict in args]): |
- d[key] = tuple([x.get(key, default) for x in args]) |
- return d |
- |
- |
-def prefixedMethodClassDict(clazz, prefix): |
- return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)]) |
- |
- |
-def prefixedMethodObjDict(obj, prefix): |
- return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)]) |
- |
- |
-class ParseError(Exception): |
- |
- def __init__(self, filename, line, col, message): |
- self.filename = filename |
- self.line = line |
- self.col = col |
- self.message = message |
- |
- def __str__(self): |
- return "%s:%s:%s: %s" % (self.filename, self.line, self.col, |
- self.message) |
- |
-class XMLParser(Protocol): |
- |
- state = None |
- encodings = None |
- filename = "<xml />" |
- beExtremelyLenient = 0 |
- _prepend = None |
- |
- # _leadingBodyData will sometimes be set before switching to the |
- # 'bodydata' state, when we "accidentally" read a byte of bodydata |
- # in a different state. |
- _leadingBodyData = None |
- |
- def connectionMade(self): |
- self.lineno = 1 |
- self.colno = 0 |
- self.encodings = [] |
- |
- def saveMark(self): |
- '''Get the line number and column of the last character parsed''' |
- # This gets replaced during dataReceived, restored afterwards |
- return (self.lineno, self.colno) |
- |
- def _parseError(self, message): |
- raise ParseError(*((self.filename,)+self.saveMark()+(message,))) |
- |
- def _buildStateTable(self): |
- '''Return a dictionary of begin, do, end state function tuples''' |
- # _buildStateTable leaves something to be desired but it does what it |
- # does.. probably slowly, so I'm doing some evil caching so it doesn't |
- # get called more than once per class. |
- stateTable = getattr(self.__class__, '__stateTable', None) |
- if stateTable is None: |
- stateTable = self.__class__.__stateTable = zipfndict( |
- *[prefixedMethodObjDict(self, prefix) |
- for prefix in ('begin_', 'do_', 'end_')]) |
- return stateTable |
- |
- def _decode(self, data): |
- if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings: |
- assert not len(data) & 1, 'UTF-16 must come in pairs for now' |
- if self._prepend: |
- data = self._prepend + data |
- for encoding in self.encodings: |
- data = unicode(data, encoding) |
- return data |
- |
- def maybeBodyData(self): |
- if self.endtag: |
- return 'bodydata' |
- |
- # Get ready for fun! We're going to allow |
- # <script>if (foo < bar)</script> to work! |
- # We do this by making everything between <script> and |
- # </script> a Text |
- # BUT <script src="foo"> will be special-cased to do regular, |
- # lenient behavior, because those may not have </script> |
- # -radix |
- |
- if (self.tagName == 'script' |
- and not self.tagAttributes.has_key('src')): |
- # we do this ourselves rather than having begin_waitforendscript |
- # becuase that can get called multiple times and we don't want |
- # bodydata to get reset other than the first time. |
- self.begin_bodydata(None) |
- return 'waitforendscript' |
- return 'bodydata' |
- |
- |
- |
- def dataReceived(self, data): |
- stateTable = self._buildStateTable() |
- if not self.state: |
- # all UTF-16 starts with this string |
- if data.startswith('\xff\xfe'): |
- self._prepend = '\xff\xfe' |
- self.encodings.append('UTF-16') |
- data = data[2:] |
- elif data.startswith('\xfe\xff'): |
- self._prepend = '\xfe\xff' |
- self.encodings.append('UTF-16') |
- data = data[2:] |
- self.state = 'begin' |
- if self.encodings: |
- data = self._decode(data) |
- # bring state, lineno, colno into local scope |
- lineno, colno = self.lineno, self.colno |
- curState = self.state |
- # replace saveMark with a nested scope function |
- _saveMark = self.saveMark |
- def saveMark(): |
- return (lineno, colno) |
- self.saveMark = saveMark |
- # fetch functions from the stateTable |
- beginFn, doFn, endFn = stateTable[curState] |
- try: |
- for byte in data: |
- # do newline stuff |
- if byte == '\n': |
- lineno += 1 |
- colno = 0 |
- else: |
- colno += 1 |
- newState = doFn(byte) |
- if newState is not None and newState != curState: |
- # this is the endFn from the previous state |
- endFn() |
- curState = newState |
- beginFn, doFn, endFn = stateTable[curState] |
- beginFn(byte) |
- finally: |
- self.saveMark = _saveMark |
- self.lineno, self.colno = lineno, colno |
- # state doesn't make sense if there's an exception.. |
- self.state = curState |
- |
- |
- def connectionLost(self, reason): |
- """ |
- End the last state we were in. |
- """ |
- stateTable = self._buildStateTable() |
- stateTable[self.state][END_HANDLER]() |
- |
- |
- # state methods |
- |
- def do_begin(self, byte): |
- if byte.isspace(): |
- return |
- if byte != '<': |
- if self.beExtremelyLenient: |
- self._leadingBodyData = byte |
- return 'bodydata' |
- self._parseError("First char of document [%r] wasn't <" % (byte,)) |
- return 'tagstart' |
- |
- def begin_comment(self, byte): |
- self.commentbuf = '' |
- |
- def do_comment(self, byte): |
- self.commentbuf += byte |
- if self.commentbuf.endswith('-->'): |
- self.gotComment(self.commentbuf[:-3]) |
- return 'bodydata' |
- |
- def begin_tagstart(self, byte): |
- self.tagName = '' # name of the tag |
- self.tagAttributes = {} # attributes of the tag |
- self.termtag = 0 # is the tag self-terminating |
- self.endtag = 0 |
- |
- def do_tagstart(self, byte): |
- if byte.isalnum() or byte in identChars: |
- self.tagName += byte |
- if self.tagName == '!--': |
- return 'comment' |
- elif byte.isspace(): |
- if self.tagName: |
- if self.endtag: |
- # properly strict thing to do here is probably to only |
- # accept whitespace |
- return 'waitforgt' |
- return 'attrs' |
- else: |
- self._parseError("Whitespace before tag-name") |
- elif byte == '>': |
- if self.endtag: |
- self.gotTagEnd(self.tagName) |
- return 'bodydata' |
- else: |
- self.gotTagStart(self.tagName, {}) |
- return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData() |
- elif byte == '/': |
- if self.tagName: |
- return 'afterslash' |
- else: |
- self.endtag = 1 |
- elif byte in '!?': |
- if self.tagName: |
- if not self.beExtremelyLenient: |
- self._parseError("Invalid character in tag-name") |
- else: |
- self.tagName += byte |
- self.termtag = 1 |
- elif byte == '[': |
- if self.tagName == '!': |
- return 'expectcdata' |
- else: |
- self._parseError("Invalid '[' in tag-name") |
- else: |
- if self.beExtremelyLenient: |
- self.bodydata = '<' |
- return 'unentity' |
- self._parseError('Invalid tag character: %r'% byte) |
- |
- def begin_unentity(self, byte): |
- self.bodydata += byte |
- |
- def do_unentity(self, byte): |
- self.bodydata += byte |
- return 'bodydata' |
- |
- def end_unentity(self): |
- self.gotText(self.bodydata) |
- |
- def begin_expectcdata(self, byte): |
- self.cdatabuf = byte |
- |
- def do_expectcdata(self, byte): |
- self.cdatabuf += byte |
- cdb = self.cdatabuf |
- cd = '[CDATA[' |
- if len(cd) > len(cdb): |
- if cd.startswith(cdb): |
- return |
- elif self.beExtremelyLenient: |
- ## WHAT THE CRAP!? MSWord9 generates HTML that includes these |
- ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore |
- ## 'em as best I can. this should really be a separate parse |
- ## state but I don't even have any idea what these _are_. |
- return 'waitforgt' |
- else: |
- self._parseError("Mal-formed CDATA header") |
- if cd == cdb: |
- self.cdatabuf = '' |
- return 'cdata' |
- self._parseError("Mal-formed CDATA header") |
- |
- def do_cdata(self, byte): |
- self.cdatabuf += byte |
- if self.cdatabuf.endswith("]]>"): |
- self.cdatabuf = self.cdatabuf[:-3] |
- return 'bodydata' |
- |
- def end_cdata(self): |
- self.gotCData(self.cdatabuf) |
- self.cdatabuf = '' |
- |
- def do_attrs(self, byte): |
- if byte.isalnum() or byte in identChars: |
- # XXX FIXME really handle !DOCTYPE at some point |
- if self.tagName == '!DOCTYPE': |
- return 'doctype' |
- if self.tagName[0] in '!?': |
- return 'waitforgt' |
- return 'attrname' |
- elif byte.isspace(): |
- return |
- elif byte == '>': |
- self.gotTagStart(self.tagName, self.tagAttributes) |
- return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData() |
- elif byte == '/': |
- return 'afterslash' |
- elif self.beExtremelyLenient: |
- # discard and move on? Only case I've seen of this so far was: |
- # <foo bar="baz""> |
- return |
- self._parseError("Unexpected character: %r" % byte) |
- |
- def begin_doctype(self, byte): |
- self.doctype = byte |
- |
- def do_doctype(self, byte): |
- if byte == '>': |
- return 'bodydata' |
- self.doctype += byte |
- |
- def end_doctype(self): |
- self.gotDoctype(self.doctype) |
- self.doctype = None |
- |
- def do_waitforgt(self, byte): |
- if byte == '>': |
- if self.endtag or not self.beExtremelyLenient: |
- return 'bodydata' |
- return self.maybeBodyData() |
- |
- def begin_attrname(self, byte): |
- self.attrname = byte |
- self._attrname_termtag = 0 |
- |
- def do_attrname(self, byte): |
- if byte.isalnum() or byte in identChars: |
- self.attrname += byte |
- return |
- elif byte == '=': |
- return 'beforeattrval' |
- elif byte.isspace(): |
- return 'beforeeq' |
- elif self.beExtremelyLenient: |
- if byte in '"\'': |
- return 'attrval' |
- if byte in lenientIdentChars or byte.isalnum(): |
- self.attrname += byte |
- return |
- if byte == '/': |
- self._attrname_termtag = 1 |
- return |
- if byte == '>': |
- self.attrval = 'True' |
- self.tagAttributes[self.attrname] = self.attrval |
- self.gotTagStart(self.tagName, self.tagAttributes) |
- if self._attrname_termtag: |
- self.gotTagEnd(self.tagName) |
- return 'bodydata' |
- return self.maybeBodyData() |
- # something is really broken. let's leave this attribute where it |
- # is and move on to the next thing |
- return |
- self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte)) |
- |
- def do_beforeattrval(self, byte): |
- if byte in '"\'': |
- return 'attrval' |
- elif byte.isspace(): |
- return |
- elif self.beExtremelyLenient: |
- if byte in lenientIdentChars or byte.isalnum(): |
- return 'messyattr' |
- if byte == '>': |
- self.attrval = 'True' |
- self.tagAttributes[self.attrname] = self.attrval |
- self.gotTagStart(self.tagName, self.tagAttributes) |
- return self.maybeBodyData() |
- if byte == '\\': |
- # I saw this in actual HTML once: |
- # <font size=\"3\"><sup>SM</sup></font> |
- return |
- self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte) |
- |
- attrname = '' |
- attrval = '' |
- |
- def begin_beforeeq(self,byte): |
- self._beforeeq_termtag = 0 |
- |
- def do_beforeeq(self, byte): |
- if byte == '=': |
- return 'beforeattrval' |
- elif byte.isspace(): |
- return |
- elif self.beExtremelyLenient: |
- if byte.isalnum() or byte in identChars: |
- self.attrval = 'True' |
- self.tagAttributes[self.attrname] = self.attrval |
- return 'attrname' |
- elif byte == '>': |
- self.attrval = 'True' |
- self.tagAttributes[self.attrname] = self.attrval |
- self.gotTagStart(self.tagName, self.tagAttributes) |
- if self._beforeeq_termtag: |
- self.gotTagEnd(self.tagName) |
- return 'bodydata' |
- return self.maybeBodyData() |
- elif byte == '/': |
- self._beforeeq_termtag = 1 |
- return |
- self._parseError("Invalid attribute") |
- |
- def begin_attrval(self, byte): |
- self.quotetype = byte |
- self.attrval = '' |
- |
- def do_attrval(self, byte): |
- if byte == self.quotetype: |
- return 'attrs' |
- self.attrval += byte |
- |
- def end_attrval(self): |
- self.tagAttributes[self.attrname] = self.attrval |
- self.attrname = self.attrval = '' |
- |
- def begin_messyattr(self, byte): |
- self.attrval = byte |
- |
- def do_messyattr(self, byte): |
- if byte.isspace(): |
- return 'attrs' |
- elif byte == '>': |
- endTag = 0 |
- if self.attrval.endswith('/'): |
- endTag = 1 |
- self.attrval = self.attrval[:-1] |
- self.tagAttributes[self.attrname] = self.attrval |
- self.gotTagStart(self.tagName, self.tagAttributes) |
- if endTag: |
- self.gotTagEnd(self.tagName) |
- return 'bodydata' |
- return self.maybeBodyData() |
- else: |
- self.attrval += byte |
- |
- def end_messyattr(self): |
- if self.attrval: |
- self.tagAttributes[self.attrname] = self.attrval |
- |
- def begin_afterslash(self, byte): |
- self._after_slash_closed = 0 |
- |
- def do_afterslash(self, byte): |
- # this state is only after a self-terminating slash, e.g. <foo/> |
- if self._after_slash_closed: |
- self._parseError("Mal-formed")#XXX When does this happen?? |
- if byte != '>': |
- if self.beExtremelyLenient: |
- return |
- else: |
- self._parseError("No data allowed after '/'") |
- self._after_slash_closed = 1 |
- self.gotTagStart(self.tagName, self.tagAttributes) |
- self.gotTagEnd(self.tagName) |
- # don't need maybeBodyData here because there better not be |
- # any javascript code after a <script/>... we'll see :( |
- return 'bodydata' |
- |
- def begin_bodydata(self, byte): |
- if self._leadingBodyData: |
- self.bodydata = self._leadingBodyData |
- del self._leadingBodyData |
- else: |
- self.bodydata = '' |
- |
- def do_bodydata(self, byte): |
- if byte == '<': |
- return 'tagstart' |
- if byte == '&': |
- return 'entityref' |
- self.bodydata += byte |
- |
- def end_bodydata(self): |
- self.gotText(self.bodydata) |
- self.bodydata = '' |
- |
- def do_waitforendscript(self, byte): |
- if byte == '<': |
- return 'waitscriptendtag' |
- self.bodydata += byte |
- |
- def begin_waitscriptendtag(self, byte): |
- self.temptagdata = '' |
- self.tagName = '' |
- self.endtag = 0 |
- |
- def do_waitscriptendtag(self, byte): |
- # 1 enforce / as first byte read |
- # 2 enforce following bytes to be subset of "script" until |
- # tagName == "script" |
- # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName) |
- # 3 spaces can happen anywhere, they're ignored |
- # e.g. < / script > |
- # 4 anything else causes all data I've read to be moved to the |
- # bodydata, and switch back to waitforendscript state |
- |
- # If it turns out this _isn't_ a </script>, we need to |
- # remember all the data we've been through so we can append it |
- # to bodydata |
- self.temptagdata += byte |
- |
- # 1 |
- if byte == '/': |
- self.endtag = True |
- elif not self.endtag: |
- self.bodydata += "<" + self.temptagdata |
- return 'waitforendscript' |
- # 2 |
- elif byte.isalnum() or byte in identChars: |
- self.tagName += byte |
- if not 'script'.startswith(self.tagName): |
- self.bodydata += "<" + self.temptagdata |
- return 'waitforendscript' |
- elif self.tagName == 'script': |
- self.gotText(self.bodydata) |
- self.gotTagEnd(self.tagName) |
- return 'waitforgt' |
- # 3 |
- elif byte.isspace(): |
- return 'waitscriptendtag' |
- # 4 |
- else: |
- self.bodydata += "<" + self.temptagdata |
- return 'waitforendscript' |
- |
- |
- def begin_entityref(self, byte): |
- self.erefbuf = '' |
- self.erefextra = '' # extra bit for lenient mode |
- |
- def do_entityref(self, byte): |
- if byte.isspace() or byte == "<": |
- if self.beExtremelyLenient: |
- # '&foo' probably was '&foo' |
- if self.erefbuf and self.erefbuf != "amp": |
- self.erefextra = self.erefbuf |
- self.erefbuf = "amp" |
- if byte == "<": |
- return "tagstart" |
- else: |
- self.erefextra += byte |
- return 'spacebodydata' |
- self._parseError("Bad entity reference") |
- elif byte != ';': |
- self.erefbuf += byte |
- else: |
- return 'bodydata' |
- |
- def end_entityref(self): |
- self.gotEntityReference(self.erefbuf) |
- |
- # hacky support for space after & in entityref in beExtremelyLenient |
- # state should only happen in that case |
- def begin_spacebodydata(self, byte): |
- self.bodydata = self.erefextra |
- self.erefextra = None |
- do_spacebodydata = do_bodydata |
- end_spacebodydata = end_bodydata |
- |
- # Sorta SAX-ish API |
- |
- def gotTagStart(self, name, attributes): |
- '''Encountered an opening tag. |
- |
- Default behaviour is to print.''' |
- print 'begin', name, attributes |
- |
- def gotText(self, data): |
- '''Encountered text |
- |
- Default behaviour is to print.''' |
- print 'text:', repr(data) |
- |
- def gotEntityReference(self, entityRef): |
- '''Encountered mnemonic entity reference |
- |
- Default behaviour is to print.''' |
- print 'entityRef: &%s;' % entityRef |
- |
- def gotComment(self, comment): |
- '''Encountered comment. |
- |
- Default behaviour is to ignore.''' |
- pass |
- |
- def gotCData(self, cdata): |
- '''Encountered CDATA |
- |
- Default behaviour is to call the gotText method''' |
- self.gotText(cdata) |
- |
- def gotDoctype(self, doctype): |
- """Encountered DOCTYPE |
- |
- This is really grotty: it basically just gives you everything between |
- '<!DOCTYPE' and '>' as an argument. |
- """ |
- print '!DOCTYPE', repr(doctype) |
- |
- def gotTagEnd(self, name): |
- '''Encountered closing tag |
- |
- Default behaviour is to print.''' |
- print 'end', name |
- |
-if __name__ == '__main__': |
- from cStringIO import StringIO |
- testDocument = ''' |
- |
- <!DOCTYPE ignore all this shit, hah its malformed!!!!@$> |
- <?xml version="suck it"?> |
- <foo> |
- A |
- <bar /> |
- <baz boz="buz">boz &zop;</baz> |
- <![CDATA[ foo bar baz ]]> |
- </foo> |
- ''' |
- x = XMLParser() |
- x.makeConnection(FileWrapper(StringIO())) |
- # fn = "/home/glyph/Projects/Twisted/doc/howto/ipc10paper.html" |
- fn = "/home/glyph/gruesome.xml" |
- # testDocument = open(fn).read() |
- x.dataReceived(testDocument) |