| Index: third_party/twisted_8_1/twisted/web/sux.py
|
| diff --git a/third_party/twisted_8_1/twisted/web/sux.py b/third_party/twisted_8_1/twisted/web/sux.py
|
| deleted file mode 100644
|
| index 6f8fea1dc3b1f80e6afa31b46694810e8b279ecc..0000000000000000000000000000000000000000
|
| --- a/third_party/twisted_8_1/twisted/web/sux.py
|
| +++ /dev/null
|
| @@ -1,657 +0,0 @@
|
| -# -*- test-case-name: twisted.web.test.test_xml -*-
|
| -#
|
| -# Copyright (c) 2001-2004 Twisted Matrix Laboratories.
|
| -# See LICENSE for details.
|
| -
|
| -
|
| -"""
|
| -*S*mall, *U*ncomplicated *X*ML.
|
| -
|
| -This is a very simple implementation of XML/HTML as a network
|
| -protocol. It is not at all clever. Its main features are that it
|
| -does not:
|
| -
|
| - - support namespaces
|
| - - mung mnemonic entity references
|
| - - validate
|
| - - perform *any* external actions (such as fetching URLs or writing files)
|
| - under *any* circumstances
|
| - - has lots and lots of horrible hacks for supporting broken HTML (as an
|
| - option, they're not on by default).
|
| -"""
|
| -
|
| -from twisted.internet.protocol import Protocol, FileWrapper
|
| -from twisted.python.reflect import prefixedMethodNames
|
| -
|
| -
|
| -
|
| -# Elements of the three-tuples in the state table.
|
| -BEGIN_HANDLER = 0
|
| -DO_HANDLER = 1
|
| -END_HANDLER = 2
|
| -
|
| -identChars = '.-_:'
|
| -lenientIdentChars = identChars + ';+#/%~'
|
| -
|
| -def nop(*args, **kw):
|
| - "Do nothing."
|
| -
|
| -
|
| -def unionlist(*args):
|
| - l = []
|
| - for x in args:
|
| - l.extend(x)
|
| - d = dict([(x, 1) for x in l])
|
| - return d.keys()
|
| -
|
| -
|
| -def zipfndict(*args, **kw):
|
| - default = kw.get('default', nop)
|
| - d = {}
|
| - for key in unionlist(*[fndict.keys() for fndict in args]):
|
| - d[key] = tuple([x.get(key, default) for x in args])
|
| - return d
|
| -
|
| -
|
| -def prefixedMethodClassDict(clazz, prefix):
|
| - return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])
|
| -
|
| -
|
| -def prefixedMethodObjDict(obj, prefix):
|
| - return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])
|
| -
|
| -
|
| -class ParseError(Exception):
|
| -
|
| - def __init__(self, filename, line, col, message):
|
| - self.filename = filename
|
| - self.line = line
|
| - self.col = col
|
| - self.message = message
|
| -
|
| - def __str__(self):
|
| - return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
|
| - self.message)
|
| -
|
| -class XMLParser(Protocol):
|
| -
|
| - state = None
|
| - encodings = None
|
| - filename = "<xml />"
|
| - beExtremelyLenient = 0
|
| - _prepend = None
|
| -
|
| - # _leadingBodyData will sometimes be set before switching to the
|
| - # 'bodydata' state, when we "accidentally" read a byte of bodydata
|
| - # in a different state.
|
| - _leadingBodyData = None
|
| -
|
| - def connectionMade(self):
|
| - self.lineno = 1
|
| - self.colno = 0
|
| - self.encodings = []
|
| -
|
| - def saveMark(self):
|
| - '''Get the line number and column of the last character parsed'''
|
| - # This gets replaced during dataReceived, restored afterwards
|
| - return (self.lineno, self.colno)
|
| -
|
| - def _parseError(self, message):
|
| - raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
|
| -
|
| - def _buildStateTable(self):
|
| - '''Return a dictionary of begin, do, end state function tuples'''
|
| - # _buildStateTable leaves something to be desired but it does what it
|
| - # does.. probably slowly, so I'm doing some evil caching so it doesn't
|
| - # get called more than once per class.
|
| - stateTable = getattr(self.__class__, '__stateTable', None)
|
| - if stateTable is None:
|
| - stateTable = self.__class__.__stateTable = zipfndict(
|
| - *[prefixedMethodObjDict(self, prefix)
|
| - for prefix in ('begin_', 'do_', 'end_')])
|
| - return stateTable
|
| -
|
| - def _decode(self, data):
|
| - if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
|
| - assert not len(data) & 1, 'UTF-16 must come in pairs for now'
|
| - if self._prepend:
|
| - data = self._prepend + data
|
| - for encoding in self.encodings:
|
| - data = unicode(data, encoding)
|
| - return data
|
| -
|
| - def maybeBodyData(self):
|
| - if self.endtag:
|
| - return 'bodydata'
|
| -
|
| - # Get ready for fun! We're going to allow
|
| - # <script>if (foo < bar)</script> to work!
|
| - # We do this by making everything between <script> and
|
| - # </script> a Text
|
| - # BUT <script src="foo"> will be special-cased to do regular,
|
| - # lenient behavior, because those may not have </script>
|
| - # -radix
|
| -
|
| - if (self.tagName == 'script'
|
| - and not self.tagAttributes.has_key('src')):
|
| - # we do this ourselves rather than having begin_waitforendscript
|
| - # becuase that can get called multiple times and we don't want
|
| - # bodydata to get reset other than the first time.
|
| - self.begin_bodydata(None)
|
| - return 'waitforendscript'
|
| - return 'bodydata'
|
| -
|
| -
|
| -
|
| - def dataReceived(self, data):
|
| - stateTable = self._buildStateTable()
|
| - if not self.state:
|
| - # all UTF-16 starts with this string
|
| - if data.startswith('\xff\xfe'):
|
| - self._prepend = '\xff\xfe'
|
| - self.encodings.append('UTF-16')
|
| - data = data[2:]
|
| - elif data.startswith('\xfe\xff'):
|
| - self._prepend = '\xfe\xff'
|
| - self.encodings.append('UTF-16')
|
| - data = data[2:]
|
| - self.state = 'begin'
|
| - if self.encodings:
|
| - data = self._decode(data)
|
| - # bring state, lineno, colno into local scope
|
| - lineno, colno = self.lineno, self.colno
|
| - curState = self.state
|
| - # replace saveMark with a nested scope function
|
| - _saveMark = self.saveMark
|
| - def saveMark():
|
| - return (lineno, colno)
|
| - self.saveMark = saveMark
|
| - # fetch functions from the stateTable
|
| - beginFn, doFn, endFn = stateTable[curState]
|
| - try:
|
| - for byte in data:
|
| - # do newline stuff
|
| - if byte == '\n':
|
| - lineno += 1
|
| - colno = 0
|
| - else:
|
| - colno += 1
|
| - newState = doFn(byte)
|
| - if newState is not None and newState != curState:
|
| - # this is the endFn from the previous state
|
| - endFn()
|
| - curState = newState
|
| - beginFn, doFn, endFn = stateTable[curState]
|
| - beginFn(byte)
|
| - finally:
|
| - self.saveMark = _saveMark
|
| - self.lineno, self.colno = lineno, colno
|
| - # state doesn't make sense if there's an exception..
|
| - self.state = curState
|
| -
|
| -
|
| - def connectionLost(self, reason):
|
| - """
|
| - End the last state we were in.
|
| - """
|
| - stateTable = self._buildStateTable()
|
| - stateTable[self.state][END_HANDLER]()
|
| -
|
| -
|
| - # state methods
|
| -
|
| - def do_begin(self, byte):
|
| - if byte.isspace():
|
| - return
|
| - if byte != '<':
|
| - if self.beExtremelyLenient:
|
| - self._leadingBodyData = byte
|
| - return 'bodydata'
|
| - self._parseError("First char of document [%r] wasn't <" % (byte,))
|
| - return 'tagstart'
|
| -
|
| - def begin_comment(self, byte):
|
| - self.commentbuf = ''
|
| -
|
| - def do_comment(self, byte):
|
| - self.commentbuf += byte
|
| - if self.commentbuf.endswith('-->'):
|
| - self.gotComment(self.commentbuf[:-3])
|
| - return 'bodydata'
|
| -
|
| - def begin_tagstart(self, byte):
|
| - self.tagName = '' # name of the tag
|
| - self.tagAttributes = {} # attributes of the tag
|
| - self.termtag = 0 # is the tag self-terminating
|
| - self.endtag = 0
|
| -
|
| - def do_tagstart(self, byte):
|
| - if byte.isalnum() or byte in identChars:
|
| - self.tagName += byte
|
| - if self.tagName == '!--':
|
| - return 'comment'
|
| - elif byte.isspace():
|
| - if self.tagName:
|
| - if self.endtag:
|
| - # properly strict thing to do here is probably to only
|
| - # accept whitespace
|
| - return 'waitforgt'
|
| - return 'attrs'
|
| - else:
|
| - self._parseError("Whitespace before tag-name")
|
| - elif byte == '>':
|
| - if self.endtag:
|
| - self.gotTagEnd(self.tagName)
|
| - return 'bodydata'
|
| - else:
|
| - self.gotTagStart(self.tagName, {})
|
| - return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
|
| - elif byte == '/':
|
| - if self.tagName:
|
| - return 'afterslash'
|
| - else:
|
| - self.endtag = 1
|
| - elif byte in '!?':
|
| - if self.tagName:
|
| - if not self.beExtremelyLenient:
|
| - self._parseError("Invalid character in tag-name")
|
| - else:
|
| - self.tagName += byte
|
| - self.termtag = 1
|
| - elif byte == '[':
|
| - if self.tagName == '!':
|
| - return 'expectcdata'
|
| - else:
|
| - self._parseError("Invalid '[' in tag-name")
|
| - else:
|
| - if self.beExtremelyLenient:
|
| - self.bodydata = '<'
|
| - return 'unentity'
|
| - self._parseError('Invalid tag character: %r'% byte)
|
| -
|
| - def begin_unentity(self, byte):
|
| - self.bodydata += byte
|
| -
|
| - def do_unentity(self, byte):
|
| - self.bodydata += byte
|
| - return 'bodydata'
|
| -
|
| - def end_unentity(self):
|
| - self.gotText(self.bodydata)
|
| -
|
| - def begin_expectcdata(self, byte):
|
| - self.cdatabuf = byte
|
| -
|
| - def do_expectcdata(self, byte):
|
| - self.cdatabuf += byte
|
| - cdb = self.cdatabuf
|
| - cd = '[CDATA['
|
| - if len(cd) > len(cdb):
|
| - if cd.startswith(cdb):
|
| - return
|
| - elif self.beExtremelyLenient:
|
| - ## WHAT THE CRAP!? MSWord9 generates HTML that includes these
|
| - ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
|
| - ## 'em as best I can. this should really be a separate parse
|
| - ## state but I don't even have any idea what these _are_.
|
| - return 'waitforgt'
|
| - else:
|
| - self._parseError("Mal-formed CDATA header")
|
| - if cd == cdb:
|
| - self.cdatabuf = ''
|
| - return 'cdata'
|
| - self._parseError("Mal-formed CDATA header")
|
| -
|
| - def do_cdata(self, byte):
|
| - self.cdatabuf += byte
|
| - if self.cdatabuf.endswith("]]>"):
|
| - self.cdatabuf = self.cdatabuf[:-3]
|
| - return 'bodydata'
|
| -
|
| - def end_cdata(self):
|
| - self.gotCData(self.cdatabuf)
|
| - self.cdatabuf = ''
|
| -
|
| - def do_attrs(self, byte):
|
| - if byte.isalnum() or byte in identChars:
|
| - # XXX FIXME really handle !DOCTYPE at some point
|
| - if self.tagName == '!DOCTYPE':
|
| - return 'doctype'
|
| - if self.tagName[0] in '!?':
|
| - return 'waitforgt'
|
| - return 'attrname'
|
| - elif byte.isspace():
|
| - return
|
| - elif byte == '>':
|
| - self.gotTagStart(self.tagName, self.tagAttributes)
|
| - return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
|
| - elif byte == '/':
|
| - return 'afterslash'
|
| - elif self.beExtremelyLenient:
|
| - # discard and move on? Only case I've seen of this so far was:
|
| - # <foo bar="baz"">
|
| - return
|
| - self._parseError("Unexpected character: %r" % byte)
|
| -
|
| - def begin_doctype(self, byte):
|
| - self.doctype = byte
|
| -
|
| - def do_doctype(self, byte):
|
| - if byte == '>':
|
| - return 'bodydata'
|
| - self.doctype += byte
|
| -
|
| - def end_doctype(self):
|
| - self.gotDoctype(self.doctype)
|
| - self.doctype = None
|
| -
|
| - def do_waitforgt(self, byte):
|
| - if byte == '>':
|
| - if self.endtag or not self.beExtremelyLenient:
|
| - return 'bodydata'
|
| - return self.maybeBodyData()
|
| -
|
| - def begin_attrname(self, byte):
|
| - self.attrname = byte
|
| - self._attrname_termtag = 0
|
| -
|
| - def do_attrname(self, byte):
|
| - if byte.isalnum() or byte in identChars:
|
| - self.attrname += byte
|
| - return
|
| - elif byte == '=':
|
| - return 'beforeattrval'
|
| - elif byte.isspace():
|
| - return 'beforeeq'
|
| - elif self.beExtremelyLenient:
|
| - if byte in '"\'':
|
| - return 'attrval'
|
| - if byte in lenientIdentChars or byte.isalnum():
|
| - self.attrname += byte
|
| - return
|
| - if byte == '/':
|
| - self._attrname_termtag = 1
|
| - return
|
| - if byte == '>':
|
| - self.attrval = 'True'
|
| - self.tagAttributes[self.attrname] = self.attrval
|
| - self.gotTagStart(self.tagName, self.tagAttributes)
|
| - if self._attrname_termtag:
|
| - self.gotTagEnd(self.tagName)
|
| - return 'bodydata'
|
| - return self.maybeBodyData()
|
| - # something is really broken. let's leave this attribute where it
|
| - # is and move on to the next thing
|
| - return
|
| - self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))
|
| -
|
| - def do_beforeattrval(self, byte):
|
| - if byte in '"\'':
|
| - return 'attrval'
|
| - elif byte.isspace():
|
| - return
|
| - elif self.beExtremelyLenient:
|
| - if byte in lenientIdentChars or byte.isalnum():
|
| - return 'messyattr'
|
| - if byte == '>':
|
| - self.attrval = 'True'
|
| - self.tagAttributes[self.attrname] = self.attrval
|
| - self.gotTagStart(self.tagName, self.tagAttributes)
|
| - return self.maybeBodyData()
|
| - if byte == '\\':
|
| - # I saw this in actual HTML once:
|
| - # <font size=\"3\"><sup>SM</sup></font>
|
| - return
|
| - self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
|
| -
|
| - attrname = ''
|
| - attrval = ''
|
| -
|
| - def begin_beforeeq(self,byte):
|
| - self._beforeeq_termtag = 0
|
| -
|
| - def do_beforeeq(self, byte):
|
| - if byte == '=':
|
| - return 'beforeattrval'
|
| - elif byte.isspace():
|
| - return
|
| - elif self.beExtremelyLenient:
|
| - if byte.isalnum() or byte in identChars:
|
| - self.attrval = 'True'
|
| - self.tagAttributes[self.attrname] = self.attrval
|
| - return 'attrname'
|
| - elif byte == '>':
|
| - self.attrval = 'True'
|
| - self.tagAttributes[self.attrname] = self.attrval
|
| - self.gotTagStart(self.tagName, self.tagAttributes)
|
| - if self._beforeeq_termtag:
|
| - self.gotTagEnd(self.tagName)
|
| - return 'bodydata'
|
| - return self.maybeBodyData()
|
| - elif byte == '/':
|
| - self._beforeeq_termtag = 1
|
| - return
|
| - self._parseError("Invalid attribute")
|
| -
|
| - def begin_attrval(self, byte):
|
| - self.quotetype = byte
|
| - self.attrval = ''
|
| -
|
| - def do_attrval(self, byte):
|
| - if byte == self.quotetype:
|
| - return 'attrs'
|
| - self.attrval += byte
|
| -
|
| - def end_attrval(self):
|
| - self.tagAttributes[self.attrname] = self.attrval
|
| - self.attrname = self.attrval = ''
|
| -
|
| - def begin_messyattr(self, byte):
|
| - self.attrval = byte
|
| -
|
| - def do_messyattr(self, byte):
|
| - if byte.isspace():
|
| - return 'attrs'
|
| - elif byte == '>':
|
| - endTag = 0
|
| - if self.attrval.endswith('/'):
|
| - endTag = 1
|
| - self.attrval = self.attrval[:-1]
|
| - self.tagAttributes[self.attrname] = self.attrval
|
| - self.gotTagStart(self.tagName, self.tagAttributes)
|
| - if endTag:
|
| - self.gotTagEnd(self.tagName)
|
| - return 'bodydata'
|
| - return self.maybeBodyData()
|
| - else:
|
| - self.attrval += byte
|
| -
|
| - def end_messyattr(self):
|
| - if self.attrval:
|
| - self.tagAttributes[self.attrname] = self.attrval
|
| -
|
| - def begin_afterslash(self, byte):
|
| - self._after_slash_closed = 0
|
| -
|
| - def do_afterslash(self, byte):
|
| - # this state is only after a self-terminating slash, e.g. <foo/>
|
| - if self._after_slash_closed:
|
| - self._parseError("Mal-formed")#XXX When does this happen??
|
| - if byte != '>':
|
| - if self.beExtremelyLenient:
|
| - return
|
| - else:
|
| - self._parseError("No data allowed after '/'")
|
| - self._after_slash_closed = 1
|
| - self.gotTagStart(self.tagName, self.tagAttributes)
|
| - self.gotTagEnd(self.tagName)
|
| - # don't need maybeBodyData here because there better not be
|
| - # any javascript code after a <script/>... we'll see :(
|
| - return 'bodydata'
|
| -
|
| - def begin_bodydata(self, byte):
|
| - if self._leadingBodyData:
|
| - self.bodydata = self._leadingBodyData
|
| - del self._leadingBodyData
|
| - else:
|
| - self.bodydata = ''
|
| -
|
| - def do_bodydata(self, byte):
|
| - if byte == '<':
|
| - return 'tagstart'
|
| - if byte == '&':
|
| - return 'entityref'
|
| - self.bodydata += byte
|
| -
|
| - def end_bodydata(self):
|
| - self.gotText(self.bodydata)
|
| - self.bodydata = ''
|
| -
|
| - def do_waitforendscript(self, byte):
|
| - if byte == '<':
|
| - return 'waitscriptendtag'
|
| - self.bodydata += byte
|
| -
|
| - def begin_waitscriptendtag(self, byte):
|
| - self.temptagdata = ''
|
| - self.tagName = ''
|
| - self.endtag = 0
|
| -
|
| - def do_waitscriptendtag(self, byte):
|
| - # 1 enforce / as first byte read
|
| - # 2 enforce following bytes to be subset of "script" until
|
| - # tagName == "script"
|
| - # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
|
| - # 3 spaces can happen anywhere, they're ignored
|
| - # e.g. < / script >
|
| - # 4 anything else causes all data I've read to be moved to the
|
| - # bodydata, and switch back to waitforendscript state
|
| -
|
| - # If it turns out this _isn't_ a </script>, we need to
|
| - # remember all the data we've been through so we can append it
|
| - # to bodydata
|
| - self.temptagdata += byte
|
| -
|
| - # 1
|
| - if byte == '/':
|
| - self.endtag = True
|
| - elif not self.endtag:
|
| - self.bodydata += "<" + self.temptagdata
|
| - return 'waitforendscript'
|
| - # 2
|
| - elif byte.isalnum() or byte in identChars:
|
| - self.tagName += byte
|
| - if not 'script'.startswith(self.tagName):
|
| - self.bodydata += "<" + self.temptagdata
|
| - return 'waitforendscript'
|
| - elif self.tagName == 'script':
|
| - self.gotText(self.bodydata)
|
| - self.gotTagEnd(self.tagName)
|
| - return 'waitforgt'
|
| - # 3
|
| - elif byte.isspace():
|
| - return 'waitscriptendtag'
|
| - # 4
|
| - else:
|
| - self.bodydata += "<" + self.temptagdata
|
| - return 'waitforendscript'
|
| -
|
| -
|
| - def begin_entityref(self, byte):
|
| - self.erefbuf = ''
|
| - self.erefextra = '' # extra bit for lenient mode
|
| -
|
| - def do_entityref(self, byte):
|
| - if byte.isspace() or byte == "<":
|
| - if self.beExtremelyLenient:
|
| - # '&foo' probably was '&foo'
|
| - if self.erefbuf and self.erefbuf != "amp":
|
| - self.erefextra = self.erefbuf
|
| - self.erefbuf = "amp"
|
| - if byte == "<":
|
| - return "tagstart"
|
| - else:
|
| - self.erefextra += byte
|
| - return 'spacebodydata'
|
| - self._parseError("Bad entity reference")
|
| - elif byte != ';':
|
| - self.erefbuf += byte
|
| - else:
|
| - return 'bodydata'
|
| -
|
| - def end_entityref(self):
|
| - self.gotEntityReference(self.erefbuf)
|
| -
|
| - # hacky support for space after & in entityref in beExtremelyLenient
|
| - # state should only happen in that case
|
| - def begin_spacebodydata(self, byte):
|
| - self.bodydata = self.erefextra
|
| - self.erefextra = None
|
| - do_spacebodydata = do_bodydata
|
| - end_spacebodydata = end_bodydata
|
| -
|
| - # Sorta SAX-ish API
|
| -
|
| - def gotTagStart(self, name, attributes):
|
| - '''Encountered an opening tag.
|
| -
|
| - Default behaviour is to print.'''
|
| - print 'begin', name, attributes
|
| -
|
| - def gotText(self, data):
|
| - '''Encountered text
|
| -
|
| - Default behaviour is to print.'''
|
| - print 'text:', repr(data)
|
| -
|
| - def gotEntityReference(self, entityRef):
|
| - '''Encountered mnemonic entity reference
|
| -
|
| - Default behaviour is to print.'''
|
| - print 'entityRef: &%s;' % entityRef
|
| -
|
| - def gotComment(self, comment):
|
| - '''Encountered comment.
|
| -
|
| - Default behaviour is to ignore.'''
|
| - pass
|
| -
|
| - def gotCData(self, cdata):
|
| - '''Encountered CDATA
|
| -
|
| - Default behaviour is to call the gotText method'''
|
| - self.gotText(cdata)
|
| -
|
| - def gotDoctype(self, doctype):
|
| - """Encountered DOCTYPE
|
| -
|
| - This is really grotty: it basically just gives you everything between
|
| - '<!DOCTYPE' and '>' as an argument.
|
| - """
|
| - print '!DOCTYPE', repr(doctype)
|
| -
|
| - def gotTagEnd(self, name):
|
| - '''Encountered closing tag
|
| -
|
| - Default behaviour is to print.'''
|
| - print 'end', name
|
| -
|
| -if __name__ == '__main__':
|
| - from cStringIO import StringIO
|
| - testDocument = '''
|
| -
|
| - <!DOCTYPE ignore all this shit, hah its malformed!!!!@$>
|
| - <?xml version="suck it"?>
|
| - <foo>
|
| - A
|
| - <bar />
|
| - <baz boz="buz">boz &zop;</baz>
|
| - <![CDATA[ foo bar baz ]]>
|
| - </foo>
|
| - '''
|
| - x = XMLParser()
|
| - x.makeConnection(FileWrapper(StringIO()))
|
| - # fn = "/home/glyph/Projects/Twisted/doc/howto/ipc10paper.html"
|
| - fn = "/home/glyph/gruesome.xml"
|
| - # testDocument = open(fn).read()
|
| - x.dataReceived(testDocument)
|
|
|