| Index: third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/tokenizer.py
|
| diff --git a/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/tokenizer.py b/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/tokenizer.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..797745787a49fefe7e64667b4b0b0355275bf8fa
|
| --- /dev/null
|
| +++ b/third_party/WebKit/Tools/Scripts/webkitpy/thirdparty/wpt/wpt/tools/html5lib/html5lib/tokenizer.py
|
| @@ -0,0 +1,1731 @@
|
| +from __future__ import absolute_import, division, unicode_literals
|
| +
|
| +try:
|
| + chr = unichr # flake8: noqa
|
| +except NameError:
|
| + pass
|
| +
|
| +from collections import deque
|
| +
|
| +from .constants import spaceCharacters
|
| +from .constants import entities
|
| +from .constants import asciiLetters, asciiUpper2Lower
|
| +from .constants import digits, hexDigits, EOF
|
| +from .constants import tokenTypes, tagTokenTypes
|
| +from .constants import replacementCharacters
|
| +
|
| +from .inputstream import HTMLInputStream
|
| +
|
| +from .trie import Trie
|
| +
|
| +entitiesTrie = Trie(entities)
|
| +
|
| +
|
| +class HTMLTokenizer(object):
|
| + """ This class takes care of tokenizing HTML.
|
| +
|
| + * self.currentToken
|
| + Holds the token that is currently being processed.
|
| +
|
| + * self.state
|
| + Holds a reference to the method to be invoked... XXX
|
| +
|
| + * self.stream
|
| + Points to HTMLInputStream object.
|
| + """
|
| +
|
| + def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
| + lowercaseElementName=True, lowercaseAttrName=True, parser=None):
|
| +
|
| + self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
|
| + self.parser = parser
|
| +
|
| + # Perform case conversions?
|
| + self.lowercaseElementName = lowercaseElementName
|
| + self.lowercaseAttrName = lowercaseAttrName
|
| +
|
| + # Setup the initial tokenizer state
|
| + self.escapeFlag = False
|
| + self.lastFourChars = []
|
| + self.state = self.dataState
|
| + self.escape = False
|
| +
|
| + # The current token being created
|
| + self.currentToken = None
|
| + super(HTMLTokenizer, self).__init__()
|
| +
|
| + def __iter__(self):
|
| + """ This is where the magic happens.
|
| +
|
| + We do our usually processing through the states and when we have a token
|
| + to return we yield the token which pauses processing until the next token
|
| + is requested.
|
| + """
|
| + self.tokenQueue = deque([])
|
| + # Start processing. When EOF is reached self.state will return False
|
| + # instead of True and the loop will terminate.
|
| + while self.state():
|
| + while self.stream.errors:
|
| + yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
|
| + while self.tokenQueue:
|
| + yield self.tokenQueue.popleft()
|
| +
|
| + def consumeNumberEntity(self, isHex):
|
| + """This function returns either U+FFFD or the character based on the
|
| + decimal or hexadecimal representation. It also discards ";" if present.
|
| + If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
|
| + """
|
| +
|
| + allowed = digits
|
| + radix = 10
|
| + if isHex:
|
| + allowed = hexDigits
|
| + radix = 16
|
| +
|
| + charStack = []
|
| +
|
| + # Consume all the characters that are in range while making sure we
|
| + # don't hit an EOF.
|
| + c = self.stream.char()
|
| + while c in allowed and c is not EOF:
|
| + charStack.append(c)
|
| + c = self.stream.char()
|
| +
|
| + # Convert the set of characters consumed to an int.
|
| + charAsInt = int("".join(charStack), radix)
|
| +
|
| + # Certain characters get replaced with others
|
| + if charAsInt in replacementCharacters:
|
| + char = replacementCharacters[charAsInt]
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "illegal-codepoint-for-numeric-entity",
|
| + "datavars": {"charAsInt": charAsInt}})
|
| + elif ((0xD800 <= charAsInt <= 0xDFFF) or
|
| + (charAsInt > 0x10FFFF)):
|
| + char = "\uFFFD"
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "illegal-codepoint-for-numeric-entity",
|
| + "datavars": {"charAsInt": charAsInt}})
|
| + else:
|
| + # Should speed up this check somehow (e.g. move the set to a constant)
|
| + if ((0x0001 <= charAsInt <= 0x0008) or
|
| + (0x000E <= charAsInt <= 0x001F) or
|
| + (0x007F <= charAsInt <= 0x009F) or
|
| + (0xFDD0 <= charAsInt <= 0xFDEF) or
|
| + charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
|
| + 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
| + 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
|
| + 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
|
| + 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
|
| + 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
|
| + 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
| + 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
|
| + 0xFFFFF, 0x10FFFE, 0x10FFFF])):
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data":
|
| + "illegal-codepoint-for-numeric-entity",
|
| + "datavars": {"charAsInt": charAsInt}})
|
| + try:
|
| + # Try/except needed as UCS-2 Python builds' unichar only works
|
| + # within the BMP.
|
| + char = chr(charAsInt)
|
| + except ValueError:
|
| + v = charAsInt - 0x10000
|
| + char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
|
| +
|
| + # Discard the ; if present. Otherwise, put it back on the queue and
|
| + # invoke parseError on parser.
|
| + if c != ";":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "numeric-entity-without-semicolon"})
|
| + self.stream.unget(c)
|
| +
|
| + return char
|
| +
|
| + def consumeEntity(self, allowedChar=None, fromAttribute=False):
|
| + # Initialise to the default output for when no entity is matched
|
| + output = "&"
|
| +
|
| + charStack = [self.stream.char()]
|
| + if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
|
| + or (allowedChar is not None and allowedChar == charStack[0])):
|
| + self.stream.unget(charStack[0])
|
| +
|
| + elif charStack[0] == "#":
|
| + # Read the next character to see if it's hex or decimal
|
| + hex = False
|
| + charStack.append(self.stream.char())
|
| + if charStack[-1] in ("x", "X"):
|
| + hex = True
|
| + charStack.append(self.stream.char())
|
| +
|
| + # charStack[-1] should be the first digit
|
| + if (hex and charStack[-1] in hexDigits) \
|
| + or (not hex and charStack[-1] in digits):
|
| + # At least one digit found, so consume the whole number
|
| + self.stream.unget(charStack[-1])
|
| + output = self.consumeNumberEntity(hex)
|
| + else:
|
| + # No digits found
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "expected-numeric-entity"})
|
| + self.stream.unget(charStack.pop())
|
| + output = "&" + "".join(charStack)
|
| +
|
| + else:
|
| + # At this point in the process might have named entity. Entities
|
| + # are stored in the global variable "entities".
|
| + #
|
| + # Consume characters and compare to these to a substring of the
|
| + # entity names in the list until the substring no longer matches.
|
| + while (charStack[-1] is not EOF):
|
| + if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
|
| + break
|
| + charStack.append(self.stream.char())
|
| +
|
| + # At this point we have a string that starts with some characters
|
| + # that may match an entity
|
| + # Try to find the longest entity the string will match to take care
|
| + # of ¬i for instance.
|
| + try:
|
| + entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
|
| + entityLength = len(entityName)
|
| + except KeyError:
|
| + entityName = None
|
| +
|
| + if entityName is not None:
|
| + if entityName[-1] != ";":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "named-entity-without-semicolon"})
|
| + if (entityName[-1] != ";" and fromAttribute and
|
| + (charStack[entityLength] in asciiLetters or
|
| + charStack[entityLength] in digits or
|
| + charStack[entityLength] == "=")):
|
| + self.stream.unget(charStack.pop())
|
| + output = "&" + "".join(charStack)
|
| + else:
|
| + output = entities[entityName]
|
| + self.stream.unget(charStack.pop())
|
| + output += "".join(charStack[entityLength:])
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-named-entity"})
|
| + self.stream.unget(charStack.pop())
|
| + output = "&" + "".join(charStack)
|
| +
|
| + if fromAttribute:
|
| + self.currentToken["data"][-1][1] += output
|
| + else:
|
| + if output in spaceCharacters:
|
| + tokenType = "SpaceCharacters"
|
| + else:
|
| + tokenType = "Characters"
|
| + self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
|
| +
|
| + def processEntityInAttribute(self, allowedChar):
|
| + """This method replaces the need for "entityInAttributeValueState".
|
| + """
|
| + self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
|
| +
|
| + def emitCurrentToken(self):
|
| + """This method is a generic handler for emitting the tags. It also sets
|
| + the state to "data" because that's what's needed after a token has been
|
| + emitted.
|
| + """
|
| + token = self.currentToken
|
| + # Add token to the queue to be yielded
|
| + if (token["type"] in tagTokenTypes):
|
| + if self.lowercaseElementName:
|
| + token["name"] = token["name"].translate(asciiUpper2Lower)
|
| + if token["type"] == tokenTypes["EndTag"]:
|
| + if token["data"]:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "attributes-in-end-tag"})
|
| + if token["selfClosing"]:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "self-closing-flag-on-end-tag"})
|
| + self.tokenQueue.append(token)
|
| + self.state = self.dataState
|
| +
|
| + # Below are the various tokenizer states worked out.
|
| + def dataState(self):
|
| + data = self.stream.char()
|
| + if data == "&":
|
| + self.state = self.entityDataState
|
| + elif data == "<":
|
| + self.state = self.tagOpenState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\u0000"})
|
| + elif data is EOF:
|
| + # Tokenization ends.
|
| + return False
|
| + elif data in spaceCharacters:
|
| + # Directly after emitting a token you switch back to the "data
|
| + # state". At that point spaceCharacters are important so they are
|
| + # emitted separately.
|
| + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
|
| + data + self.stream.charsUntil(spaceCharacters, True)})
|
| + # No need to update lastFourChars here, since the first space will
|
| + # have already been appended to lastFourChars and will have broken
|
| + # any <!-- or --> sequences
|
| + else:
|
| + chars = self.stream.charsUntil(("&", "<", "\u0000"))
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
| + data + chars})
|
| + return True
|
| +
|
| + def entityDataState(self):
|
| + self.consumeEntity()
|
| + self.state = self.dataState
|
| + return True
|
| +
|
| + def rcdataState(self):
|
| + data = self.stream.char()
|
| + if data == "&":
|
| + self.state = self.characterReferenceInRcdata
|
| + elif data == "<":
|
| + self.state = self.rcdataLessThanSignState
|
| + elif data == EOF:
|
| + # Tokenization ends.
|
| + return False
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + elif data in spaceCharacters:
|
| + # Directly after emitting a token you switch back to the "data
|
| + # state". At that point spaceCharacters are important so they are
|
| + # emitted separately.
|
| + self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
|
| + data + self.stream.charsUntil(spaceCharacters, True)})
|
| + # No need to update lastFourChars here, since the first space will
|
| + # have already been appended to lastFourChars and will have broken
|
| + # any <!-- or --> sequences
|
| + else:
|
| + chars = self.stream.charsUntil(("&", "<", "\u0000"))
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
| + data + chars})
|
| + return True
|
| +
|
| + def characterReferenceInRcdata(self):
|
| + self.consumeEntity()
|
| + self.state = self.rcdataState
|
| + return True
|
| +
|
| + def rawtextState(self):
|
| + data = self.stream.char()
|
| + if data == "<":
|
| + self.state = self.rawtextLessThanSignState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + elif data == EOF:
|
| + # Tokenization ends.
|
| + return False
|
| + else:
|
| + chars = self.stream.charsUntil(("<", "\u0000"))
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
| + data + chars})
|
| + return True
|
| +
|
| + def scriptDataState(self):
|
| + data = self.stream.char()
|
| + if data == "<":
|
| + self.state = self.scriptDataLessThanSignState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + elif data == EOF:
|
| + # Tokenization ends.
|
| + return False
|
| + else:
|
| + chars = self.stream.charsUntil(("<", "\u0000"))
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
| + data + chars})
|
| + return True
|
| +
|
| + def plaintextState(self):
|
| + data = self.stream.char()
|
| + if data == EOF:
|
| + # Tokenization ends.
|
| + return False
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
| + data + self.stream.charsUntil("\u0000")})
|
| + return True
|
| +
|
| + def tagOpenState(self):
|
| + data = self.stream.char()
|
| + if data == "!":
|
| + self.state = self.markupDeclarationOpenState
|
| + elif data == "/":
|
| + self.state = self.closeTagOpenState
|
| + elif data in asciiLetters:
|
| + self.currentToken = {"type": tokenTypes["StartTag"],
|
| + "name": data, "data": [],
|
| + "selfClosing": False,
|
| + "selfClosingAcknowledged": False}
|
| + self.state = self.tagNameState
|
| + elif data == ">":
|
| + # XXX In theory it could be something besides a tag name. But
|
| + # do we really care?
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-tag-name-but-got-right-bracket"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
|
| + self.state = self.dataState
|
| + elif data == "?":
|
| + # XXX In theory it could be something besides a tag name. But
|
| + # do we really care?
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-tag-name-but-got-question-mark"})
|
| + self.stream.unget(data)
|
| + self.state = self.bogusCommentState
|
| + else:
|
| + # XXX
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-tag-name"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
| + self.stream.unget(data)
|
| + self.state = self.dataState
|
| + return True
|
| +
|
| + def closeTagOpenState(self):
|
| + data = self.stream.char()
|
| + if data in asciiLetters:
|
| + self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
|
| + "data": [], "selfClosing": False}
|
| + self.state = self.tagNameState
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-closing-tag-but-got-right-bracket"})
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-closing-tag-but-got-eof"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
| + self.state = self.dataState
|
| + else:
|
| + # XXX data can be _'_...
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-closing-tag-but-got-char",
|
| + "datavars": {"data": data}})
|
| + self.stream.unget(data)
|
| + self.state = self.bogusCommentState
|
| + return True
|
| +
|
| + def tagNameState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.state = self.beforeAttributeNameState
|
| + elif data == ">":
|
| + self.emitCurrentToken()
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-tag-name"})
|
| + self.state = self.dataState
|
| + elif data == "/":
|
| + self.state = self.selfClosingStartTagState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["name"] += "\uFFFD"
|
| + else:
|
| + self.currentToken["name"] += data
|
| + # (Don't use charsUntil here, because tag names are
|
| + # very short and it's faster to not do anything fancy)
|
| + return True
|
| +
|
| + def rcdataLessThanSignState(self):
|
| + data = self.stream.char()
|
| + if data == "/":
|
| + self.temporaryBuffer = ""
|
| + self.state = self.rcdataEndTagOpenState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
| + self.stream.unget(data)
|
| + self.state = self.rcdataState
|
| + return True
|
| +
|
| + def rcdataEndTagOpenState(self):
|
| + data = self.stream.char()
|
| + if data in asciiLetters:
|
| + self.temporaryBuffer += data
|
| + self.state = self.rcdataEndTagNameState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
| + self.stream.unget(data)
|
| + self.state = self.rcdataState
|
| + return True
|
| +
|
| + def rcdataEndTagNameState(self):
|
| + appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
| + data = self.stream.char()
|
| + if data in spaceCharacters and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.state = self.beforeAttributeNameState
|
| + elif data == "/" and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.state = self.selfClosingStartTagState
|
| + elif data == ">" and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.emitCurrentToken()
|
| + self.state = self.dataState
|
| + elif data in asciiLetters:
|
| + self.temporaryBuffer += data
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "</" + self.temporaryBuffer})
|
| + self.stream.unget(data)
|
| + self.state = self.rcdataState
|
| + return True
|
| +
|
| + def rawtextLessThanSignState(self):
|
| + data = self.stream.char()
|
| + if data == "/":
|
| + self.temporaryBuffer = ""
|
| + self.state = self.rawtextEndTagOpenState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
| + self.stream.unget(data)
|
| + self.state = self.rawtextState
|
| + return True
|
| +
|
| + def rawtextEndTagOpenState(self):
|
| + data = self.stream.char()
|
| + if data in asciiLetters:
|
| + self.temporaryBuffer += data
|
| + self.state = self.rawtextEndTagNameState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
| + self.stream.unget(data)
|
| + self.state = self.rawtextState
|
| + return True
|
| +
|
| + def rawtextEndTagNameState(self):
|
| + appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
| + data = self.stream.char()
|
| + if data in spaceCharacters and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.state = self.beforeAttributeNameState
|
| + elif data == "/" and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.state = self.selfClosingStartTagState
|
| + elif data == ">" and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.emitCurrentToken()
|
| + self.state = self.dataState
|
| + elif data in asciiLetters:
|
| + self.temporaryBuffer += data
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "</" + self.temporaryBuffer})
|
| + self.stream.unget(data)
|
| + self.state = self.rawtextState
|
| + return True
|
| +
|
| + def scriptDataLessThanSignState(self):
|
| + data = self.stream.char()
|
| + if data == "/":
|
| + self.temporaryBuffer = ""
|
| + self.state = self.scriptDataEndTagOpenState
|
| + elif data == "!":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
|
| + self.state = self.scriptDataEscapeStartState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataState
|
| + return True
|
| +
|
| + def scriptDataEndTagOpenState(self):
|
| + data = self.stream.char()
|
| + if data in asciiLetters:
|
| + self.temporaryBuffer += data
|
| + self.state = self.scriptDataEndTagNameState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataState
|
| + return True
|
| +
|
| + def scriptDataEndTagNameState(self):
|
| + appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
| + data = self.stream.char()
|
| + if data in spaceCharacters and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.state = self.beforeAttributeNameState
|
| + elif data == "/" and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.state = self.selfClosingStartTagState
|
| + elif data == ">" and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.emitCurrentToken()
|
| + self.state = self.dataState
|
| + elif data in asciiLetters:
|
| + self.temporaryBuffer += data
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "</" + self.temporaryBuffer})
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataState
|
| + return True
|
| +
|
| + def scriptDataEscapeStartState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
| + self.state = self.scriptDataEscapeStartDashState
|
| + else:
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataState
|
| + return True
|
| +
|
| + def scriptDataEscapeStartDashState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
| + self.state = self.scriptDataEscapedDashDashState
|
| + else:
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataState
|
| + return True
|
| +
|
| + def scriptDataEscapedState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
| + self.state = self.scriptDataEscapedDashState
|
| + elif data == "<":
|
| + self.state = self.scriptDataEscapedLessThanSignState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + elif data == EOF:
|
| + self.state = self.dataState
|
| + else:
|
| + chars = self.stream.charsUntil(("<", "-", "\u0000"))
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
| + data + chars})
|
| + return True
|
| +
|
| + def scriptDataEscapedDashState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
| + self.state = self.scriptDataEscapedDashDashState
|
| + elif data == "<":
|
| + self.state = self.scriptDataEscapedLessThanSignState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + self.state = self.scriptDataEscapedState
|
| + elif data == EOF:
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
| + self.state = self.scriptDataEscapedState
|
| + return True
|
| +
|
| + def scriptDataEscapedDashDashState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
| + elif data == "<":
|
| + self.state = self.scriptDataEscapedLessThanSignState
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
|
| + self.state = self.scriptDataState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + self.state = self.scriptDataEscapedState
|
| + elif data == EOF:
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
| + self.state = self.scriptDataEscapedState
|
| + return True
|
| +
|
| + def scriptDataEscapedLessThanSignState(self):
|
| + data = self.stream.char()
|
| + if data == "/":
|
| + self.temporaryBuffer = ""
|
| + self.state = self.scriptDataEscapedEndTagOpenState
|
| + elif data in asciiLetters:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
|
| + self.temporaryBuffer = data
|
| + self.state = self.scriptDataDoubleEscapeStartState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataEscapedState
|
| + return True
|
| +
|
| + def scriptDataEscapedEndTagOpenState(self):
|
| + data = self.stream.char()
|
| + if data in asciiLetters:
|
| + self.temporaryBuffer = data
|
| + self.state = self.scriptDataEscapedEndTagNameState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataEscapedState
|
| + return True
|
| +
|
| + def scriptDataEscapedEndTagNameState(self):
|
| + appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
| + data = self.stream.char()
|
| + if data in spaceCharacters and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.state = self.beforeAttributeNameState
|
| + elif data == "/" and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.state = self.selfClosingStartTagState
|
| + elif data == ">" and appropriate:
|
| + self.currentToken = {"type": tokenTypes["EndTag"],
|
| + "name": self.temporaryBuffer,
|
| + "data": [], "selfClosing": False}
|
| + self.emitCurrentToken()
|
| + self.state = self.dataState
|
| + elif data in asciiLetters:
|
| + self.temporaryBuffer += data
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "</" + self.temporaryBuffer})
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataEscapedState
|
| + return True
|
| +
|
| + def scriptDataDoubleEscapeStartState(self):
|
| + data = self.stream.char()
|
| + if data in (spaceCharacters | frozenset(("/", ">"))):
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
| + if self.temporaryBuffer.lower() == "script":
|
| + self.state = self.scriptDataDoubleEscapedState
|
| + else:
|
| + self.state = self.scriptDataEscapedState
|
| + elif data in asciiLetters:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
| + self.temporaryBuffer += data
|
| + else:
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataEscapedState
|
| + return True
|
| +
|
| + def scriptDataDoubleEscapedState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
| + self.state = self.scriptDataDoubleEscapedDashState
|
| + elif data == "<":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
| + self.state = self.scriptDataDoubleEscapedLessThanSignState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + elif data == EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-script-in-script"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
| + return True
|
| +
|
| + def scriptDataDoubleEscapedDashState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
| + self.state = self.scriptDataDoubleEscapedDashDashState
|
| + elif data == "<":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
| + self.state = self.scriptDataDoubleEscapedLessThanSignState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + self.state = self.scriptDataDoubleEscapedState
|
| + elif data == EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-script-in-script"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
| + self.state = self.scriptDataDoubleEscapedState
|
| + return True
|
| +
|
| + def scriptDataDoubleEscapedDashDashState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
|
| + elif data == "<":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
|
| + self.state = self.scriptDataDoubleEscapedLessThanSignState
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
|
| + self.state = self.scriptDataState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": "\uFFFD"})
|
| + self.state = self.scriptDataDoubleEscapedState
|
| + elif data == EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-script-in-script"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
| + self.state = self.scriptDataDoubleEscapedState
|
| + return True
|
| +
|
| + def scriptDataDoubleEscapedLessThanSignState(self):
|
| + data = self.stream.char()
|
| + if data == "/":
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
|
| + self.temporaryBuffer = ""
|
| + self.state = self.scriptDataDoubleEscapeEndState
|
| + else:
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataDoubleEscapedState
|
| + return True
|
| +
|
| + def scriptDataDoubleEscapeEndState(self):
|
| + data = self.stream.char()
|
| + if data in (spaceCharacters | frozenset(("/", ">"))):
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
| + if self.temporaryBuffer.lower() == "script":
|
| + self.state = self.scriptDataEscapedState
|
| + else:
|
| + self.state = self.scriptDataDoubleEscapedState
|
| + elif data in asciiLetters:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
| + self.temporaryBuffer += data
|
| + else:
|
| + self.stream.unget(data)
|
| + self.state = self.scriptDataDoubleEscapedState
|
| + return True
|
| +
|
| + def beforeAttributeNameState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.stream.charsUntil(spaceCharacters, True)
|
| + elif data in asciiLetters:
|
| + self.currentToken["data"].append([data, ""])
|
| + self.state = self.attributeNameState
|
| + elif data == ">":
|
| + self.emitCurrentToken()
|
| + elif data == "/":
|
| + self.state = self.selfClosingStartTagState
|
| + elif data in ("'", '"', "=", "<"):
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "invalid-character-in-attribute-name"})
|
| + self.currentToken["data"].append([data, ""])
|
| + self.state = self.attributeNameState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"].append(["\uFFFD", ""])
|
| + self.state = self.attributeNameState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-attribute-name-but-got-eof"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"].append([data, ""])
|
| + self.state = self.attributeNameState
|
| + return True
|
| +
|
| + def attributeNameState(self):
|
| + data = self.stream.char()
|
| + leavingThisState = True
|
| + emitToken = False
|
| + if data == "=":
|
| + self.state = self.beforeAttributeValueState
|
| + elif data in asciiLetters:
|
| + self.currentToken["data"][-1][0] += data +\
|
| + self.stream.charsUntil(asciiLetters, True)
|
| + leavingThisState = False
|
| + elif data == ">":
|
| + # XXX If we emit here the attributes are converted to a dict
|
| + # without being checked and when the code below runs we error
|
| + # because data is a dict not a list
|
| + emitToken = True
|
| + elif data in spaceCharacters:
|
| + self.state = self.afterAttributeNameState
|
| + elif data == "/":
|
| + self.state = self.selfClosingStartTagState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"][-1][0] += "\uFFFD"
|
| + leavingThisState = False
|
| + elif data in ("'", '"', "<"):
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data":
|
| + "invalid-character-in-attribute-name"})
|
| + self.currentToken["data"][-1][0] += data
|
| + leavingThisState = False
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "eof-in-attribute-name"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"][-1][0] += data
|
| + leavingThisState = False
|
| +
|
| + if leavingThisState:
|
| + # Attributes are not dropped at this stage. That happens when the
|
| + # start tag token is emitted so values can still be safely appended
|
| + # to attributes, but we do want to report the parse error in time.
|
| + if self.lowercaseAttrName:
|
| + self.currentToken["data"][-1][0] = (
|
| + self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
| + for name, value in self.currentToken["data"][:-1]:
|
| + if self.currentToken["data"][-1][0] == name:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "duplicate-attribute"})
|
| + break
|
| + # XXX Fix for above XXX
|
| + if emitToken:
|
| + self.emitCurrentToken()
|
| + return True
|
| +
|
| + def afterAttributeNameState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.stream.charsUntil(spaceCharacters, True)
|
| + elif data == "=":
|
| + self.state = self.beforeAttributeValueState
|
| + elif data == ">":
|
| + self.emitCurrentToken()
|
| + elif data in asciiLetters:
|
| + self.currentToken["data"].append([data, ""])
|
| + self.state = self.attributeNameState
|
| + elif data == "/":
|
| + self.state = self.selfClosingStartTagState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"].append(["\uFFFD", ""])
|
| + self.state = self.attributeNameState
|
| + elif data in ("'", '"', "<"):
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "invalid-character-after-attribute-name"})
|
| + self.currentToken["data"].append([data, ""])
|
| + self.state = self.attributeNameState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-end-of-tag-but-got-eof"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"].append([data, ""])
|
| + self.state = self.attributeNameState
|
| + return True
|
| +
|
| + def beforeAttributeValueState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.stream.charsUntil(spaceCharacters, True)
|
| + elif data == "\"":
|
| + self.state = self.attributeValueDoubleQuotedState
|
| + elif data == "&":
|
| + self.state = self.attributeValueUnQuotedState
|
| + self.stream.unget(data)
|
| + elif data == "'":
|
| + self.state = self.attributeValueSingleQuotedState
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-attribute-value-but-got-right-bracket"})
|
| + self.emitCurrentToken()
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"][-1][1] += "\uFFFD"
|
| + self.state = self.attributeValueUnQuotedState
|
| + elif data in ("=", "<", "`"):
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "equals-in-unquoted-attribute-value"})
|
| + self.currentToken["data"][-1][1] += data
|
| + self.state = self.attributeValueUnQuotedState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-attribute-value-but-got-eof"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"][-1][1] += data
|
| + self.state = self.attributeValueUnQuotedState
|
| + return True
|
| +
|
| + def attributeValueDoubleQuotedState(self):
|
| + data = self.stream.char()
|
| + if data == "\"":
|
| + self.state = self.afterAttributeValueState
|
| + elif data == "&":
|
| + self.processEntityInAttribute('"')
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"][-1][1] += "\uFFFD"
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-attribute-value-double-quote"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"][-1][1] += data +\
|
| + self.stream.charsUntil(("\"", "&", "\u0000"))
|
| + return True
|
| +
|
| + def attributeValueSingleQuotedState(self):
|
| + data = self.stream.char()
|
| + if data == "'":
|
| + self.state = self.afterAttributeValueState
|
| + elif data == "&":
|
| + self.processEntityInAttribute("'")
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"][-1][1] += "\uFFFD"
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-attribute-value-single-quote"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"][-1][1] += data +\
|
| + self.stream.charsUntil(("'", "&", "\u0000"))
|
| + return True
|
| +
|
| + def attributeValueUnQuotedState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.state = self.beforeAttributeNameState
|
| + elif data == "&":
|
| + self.processEntityInAttribute(">")
|
| + elif data == ">":
|
| + self.emitCurrentToken()
|
| + elif data in ('"', "'", "=", "<", "`"):
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-character-in-unquoted-attribute-value"})
|
| + self.currentToken["data"][-1][1] += data
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"][-1][1] += "\uFFFD"
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-attribute-value-no-quotes"})
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
|
| + frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
|
| + return True
|
| +
|
| + def afterAttributeValueState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.state = self.beforeAttributeNameState
|
| + elif data == ">":
|
| + self.emitCurrentToken()
|
| + elif data == "/":
|
| + self.state = self.selfClosingStartTagState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-EOF-after-attribute-value"})
|
| + self.stream.unget(data)
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-character-after-attribute-value"})
|
| + self.stream.unget(data)
|
| + self.state = self.beforeAttributeNameState
|
| + return True
|
| +
|
| + def selfClosingStartTagState(self):
|
| + data = self.stream.char()
|
| + if data == ">":
|
| + self.currentToken["selfClosing"] = True
|
| + self.emitCurrentToken()
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data":
|
| + "unexpected-EOF-after-solidus-in-tag"})
|
| + self.stream.unget(data)
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-character-after-solidus-in-tag"})
|
| + self.stream.unget(data)
|
| + self.state = self.beforeAttributeNameState
|
| + return True
|
| +
|
| + def bogusCommentState(self):
|
| + # Make a new comment token and give it as value all the characters
|
| + # until the first > or EOF (charsUntil checks for EOF automatically)
|
| + # and emit it.
|
| + data = self.stream.charsUntil(">")
|
| + data = data.replace("\u0000", "\uFFFD")
|
| + self.tokenQueue.append(
|
| + {"type": tokenTypes["Comment"], "data": data})
|
| +
|
| + # Eat the character directly after the bogus comment which is either a
|
| + # ">" or an EOF.
|
| + self.stream.char()
|
| + self.state = self.dataState
|
| + return True
|
| +
|
| + def markupDeclarationOpenState(self):
|
| + charStack = [self.stream.char()]
|
| + if charStack[-1] == "-":
|
| + charStack.append(self.stream.char())
|
| + if charStack[-1] == "-":
|
| + self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
|
| + self.state = self.commentStartState
|
| + return True
|
| + elif charStack[-1] in ('d', 'D'):
|
| + matched = True
|
| + for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
|
| + ('y', 'Y'), ('p', 'P'), ('e', 'E')):
|
| + charStack.append(self.stream.char())
|
| + if charStack[-1] not in expected:
|
| + matched = False
|
| + break
|
| + if matched:
|
| + self.currentToken = {"type": tokenTypes["Doctype"],
|
| + "name": "",
|
| + "publicId": None, "systemId": None,
|
| + "correct": True}
|
| + self.state = self.doctypeState
|
| + return True
|
| + elif (charStack[-1] == "[" and
|
| + self.parser is not None and
|
| + self.parser.tree.openElements and
|
| + self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
|
| + matched = True
|
| + for expected in ["C", "D", "A", "T", "A", "["]:
|
| + charStack.append(self.stream.char())
|
| + if charStack[-1] != expected:
|
| + matched = False
|
| + break
|
| + if matched:
|
| + self.state = self.cdataSectionState
|
| + return True
|
| +
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-dashes-or-doctype"})
|
| +
|
| + while charStack:
|
| + self.stream.unget(charStack.pop())
|
| + self.state = self.bogusCommentState
|
| + return True
|
| +
|
| + def commentStartState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.state = self.commentStartDashState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"] += "\uFFFD"
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "incorrect-comment"})
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-comment"})
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"] += data
|
| + self.state = self.commentState
|
| + return True
|
| +
|
| + def commentStartDashState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.state = self.commentEndState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"] += "-\uFFFD"
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "incorrect-comment"})
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-comment"})
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"] += "-" + data
|
| + self.state = self.commentState
|
| + return True
|
| +
|
| + def commentState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.state = self.commentEndDashState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"] += "\uFFFD"
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "eof-in-comment"})
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"] += data + \
|
| + self.stream.charsUntil(("-", "\u0000"))
|
| + return True
|
| +
|
| + def commentEndDashState(self):
|
| + data = self.stream.char()
|
| + if data == "-":
|
| + self.state = self.commentEndState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"] += "-\uFFFD"
|
| + self.state = self.commentState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-comment-end-dash"})
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"] += "-" + data
|
| + self.state = self.commentState
|
| + return True
|
| +
|
| + def commentEndState(self):
|
| + data = self.stream.char()
|
| + if data == ">":
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"] += "--\uFFFD"
|
| + self.state = self.commentState
|
| + elif data == "!":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-bang-after-double-dash-in-comment"})
|
| + self.state = self.commentEndBangState
|
| + elif data == "-":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-dash-after-double-dash-in-comment"})
|
| + self.currentToken["data"] += data
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-comment-double-dash"})
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + # XXX
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-comment"})
|
| + self.currentToken["data"] += "--" + data
|
| + self.state = self.commentState
|
| + return True
|
| +
|
| + def commentEndBangState(self):
|
| + data = self.stream.char()
|
| + if data == ">":
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data == "-":
|
| + self.currentToken["data"] += "--!"
|
| + self.state = self.commentEndDashState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["data"] += "--!\uFFFD"
|
| + self.state = self.commentState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-comment-end-bang-state"})
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["data"] += "--!" + data
|
| + self.state = self.commentState
|
| + return True
|
| +
|
| + def doctypeState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.state = self.beforeDoctypeNameState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-doctype-name-but-got-eof"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "need-space-after-doctype"})
|
| + self.stream.unget(data)
|
| + self.state = self.beforeDoctypeNameState
|
| + return True
|
| +
|
| + def beforeDoctypeNameState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + pass
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-doctype-name-but-got-right-bracket"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["name"] = "\uFFFD"
|
| + self.state = self.doctypeNameState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-doctype-name-but-got-eof"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["name"] = data
|
| + self.state = self.doctypeNameState
|
| + return True
|
| +
|
| + def doctypeNameState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
| + self.state = self.afterDoctypeNameState
|
| + elif data == ">":
|
| + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["name"] += "\uFFFD"
|
| + self.state = self.doctypeNameState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype-name"})
|
| + self.currentToken["correct"] = False
|
| + self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["name"] += data
|
| + return True
|
| +
|
| + def afterDoctypeNameState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + pass
|
| + elif data == ">":
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.currentToken["correct"] = False
|
| + self.stream.unget(data)
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + if data in ("p", "P"):
|
| + matched = True
|
| + for expected in (("u", "U"), ("b", "B"), ("l", "L"),
|
| + ("i", "I"), ("c", "C")):
|
| + data = self.stream.char()
|
| + if data not in expected:
|
| + matched = False
|
| + break
|
| + if matched:
|
| + self.state = self.afterDoctypePublicKeywordState
|
| + return True
|
| + elif data in ("s", "S"):
|
| + matched = True
|
| + for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
|
| + ("e", "E"), ("m", "M")):
|
| + data = self.stream.char()
|
| + if data not in expected:
|
| + matched = False
|
| + break
|
| + if matched:
|
| + self.state = self.afterDoctypeSystemKeywordState
|
| + return True
|
| +
|
| + # All the characters read before the current 'data' will be
|
| + # [a-zA-Z], so they're garbage in the bogus doctype and can be
|
| + # discarded; only the latest character might be '>' or EOF
|
| + # and needs to be ungetted
|
| + self.stream.unget(data)
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "expected-space-or-right-bracket-in-doctype", "datavars":
|
| + {"data": data}})
|
| + self.currentToken["correct"] = False
|
| + self.state = self.bogusDoctypeState
|
| +
|
| + return True
|
| +
|
| + def afterDoctypePublicKeywordState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.state = self.beforeDoctypePublicIdentifierState
|
| + elif data in ("'", '"'):
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.stream.unget(data)
|
| + self.state = self.beforeDoctypePublicIdentifierState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.stream.unget(data)
|
| + self.state = self.beforeDoctypePublicIdentifierState
|
| + return True
|
| +
|
| + def beforeDoctypePublicIdentifierState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + pass
|
| + elif data == "\"":
|
| + self.currentToken["publicId"] = ""
|
| + self.state = self.doctypePublicIdentifierDoubleQuotedState
|
| + elif data == "'":
|
| + self.currentToken["publicId"] = ""
|
| + self.state = self.doctypePublicIdentifierSingleQuotedState
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-end-of-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.state = self.bogusDoctypeState
|
| + return True
|
| +
|
| + def doctypePublicIdentifierDoubleQuotedState(self):
|
| + data = self.stream.char()
|
| + if data == "\"":
|
| + self.state = self.afterDoctypePublicIdentifierState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["publicId"] += "\uFFFD"
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-end-of-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["publicId"] += data
|
| + return True
|
| +
|
| + def doctypePublicIdentifierSingleQuotedState(self):
|
| + data = self.stream.char()
|
| + if data == "'":
|
| + self.state = self.afterDoctypePublicIdentifierState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["publicId"] += "\uFFFD"
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-end-of-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["publicId"] += data
|
| + return True
|
| +
|
| + def afterDoctypePublicIdentifierState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.state = self.betweenDoctypePublicAndSystemIdentifiersState
|
| + elif data == ">":
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data == '"':
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.currentToken["systemId"] = ""
|
| + self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
| + elif data == "'":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.currentToken["systemId"] = ""
|
| + self.state = self.doctypeSystemIdentifierSingleQuotedState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.state = self.bogusDoctypeState
|
| + return True
|
| +
|
| + def betweenDoctypePublicAndSystemIdentifiersState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + pass
|
| + elif data == ">":
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data == '"':
|
| + self.currentToken["systemId"] = ""
|
| + self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
| + elif data == "'":
|
| + self.currentToken["systemId"] = ""
|
| + self.state = self.doctypeSystemIdentifierSingleQuotedState
|
| + elif data == EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.state = self.bogusDoctypeState
|
| + return True
|
| +
|
| + def afterDoctypeSystemKeywordState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + self.state = self.beforeDoctypeSystemIdentifierState
|
| + elif data in ("'", '"'):
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.stream.unget(data)
|
| + self.state = self.beforeDoctypeSystemIdentifierState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.stream.unget(data)
|
| + self.state = self.beforeDoctypeSystemIdentifierState
|
| + return True
|
| +
|
| + def beforeDoctypeSystemIdentifierState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + pass
|
| + elif data == "\"":
|
| + self.currentToken["systemId"] = ""
|
| + self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
| + elif data == "'":
|
| + self.currentToken["systemId"] = ""
|
| + self.state = self.doctypeSystemIdentifierSingleQuotedState
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.state = self.bogusDoctypeState
|
| + return True
|
| +
|
| + def doctypeSystemIdentifierDoubleQuotedState(self):
|
| + data = self.stream.char()
|
| + if data == "\"":
|
| + self.state = self.afterDoctypeSystemIdentifierState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["systemId"] += "\uFFFD"
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-end-of-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["systemId"] += data
|
| + return True
|
| +
|
| + def doctypeSystemIdentifierSingleQuotedState(self):
|
| + data = self.stream.char()
|
| + if data == "'":
|
| + self.state = self.afterDoctypeSystemIdentifierState
|
| + elif data == "\u0000":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + self.currentToken["systemId"] += "\uFFFD"
|
| + elif data == ">":
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-end-of-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.currentToken["systemId"] += data
|
| + return True
|
| +
|
| + def afterDoctypeSystemIdentifierState(self):
|
| + data = self.stream.char()
|
| + if data in spaceCharacters:
|
| + pass
|
| + elif data == ">":
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "eof-in-doctype"})
|
| + self.currentToken["correct"] = False
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
| + "unexpected-char-in-doctype"})
|
| + self.state = self.bogusDoctypeState
|
| + return True
|
| +
|
| + def bogusDoctypeState(self):
|
| + data = self.stream.char()
|
| + if data == ">":
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + elif data is EOF:
|
| + # XXX EMIT
|
| + self.stream.unget(data)
|
| + self.tokenQueue.append(self.currentToken)
|
| + self.state = self.dataState
|
| + else:
|
| + pass
|
| + return True
|
| +
|
| + def cdataSectionState(self):
|
| + data = []
|
| + while True:
|
| + data.append(self.stream.charsUntil("]"))
|
| + data.append(self.stream.charsUntil(">"))
|
| + char = self.stream.char()
|
| + if char == EOF:
|
| + break
|
| + else:
|
| + assert char == ">"
|
| + if data[-1][-2:] == "]]":
|
| + data[-1] = data[-1][:-2]
|
| + break
|
| + else:
|
| + data.append(char)
|
| +
|
| + data = "".join(data)
|
| + # Deal with null here rather than in the parser
|
| + nullCount = data.count("\u0000")
|
| + if nullCount > 0:
|
| + for i in range(nullCount):
|
| + self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
| + "data": "invalid-codepoint"})
|
| + data = data.replace("\u0000", "\uFFFD")
|
| + if data:
|
| + self.tokenQueue.append({"type": tokenTypes["Characters"],
|
| + "data": data})
|
| + self.state = self.dataState
|
| + return True
|
|
|