Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Unified Diff: pkg/third_party/html5lib/lib/src/tokenizer.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: change location of html5lib to pkg/third_party/html5lib Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: pkg/third_party/html5lib/lib/src/tokenizer.dart
diff --git a/pkg/third_party/html5lib/lib/src/tokenizer.dart b/pkg/third_party/html5lib/lib/src/tokenizer.dart
new file mode 100644
index 0000000000000000000000000000000000000000..ee867b7cc7cc549e94439ec7f68133b706e44154
--- /dev/null
+++ b/pkg/third_party/html5lib/lib/src/tokenizer.dart
@@ -0,0 +1,1900 @@
+library tokenizer;
+
+import 'dart:collection';
+import 'dart:math';
+import 'package:html5lib/parser.dart' show HtmlParser;
+import 'package:source_maps/span.dart' show Span, FileSpan;
+import 'constants.dart';
+import 'inputstream.dart';
+import 'token.dart';
+import 'utils.dart';
+
+// Group entities by their first character, for faster lookups
+
+// TODO(jmesserly): we could use a better data structure here like a trie, if
+// we had it implemented in Dart.
+Map<String, List<String>> entitiesByFirstChar = (() {
+ var result = {};
+ for (var k in entities.keys) {
+ result.putIfAbsent(k[0], () => []).add(k);
+ }
+ return result;
+})();
+
+// TODO(jmesserly): lots of ways to make this faster:
+// - use char codes everywhere instead of 1-char strings
+// - use switch instead of contains, indexOf
+// - use switch instead of the sequential if tests
+// - avoid string concat
+
+/**
+ * This class takes care of tokenizing HTML.
+ */
+class HtmlTokenizer implements Iterator<Token> {
+ // TODO(jmesserly): a lot of these could be made private
+
+ final HtmlInputStream stream;
+
+ final bool lowercaseElementName;
+
+ final bool lowercaseAttrName;
+
+ /** True to generate spans in for [Token.span]. */
+ final bool generateSpans;
+
+ /** True to generate spans for attributes. */
+ final bool attributeSpans;
+
+ /**
+ * This reference to the parser is used for correct CDATA handling.
+ * The [HtmlParser] will set this at construction time.
+ */
+ HtmlParser parser;
+
+ final Queue<Token> tokenQueue;
+
+ /** Holds the token that is currently being processed. */
+ Token currentToken;
+
+ /**
+ * Holds a reference to the method to be invoked for the next parser state.
+ */
+ Predicate state;
+
+ String temporaryBuffer;
+
+ int _lastOffset;
+
+ // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add
+ // an item until it's ready. But the code doesn't have a clear notion of when
+ // it's "done" with the attribute.
+ List<TagAttribute> _attributes;
+ Set<String> _attributeNames;
+
+ HtmlTokenizer(doc, {String encoding, bool parseMeta: true,
+ this.lowercaseElementName: true, this.lowercaseAttrName: true,
+ bool generateSpans: false, String sourceUrl, this.attributeSpans: false})
+ : stream = new HtmlInputStream(
+ doc, encoding, parseMeta, generateSpans, sourceUrl),
+ tokenQueue = new Queue(),
+ generateSpans = generateSpans {
+ reset();
+ }
+
+ TagToken get currentTagToken => currentToken;
+ DoctypeToken get currentDoctypeToken => currentToken;
+ StringToken get currentStringToken => currentToken;
+
+ Token _current;
+ Token get current => _current;
+
+ String get _attributeName => _attributes.last.name;
+ set _attributeName(String value) {
+ _attributes.last.name = value;
+ }
+
+ String get _attributeValue => _attributes.last.value;
+ set _attributeValue(String value) {
+ _attributes.last.value = value;
+ }
+
+ void _markAttributeEnd(int offset) {
+ if (attributeSpans) _attributes.last.end = stream.position + offset;
+ }
+
+ void _markAttributeValueStart(int offset) {
+ if (attributeSpans) _attributes.last.startValue = stream.position + offset;
+ }
+
+ void _markAttributeValueEnd(int offset) {
+ if (attributeSpans) {
+ _attributes.last.endValue = stream.position + offset;
+ _markAttributeEnd(offset);
+ }
+ }
+
+ // Note: we could track the name span here, if we need it.
+ void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset);
+
+ void _addAttribute(String name) {
+ if (_attributes == null) _attributes = [];
+ var attr = new TagAttribute(name);
+ _attributes.add(attr);
+ if (attributeSpans) attr.start = stream.position - name.length;
+ }
+
+ /**
+ * This is where the magic happens.
+ *
+ * We do our usually processing through the states and when we have a token
+ * to return we yield the token which pauses processing until the next token
+ * is requested.
+ */
+ bool moveNext() {
+ // Start processing. When EOF is reached state will return false;
+ // instead of true and the loop will terminate.
+ while (stream.errors.length == 0 && tokenQueue.length == 0) {
+ if (!state()) {
+ _current = null;
+ return false;
+ }
+ }
+ if (stream.errors.length > 0) {
+ _current = new ParseErrorToken(stream.errors.removeFirst());
+ } else {
+ assert (tokenQueue.length > 0);
+ _current = tokenQueue.removeFirst();
+ }
+ return true;
+ }
+
+ /**
+ * Resets the tokenizer state. Calling this does not reset the [stream] or
+ * the [parser].
+ */
+ void reset() {
+ _lastOffset = 0;
+ tokenQueue.clear();
+ currentToken = null;
+ temporaryBuffer = null;
+ _attributes = null;
+ _attributeNames = null;
+ state = dataState;
+ }
+
+ /** Adds a token to the queue. Sets the span if needed. */
+ void _addToken(Token token) {
+ if (generateSpans && token.span == null) {
+ int offset = stream.position;
+ token.span = new FileSpan(stream.fileInfo, _lastOffset, offset);
+ if (token is! ParseErrorToken) {
+ _lastOffset = offset;
+ }
+ }
+ tokenQueue.add(token);
+ }
+
+ /**
+ * This function returns either U+FFFD or the character based on the
+ * decimal or hexadecimal representation. It also discards ";" if present.
+ * If not present it will add a [ParseErrorToken].
+ */
+ String consumeNumberEntity(bool isHex) {
+ var allowed = isDigit;
+ var radix = 10;
+ if (isHex) {
+ allowed = isHexDigit;
+ radix = 16;
+ }
+
+ var charStack = [];
+
+ // Consume all the characters that are in range while making sure we
+ // don't hit an EOF.
+ var c = stream.char();
+ while (allowed(c) && c != EOF) {
+ charStack.add(c);
+ c = stream.char();
+ }
+
+ // Convert the set of characters consumed to an int.
+ var charAsInt = parseIntRadix(charStack.join(), radix);
+
+ // Certain characters get replaced with others
+ var char = replacementCharacters[charAsInt];
+ if (char != null) {
+ _addToken(new ParseErrorToken(
+ "illegal-codepoint-for-numeric-entity",
+ messageParams: {"charAsInt": charAsInt}));
+ } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF)
+ || (charAsInt > 0x10FFFF)) {
+ char = "\uFFFD";
+ _addToken(new ParseErrorToken(
+ "illegal-codepoint-for-numeric-entity",
+ messageParams: {"charAsInt": charAsInt}));
+ } else {
+ // Should speed up this check somehow (e.g. move the set to a constant)
+ if ((0x0001 <= charAsInt && charAsInt <= 0x0008) ||
+ (0x000E <= charAsInt && charAsInt <= 0x001F) ||
+ (0x007F <= charAsInt && charAsInt <= 0x009F) ||
+ (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) ||
+ const [0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
+ 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+ 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
+ 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
+ 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
+ 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
+ 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
+ 0xFFFFF, 0x10FFFE, 0x10FFFF].contains(charAsInt)) {
+ _addToken(new ParseErrorToken(
+ "illegal-codepoint-for-numeric-entity",
+ messageParams: {"charAsInt": charAsInt}));
+ }
+ char = new String.fromCharCodes([charAsInt]);
+ }
+
+ // Discard the ; if present. Otherwise, put it back on the queue and
+ // invoke parseError on parser.
+ if (c != ";") {
+ _addToken(new ParseErrorToken(
+ "numeric-entity-without-semicolon"));
+ stream.unget(c);
+ }
+ return char;
+ }
+
+ void consumeEntity({String allowedChar, bool fromAttribute: false}) {
+ // Initialise to the default output for when no entity is matched
+ var output = "&";
+
+ var charStack = [stream.char()];
+ if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&'
+ || charStack[0] == EOF || allowedChar == charStack[0]) {
+ stream.unget(charStack[0]);
+ } else if (charStack[0] == "#") {
+ // Read the next character to see if it's hex or decimal
+ bool hex = false;
+ charStack.add(stream.char());
+ if (charStack.last == 'x' || charStack.last == 'X') {
+ hex = true;
+ charStack.add(stream.char());
+ }
+
+ // charStack.last should be the first digit
+ if (hex && isHexDigit(charStack.last) ||
+ (!hex && isDigit(charStack.last))) {
+ // At least one digit found, so consume the whole number
+ stream.unget(charStack.last);
+ output = consumeNumberEntity(hex);
+ } else {
+ // No digits found
+ _addToken(new ParseErrorToken("expected-numeric-entity"));
+ stream.unget(charStack.removeLast());
+ output = "&${charStack.join()}";
+ }
+ } else {
+ // At this point in the process might have named entity. Entities
+ // are stored in the global variable "entities".
+ //
+ // Consume characters and compare to these to a substring of the
+ // entity names in the list until the substring no longer matches.
+ var filteredEntityList = entitiesByFirstChar[charStack[0]];
+ if (filteredEntityList == null) filteredEntityList = const [];
+
+ while (charStack.last != EOF) {
+ var name = charStack.join();
+ filteredEntityList = filteredEntityList.where(
+ (e) => e.startsWith(name)).toList();
+
+ if (filteredEntityList.length == 0) {
+ break;
+ }
+ charStack.add(stream.char());
+ }
+
+ // At this point we have a string that starts with some characters
+ // that may match an entity
+ String entityName = null;
+
+ // Try to find the longest entity the string will match to take care
+ // of &noti for instance.
+
+ int entityLen;
+ for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) {
+ var possibleEntityName = charStack.sublist(0, entityLen).join();
+ if (entities.containsKey(possibleEntityName)) {
+ entityName = possibleEntityName;
+ break;
+ }
+ }
+
+ if (entityName != null) {
+ var lastChar = entityName[entityName.length - 1];
+ if (lastChar != ";") {
+ _addToken(new ParseErrorToken(
+ "named-entity-without-semicolon"));
+ }
+ if (lastChar != ";" && fromAttribute &&
+ (isLetterOrDigit(charStack[entityLen]) ||
+ charStack[entityLen] == '=')) {
+ stream.unget(charStack.removeLast());
+ output = "&${charStack.join()}";
+ } else {
+ output = entities[entityName];
+ stream.unget(charStack.removeLast());
+ output = '${output}${slice(charStack, entityLen).join()}';
+ }
+ } else {
+ _addToken(new ParseErrorToken("expected-named-entity"));
+ stream.unget(charStack.removeLast());
+ output = "&${charStack.join()}";
+ }
+ }
+ if (fromAttribute) {
+ _attributeValue = '$_attributeValue$output';
+ } else {
+ var token;
+ if (isWhitespace(output)) {
+ token = new SpaceCharactersToken(output);
+ } else {
+ token = new CharactersToken(output);
+ }
+ _addToken(token);
+ }
+ }
+
+ /** This method replaces the need for "entityInAttributeValueState". */
+ void processEntityInAttribute(String allowedChar) {
+ consumeEntity(allowedChar: allowedChar, fromAttribute: true);
+ }
+
+ /**
+ * This method is a generic handler for emitting the tags. It also sets
+ * the state to "data" because that's what's needed after a token has been
+ * emitted.
+ */
+ void emitCurrentToken() {
+ var token = currentToken;
+ // Add token to the queue to be yielded
+ if (token is TagToken) {
+ if (lowercaseElementName) {
+ token.name = asciiUpper2Lower(token.name);
+ }
+ if (token is EndTagToken) {
+ if (_attributes != null) {
+ _addToken(new ParseErrorToken("attributes-in-end-tag"));
+ }
+ if (token.selfClosing) {
+ _addToken(new ParseErrorToken("this-closing-flag-on-end-tag"));
+ }
+ } else if (token is StartTagToken) {
+ // HTML5 specific normalizations to the token stream.
+ // Convert the list into a map where first key wins.
+ token.data = new LinkedHashMap<Object, String>();
+ if (_attributes != null) {
+ for (var attr in _attributes) {
+ token.data.putIfAbsent(attr.name, () => attr.value);
+ }
+ if (attributeSpans) token.attributeSpans = _attributes;
+ }
+ }
+ _attributes = null;
+ _attributeNames = null;
+ }
+ _addToken(token);
+ state = dataState;
+ }
+
+ // Below are the various tokenizer states worked out.
+
+ bool dataState() {
+ var data = stream.char();
+ if (data == "&") {
+ state = entityDataState;
+ } else if (data == "<") {
+ state = tagOpenState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\u0000"));
+ } else if (data == EOF) {
+ // Tokenization ends.
+ return false;
+ } else if (isWhitespace(data)) {
+ // Directly after emitting a token you switch back to the "data
+ // state". At that point spaceCharacters are important so they are
+ // emitted separately.
+ _addToken(new SpaceCharactersToken(
+ '${data}${stream.charsUntil(spaceCharacters, true)}'));
+ // No need to update lastFourChars here, since the first space will
+ // have already been appended to lastFourChars and will have broken
+ // any <!-- or --> sequences
+ } else {
+ var chars = stream.charsUntil("&<\u0000");
+ _addToken(new CharactersToken('${data}${chars}'));
+ }
+ return true;
+ }
+
+ bool entityDataState() {
+ consumeEntity();
+ state = dataState;
+ return true;
+ }
+
+ bool rcdataState() {
+ var data = stream.char();
+ if (data == "&") {
+ state = characterReferenceInRcdata;
+ } else if (data == "<") {
+ state = rcdataLessThanSignState;
+ } else if (data == EOF) {
+ // Tokenization ends.
+ return false;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ } else if (isWhitespace(data)) {
+ // Directly after emitting a token you switch back to the "data
+ // state". At that point spaceCharacters are important so they are
+ // emitted separately.
+ _addToken(new SpaceCharactersToken(
+ '${data}${stream.charsUntil(spaceCharacters, true)}'));
+ } else {
+ var chars = stream.charsUntil("&<");
+ _addToken(new CharactersToken('${data}${chars}'));
+ }
+ return true;
+ }
+
+ bool characterReferenceInRcdata() {
+ consumeEntity();
+ state = rcdataState;
+ return true;
+ }
+
+ bool rawtextState() {
+ var data = stream.char();
+ if (data == "<") {
+ state = rawtextLessThanSignState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ } else if (data == EOF) {
+ // Tokenization ends.
+ return false;
+ } else {
+ var chars = stream.charsUntil("<\u0000");
+ _addToken(new CharactersToken("${data}${chars}"));
+ }
+ return true;
+ }
+
+ bool scriptDataState() {
+ var data = stream.char();
+ if (data == "<") {
+ state = scriptDataLessThanSignState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ } else if (data == EOF) {
+ // Tokenization ends.
+ return false;
+ } else {
+ var chars = stream.charsUntil("<\u0000");
+ _addToken(new CharactersToken("${data}${chars}"));
+ }
+ return true;
+ }
+
+ bool plaintextState() {
+ var data = stream.char();
+ if (data == EOF) {
+ // Tokenization ends.
+ return false;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ } else {
+ _addToken(new CharactersToken(
+ '${data}${stream.charsUntil("\u0000")}'));
+ }
+ return true;
+ }
+
+ bool tagOpenState() {
+ var data = stream.char();
+ if (data == "!") {
+ state = markupDeclarationOpenState;
+ } else if (data == "/") {
+ state = closeTagOpenState;
+ } else if (isLetter(data)) {
+ currentToken = new StartTagToken(data);
+ state = tagNameState;
+ } else if (data == ">") {
+ // XXX In theory it could be something besides a tag name. But
+ // do we really care?
+ _addToken(new ParseErrorToken(
+ "expected-tag-name-but-got-right-bracket"));
+ _addToken(new CharactersToken("<>"));
+ state = dataState;
+ } else if (data == "?") {
+ // XXX In theory it could be something besides a tag name. But
+ // do we really care?
+ _addToken(new ParseErrorToken(
+ "expected-tag-name-but-got-question-mark"));
+ stream.unget(data);
+ state = bogusCommentState;
+ } else {
+ // XXX
+ _addToken(new ParseErrorToken("expected-tag-name"));
+ _addToken(new CharactersToken("<"));
+ stream.unget(data);
+ state = dataState;
+ }
+ return true;
+ }
+
+ bool closeTagOpenState() {
+ var data = stream.char();
+ if (isLetter(data)) {
+ currentToken = new EndTagToken(data);
+ state = tagNameState;
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken(
+ "expected-closing-tag-but-got-right-bracket"));
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken(
+ "expected-closing-tag-but-got-eof"));
+ _addToken(new CharactersToken("</"));
+ state = dataState;
+ } else {
+ // XXX data can be _'_...
+ _addToken(new ParseErrorToken(
+ "expected-closing-tag-but-got-char", messageParams: {"data": data}));
+ stream.unget(data);
+ state = bogusCommentState;
+ }
+ return true;
+ }
+
+ bool tagNameState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ state = beforeAttributeNameState;
+ } else if (data == ">") {
+ emitCurrentToken();
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-tag-name"));
+ state = dataState;
+ } else if (data == "/") {
+ state = selfClosingStartTagState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentTagToken.name = '${currentTagToken.name}\uFFFD';
+ } else {
+ currentTagToken.name = '${currentTagToken.name}$data';
+ // (Don't use charsUntil here, because tag names are
+ // very short and it's faster to not do anything fancy)
+ }
+ return true;
+ }
+
+ bool rcdataLessThanSignState() {
+ var data = stream.char();
+ if (data == "/") {
+ temporaryBuffer = "";
+ state = rcdataEndTagOpenState;
+ } else {
+ _addToken(new CharactersToken("<"));
+ stream.unget(data);
+ state = rcdataState;
+ }
+ return true;
+ }
+
+ bool rcdataEndTagOpenState() {
+ var data = stream.char();
+ if (isLetter(data)) {
+ temporaryBuffer = '${temporaryBuffer}$data';
+ state = rcdataEndTagNameState;
+ } else {
+ _addToken(new CharactersToken("</"));
+ stream.unget(data);
+ state = rcdataState;
+ }
+ return true;
+ }
+
+ bool _tokenIsAppropriate() {
+ return currentToken is TagToken &&
+ currentTagToken.name.toLowerCase() == temporaryBuffer.toLowerCase();
+ }
+
+ bool rcdataEndTagNameState() {
+ var appropriate = _tokenIsAppropriate();
+ var data = stream.char();
+ if (isWhitespace(data) && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ state = beforeAttributeNameState;
+ } else if (data == "/" && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ state = selfClosingStartTagState;
+ } else if (data == ">" && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ emitCurrentToken();
+ state = dataState;
+ } else if (isLetter(data)) {
+ temporaryBuffer = '${temporaryBuffer}$data';
+ } else {
+ _addToken(new CharactersToken("</$temporaryBuffer"));
+ stream.unget(data);
+ state = rcdataState;
+ }
+ return true;
+ }
+
+ bool rawtextLessThanSignState() {
+ var data = stream.char();
+ if (data == "/") {
+ temporaryBuffer = "";
+ state = rawtextEndTagOpenState;
+ } else {
+ _addToken(new CharactersToken("<"));
+ stream.unget(data);
+ state = rawtextState;
+ }
+ return true;
+ }
+
+ bool rawtextEndTagOpenState() {
+ var data = stream.char();
+ if (isLetter(data)) {
+ temporaryBuffer = '${temporaryBuffer}$data';
+ state = rawtextEndTagNameState;
+ } else {
+ _addToken(new CharactersToken("</"));
+ stream.unget(data);
+ state = rawtextState;
+ }
+ return true;
+ }
+
+ bool rawtextEndTagNameState() {
+ var appropriate = _tokenIsAppropriate();
+ var data = stream.char();
+ if (isWhitespace(data) && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ state = beforeAttributeNameState;
+ } else if (data == "/" && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ state = selfClosingStartTagState;
+ } else if (data == ">" && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ emitCurrentToken();
+ state = dataState;
+ } else if (isLetter(data)) {
+ temporaryBuffer = '${temporaryBuffer}$data';
+ } else {
+ _addToken(new CharactersToken("</$temporaryBuffer"));
+ stream.unget(data);
+ state = rawtextState;
+ }
+ return true;
+ }
+
+ bool scriptDataLessThanSignState() {
+ var data = stream.char();
+ if (data == "/") {
+ temporaryBuffer = "";
+ state = scriptDataEndTagOpenState;
+ } else if (data == "!") {
+ _addToken(new CharactersToken("<!"));
+ state = scriptDataEscapeStartState;
+ } else {
+ _addToken(new CharactersToken("<"));
+ stream.unget(data);
+ state = scriptDataState;
+ }
+ return true;
+ }
+
+ bool scriptDataEndTagOpenState() {
+ var data = stream.char();
+ if (isLetter(data)) {
+ temporaryBuffer = '${temporaryBuffer}$data';
+ state = scriptDataEndTagNameState;
+ } else {
+ _addToken(new CharactersToken("</"));
+ stream.unget(data);
+ state = scriptDataState;
+ }
+ return true;
+ }
+
+ bool scriptDataEndTagNameState() {
+ var appropriate = _tokenIsAppropriate();
+ var data = stream.char();
+ if (isWhitespace(data) && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ state = beforeAttributeNameState;
+ } else if (data == "/" && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ state = selfClosingStartTagState;
+ } else if (data == ">" && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ emitCurrentToken();
+ state = dataState;
+ } else if (isLetter(data)) {
+ temporaryBuffer = '${temporaryBuffer}$data';
+ } else {
+ _addToken(new CharactersToken("</$temporaryBuffer"));
+ stream.unget(data);
+ state = scriptDataState;
+ }
+ return true;
+ }
+
+ bool scriptDataEscapeStartState() {
+ var data = stream.char();
+ if (data == "-") {
+ _addToken(new CharactersToken("-"));
+ state = scriptDataEscapeStartDashState;
+ } else {
+ stream.unget(data);
+ state = scriptDataState;
+ }
+ return true;
+ }
+
+ bool scriptDataEscapeStartDashState() {
+ var data = stream.char();
+ if (data == "-") {
+ _addToken(new CharactersToken("-"));
+ state = scriptDataEscapedDashDashState;
+ } else {
+ stream.unget(data);
+ state = scriptDataState;
+ }
+ return true;
+ }
+
+ bool scriptDataEscapedState() {
+ var data = stream.char();
+ if (data == "-") {
+ _addToken(new CharactersToken("-"));
+ state = scriptDataEscapedDashState;
+ } else if (data == "<") {
+ state = scriptDataEscapedLessThanSignState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ } else if (data == EOF) {
+ state = dataState;
+ } else {
+ var chars = stream.charsUntil("<-\u0000");
+ _addToken(new CharactersToken("${data}${chars}"));
+ }
+ return true;
+ }
+
+ bool scriptDataEscapedDashState() {
+ var data = stream.char();
+ if (data == "-") {
+ _addToken(new CharactersToken("-"));
+ state = scriptDataEscapedDashDashState;
+ } else if (data == "<") {
+ state = scriptDataEscapedLessThanSignState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ state = scriptDataEscapedState;
+ } else if (data == EOF) {
+ state = dataState;
+ } else {
+ _addToken(new CharactersToken(data));
+ state = scriptDataEscapedState;
+ }
+ return true;
+ }
+
+ bool scriptDataEscapedDashDashState() {
+ var data = stream.char();
+ if (data == "-") {
+ _addToken(new CharactersToken("-"));
+ } else if (data == "<") {
+ state = scriptDataEscapedLessThanSignState;
+ } else if (data == ">") {
+ _addToken(new CharactersToken(">"));
+ state = scriptDataState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ state = scriptDataEscapedState;
+ } else if (data == EOF) {
+ state = dataState;
+ } else {
+ _addToken(new CharactersToken(data));
+ state = scriptDataEscapedState;
+ }
+ return true;
+ }
+
+ bool scriptDataEscapedLessThanSignState() {
+ var data = stream.char();
+ if (data == "/") {
+ temporaryBuffer = "";
+ state = scriptDataEscapedEndTagOpenState;
+ } else if (isLetter(data)) {
+ _addToken(new CharactersToken("<$data"));
+ temporaryBuffer = data;
+ state = scriptDataDoubleEscapeStartState;
+ } else {
+ _addToken(new CharactersToken("<"));
+ stream.unget(data);
+ state = scriptDataEscapedState;
+ }
+ return true;
+ }
+
+ bool scriptDataEscapedEndTagOpenState() {
+ var data = stream.char();
+ if (isLetter(data)) {
+ temporaryBuffer = data;
+ state = scriptDataEscapedEndTagNameState;
+ } else {
+ _addToken(new CharactersToken("</"));
+ stream.unget(data);
+ state = scriptDataEscapedState;
+ }
+ return true;
+ }
+
+ bool scriptDataEscapedEndTagNameState() {
+ var appropriate = _tokenIsAppropriate();
+ var data = stream.char();
+ if (isWhitespace(data) && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ state = beforeAttributeNameState;
+ } else if (data == "/" && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ state = selfClosingStartTagState;
+ } else if (data == ">" && appropriate) {
+ currentToken = new EndTagToken(temporaryBuffer);
+ emitCurrentToken();
+ state = dataState;
+ } else if (isLetter(data)) {
+ temporaryBuffer = '${temporaryBuffer}$data';
+ } else {
+ _addToken(new CharactersToken("</$temporaryBuffer"));
+ stream.unget(data);
+ state = scriptDataEscapedState;
+ }
+ return true;
+ }
+
+ bool scriptDataDoubleEscapeStartState() {
+ var data = stream.char();
+ if (isWhitespace(data) || data == "/" || data == ">") {
+ _addToken(new CharactersToken(data));
+ if (temporaryBuffer.toLowerCase() == "script") {
+ state = scriptDataDoubleEscapedState;
+ } else {
+ state = scriptDataEscapedState;
+ }
+ } else if (isLetter(data)) {
+ _addToken(new CharactersToken(data));
+ temporaryBuffer = '${temporaryBuffer}$data';
+ } else {
+ stream.unget(data);
+ state = scriptDataEscapedState;
+ }
+ return true;
+ }
+
+ bool scriptDataDoubleEscapedState() {
+ var data = stream.char();
+ if (data == "-") {
+ _addToken(new CharactersToken("-"));
+ state = scriptDataDoubleEscapedDashState;
+ } else if (data == "<") {
+ _addToken(new CharactersToken("<"));
+ state = scriptDataDoubleEscapedLessThanSignState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-script-in-script"));
+ state = dataState;
+ } else {
+ _addToken(new CharactersToken(data));
+ }
+ return true;
+ }
+
+ bool scriptDataDoubleEscapedDashState() {
+ var data = stream.char();
+ if (data == "-") {
+ _addToken(new CharactersToken("-"));
+ state = scriptDataDoubleEscapedDashDashState;
+ } else if (data == "<") {
+ _addToken(new CharactersToken("<"));
+ state = scriptDataDoubleEscapedLessThanSignState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ state = scriptDataDoubleEscapedState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-script-in-script"));
+ state = dataState;
+ } else {
+ _addToken(new CharactersToken(data));
+ state = scriptDataDoubleEscapedState;
+ }
+ return true;
+ }
+
+ // TODO(jmesserly): report bug in original code
+ // (was "Dash" instead of "DashDash")
+ bool scriptDataDoubleEscapedDashDashState() {
+ var data = stream.char();
+ if (data == "-") {
+ _addToken(new CharactersToken("-"));
+ } else if (data == "<") {
+ _addToken(new CharactersToken("<"));
+ state = scriptDataDoubleEscapedLessThanSignState;
+ } else if (data == ">") {
+ _addToken(new CharactersToken(">"));
+ state = scriptDataState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addToken(new CharactersToken("\uFFFD"));
+ state = scriptDataDoubleEscapedState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-script-in-script"));
+ state = dataState;
+ } else {
+ _addToken(new CharactersToken(data));
+ state = scriptDataDoubleEscapedState;
+ }
+ return true;
+ }
+
+ bool scriptDataDoubleEscapedLessThanSignState() {
+ var data = stream.char();
+ if (data == "/") {
+ _addToken(new CharactersToken("/"));
+ temporaryBuffer = "";
+ state = scriptDataDoubleEscapeEndState;
+ } else {
+ stream.unget(data);
+ state = scriptDataDoubleEscapedState;
+ }
+ return true;
+ }
+
+ bool scriptDataDoubleEscapeEndState() {
+ var data = stream.char();
+ if (isWhitespace(data) || data == "/" || data == ">") {
+ _addToken(new CharactersToken(data));
+ if (temporaryBuffer.toLowerCase() == "script") {
+ state = scriptDataEscapedState;
+ } else {
+ state = scriptDataDoubleEscapedState;
+ }
+ } else if (isLetter(data)) {
+ _addToken(new CharactersToken(data));
+ temporaryBuffer = '${temporaryBuffer}$data';
+ } else {
+ stream.unget(data);
+ state = scriptDataDoubleEscapedState;
+ }
+ return true;
+ }
+
+ bool beforeAttributeNameState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ stream.charsUntil(spaceCharacters, true);
+ } else if (isLetter(data)) {
+ _addAttribute(data);
+ state = attributeNameState;
+ } else if (data == ">") {
+ emitCurrentToken();
+ } else if (data == "/") {
+ state = selfClosingStartTagState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof"));
+ state = dataState;
+ } else if ("'\"=<".contains(data)) {
+ _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
+ _addAttribute(data);
+ state = attributeNameState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addAttribute("\uFFFD");
+ state = attributeNameState;
+ } else {
+ _addAttribute(data);
+ state = attributeNameState;
+ }
+ return true;
+ }
+
+ bool attributeNameState() {
+ var data = stream.char();
+ bool leavingThisState = true;
+ bool emitToken = false;
+ if (data == "=") {
+ state = beforeAttributeValueState;
+ } else if (isLetter(data)) {
+ _attributeName = '$_attributeName$data'
+ '${stream.charsUntil(asciiLetters, true)}';
+ leavingThisState = false;
+ } else if (data == ">") {
+ // XXX If we emit here the attributes are converted to a dict
+ // without being checked and when the code below runs we error
+ // because data is a dict not a list
+ emitToken = true;
+ } else if (isWhitespace(data)) {
+ state = afterAttributeNameState;
+ } else if (data == "/") {
+ state = selfClosingStartTagState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _attributeName = '${_attributeName}\uFFFD';
+ leavingThisState = false;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-attribute-name"));
+ state = dataState;
+ } else if ("'\"<".contains(data)) {
+ _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
+ _attributeName = '$_attributeName$data';
+ leavingThisState = false;
+ } else {
+ _attributeName = '$_attributeName$data';
+ leavingThisState = false;
+ }
+
+ if (leavingThisState) {
+ _markAttributeNameEnd(-1);
+
+ // Attributes are not dropped at this stage. That happens when the
+ // start tag token is emitted so values can still be safely appended
+ // to attributes, but we do want to report the parse error in time.
+ if (lowercaseAttrName) {
+ _attributeName = asciiUpper2Lower(_attributeName);
+ }
+ if (_attributeNames == null) _attributeNames = new Set();
+ if (_attributeNames.contains(_attributeName)) {
+ _addToken(new ParseErrorToken("duplicate-attribute"));
+ }
+ _attributeNames.add(_attributeName);
+
+ // XXX Fix for above XXX
+ if (emitToken) {
+ emitCurrentToken();
+ }
+ }
+ return true;
+ }
+
+ bool afterAttributeNameState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ stream.charsUntil(spaceCharacters, true);
+ } else if (data == "=") {
+ state = beforeAttributeValueState;
+ } else if (data == ">") {
+ emitCurrentToken();
+ } else if (isLetter(data)) {
+ _addAttribute(data);
+ state = attributeNameState;
+ } else if (data == "/") {
+ state = selfClosingStartTagState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _addAttribute("\uFFFD");
+ state = attributeNameState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("expected-end-of-tag-but-got-eof"));
+ state = dataState;
+ } else if ("'\"<".contains(data)) {
+ _addToken(new ParseErrorToken("invalid-character-after-attribute-name"));
+ _addAttribute(data);
+ state = attributeNameState;
+ } else {
+ _addAttribute(data);
+ state = attributeNameState;
+ }
+ return true;
+ }
+
+ bool beforeAttributeValueState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ stream.charsUntil(spaceCharacters, true);
+ } else if (data == "\"") {
+ _markAttributeValueStart(0);
+ state = attributeValueDoubleQuotedState;
+ } else if (data == "&") {
+ state = attributeValueUnQuotedState;
+ stream.unget(data);
+ _markAttributeValueStart(0);
+ } else if (data == "'") {
+ _markAttributeValueStart(0);
+ state = attributeValueSingleQuotedState;
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken(
+ "expected-attribute-value-but-got-right-bracket"));
+ emitCurrentToken();
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _markAttributeValueStart(-1);
+ _attributeValue = '${_attributeValue}\uFFFD';
+ state = attributeValueUnQuotedState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("expected-attribute-value-but-got-eof"));
+ state = dataState;
+ } else if ("=<`".contains(data)) {
+ _addToken(new ParseErrorToken("equals-in-unquoted-attribute-value"));
+ _markAttributeValueStart(-1);
+ _attributeValue = '$_attributeValue$data';
+ state = attributeValueUnQuotedState;
+ } else {
+ _markAttributeValueStart(-1);
+ _attributeValue = '$_attributeValue$data';
+ state = attributeValueUnQuotedState;
+ }
+ return true;
+ }
+
+ bool attributeValueDoubleQuotedState() {
+ var data = stream.char();
+ if (data == "\"") {
+ _markAttributeValueEnd(-1);
+ _markAttributeEnd(0);
+ state = afterAttributeValueState;
+ } else if (data == "&") {
+ processEntityInAttribute('"');
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _attributeValue = '${_attributeValue}\uFFFD';
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-attribute-value-double-quote"));
+ _markAttributeValueEnd(-1);
+ state = dataState;
+ } else {
+ _attributeValue = '$_attributeValue$data${stream.charsUntil("\"&")}';
+ }
+ return true;
+ }
+
+ bool attributeValueSingleQuotedState() {
+ var data = stream.char();
+ if (data == "'") {
+ _markAttributeValueEnd(-1);
+ _markAttributeEnd(0);
+ state = afterAttributeValueState;
+ } else if (data == "&") {
+ processEntityInAttribute("'");
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _attributeValue = '${_attributeValue}\uFFFD';
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-attribute-value-single-quote"));
+ _markAttributeValueEnd(-1);
+ state = dataState;
+ } else {
+ _attributeValue = '$_attributeValue$data${stream.charsUntil("\'&")}';
+ }
+ return true;
+ }
+
+ bool attributeValueUnQuotedState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ _markAttributeValueEnd(-1);
+ state = beforeAttributeNameState;
+ } else if (data == "&") {
+ processEntityInAttribute(">");
+ } else if (data == ">") {
+ _markAttributeValueEnd(-1);
+ emitCurrentToken();
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-attribute-value-no-quotes"));
+ _markAttributeValueEnd(-1);
+ state = dataState;
+ } else if ('"\'=<`'.contains(data)) {
+ _addToken(new ParseErrorToken(
+ "unexpected-character-in-unquoted-attribute-value"));
+ _attributeValue = '$_attributeValue$data';
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ _attributeValue = '${_attributeValue}\uFFFD';
+ } else {
+ _attributeValue = '$_attributeValue$data'
+ '${stream.charsUntil("&>\"\'=<`$spaceCharacters")}';
+ }
+ return true;
+ }
+
+ bool afterAttributeValueState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ state = beforeAttributeNameState;
+ } else if (data == ">") {
+ emitCurrentToken();
+ } else if (data == "/") {
+ state = selfClosingStartTagState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("unexpected-EOF-after-attribute-value"));
+ stream.unget(data);
+ state = dataState;
+ } else {
+ _addToken(new ParseErrorToken(
+ "unexpected-character-after-attribute-value"));
+ stream.unget(data);
+ state = beforeAttributeNameState;
+ }
+ return true;
+ }
+
+ bool selfClosingStartTagState() {
+ var data = stream.char();
+ if (data == ">") {
+ currentTagToken.selfClosing = true;
+ emitCurrentToken();
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("unexpected-EOF-after-solidus-in-tag"));
+ stream.unget(data);
+ state = dataState;
+ } else {
+ _addToken(new ParseErrorToken(
+ "unexpected-character-after-soldius-in-tag"));
+ stream.unget(data);
+ state = beforeAttributeNameState;
+ }
+ return true;
+ }
+
+ bool bogusCommentState() {
+ // Make a new comment token and give it as value all the characters
+ // until the first > or EOF (charsUntil checks for EOF automatically)
+ // and emit it.
+ var data = stream.charsUntil(">");
+ data = data.replaceAll("\u0000", "\uFFFD");
+ _addToken(new CommentToken(data));
+
+ // Eat the character directly after the bogus comment which is either a
+ // ">" or an EOF.
+ stream.char();
+ state = dataState;
+ return true;
+ }
+
+ bool markupDeclarationOpenState() {
+ var charStack = [stream.char()];
+ if (charStack.last == "-") {
+ charStack.add(stream.char());
+ if (charStack.last == "-") {
+ currentToken = new CommentToken("");
+ state = commentStartState;
+ return true;
+ }
+ } else if (charStack.last == 'd' || charStack.last == 'D') {
+ var matched = true;
+ for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) {
+ var char = stream.char();
+ charStack.add(char);
+ if (char == EOF || !expected.contains(char)) {
+ matched = false;
+ break;
+ }
+ }
+ if (matched) {
+ currentToken = new DoctypeToken(correct: true);
+ state = doctypeState;
+ return true;
+ }
+ } else if (charStack.last == "[" &&
+ parser != null && parser.tree.openElements.length > 0 &&
+ parser.tree.openElements.last.namespace
+ != parser.tree.defaultNamespace) {
+ var matched = true;
+ for (var expected in const ["C", "D", "A", "T", "A", "["]) {
+ charStack.add(stream.char());
+ if (charStack.last != expected) {
+ matched = false;
+ break;
+ }
+ }
+ if (matched) {
+ state = cdataSectionState;
+ return true;
+ }
+ }
+
+ _addToken(new ParseErrorToken("expected-dashes-or-doctype"));
+
+ while (charStack.length > 0) {
+ stream.unget(charStack.removeLast());
+ }
+ state = bogusCommentState;
+ return true;
+ }
+
+ bool commentStartState() {
+ var data = stream.char();
+ if (data == "-") {
+ state = commentStartDashState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentStringToken.data = '${currentStringToken.data}\uFFFD';
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken("incorrect-comment"));
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-comment"));
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentStringToken.data = '${currentStringToken.data}$data';
+ state = commentState;
+ }
+ return true;
+ }
+
+ bool commentStartDashState() {
+ var data = stream.char();
+ if (data == "-") {
+ state = commentEndState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentStringToken.data = '${currentStringToken.data}-\uFFFD';
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken("incorrect-comment"));
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-comment"));
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentStringToken.data = '${currentStringToken.data}-${data}';
+ state = commentState;
+ }
+ return true;
+ }
+
+ bool commentState() {
+ var data = stream.char();
+ if (data == "-") {
+ state = commentEndDashState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentStringToken.data = '${currentStringToken.data}\uFFFD';
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-comment"));
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentStringToken.data = '${currentStringToken.data}$data'
+ '${stream.charsUntil("-\u0000")}';
+ }
+ return true;
+ }
+
+ bool commentEndDashState() {
+ var data = stream.char();
+ if (data == "-") {
+ state = commentEndState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentStringToken.data = "${currentStringToken.data}-\uFFFD";
+ state = commentState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-comment-end-dash"));
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentStringToken.data = "${currentStringToken.data}-${data}";
+ state = commentState;
+ }
+ return true;
+ }
+
+ bool commentEndState() {
+ var data = stream.char();
+ if (data == ">") {
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentStringToken.data = '${currentStringToken.data}--\uFFFD';
+ state = commentState;
+ } else if (data == "!") {
+ _addToken(new ParseErrorToken(
+ "unexpected-bang-after-double-dash-in-comment"));
+ state = commentEndBangState;
+ } else if (data == "-") {
+ _addToken(new ParseErrorToken(
+ "unexpected-dash-after-double-dash-in-comment"));
+ currentStringToken.data = '${currentStringToken.data}$data';
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-comment-double-dash"));
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ // XXX
+ _addToken(new ParseErrorToken("unexpected-char-in-comment"));
+ currentStringToken.data = "${currentStringToken.data}--${data}";
+ state = commentState;
+ }
+ return true;
+ }
+
+ bool commentEndBangState() {
+ var data = stream.char();
+ if (data == ">") {
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == "-") {
+ currentStringToken.data = '${currentStringToken.data}--!';
+ state = commentEndDashState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentStringToken.data = '${currentStringToken.data}--!\uFFFD';
+ state = commentState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-comment-end-bang-state"));
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentStringToken.data = "${currentStringToken.data}--!${data}";
+ state = commentState;
+ }
+ return true;
+ }
+
+ bool doctypeState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ state = beforeDoctypeNameState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken(
+ "expected-doctype-name-but-got-eof"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ _addToken(new ParseErrorToken("need-space-after-doctype"));
+ stream.unget(data);
+ state = beforeDoctypeNameState;
+ }
+ return true;
+ }
+
+ bool beforeDoctypeNameState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ return true;
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken(
+ "expected-doctype-name-but-got-right-bracket"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentDoctypeToken.name = "\uFFFD";
+ state = doctypeNameState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken(
+ "expected-doctype-name-but-got-eof"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentDoctypeToken.name = data;
+ state = doctypeNameState;
+ }
+ return true;
+ }
+
+ bool doctypeNameState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
+ state = afterDoctypeNameState;
+ } else if (data == ">") {
+ currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentDoctypeToken.name = "${currentDoctypeToken.name}\uFFFD";
+ state = doctypeNameState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype-name"));
+ currentDoctypeToken.correct = false;
+ currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentDoctypeToken.name = '${currentDoctypeToken.name}$data';
+ }
+ return true;
+ }
+
+ bool afterDoctypeNameState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ return true;
+ } else if (data == ">") {
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ currentDoctypeToken.correct = false;
+ stream.unget(data);
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ if (data == "p" || data == "P") {
+ // TODO(jmesserly): would be nice to have a helper for this.
+ var matched = true;
+ for (var expected in const ["uU", "bB", "lL", "iI", "cC"]) {
+ data = stream.char();
+ if (data == EOF || !expected.contains(data)) {
+ matched = false;
+ break;
+ }
+ }
+ if (matched) {
+ state = afterDoctypePublicKeywordState;
+ return true;
+ }
+ } else if (data == "s" || data == "S") {
+ var matched = true;
+ for (var expected in const ["yY", "sS", "tT", "eE", "mM"]) {
+ data = stream.char();
+ if (data == EOF || !expected.contains(data)) {
+ matched = false;
+ break;
+ }
+ }
+ if (matched) {
+ state = afterDoctypeSystemKeywordState;
+ return true;
+ }
+ }
+
+ // All the characters read before the current 'data' will be
+ // [a-zA-Z], so they're garbage in the bogus doctype and can be
+ // discarded; only the latest character might be '>' or EOF
+ // and needs to be ungetted
+ stream.unget(data);
+ _addToken(new ParseErrorToken(
+ "expected-space-or-right-bracket-in-doctype",
+ messageParams: {"data": data}));
+ currentDoctypeToken.correct = false;
+ state = bogusDoctypeState;
+ }
+ return true;
+ }
+
+ bool afterDoctypePublicKeywordState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ state = beforeDoctypePublicIdentifierState;
+ } else if (data == "'" || data == '"') {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ stream.unget(data);
+ state = beforeDoctypePublicIdentifierState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ stream.unget(data);
+ state = beforeDoctypePublicIdentifierState;
+ }
+ return true;
+ }
+
+ bool beforeDoctypePublicIdentifierState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ return true;
+ } else if (data == "\"") {
+ currentDoctypeToken.publicId = "";
+ state = doctypePublicIdentifierDoubleQuotedState;
+ } else if (data == "'") {
+ currentDoctypeToken.publicId = "";
+ state = doctypePublicIdentifierSingleQuotedState;
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ currentDoctypeToken.correct = false;
+ state = bogusDoctypeState;
+ }
+ return true;
+ }
+
+ bool doctypePublicIdentifierDoubleQuotedState() {
+ var data = stream.char();
+ if (data == '"') {
+ state = afterDoctypePublicIdentifierState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
+ }
+ return true;
+ }
+
+ bool doctypePublicIdentifierSingleQuotedState() {
+ var data = stream.char();
+ if (data == "'") {
+ state = afterDoctypePublicIdentifierState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
+ }
+ return true;
+ }
+
+ bool afterDoctypePublicIdentifierState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ state = betweenDoctypePublicAndSystemIdentifiersState;
+ } else if (data == ">") {
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == '"') {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ currentDoctypeToken.systemId = "";
+ state = doctypeSystemIdentifierDoubleQuotedState;
+ } else if (data == "'") {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ currentDoctypeToken.systemId = "";
+ state = doctypeSystemIdentifierSingleQuotedState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ currentDoctypeToken.correct = false;
+ state = bogusDoctypeState;
+ }
+ return true;
+ }
+
+ bool betweenDoctypePublicAndSystemIdentifiersState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ return true;
+ } else if (data == ">") {
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == '"') {
+ currentDoctypeToken.systemId = "";
+ state = doctypeSystemIdentifierDoubleQuotedState;
+ } else if (data == "'") {
+ currentDoctypeToken.systemId = "";
+ state = doctypeSystemIdentifierSingleQuotedState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ currentDoctypeToken.correct = false;
+ state = bogusDoctypeState;
+ }
+ return true;
+ }
+
+ bool afterDoctypeSystemKeywordState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ state = beforeDoctypeSystemIdentifierState;
+ } else if (data == "'" || data == '"') {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ stream.unget(data);
+ state = beforeDoctypeSystemIdentifierState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ stream.unget(data);
+ state = beforeDoctypeSystemIdentifierState;
+ }
+ return true;
+ }
+
+ bool beforeDoctypeSystemIdentifierState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ return true;
+ } else if (data == "\"") {
+ currentDoctypeToken.systemId = "";
+ state = doctypeSystemIdentifierDoubleQuotedState;
+ } else if (data == "'") {
+ currentDoctypeToken.systemId = "";
+ state = doctypeSystemIdentifierSingleQuotedState;
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ currentDoctypeToken.correct = false;
+ state = bogusDoctypeState;
+ }
+ return true;
+ }
+
+ bool doctypeSystemIdentifierDoubleQuotedState() {
+ var data = stream.char();
+ if (data == "\"") {
+ state = afterDoctypeSystemIdentifierState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
+ }
+ return true;
+ }
+
+ bool doctypeSystemIdentifierSingleQuotedState() {
+ var data = stream.char();
+ if (data == "'") {
+ state = afterDoctypeSystemIdentifierState;
+ } else if (data == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
+ } else if (data == ">") {
+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
+ }
+ return true;
+ }
+
+ bool afterDoctypeSystemIdentifierState() {
+ var data = stream.char();
+ if (isWhitespace(data)) {
+ return true;
+ } else if (data == ">") {
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ _addToken(new ParseErrorToken("eof-in-doctype"));
+ currentDoctypeToken.correct = false;
+ _addToken(currentToken);
+ state = dataState;
+ } else {
+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
+ state = bogusDoctypeState;
+ }
+ return true;
+ }
+
+ bool bogusDoctypeState() {
+ var data = stream.char();
+ if (data == ">") {
+ _addToken(currentToken);
+ state = dataState;
+ } else if (data == EOF) {
+ // XXX EMIT
+ stream.unget(data);
+ _addToken(currentToken);
+ state = dataState;
+ }
+ return true;
+ }
+
+ bool cdataSectionState() {
+ var data = [];
+ int matchedEnd = 0;
+ while (true) {
+ var ch = stream.char();
+ if (ch == EOF) {
+ break;
+ }
+ // Deal with null here rather than in the parser
+ if (ch == "\u0000") {
+ _addToken(new ParseErrorToken("invalid-codepoint"));
+ ch = "\uFFFD";
+ }
+ data.add(ch);
+ // TODO(jmesserly): it'd be nice if we had an easier way to match the end,
+ // perhaps with a "peek" API.
+ if (ch == "]" && matchedEnd < 2) {
+ matchedEnd++;
+ } else if (ch == ">" && matchedEnd == 2) {
+ // Remove "]]>" from the end.
+ data.removeLast();
+ data.removeLast();
+ data.removeLast();
+ break;
+ } else {
+ matchedEnd = 0;
+ }
+ }
+
+ if (data.length > 0) {
+ _addToken(new CharactersToken(data.join()));
+ }
+ state = dataState;
+ return true;
+ }
+}
+

Powered by Google App Engine
This is Rietveld 408576698