pkg/third_party/html5lib/lib/src/tokenizer.dart - Issue 22375011: move html5lib code into dart svn repo

Unified Diff: pkg/third_party/html5lib/lib/src/tokenizer.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: change location of html5lib to pkg/third_party/html5lib Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« pkg/third_party/html5lib/html5lib.status ('K') | « pkg/third_party/html5lib/lib/src/token.dart ('k') | pkg/third_party/html5lib/lib/src/treebuilder.dart » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: pkg/third_party/html5lib/lib/src/tokenizer.dart

diff --git a/pkg/third_party/html5lib/lib/src/tokenizer.dart b/pkg/third_party/html5lib/lib/src/tokenizer.dart

new file mode 100644

index 0000000000000000000000000000000000000000..ee867b7cc7cc549e94439ec7f68133b706e44154

--- /dev/null

+++ b/pkg/third_party/html5lib/lib/src/tokenizer.dart

@@ -0,0 +1,1900 @@

+library tokenizer;

+import 'dart:collection';

+import 'dart:math';

+import 'package:html5lib/parser.dart' show HtmlParser;

+import 'package:source_maps/span.dart' show Span, FileSpan;

+import 'constants.dart';

+import 'inputstream.dart';

+import 'token.dart';

+import 'utils.dart';

+// Group entities by their first character, for faster lookups

+// TODO(jmesserly): we could use a better data structure here like a trie, if

+// we had it implemented in Dart.

+Map<String, List<String>> entitiesByFirstChar = (() {

+ var result = {};

+ for (var k in entities.keys) {

+ result.putIfAbsent(k[0], () => []).add(k);

+ }

+ return result;

+})();

+// TODO(jmesserly): lots of ways to make this faster:

+// - use char codes everywhere instead of 1-char strings

+// - use switch instead of contains, indexOf

+// - use switch instead of the sequential if tests

+// - avoid string concat

+/**

+ * This class takes care of tokenizing HTML.

+ */

+class HtmlTokenizer implements Iterator<Token> {

+ // TODO(jmesserly): a lot of these could be made private

+ final HtmlInputStream stream;

+ final bool lowercaseElementName;

+ final bool lowercaseAttrName;

+ /** True to generate spans in for [Token.span]. */

+ final bool generateSpans;

+ /** True to generate spans for attributes. */

+ final bool attributeSpans;

+ /**

+ * This reference to the parser is used for correct CDATA handling.

+ * The [HtmlParser] will set this at construction time.

+ */

+ HtmlParser parser;

+ final Queue<Token> tokenQueue;

+ /** Holds the token that is currently being processed. */

+ Token currentToken;

+ /**

+ * Holds a reference to the method to be invoked for the next parser state.

+ */

+ Predicate state;

+ String temporaryBuffer;

+ int _lastOffset;

+ // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add

+ // an item until it's ready. But the code doesn't have a clear notion of when

+ // it's "done" with the attribute.

+ List<TagAttribute> _attributes;

+ Set<String> _attributeNames;

+ HtmlTokenizer(doc, {String encoding, bool parseMeta: true,

+ this.lowercaseElementName: true, this.lowercaseAttrName: true,

+ bool generateSpans: false, String sourceUrl, this.attributeSpans: false})

+ : stream = new HtmlInputStream(

+ doc, encoding, parseMeta, generateSpans, sourceUrl),

+ tokenQueue = new Queue(),

+ generateSpans = generateSpans {

+ reset();

+ }

+ TagToken get currentTagToken => currentToken;

+ DoctypeToken get currentDoctypeToken => currentToken;

+ StringToken get currentStringToken => currentToken;

+ Token _current;

+ Token get current => _current;

+ String get _attributeName => _attributes.last.name;

+ set _attributeName(String value) {

+ _attributes.last.name = value;

+ }

+ String get _attributeValue => _attributes.last.value;

+ set _attributeValue(String value) {

+ _attributes.last.value = value;

+ }

+ void _markAttributeEnd(int offset) {

+ if (attributeSpans) _attributes.last.end = stream.position + offset;

+ }

+ void _markAttributeValueStart(int offset) {

+ if (attributeSpans) _attributes.last.startValue = stream.position + offset;

+ }

+ void _markAttributeValueEnd(int offset) {

+ if (attributeSpans) {

+ _attributes.last.endValue = stream.position + offset;

+ _markAttributeEnd(offset);

+ }

+ // Note: we could track the name span here, if we need it.

+ void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset);

+ void _addAttribute(String name) {

+ if (_attributes == null) _attributes = [];

+ var attr = new TagAttribute(name);

+ _attributes.add(attr);

+ if (attributeSpans) attr.start = stream.position - name.length;

+ }

+ /**

+ * This is where the magic happens.

+ *

+ * We do our usually processing through the states and when we have a token

+ * to return we yield the token which pauses processing until the next token

+ * is requested.

+ */

+ bool moveNext() {

+ // Start processing. When EOF is reached state will return false;

+ // instead of true and the loop will terminate.

+ while (stream.errors.length == 0 && tokenQueue.length == 0) {

+ if (!state()) {

+ _current = null;

+ return false;

+ }

+ if (stream.errors.length > 0) {

+ _current = new ParseErrorToken(stream.errors.removeFirst());

+ } else {

+ assert (tokenQueue.length > 0);

+ _current = tokenQueue.removeFirst();

+ }

+ return true;

+ }

+ /**

+ * Resets the tokenizer state. Calling this does not reset the [stream] or

+ * the [parser].

+ */

+ void reset() {

+ _lastOffset = 0;

+ tokenQueue.clear();

+ currentToken = null;

+ temporaryBuffer = null;

+ _attributes = null;

+ _attributeNames = null;

+ state = dataState;

+ }

+ /** Adds a token to the queue. Sets the span if needed. */

+ void _addToken(Token token) {

+ if (generateSpans && token.span == null) {

+ int offset = stream.position;

+ token.span = new FileSpan(stream.fileInfo, _lastOffset, offset);

+ if (token is! ParseErrorToken) {

+ _lastOffset = offset;

+ }

+ tokenQueue.add(token);

+ }

+ /**

+ * This function returns either U+FFFD or the character based on the

+ * decimal or hexadecimal representation. It also discards ";" if present.

+ * If not present it will add a [ParseErrorToken].

+ */

+ String consumeNumberEntity(bool isHex) {

+ var allowed = isDigit;

+ var radix = 10;

+ if (isHex) {

+ allowed = isHexDigit;

+ radix = 16;

+ }

+ var charStack = [];

+ // Consume all the characters that are in range while making sure we

+ // don't hit an EOF.

+ var c = stream.char();

+ while (allowed(c) && c != EOF) {

+ charStack.add(c);

+ c = stream.char();

+ }

+ // Convert the set of characters consumed to an int.

+ var charAsInt = parseIntRadix(charStack.join(), radix);

+ // Certain characters get replaced with others

+ var char = replacementCharacters[charAsInt];

+ if (char != null) {

+ _addToken(new ParseErrorToken(

+ "illegal-codepoint-for-numeric-entity",

+ messageParams: {"charAsInt": charAsInt}));

+ } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF)

+ || (charAsInt > 0x10FFFF)) {

+ char = "\uFFFD";

+ _addToken(new ParseErrorToken(

+ "illegal-codepoint-for-numeric-entity",

+ messageParams: {"charAsInt": charAsInt}));

+ } else {

+ // Should speed up this check somehow (e.g. move the set to a constant)

+ if ((0x0001 <= charAsInt && charAsInt <= 0x0008) ||

+ (0x000E <= charAsInt && charAsInt <= 0x001F) ||

+ (0x007F <= charAsInt && charAsInt <= 0x009F) ||

+ (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) ||

+ const [0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,

+ 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,

+ 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,

+ 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,

+ 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,

+ 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,

+ 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,

+ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,

+ 0xFFFFF, 0x10FFFE, 0x10FFFF].contains(charAsInt)) {

+ _addToken(new ParseErrorToken(

+ "illegal-codepoint-for-numeric-entity",

+ messageParams: {"charAsInt": charAsInt}));

+ }

+ char = new String.fromCharCodes([charAsInt]);

+ }

+ // Discard the ; if present. Otherwise, put it back on the queue and

+ // invoke parseError on parser.

+ if (c != ";") {

+ _addToken(new ParseErrorToken(

+ "numeric-entity-without-semicolon"));

+ stream.unget(c);

+ }

+ return char;

+ }

+ void consumeEntity({String allowedChar, bool fromAttribute: false}) {

+ // Initialise to the default output for when no entity is matched

+ var output = "&";

+ var charStack = [stream.char()];

+ if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&'

+ || charStack[0] == EOF || allowedChar == charStack[0]) {

+ stream.unget(charStack[0]);

+ } else if (charStack[0] == "#") {

+ // Read the next character to see if it's hex or decimal

+ bool hex = false;

+ charStack.add(stream.char());

+ if (charStack.last == 'x' || charStack.last == 'X') {

+ hex = true;

+ charStack.add(stream.char());

+ }

+ // charStack.last should be the first digit

+ if (hex && isHexDigit(charStack.last) ||

+ (!hex && isDigit(charStack.last))) {

+ // At least one digit found, so consume the whole number

+ stream.unget(charStack.last);

+ output = consumeNumberEntity(hex);

+ } else {

+ // No digits found

+ _addToken(new ParseErrorToken("expected-numeric-entity"));

+ stream.unget(charStack.removeLast());

+ output = "&${charStack.join()}";

+ }

+ } else {

+ // At this point in the process might have named entity. Entities

+ // are stored in the global variable "entities".

+ //

+ // Consume characters and compare to these to a substring of the

+ // entity names in the list until the substring no longer matches.

+ var filteredEntityList = entitiesByFirstChar[charStack[0]];

+ if (filteredEntityList == null) filteredEntityList = const [];

+ while (charStack.last != EOF) {

+ var name = charStack.join();

+ filteredEntityList = filteredEntityList.where(

+ (e) => e.startsWith(name)).toList();

+ if (filteredEntityList.length == 0) {

+ break;

+ }

+ charStack.add(stream.char());

+ }

+ // At this point we have a string that starts with some characters

+ // that may match an entity

+ String entityName = null;

+ // Try to find the longest entity the string will match to take care

+ // of &noti for instance.

+ int entityLen;

+ for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) {

+ var possibleEntityName = charStack.sublist(0, entityLen).join();

+ if (entities.containsKey(possibleEntityName)) {

+ entityName = possibleEntityName;

+ break;

+ }

+ if (entityName != null) {

+ var lastChar = entityName[entityName.length - 1];

+ if (lastChar != ";") {

+ _addToken(new ParseErrorToken(

+ "named-entity-without-semicolon"));

+ }

+ if (lastChar != ";" && fromAttribute &&

+ (isLetterOrDigit(charStack[entityLen]) ||

+ charStack[entityLen] == '=')) {

+ stream.unget(charStack.removeLast());

+ output = "&${charStack.join()}";

+ } else {

+ output = entities[entityName];

+ stream.unget(charStack.removeLast());

+ output = '${output}${slice(charStack, entityLen).join()}';

+ }

+ } else {

+ _addToken(new ParseErrorToken("expected-named-entity"));

+ stream.unget(charStack.removeLast());

+ output = "&${charStack.join()}";

+ }

+ if (fromAttribute) {

+ _attributeValue = '$_attributeValue$output';

+ } else {

+ var token;

+ if (isWhitespace(output)) {

+ token = new SpaceCharactersToken(output);

+ } else {

+ token = new CharactersToken(output);

+ }

+ _addToken(token);

+ }

+ /** This method replaces the need for "entityInAttributeValueState". */

+ void processEntityInAttribute(String allowedChar) {

+ consumeEntity(allowedChar: allowedChar, fromAttribute: true);

+ }

+ /**

+ * This method is a generic handler for emitting the tags. It also sets

+ * the state to "data" because that's what's needed after a token has been

+ * emitted.

+ */

+ void emitCurrentToken() {

+ var token = currentToken;

+ // Add token to the queue to be yielded

+ if (token is TagToken) {

+ if (lowercaseElementName) {

+ token.name = asciiUpper2Lower(token.name);

+ }

+ if (token is EndTagToken) {

+ if (_attributes != null) {

+ _addToken(new ParseErrorToken("attributes-in-end-tag"));

+ }

+ if (token.selfClosing) {

+ _addToken(new ParseErrorToken("this-closing-flag-on-end-tag"));

+ }

+ } else if (token is StartTagToken) {

+ // HTML5 specific normalizations to the token stream.

+ // Convert the list into a map where first key wins.

+ token.data = new LinkedHashMap<Object, String>();

+ if (_attributes != null) {

+ for (var attr in _attributes) {

+ token.data.putIfAbsent(attr.name, () => attr.value);

+ }

+ if (attributeSpans) token.attributeSpans = _attributes;

+ }

+ _attributes = null;

+ _attributeNames = null;

+ }

+ _addToken(token);

+ state = dataState;

+ }

+ // Below are the various tokenizer states worked out.

+ bool dataState() {

+ var data = stream.char();

+ if (data == "&") {

+ state = entityDataState;

+ } else if (data == "<") {

+ state = tagOpenState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\u0000"));

+ } else if (data == EOF) {

+ // Tokenization ends.

+ return false;

+ } else if (isWhitespace(data)) {

+ // Directly after emitting a token you switch back to the "data

+ // state". At that point spaceCharacters are important so they are

+ // emitted separately.

+ _addToken(new SpaceCharactersToken(

+ '${data}${stream.charsUntil(spaceCharacters, true)}'));

+ // No need to update lastFourChars here, since the first space will

+ // have already been appended to lastFourChars and will have broken

+ // any  sequences

+ } else {

+ var chars = stream.charsUntil("&<\u0000");

+ _addToken(new CharactersToken('${data}${chars}'));

+ }

+ return true;

+ }

+ bool entityDataState() {

+ consumeEntity();

+ state = dataState;

+ return true;

+ }

+ bool rcdataState() {

+ var data = stream.char();

+ if (data == "&") {

+ state = characterReferenceInRcdata;

+ } else if (data == "<") {

+ state = rcdataLessThanSignState;

+ } else if (data == EOF) {

+ // Tokenization ends.

+ return false;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ } else if (isWhitespace(data)) {

+ // Directly after emitting a token you switch back to the "data

+ // state". At that point spaceCharacters are important so they are

+ // emitted separately.

+ _addToken(new SpaceCharactersToken(

+ '${data}${stream.charsUntil(spaceCharacters, true)}'));

+ } else {

+ var chars = stream.charsUntil("&<");

+ _addToken(new CharactersToken('${data}${chars}'));

+ }

+ return true;

+ }

+ bool characterReferenceInRcdata() {

+ consumeEntity();

+ state = rcdataState;

+ return true;

+ }

+ bool rawtextState() {

+ var data = stream.char();

+ if (data == "<") {

+ state = rawtextLessThanSignState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ } else if (data == EOF) {

+ // Tokenization ends.

+ return false;

+ } else {

+ var chars = stream.charsUntil("<\u0000");

+ _addToken(new CharactersToken("${data}${chars}"));

+ }

+ return true;

+ }

+ bool scriptDataState() {

+ var data = stream.char();

+ if (data == "<") {

+ state = scriptDataLessThanSignState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ } else if (data == EOF) {

+ // Tokenization ends.

+ return false;

+ } else {

+ var chars = stream.charsUntil("<\u0000");

+ _addToken(new CharactersToken("${data}${chars}"));

+ }

+ return true;

+ }

+ bool plaintextState() {

+ var data = stream.char();

+ if (data == EOF) {

+ // Tokenization ends.

+ return false;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ } else {

+ _addToken(new CharactersToken(

+ '${data}${stream.charsUntil("\u0000")}'));

+ }

+ return true;

+ }

+ bool tagOpenState() {

+ var data = stream.char();

+ if (data == "!") {

+ state = markupDeclarationOpenState;

+ } else if (data == "/") {

+ state = closeTagOpenState;

+ } else if (isLetter(data)) {

+ currentToken = new StartTagToken(data);

+ state = tagNameState;

+ } else if (data == ">") {

+ // XXX In theory it could be something besides a tag name. But

+ // do we really care?

+ _addToken(new ParseErrorToken(

+ "expected-tag-name-but-got-right-bracket"));

+ _addToken(new CharactersToken("<>"));

+ state = dataState;

+ } else if (data == "?") {

+ // XXX In theory it could be something besides a tag name. But

+ // do we really care?

+ _addToken(new ParseErrorToken(

+ "expected-tag-name-but-got-question-mark"));

+ stream.unget(data);

+ state = bogusCommentState;

+ } else {

+ // XXX

+ _addToken(new ParseErrorToken("expected-tag-name"));

+ _addToken(new CharactersToken("<"));

+ stream.unget(data);

+ state = dataState;

+ }

+ return true;

+ }

+ bool closeTagOpenState() {

+ var data = stream.char();

+ if (isLetter(data)) {

+ currentToken = new EndTagToken(data);

+ state = tagNameState;

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken(

+ "expected-closing-tag-but-got-right-bracket"));

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken(

+ "expected-closing-tag-but-got-eof"));

+ _addToken(new CharactersToken("</"));

+ state = dataState;

+ } else {

+ // XXX data can be _'_...

+ _addToken(new ParseErrorToken(

+ "expected-closing-tag-but-got-char", messageParams: {"data": data}));

+ stream.unget(data);

+ state = bogusCommentState;

+ }

+ return true;

+ }

+ bool tagNameState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ state = beforeAttributeNameState;

+ } else if (data == ">") {

+ emitCurrentToken();

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-tag-name"));

+ state = dataState;

+ } else if (data == "/") {

+ state = selfClosingStartTagState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentTagToken.name = '${currentTagToken.name}\uFFFD';

+ } else {

+ currentTagToken.name = '${currentTagToken.name}$data';

+ // (Don't use charsUntil here, because tag names are

+ // very short and it's faster to not do anything fancy)

+ }

+ return true;

+ }

+ bool rcdataLessThanSignState() {

+ var data = stream.char();

+ if (data == "/") {

+ temporaryBuffer = "";

+ state = rcdataEndTagOpenState;

+ } else {

+ _addToken(new CharactersToken("<"));

+ stream.unget(data);

+ state = rcdataState;

+ }

+ return true;

+ }

+ bool rcdataEndTagOpenState() {

+ var data = stream.char();

+ if (isLetter(data)) {

+ temporaryBuffer = '${temporaryBuffer}$data';

+ state = rcdataEndTagNameState;

+ } else {

+ _addToken(new CharactersToken("</"));

+ stream.unget(data);

+ state = rcdataState;

+ }

+ return true;

+ }

+ bool _tokenIsAppropriate() {

+ return currentToken is TagToken &&

+ currentTagToken.name.toLowerCase() == temporaryBuffer.toLowerCase();

+ }

+ bool rcdataEndTagNameState() {

+ var appropriate = _tokenIsAppropriate();

+ var data = stream.char();

+ if (isWhitespace(data) && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ state = beforeAttributeNameState;

+ } else if (data == "/" && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ state = selfClosingStartTagState;

+ } else if (data == ">" && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ emitCurrentToken();

+ state = dataState;

+ } else if (isLetter(data)) {

+ temporaryBuffer = '${temporaryBuffer}$data';

+ } else {

+ _addToken(new CharactersToken("</$temporaryBuffer"));

+ stream.unget(data);

+ state = rcdataState;

+ }

+ return true;

+ }

+ bool rawtextLessThanSignState() {

+ var data = stream.char();

+ if (data == "/") {

+ temporaryBuffer = "";

+ state = rawtextEndTagOpenState;

+ } else {

+ _addToken(new CharactersToken("<"));

+ stream.unget(data);

+ state = rawtextState;

+ }

+ return true;

+ }

+ bool rawtextEndTagOpenState() {

+ var data = stream.char();

+ if (isLetter(data)) {

+ temporaryBuffer = '${temporaryBuffer}$data';

+ state = rawtextEndTagNameState;

+ } else {

+ _addToken(new CharactersToken("</"));

+ stream.unget(data);

+ state = rawtextState;

+ }

+ return true;

+ }

+ bool rawtextEndTagNameState() {

+ var appropriate = _tokenIsAppropriate();

+ var data = stream.char();

+ if (isWhitespace(data) && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ state = beforeAttributeNameState;

+ } else if (data == "/" && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ state = selfClosingStartTagState;

+ } else if (data == ">" && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ emitCurrentToken();

+ state = dataState;

+ } else if (isLetter(data)) {

+ temporaryBuffer = '${temporaryBuffer}$data';

+ } else {

+ _addToken(new CharactersToken("</$temporaryBuffer"));

+ stream.unget(data);

+ state = rawtextState;

+ }

+ return true;

+ }

+ bool scriptDataLessThanSignState() {

+ var data = stream.char();

+ if (data == "/") {

+ temporaryBuffer = "";

+ state = scriptDataEndTagOpenState;

+ } else if (data == "!") {

+ _addToken(new CharactersToken("<!"));

+ state = scriptDataEscapeStartState;

+ } else {

+ _addToken(new CharactersToken("<"));

+ stream.unget(data);

+ state = scriptDataState;

+ }

+ return true;

+ }

+ bool scriptDataEndTagOpenState() {

+ var data = stream.char();

+ if (isLetter(data)) {

+ temporaryBuffer = '${temporaryBuffer}$data';

+ state = scriptDataEndTagNameState;

+ } else {

+ _addToken(new CharactersToken("</"));

+ stream.unget(data);

+ state = scriptDataState;

+ }

+ return true;

+ }

+ bool scriptDataEndTagNameState() {

+ var appropriate = _tokenIsAppropriate();

+ var data = stream.char();

+ if (isWhitespace(data) && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ state = beforeAttributeNameState;

+ } else if (data == "/" && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ state = selfClosingStartTagState;

+ } else if (data == ">" && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ emitCurrentToken();

+ state = dataState;

+ } else if (isLetter(data)) {

+ temporaryBuffer = '${temporaryBuffer}$data';

+ } else {

+ _addToken(new CharactersToken("</$temporaryBuffer"));

+ stream.unget(data);

+ state = scriptDataState;

+ }

+ return true;

+ }

+ bool scriptDataEscapeStartState() {

+ var data = stream.char();

+ if (data == "-") {

+ _addToken(new CharactersToken("-"));

+ state = scriptDataEscapeStartDashState;

+ } else {

+ stream.unget(data);

+ state = scriptDataState;

+ }

+ return true;

+ }

+ bool scriptDataEscapeStartDashState() {

+ var data = stream.char();

+ if (data == "-") {

+ _addToken(new CharactersToken("-"));

+ state = scriptDataEscapedDashDashState;

+ } else {

+ stream.unget(data);

+ state = scriptDataState;

+ }

+ return true;

+ }

+ bool scriptDataEscapedState() {

+ var data = stream.char();

+ if (data == "-") {

+ _addToken(new CharactersToken("-"));

+ state = scriptDataEscapedDashState;

+ } else if (data == "<") {

+ state = scriptDataEscapedLessThanSignState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ } else if (data == EOF) {

+ state = dataState;

+ } else {

+ var chars = stream.charsUntil("<-\u0000");

+ _addToken(new CharactersToken("${data}${chars}"));

+ }

+ return true;

+ }

+ bool scriptDataEscapedDashState() {

+ var data = stream.char();

+ if (data == "-") {

+ _addToken(new CharactersToken("-"));

+ state = scriptDataEscapedDashDashState;

+ } else if (data == "<") {

+ state = scriptDataEscapedLessThanSignState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ state = scriptDataEscapedState;

+ } else if (data == EOF) {

+ state = dataState;

+ } else {

+ _addToken(new CharactersToken(data));

+ state = scriptDataEscapedState;

+ }

+ return true;

+ }

+ bool scriptDataEscapedDashDashState() {

+ var data = stream.char();

+ if (data == "-") {

+ _addToken(new CharactersToken("-"));

+ } else if (data == "<") {

+ state = scriptDataEscapedLessThanSignState;

+ } else if (data == ">") {

+ _addToken(new CharactersToken(">"));

+ state = scriptDataState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ state = scriptDataEscapedState;

+ } else if (data == EOF) {

+ state = dataState;

+ } else {

+ _addToken(new CharactersToken(data));

+ state = scriptDataEscapedState;

+ }

+ return true;

+ }

+ bool scriptDataEscapedLessThanSignState() {

+ var data = stream.char();

+ if (data == "/") {

+ temporaryBuffer = "";

+ state = scriptDataEscapedEndTagOpenState;

+ } else if (isLetter(data)) {

+ _addToken(new CharactersToken("<$data"));

+ temporaryBuffer = data;

+ state = scriptDataDoubleEscapeStartState;

+ } else {

+ _addToken(new CharactersToken("<"));

+ stream.unget(data);

+ state = scriptDataEscapedState;

+ }

+ return true;

+ }

+ bool scriptDataEscapedEndTagOpenState() {

+ var data = stream.char();

+ if (isLetter(data)) {

+ temporaryBuffer = data;

+ state = scriptDataEscapedEndTagNameState;

+ } else {

+ _addToken(new CharactersToken("</"));

+ stream.unget(data);

+ state = scriptDataEscapedState;

+ }

+ return true;

+ }

+ bool scriptDataEscapedEndTagNameState() {

+ var appropriate = _tokenIsAppropriate();

+ var data = stream.char();

+ if (isWhitespace(data) && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ state = beforeAttributeNameState;

+ } else if (data == "/" && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ state = selfClosingStartTagState;

+ } else if (data == ">" && appropriate) {

+ currentToken = new EndTagToken(temporaryBuffer);

+ emitCurrentToken();

+ state = dataState;

+ } else if (isLetter(data)) {

+ temporaryBuffer = '${temporaryBuffer}$data';

+ } else {

+ _addToken(new CharactersToken("</$temporaryBuffer"));

+ stream.unget(data);

+ state = scriptDataEscapedState;

+ }

+ return true;

+ }

+ bool scriptDataDoubleEscapeStartState() {

+ var data = stream.char();

+ if (isWhitespace(data) || data == "/" || data == ">") {

+ _addToken(new CharactersToken(data));

+ if (temporaryBuffer.toLowerCase() == "script") {

+ state = scriptDataDoubleEscapedState;

+ } else {

+ state = scriptDataEscapedState;

+ }

+ } else if (isLetter(data)) {

+ _addToken(new CharactersToken(data));

+ temporaryBuffer = '${temporaryBuffer}$data';

+ } else {

+ stream.unget(data);

+ state = scriptDataEscapedState;

+ }

+ return true;

+ }

+ bool scriptDataDoubleEscapedState() {

+ var data = stream.char();

+ if (data == "-") {

+ _addToken(new CharactersToken("-"));

+ state = scriptDataDoubleEscapedDashState;

+ } else if (data == "<") {

+ _addToken(new CharactersToken("<"));

+ state = scriptDataDoubleEscapedLessThanSignState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-script-in-script"));

+ state = dataState;

+ } else {

+ _addToken(new CharactersToken(data));

+ }

+ return true;

+ }

+ bool scriptDataDoubleEscapedDashState() {

+ var data = stream.char();

+ if (data == "-") {

+ _addToken(new CharactersToken("-"));

+ state = scriptDataDoubleEscapedDashDashState;

+ } else if (data == "<") {

+ _addToken(new CharactersToken("<"));

+ state = scriptDataDoubleEscapedLessThanSignState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ state = scriptDataDoubleEscapedState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-script-in-script"));

+ state = dataState;

+ } else {

+ _addToken(new CharactersToken(data));

+ state = scriptDataDoubleEscapedState;

+ }

+ return true;

+ }

+ // TODO(jmesserly): report bug in original code

+ // (was "Dash" instead of "DashDash")

+ bool scriptDataDoubleEscapedDashDashState() {

+ var data = stream.char();

+ if (data == "-") {

+ _addToken(new CharactersToken("-"));

+ } else if (data == "<") {

+ _addToken(new CharactersToken("<"));

+ state = scriptDataDoubleEscapedLessThanSignState;

+ } else if (data == ">") {

+ _addToken(new CharactersToken(">"));

+ state = scriptDataState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addToken(new CharactersToken("\uFFFD"));

+ state = scriptDataDoubleEscapedState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-script-in-script"));

+ state = dataState;

+ } else {

+ _addToken(new CharactersToken(data));

+ state = scriptDataDoubleEscapedState;

+ }

+ return true;

+ }

+ bool scriptDataDoubleEscapedLessThanSignState() {

+ var data = stream.char();

+ if (data == "/") {

+ _addToken(new CharactersToken("/"));

+ temporaryBuffer = "";

+ state = scriptDataDoubleEscapeEndState;

+ } else {

+ stream.unget(data);

+ state = scriptDataDoubleEscapedState;

+ }

+ return true;

+ }

+ bool scriptDataDoubleEscapeEndState() {

+ var data = stream.char();

+ if (isWhitespace(data) || data == "/" || data == ">") {

+ _addToken(new CharactersToken(data));

+ if (temporaryBuffer.toLowerCase() == "script") {

+ state = scriptDataEscapedState;

+ } else {

+ state = scriptDataDoubleEscapedState;

+ }

+ } else if (isLetter(data)) {

+ _addToken(new CharactersToken(data));

+ temporaryBuffer = '${temporaryBuffer}$data';

+ } else {

+ stream.unget(data);

+ state = scriptDataDoubleEscapedState;

+ }

+ return true;

+ }

+ bool beforeAttributeNameState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ stream.charsUntil(spaceCharacters, true);

+ } else if (isLetter(data)) {

+ _addAttribute(data);

+ state = attributeNameState;

+ } else if (data == ">") {

+ emitCurrentToken();

+ } else if (data == "/") {

+ state = selfClosingStartTagState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof"));

+ state = dataState;

+ } else if ("'\"=<".contains(data)) {

+ _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));

+ _addAttribute(data);

+ state = attributeNameState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addAttribute("\uFFFD");

+ state = attributeNameState;

+ } else {

+ _addAttribute(data);

+ state = attributeNameState;

+ }

+ return true;

+ }

+ bool attributeNameState() {

+ var data = stream.char();

+ bool leavingThisState = true;

+ bool emitToken = false;

+ if (data == "=") {

+ state = beforeAttributeValueState;

+ } else if (isLetter(data)) {

+ _attributeName = '$_attributeName$data'

+ '${stream.charsUntil(asciiLetters, true)}';

+ leavingThisState = false;

+ } else if (data == ">") {

+ // XXX If we emit here the attributes are converted to a dict

+ // without being checked and when the code below runs we error

+ // because data is a dict not a list

+ emitToken = true;

+ } else if (isWhitespace(data)) {

+ state = afterAttributeNameState;

+ } else if (data == "/") {

+ state = selfClosingStartTagState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _attributeName = '${_attributeName}\uFFFD';

+ leavingThisState = false;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-attribute-name"));

+ state = dataState;

+ } else if ("'\"<".contains(data)) {

+ _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));

+ _attributeName = '$_attributeName$data';

+ leavingThisState = false;

+ } else {

+ _attributeName = '$_attributeName$data';

+ leavingThisState = false;

+ }

+ if (leavingThisState) {

+ _markAttributeNameEnd(-1);

+ // Attributes are not dropped at this stage. That happens when the

+ // start tag token is emitted so values can still be safely appended

+ // to attributes, but we do want to report the parse error in time.

+ if (lowercaseAttrName) {

+ _attributeName = asciiUpper2Lower(_attributeName);

+ }

+ if (_attributeNames == null) _attributeNames = new Set();

+ if (_attributeNames.contains(_attributeName)) {

+ _addToken(new ParseErrorToken("duplicate-attribute"));

+ }

+ _attributeNames.add(_attributeName);

+ // XXX Fix for above XXX

+ if (emitToken) {

+ emitCurrentToken();

+ }

+ return true;

+ }

+ bool afterAttributeNameState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ stream.charsUntil(spaceCharacters, true);

+ } else if (data == "=") {

+ state = beforeAttributeValueState;

+ } else if (data == ">") {

+ emitCurrentToken();

+ } else if (isLetter(data)) {

+ _addAttribute(data);

+ state = attributeNameState;

+ } else if (data == "/") {

+ state = selfClosingStartTagState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _addAttribute("\uFFFD");

+ state = attributeNameState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("expected-end-of-tag-but-got-eof"));

+ state = dataState;

+ } else if ("'\"<".contains(data)) {

+ _addToken(new ParseErrorToken("invalid-character-after-attribute-name"));

+ _addAttribute(data);

+ state = attributeNameState;

+ } else {

+ _addAttribute(data);

+ state = attributeNameState;

+ }

+ return true;

+ }

+ bool beforeAttributeValueState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ stream.charsUntil(spaceCharacters, true);

+ } else if (data == "\"") {

+ _markAttributeValueStart(0);

+ state = attributeValueDoubleQuotedState;

+ } else if (data == "&") {

+ state = attributeValueUnQuotedState;

+ stream.unget(data);

+ _markAttributeValueStart(0);

+ } else if (data == "'") {

+ _markAttributeValueStart(0);

+ state = attributeValueSingleQuotedState;

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken(

+ "expected-attribute-value-but-got-right-bracket"));

+ emitCurrentToken();

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _markAttributeValueStart(-1);

+ _attributeValue = '${_attributeValue}\uFFFD';

+ state = attributeValueUnQuotedState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("expected-attribute-value-but-got-eof"));

+ state = dataState;

+ } else if ("=<`".contains(data)) {

+ _addToken(new ParseErrorToken("equals-in-unquoted-attribute-value"));

+ _markAttributeValueStart(-1);

+ _attributeValue = '$_attributeValue$data';

+ state = attributeValueUnQuotedState;

+ } else {

+ _markAttributeValueStart(-1);

+ _attributeValue = '$_attributeValue$data';

+ state = attributeValueUnQuotedState;

+ }

+ return true;

+ }

+ bool attributeValueDoubleQuotedState() {

+ var data = stream.char();

+ if (data == "\"") {

+ _markAttributeValueEnd(-1);

+ _markAttributeEnd(0);

+ state = afterAttributeValueState;

+ } else if (data == "&") {

+ processEntityInAttribute('"');

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _attributeValue = '${_attributeValue}\uFFFD';

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-attribute-value-double-quote"));

+ _markAttributeValueEnd(-1);

+ state = dataState;

+ } else {

+ _attributeValue = '$_attributeValue$data${stream.charsUntil("\"&")}';

+ }

+ return true;

+ }

+ bool attributeValueSingleQuotedState() {

+ var data = stream.char();

+ if (data == "'") {

+ _markAttributeValueEnd(-1);

+ _markAttributeEnd(0);

+ state = afterAttributeValueState;

+ } else if (data == "&") {

+ processEntityInAttribute("'");

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _attributeValue = '${_attributeValue}\uFFFD';

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-attribute-value-single-quote"));

+ _markAttributeValueEnd(-1);

+ state = dataState;

+ } else {

+ _attributeValue = '$_attributeValue$data${stream.charsUntil("\'&")}';

+ }

+ return true;

+ }

+ bool attributeValueUnQuotedState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ _markAttributeValueEnd(-1);

+ state = beforeAttributeNameState;

+ } else if (data == "&") {

+ processEntityInAttribute(">");

+ } else if (data == ">") {

+ _markAttributeValueEnd(-1);

+ emitCurrentToken();

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-attribute-value-no-quotes"));

+ _markAttributeValueEnd(-1);

+ state = dataState;

+ } else if ('"\'=<`'.contains(data)) {

+ _addToken(new ParseErrorToken(

+ "unexpected-character-in-unquoted-attribute-value"));

+ _attributeValue = '$_attributeValue$data';

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ _attributeValue = '${_attributeValue}\uFFFD';

+ } else {

+ _attributeValue = '$_attributeValue$data'

+ '${stream.charsUntil("&>\"\'=<`$spaceCharacters")}';

+ }

+ return true;

+ }

+ bool afterAttributeValueState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ state = beforeAttributeNameState;

+ } else if (data == ">") {

+ emitCurrentToken();

+ } else if (data == "/") {

+ state = selfClosingStartTagState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("unexpected-EOF-after-attribute-value"));

+ stream.unget(data);

+ state = dataState;

+ } else {

+ _addToken(new ParseErrorToken(

+ "unexpected-character-after-attribute-value"));

+ stream.unget(data);

+ state = beforeAttributeNameState;

+ }

+ return true;

+ }

+ bool selfClosingStartTagState() {

+ var data = stream.char();

+ if (data == ">") {

+ currentTagToken.selfClosing = true;

+ emitCurrentToken();

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("unexpected-EOF-after-solidus-in-tag"));

+ stream.unget(data);

+ state = dataState;

+ } else {

+ _addToken(new ParseErrorToken(

+ "unexpected-character-after-soldius-in-tag"));

+ stream.unget(data);

+ state = beforeAttributeNameState;

+ }

+ return true;

+ }

+ bool bogusCommentState() {

+ // Make a new comment token and give it as value all the characters

+ // until the first > or EOF (charsUntil checks for EOF automatically)

+ // and emit it.

+ var data = stream.charsUntil(">");

+ data = data.replaceAll("\u0000", "\uFFFD");

+ _addToken(new CommentToken(data));

+ // Eat the character directly after the bogus comment which is either a

+ // ">" or an EOF.

+ stream.char();

+ state = dataState;

+ return true;

+ }

+ bool markupDeclarationOpenState() {

+ var charStack = [stream.char()];

+ if (charStack.last == "-") {

+ charStack.add(stream.char());

+ if (charStack.last == "-") {

+ currentToken = new CommentToken("");

+ state = commentStartState;

+ return true;

+ }

+ } else if (charStack.last == 'd' || charStack.last == 'D') {

+ var matched = true;

+ for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) {

+ var char = stream.char();

+ charStack.add(char);

+ if (char == EOF || !expected.contains(char)) {

+ matched = false;

+ break;

+ }

+ if (matched) {

+ currentToken = new DoctypeToken(correct: true);

+ state = doctypeState;

+ return true;

+ }

+ } else if (charStack.last == "[" &&

+ parser != null && parser.tree.openElements.length > 0 &&

+ parser.tree.openElements.last.namespace

+ != parser.tree.defaultNamespace) {

+ var matched = true;

+ for (var expected in const ["C", "D", "A", "T", "A", "["]) {

+ charStack.add(stream.char());

+ if (charStack.last != expected) {

+ matched = false;

+ break;

+ }

+ if (matched) {

+ state = cdataSectionState;

+ return true;

+ }

+ _addToken(new ParseErrorToken("expected-dashes-or-doctype"));

+ while (charStack.length > 0) {

+ stream.unget(charStack.removeLast());

+ }

+ state = bogusCommentState;

+ return true;

+ }

+ bool commentStartState() {

+ var data = stream.char();

+ if (data == "-") {

+ state = commentStartDashState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentStringToken.data = '${currentStringToken.data}\uFFFD';

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken("incorrect-comment"));

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-comment"));

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentStringToken.data = '${currentStringToken.data}$data';

+ state = commentState;

+ }

+ return true;

+ }

+ bool commentStartDashState() {

+ var data = stream.char();

+ if (data == "-") {

+ state = commentEndState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentStringToken.data = '${currentStringToken.data}-\uFFFD';

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken("incorrect-comment"));

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-comment"));

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentStringToken.data = '${currentStringToken.data}-${data}';

+ state = commentState;

+ }

+ return true;

+ }

+ bool commentState() {

+ var data = stream.char();

+ if (data == "-") {

+ state = commentEndDashState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentStringToken.data = '${currentStringToken.data}\uFFFD';

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-comment"));

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentStringToken.data = '${currentStringToken.data}$data'

+ '${stream.charsUntil("-\u0000")}';

+ }

+ return true;

+ }

+ bool commentEndDashState() {

+ var data = stream.char();

+ if (data == "-") {

+ state = commentEndState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentStringToken.data = "${currentStringToken.data}-\uFFFD";

+ state = commentState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-comment-end-dash"));

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentStringToken.data = "${currentStringToken.data}-${data}";

+ state = commentState;

+ }

+ return true;

+ }

+ bool commentEndState() {

+ var data = stream.char();

+ if (data == ">") {

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentStringToken.data = '${currentStringToken.data}--\uFFFD';

+ state = commentState;

+ } else if (data == "!") {

+ _addToken(new ParseErrorToken(

+ "unexpected-bang-after-double-dash-in-comment"));

+ state = commentEndBangState;

+ } else if (data == "-") {

+ _addToken(new ParseErrorToken(

+ "unexpected-dash-after-double-dash-in-comment"));

+ currentStringToken.data = '${currentStringToken.data}$data';

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-comment-double-dash"));

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ // XXX

+ _addToken(new ParseErrorToken("unexpected-char-in-comment"));

+ currentStringToken.data = "${currentStringToken.data}--${data}";

+ state = commentState;

+ }

+ return true;

+ }

+ bool commentEndBangState() {

+ var data = stream.char();

+ if (data == ">") {

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == "-") {

+ currentStringToken.data = '${currentStringToken.data}--!';

+ state = commentEndDashState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentStringToken.data = '${currentStringToken.data}--!\uFFFD';

+ state = commentState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-comment-end-bang-state"));

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentStringToken.data = "${currentStringToken.data}--!${data}";

+ state = commentState;

+ }

+ return true;

+ }

+ bool doctypeState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ state = beforeDoctypeNameState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken(

+ "expected-doctype-name-but-got-eof"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ _addToken(new ParseErrorToken("need-space-after-doctype"));

+ stream.unget(data);

+ state = beforeDoctypeNameState;

+ }

+ return true;

+ }

+ bool beforeDoctypeNameState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ return true;

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken(

+ "expected-doctype-name-but-got-right-bracket"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentDoctypeToken.name = "\uFFFD";

+ state = doctypeNameState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken(

+ "expected-doctype-name-but-got-eof"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentDoctypeToken.name = data;

+ state = doctypeNameState;

+ }

+ return true;

+ }

+ bool doctypeNameState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);

+ state = afterDoctypeNameState;

+ } else if (data == ">") {

+ currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentDoctypeToken.name = "${currentDoctypeToken.name}\uFFFD";

+ state = doctypeNameState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype-name"));

+ currentDoctypeToken.correct = false;

+ currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentDoctypeToken.name = '${currentDoctypeToken.name}$data';

+ }

+ return true;

+ }

+ bool afterDoctypeNameState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ return true;

+ } else if (data == ">") {

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ currentDoctypeToken.correct = false;

+ stream.unget(data);

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ if (data == "p" || data == "P") {

+ // TODO(jmesserly): would be nice to have a helper for this.

+ var matched = true;

+ for (var expected in const ["uU", "bB", "lL", "iI", "cC"]) {

+ data = stream.char();

+ if (data == EOF || !expected.contains(data)) {

+ matched = false;

+ break;

+ }

+ if (matched) {

+ state = afterDoctypePublicKeywordState;

+ return true;

+ }

+ } else if (data == "s" || data == "S") {

+ var matched = true;

+ for (var expected in const ["yY", "sS", "tT", "eE", "mM"]) {

+ data = stream.char();

+ if (data == EOF || !expected.contains(data)) {

+ matched = false;

+ break;

+ }

+ if (matched) {

+ state = afterDoctypeSystemKeywordState;

+ return true;

+ }

+ // All the characters read before the current 'data' will be

+ // [a-zA-Z], so they're garbage in the bogus doctype and can be

+ // discarded; only the latest character might be '>' or EOF

+ // and needs to be ungetted

+ stream.unget(data);

+ _addToken(new ParseErrorToken(

+ "expected-space-or-right-bracket-in-doctype",

+ messageParams: {"data": data}));

+ currentDoctypeToken.correct = false;

+ state = bogusDoctypeState;

+ }

+ return true;

+ }

+ bool afterDoctypePublicKeywordState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ state = beforeDoctypePublicIdentifierState;

+ } else if (data == "'" || data == '"') {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ stream.unget(data);

+ state = beforeDoctypePublicIdentifierState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ stream.unget(data);

+ state = beforeDoctypePublicIdentifierState;

+ }

+ return true;

+ }

+ bool beforeDoctypePublicIdentifierState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ return true;

+ } else if (data == "\"") {

+ currentDoctypeToken.publicId = "";

+ state = doctypePublicIdentifierDoubleQuotedState;

+ } else if (data == "'") {

+ currentDoctypeToken.publicId = "";

+ state = doctypePublicIdentifierSingleQuotedState;

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ currentDoctypeToken.correct = false;

+ state = bogusDoctypeState;

+ }

+ return true;

+ }

+ bool doctypePublicIdentifierDoubleQuotedState() {

+ var data = stream.char();

+ if (data == '"') {

+ state = afterDoctypePublicIdentifierState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';

+ }

+ return true;

+ }

+ bool doctypePublicIdentifierSingleQuotedState() {

+ var data = stream.char();

+ if (data == "'") {

+ state = afterDoctypePublicIdentifierState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';

+ }

+ return true;

+ }

+ bool afterDoctypePublicIdentifierState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ state = betweenDoctypePublicAndSystemIdentifiersState;

+ } else if (data == ">") {

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == '"') {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ currentDoctypeToken.systemId = "";

+ state = doctypeSystemIdentifierDoubleQuotedState;

+ } else if (data == "'") {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ currentDoctypeToken.systemId = "";

+ state = doctypeSystemIdentifierSingleQuotedState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ currentDoctypeToken.correct = false;

+ state = bogusDoctypeState;

+ }

+ return true;

+ }

+ bool betweenDoctypePublicAndSystemIdentifiersState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ return true;

+ } else if (data == ">") {

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == '"') {

+ currentDoctypeToken.systemId = "";

+ state = doctypeSystemIdentifierDoubleQuotedState;

+ } else if (data == "'") {

+ currentDoctypeToken.systemId = "";

+ state = doctypeSystemIdentifierSingleQuotedState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ currentDoctypeToken.correct = false;

+ state = bogusDoctypeState;

+ }

+ return true;

+ }

+ bool afterDoctypeSystemKeywordState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ state = beforeDoctypeSystemIdentifierState;

+ } else if (data == "'" || data == '"') {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ stream.unget(data);

+ state = beforeDoctypeSystemIdentifierState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ stream.unget(data);

+ state = beforeDoctypeSystemIdentifierState;

+ }

+ return true;

+ }

+ bool beforeDoctypeSystemIdentifierState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ return true;

+ } else if (data == "\"") {

+ currentDoctypeToken.systemId = "";

+ state = doctypeSystemIdentifierDoubleQuotedState;

+ } else if (data == "'") {

+ currentDoctypeToken.systemId = "";

+ state = doctypeSystemIdentifierSingleQuotedState;

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ currentDoctypeToken.correct = false;

+ state = bogusDoctypeState;

+ }

+ return true;

+ }

+ bool doctypeSystemIdentifierDoubleQuotedState() {

+ var data = stream.char();

+ if (data == "\"") {

+ state = afterDoctypeSystemIdentifierState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';

+ }

+ return true;

+ }

+ bool doctypeSystemIdentifierSingleQuotedState() {

+ var data = stream.char();

+ if (data == "'") {

+ state = afterDoctypeSystemIdentifierState;

+ } else if (data == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";

+ } else if (data == ">") {

+ _addToken(new ParseErrorToken("unexpected-end-of-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';

+ }

+ return true;

+ }

+ bool afterDoctypeSystemIdentifierState() {

+ var data = stream.char();

+ if (isWhitespace(data)) {

+ return true;

+ } else if (data == ">") {

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ _addToken(new ParseErrorToken("eof-in-doctype"));

+ currentDoctypeToken.correct = false;

+ _addToken(currentToken);

+ state = dataState;

+ } else {

+ _addToken(new ParseErrorToken("unexpected-char-in-doctype"));

+ state = bogusDoctypeState;

+ }

+ return true;

+ }

+ bool bogusDoctypeState() {

+ var data = stream.char();

+ if (data == ">") {

+ _addToken(currentToken);

+ state = dataState;

+ } else if (data == EOF) {

+ // XXX EMIT

+ stream.unget(data);

+ _addToken(currentToken);

+ state = dataState;

+ }

+ return true;

+ }

+ bool cdataSectionState() {

+ var data = [];

+ int matchedEnd = 0;

+ while (true) {

+ var ch = stream.char();

+ if (ch == EOF) {

+ break;

+ }

+ // Deal with null here rather than in the parser

+ if (ch == "\u0000") {

+ _addToken(new ParseErrorToken("invalid-codepoint"));

+ ch = "\uFFFD";

+ }

+ data.add(ch);

+ // TODO(jmesserly): it'd be nice if we had an easier way to match the end,

+ // perhaps with a "peek" API.

+ if (ch == "]" && matchedEnd < 2) {

+ matchedEnd++;

+ } else if (ch == ">" && matchedEnd == 2) {

+ // Remove "]]>" from the end.

+ data.removeLast();

+ break;

+ } else {

+ matchedEnd = 0;

+ }

+ if (data.length > 0) {

+ _addToken(new CharactersToken(data.join()));

+ }

+ state = dataState;

+ return true;

+ }

« pkg/third_party/html5lib/html5lib.status ('K') | « pkg/third_party/html5lib/lib/src/token.dart ('k') | pkg/third_party/html5lib/lib/src/treebuilder.dart » ('j') | no next file with comments »