| OLD | NEW | 
|---|
|  | (Empty) | 
| 1 library tokenizer; |  | 
| 2 |  | 
| 3 import 'dart:collection'; |  | 
| 4 import 'package:html/parser.dart' show HtmlParser; |  | 
| 5 import 'constants.dart'; |  | 
| 6 import 'inputstream.dart'; |  | 
| 7 import 'token.dart'; |  | 
| 8 import 'utils.dart'; |  | 
| 9 |  | 
| 10 // Group entities by their first character, for faster lookups |  | 
| 11 |  | 
| 12 // TODO(jmesserly): we could use a better data structure here like a trie, if |  | 
| 13 // we had it implemented in Dart. |  | 
| 14 Map<String, List<String>> entitiesByFirstChar = (() { |  | 
| 15   var result = {}; |  | 
| 16   for (var k in entities.keys) { |  | 
| 17     result.putIfAbsent(k[0], () => []).add(k); |  | 
| 18   } |  | 
| 19   return result; |  | 
| 20 })(); |  | 
| 21 |  | 
| 22 // TODO(jmesserly): lots of ways to make this faster: |  | 
| 23 // - use char codes everywhere instead of 1-char strings |  | 
| 24 // - use switch instead of contains, indexOf |  | 
| 25 // - use switch instead of the sequential if tests |  | 
| 26 // - avoid string concat |  | 
| 27 |  | 
| 28 /// This class takes care of tokenizing HTML. |  | 
| 29 class HtmlTokenizer implements Iterator<Token> { |  | 
| 30   // TODO(jmesserly): a lot of these could be made private |  | 
| 31 |  | 
| 32   final HtmlInputStream stream; |  | 
| 33 |  | 
| 34   final bool lowercaseElementName; |  | 
| 35 |  | 
| 36   final bool lowercaseAttrName; |  | 
| 37 |  | 
| 38   /// True to generate spans in for [Token.span]. |  | 
| 39   final bool generateSpans; |  | 
| 40 |  | 
| 41   /// True to generate spans for attributes. |  | 
| 42   final bool attributeSpans; |  | 
| 43 |  | 
| 44   /// This reference to the parser is used for correct CDATA handling. |  | 
| 45   /// The [HtmlParser] will set this at construction time. |  | 
| 46   HtmlParser parser; |  | 
| 47 |  | 
| 48   final Queue<Token> tokenQueue; |  | 
| 49 |  | 
| 50   /// Holds the token that is currently being processed. |  | 
| 51   Token currentToken; |  | 
| 52 |  | 
| 53   /// Holds a reference to the method to be invoked for the next parser state. |  | 
| 54   // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode |  | 
| 55   // bug prevents us from doing that. See http://dartbug.com/12465 |  | 
| 56   Function state; |  | 
| 57 |  | 
| 58   final StringBuffer _buffer = new StringBuffer(); |  | 
| 59 |  | 
| 60   int _lastOffset; |  | 
| 61 |  | 
| 62   // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add |  | 
| 63   // an item until it's ready. But the code doesn't have a clear notion of when |  | 
| 64   // it's "done" with the attribute. |  | 
| 65   List<TagAttribute> _attributes; |  | 
| 66   Set<String> _attributeNames; |  | 
| 67 |  | 
| 68   HtmlTokenizer(doc, {String encoding, bool parseMeta: true, |  | 
| 69       this.lowercaseElementName: true, this.lowercaseAttrName: true, |  | 
| 70       bool generateSpans: false, String sourceUrl, this.attributeSpans: false}) |  | 
| 71       : stream = new HtmlInputStream( |  | 
| 72           doc, encoding, parseMeta, generateSpans, sourceUrl), |  | 
| 73         tokenQueue = new Queue(), |  | 
| 74         generateSpans = generateSpans { |  | 
| 75     reset(); |  | 
| 76   } |  | 
| 77 |  | 
| 78   TagToken get currentTagToken => currentToken; |  | 
| 79   DoctypeToken get currentDoctypeToken => currentToken; |  | 
| 80   StringToken get currentStringToken => currentToken; |  | 
| 81 |  | 
| 82   Token _current; |  | 
| 83   Token get current => _current; |  | 
| 84 |  | 
| 85   final StringBuffer _attributeName = new StringBuffer(); |  | 
| 86   final StringBuffer _attributeValue = new StringBuffer(); |  | 
| 87 |  | 
| 88   void _markAttributeEnd(int offset) { |  | 
| 89     _attributes.last.value = '$_attributeValue'; |  | 
| 90     if (attributeSpans) _attributes.last.end = stream.position + offset; |  | 
| 91   } |  | 
| 92 |  | 
| 93   void _markAttributeValueStart(int offset) { |  | 
| 94     if (attributeSpans) _attributes.last.startValue = stream.position + offset; |  | 
| 95   } |  | 
| 96 |  | 
| 97   void _markAttributeValueEnd(int offset) { |  | 
| 98     if (attributeSpans) _attributes.last.endValue = stream.position + offset; |  | 
| 99     _markAttributeEnd(offset); |  | 
| 100   } |  | 
| 101 |  | 
| 102   // Note: we could track the name span here, if we need it. |  | 
| 103   void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset); |  | 
| 104 |  | 
| 105   void _addAttribute(String name) { |  | 
| 106     if (_attributes == null) _attributes = []; |  | 
| 107     _attributeName.clear(); |  | 
| 108     _attributeName.write(name); |  | 
| 109     _attributeValue.clear(); |  | 
| 110     var attr = new TagAttribute(); |  | 
| 111     _attributes.add(attr); |  | 
| 112     if (attributeSpans) attr.start = stream.position - name.length; |  | 
| 113   } |  | 
| 114 |  | 
| 115   /// This is where the magic happens. |  | 
| 116   /// |  | 
| 117   /// We do our usually processing through the states and when we have a token |  | 
| 118   /// to return we yield the token which pauses processing until the next token |  | 
| 119   /// is requested. |  | 
| 120   bool moveNext() { |  | 
| 121     // Start processing. When EOF is reached state will return false; |  | 
| 122     // instead of true and the loop will terminate. |  | 
| 123     while (stream.errors.length == 0 && tokenQueue.length == 0) { |  | 
| 124       if (!state()) { |  | 
| 125         _current = null; |  | 
| 126         return false; |  | 
| 127       } |  | 
| 128     } |  | 
| 129     if (stream.errors.length > 0) { |  | 
| 130       _current = new ParseErrorToken(stream.errors.removeFirst()); |  | 
| 131     } else { |  | 
| 132       assert(tokenQueue.length > 0); |  | 
| 133       _current = tokenQueue.removeFirst(); |  | 
| 134     } |  | 
| 135     return true; |  | 
| 136   } |  | 
| 137 |  | 
| 138   /// Resets the tokenizer state. Calling this does not reset the [stream] or |  | 
| 139   /// the [parser]. |  | 
| 140   void reset() { |  | 
| 141     _lastOffset = 0; |  | 
| 142     tokenQueue.clear(); |  | 
| 143     currentToken = null; |  | 
| 144     _buffer.clear(); |  | 
| 145     _attributes = null; |  | 
| 146     _attributeNames = null; |  | 
| 147     state = dataState; |  | 
| 148   } |  | 
| 149 |  | 
| 150   /// Adds a token to the queue. Sets the span if needed. |  | 
| 151   void _addToken(Token token) { |  | 
| 152     if (generateSpans && token.span == null) { |  | 
| 153       int offset = stream.position; |  | 
| 154       token.span = stream.fileInfo.span(_lastOffset, offset); |  | 
| 155       if (token is! ParseErrorToken) { |  | 
| 156         _lastOffset = offset; |  | 
| 157       } |  | 
| 158     } |  | 
| 159     tokenQueue.add(token); |  | 
| 160   } |  | 
| 161 |  | 
| 162   /// This function returns either U+FFFD or the character based on the |  | 
| 163   /// decimal or hexadecimal representation. It also discards ";" if present. |  | 
| 164   /// If not present it will add a [ParseErrorToken]. |  | 
| 165   String consumeNumberEntity(bool isHex) { |  | 
| 166     var allowed = isDigit; |  | 
| 167     var radix = 10; |  | 
| 168     if (isHex) { |  | 
| 169       allowed = isHexDigit; |  | 
| 170       radix = 16; |  | 
| 171     } |  | 
| 172 |  | 
| 173     var charStack = []; |  | 
| 174 |  | 
| 175     // Consume all the characters that are in range while making sure we |  | 
| 176     // don't hit an EOF. |  | 
| 177     var c = stream.char(); |  | 
| 178     while (allowed(c) && c != EOF) { |  | 
| 179       charStack.add(c); |  | 
| 180       c = stream.char(); |  | 
| 181     } |  | 
| 182 |  | 
| 183     // Convert the set of characters consumed to an int. |  | 
| 184     var charAsInt = parseIntRadix(charStack.join(), radix); |  | 
| 185 |  | 
| 186     // Certain characters get replaced with others |  | 
| 187     var char = replacementCharacters[charAsInt]; |  | 
| 188     if (char != null) { |  | 
| 189       _addToken(new ParseErrorToken("illegal-codepoint-for-numeric-entity", |  | 
| 190           messageParams: {"charAsInt": charAsInt})); |  | 
| 191     } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF) || |  | 
| 192         (charAsInt > 0x10FFFF)) { |  | 
| 193       char = "\uFFFD"; |  | 
| 194       _addToken(new ParseErrorToken("illegal-codepoint-for-numeric-entity", |  | 
| 195           messageParams: {"charAsInt": charAsInt})); |  | 
| 196     } else { |  | 
| 197       // Should speed up this check somehow (e.g. move the set to a constant) |  | 
| 198       if ((0x0001 <= charAsInt && charAsInt <= 0x0008) || |  | 
| 199           (0x000E <= charAsInt && charAsInt <= 0x001F) || |  | 
| 200           (0x007F <= charAsInt && charAsInt <= 0x009F) || |  | 
| 201           (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) || |  | 
| 202           const [ |  | 
| 203         0x000B, |  | 
| 204         0xFFFE, |  | 
| 205         0xFFFF, |  | 
| 206         0x1FFFE, |  | 
| 207         0x1FFFF, |  | 
| 208         0x2FFFE, |  | 
| 209         0x2FFFF, |  | 
| 210         0x3FFFE, |  | 
| 211         0x3FFFF, |  | 
| 212         0x4FFFE, |  | 
| 213         0x4FFFF, |  | 
| 214         0x5FFFE, |  | 
| 215         0x5FFFF, |  | 
| 216         0x6FFFE, |  | 
| 217         0x6FFFF, |  | 
| 218         0x7FFFE, |  | 
| 219         0x7FFFF, |  | 
| 220         0x8FFFE, |  | 
| 221         0x8FFFF, |  | 
| 222         0x9FFFE, |  | 
| 223         0x9FFFF, |  | 
| 224         0xAFFFE, |  | 
| 225         0xAFFFF, |  | 
| 226         0xBFFFE, |  | 
| 227         0xBFFFF, |  | 
| 228         0xCFFFE, |  | 
| 229         0xCFFFF, |  | 
| 230         0xDFFFE, |  | 
| 231         0xDFFFF, |  | 
| 232         0xEFFFE, |  | 
| 233         0xEFFFF, |  | 
| 234         0xFFFFE, |  | 
| 235         0xFFFFF, |  | 
| 236         0x10FFFE, |  | 
| 237         0x10FFFF |  | 
| 238       ].contains(charAsInt)) { |  | 
| 239         _addToken(new ParseErrorToken("illegal-codepoint-for-numeric-entity", |  | 
| 240             messageParams: {"charAsInt": charAsInt})); |  | 
| 241       } |  | 
| 242       char = new String.fromCharCodes([charAsInt]); |  | 
| 243     } |  | 
| 244 |  | 
| 245     // Discard the ; if present. Otherwise, put it back on the queue and |  | 
| 246     // invoke parseError on parser. |  | 
| 247     if (c != ";") { |  | 
| 248       _addToken(new ParseErrorToken("numeric-entity-without-semicolon")); |  | 
| 249       stream.unget(c); |  | 
| 250     } |  | 
| 251     return char; |  | 
| 252   } |  | 
| 253 |  | 
| 254   void consumeEntity({String allowedChar, bool fromAttribute: false}) { |  | 
| 255     // Initialise to the default output for when no entity is matched |  | 
| 256     var output = "&"; |  | 
| 257 |  | 
| 258     var charStack = [stream.char()]; |  | 
| 259     if (isWhitespace(charStack[0]) || |  | 
| 260         charStack[0] == '<' || |  | 
| 261         charStack[0] == '&' || |  | 
| 262         charStack[0] == EOF || |  | 
| 263         allowedChar == charStack[0]) { |  | 
| 264       stream.unget(charStack[0]); |  | 
| 265     } else if (charStack[0] == "#") { |  | 
| 266       // Read the next character to see if it's hex or decimal |  | 
| 267       bool hex = false; |  | 
| 268       charStack.add(stream.char()); |  | 
| 269       if (charStack.last == 'x' || charStack.last == 'X') { |  | 
| 270         hex = true; |  | 
| 271         charStack.add(stream.char()); |  | 
| 272       } |  | 
| 273 |  | 
| 274       // charStack.last should be the first digit |  | 
| 275       if (hex && isHexDigit(charStack.last) || |  | 
| 276           (!hex && isDigit(charStack.last))) { |  | 
| 277         // At least one digit found, so consume the whole number |  | 
| 278         stream.unget(charStack.last); |  | 
| 279         output = consumeNumberEntity(hex); |  | 
| 280       } else { |  | 
| 281         // No digits found |  | 
| 282         _addToken(new ParseErrorToken("expected-numeric-entity")); |  | 
| 283         stream.unget(charStack.removeLast()); |  | 
| 284         output = "&${charStack.join()}"; |  | 
| 285       } |  | 
| 286     } else { |  | 
| 287       // At this point in the process might have named entity. Entities |  | 
| 288       // are stored in the global variable "entities". |  | 
| 289       // |  | 
| 290       // Consume characters and compare to these to a substring of the |  | 
| 291       // entity names in the list until the substring no longer matches. |  | 
| 292       var filteredEntityList = entitiesByFirstChar[charStack[0]]; |  | 
| 293       if (filteredEntityList == null) filteredEntityList = const []; |  | 
| 294 |  | 
| 295       while (charStack.last != EOF) { |  | 
| 296         var name = charStack.join(); |  | 
| 297         filteredEntityList = |  | 
| 298             filteredEntityList.where((e) => e.startsWith(name)).toList(); |  | 
| 299 |  | 
| 300         if (filteredEntityList.length == 0) { |  | 
| 301           break; |  | 
| 302         } |  | 
| 303         charStack.add(stream.char()); |  | 
| 304       } |  | 
| 305 |  | 
| 306       // At this point we have a string that starts with some characters |  | 
| 307       // that may match an entity |  | 
| 308       String entityName = null; |  | 
| 309 |  | 
| 310       // Try to find the longest entity the string will match to take care |  | 
| 311       // of ¬i for instance. |  | 
| 312 |  | 
| 313       int entityLen; |  | 
| 314       for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) { |  | 
| 315         var possibleEntityName = charStack.sublist(0, entityLen).join(); |  | 
| 316         if (entities.containsKey(possibleEntityName)) { |  | 
| 317           entityName = possibleEntityName; |  | 
| 318           break; |  | 
| 319         } |  | 
| 320       } |  | 
| 321 |  | 
| 322       if (entityName != null) { |  | 
| 323         var lastChar = entityName[entityName.length - 1]; |  | 
| 324         if (lastChar != ";") { |  | 
| 325           _addToken(new ParseErrorToken("named-entity-without-semicolon")); |  | 
| 326         } |  | 
| 327         if (lastChar != ";" && |  | 
| 328             fromAttribute && |  | 
| 329             (isLetterOrDigit(charStack[entityLen]) || |  | 
| 330                 charStack[entityLen] == '=')) { |  | 
| 331           stream.unget(charStack.removeLast()); |  | 
| 332           output = "&${charStack.join()}"; |  | 
| 333         } else { |  | 
| 334           output = entities[entityName]; |  | 
| 335           stream.unget(charStack.removeLast()); |  | 
| 336           output = '${output}${slice(charStack, entityLen).join()}'; |  | 
| 337         } |  | 
| 338       } else { |  | 
| 339         _addToken(new ParseErrorToken("expected-named-entity")); |  | 
| 340         stream.unget(charStack.removeLast()); |  | 
| 341         output = "&${charStack.join()}"; |  | 
| 342       } |  | 
| 343     } |  | 
| 344     if (fromAttribute) { |  | 
| 345       _attributeValue.write(output); |  | 
| 346     } else { |  | 
| 347       var token; |  | 
| 348       if (isWhitespace(output)) { |  | 
| 349         token = new SpaceCharactersToken(output); |  | 
| 350       } else { |  | 
| 351         token = new CharactersToken(output); |  | 
| 352       } |  | 
| 353       _addToken(token); |  | 
| 354     } |  | 
| 355   } |  | 
| 356 |  | 
| 357   /// This method replaces the need for "entityInAttributeValueState". |  | 
| 358   void processEntityInAttribute(String allowedChar) { |  | 
| 359     consumeEntity(allowedChar: allowedChar, fromAttribute: true); |  | 
| 360   } |  | 
| 361 |  | 
| 362   /// This method is a generic handler for emitting the tags. It also sets |  | 
| 363   /// the state to "data" because that's what's needed after a token has been |  | 
| 364   /// emitted. |  | 
| 365   void emitCurrentToken() { |  | 
| 366     var token = currentToken; |  | 
| 367     // Add token to the queue to be yielded |  | 
| 368     if (token is TagToken) { |  | 
| 369       if (lowercaseElementName) { |  | 
| 370         token.name = asciiUpper2Lower(token.name); |  | 
| 371       } |  | 
| 372       if (token is EndTagToken) { |  | 
| 373         if (_attributes != null) { |  | 
| 374           _addToken(new ParseErrorToken("attributes-in-end-tag")); |  | 
| 375         } |  | 
| 376         if (token.selfClosing) { |  | 
| 377           _addToken(new ParseErrorToken("this-closing-flag-on-end-tag")); |  | 
| 378         } |  | 
| 379       } else if (token is StartTagToken) { |  | 
| 380         // HTML5 specific normalizations to the token stream. |  | 
| 381         // Convert the list into a map where first key wins. |  | 
| 382         token.data = new LinkedHashMap<Object, String>(); |  | 
| 383         if (_attributes != null) { |  | 
| 384           for (var attr in _attributes) { |  | 
| 385             token.data.putIfAbsent(attr.name, () => attr.value); |  | 
| 386           } |  | 
| 387           if (attributeSpans) token.attributeSpans = _attributes; |  | 
| 388         } |  | 
| 389       } |  | 
| 390       _attributes = null; |  | 
| 391       _attributeNames = null; |  | 
| 392     } |  | 
| 393     _addToken(token); |  | 
| 394     state = dataState; |  | 
| 395   } |  | 
| 396 |  | 
| 397   // Below are the various tokenizer states worked out. |  | 
| 398 |  | 
| 399   bool dataState() { |  | 
| 400     var data = stream.char(); |  | 
| 401     if (data == "&") { |  | 
| 402       state = entityDataState; |  | 
| 403     } else if (data == "<") { |  | 
| 404       state = tagOpenState; |  | 
| 405     } else if (data == "\u0000") { |  | 
| 406       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 407       _addToken(new CharactersToken("\u0000")); |  | 
| 408     } else if (data == EOF) { |  | 
| 409       // Tokenization ends. |  | 
| 410       return false; |  | 
| 411     } else if (isWhitespace(data)) { |  | 
| 412       // Directly after emitting a token you switch back to the "data |  | 
| 413       // state". At that point spaceCharacters are important so they are |  | 
| 414       // emitted separately. |  | 
| 415       _addToken(new SpaceCharactersToken( |  | 
| 416           '${data}${stream.charsUntil(spaceCharacters, true)}')); |  | 
| 417       // No need to update lastFourChars here, since the first space will |  | 
| 418       // have already been appended to lastFourChars and will have broken |  | 
| 419       // any <!-- or --> sequences |  | 
| 420     } else { |  | 
| 421       var chars = stream.charsUntil("&<\u0000"); |  | 
| 422       _addToken(new CharactersToken('${data}${chars}')); |  | 
| 423     } |  | 
| 424     return true; |  | 
| 425   } |  | 
| 426 |  | 
| 427   bool entityDataState() { |  | 
| 428     consumeEntity(); |  | 
| 429     state = dataState; |  | 
| 430     return true; |  | 
| 431   } |  | 
| 432 |  | 
| 433   bool rcdataState() { |  | 
| 434     var data = stream.char(); |  | 
| 435     if (data == "&") { |  | 
| 436       state = characterReferenceInRcdata; |  | 
| 437     } else if (data == "<") { |  | 
| 438       state = rcdataLessThanSignState; |  | 
| 439     } else if (data == EOF) { |  | 
| 440       // Tokenization ends. |  | 
| 441       return false; |  | 
| 442     } else if (data == "\u0000") { |  | 
| 443       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 444       _addToken(new CharactersToken("\uFFFD")); |  | 
| 445     } else if (isWhitespace(data)) { |  | 
| 446       // Directly after emitting a token you switch back to the "data |  | 
| 447       // state". At that point spaceCharacters are important so they are |  | 
| 448       // emitted separately. |  | 
| 449       _addToken(new SpaceCharactersToken( |  | 
| 450           '${data}${stream.charsUntil(spaceCharacters, true)}')); |  | 
| 451     } else { |  | 
| 452       var chars = stream.charsUntil("&<"); |  | 
| 453       _addToken(new CharactersToken('${data}${chars}')); |  | 
| 454     } |  | 
| 455     return true; |  | 
| 456   } |  | 
| 457 |  | 
| 458   bool characterReferenceInRcdata() { |  | 
| 459     consumeEntity(); |  | 
| 460     state = rcdataState; |  | 
| 461     return true; |  | 
| 462   } |  | 
| 463 |  | 
| 464   bool rawtextState() { |  | 
| 465     var data = stream.char(); |  | 
| 466     if (data == "<") { |  | 
| 467       state = rawtextLessThanSignState; |  | 
| 468     } else if (data == "\u0000") { |  | 
| 469       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 470       _addToken(new CharactersToken("\uFFFD")); |  | 
| 471     } else if (data == EOF) { |  | 
| 472       // Tokenization ends. |  | 
| 473       return false; |  | 
| 474     } else { |  | 
| 475       var chars = stream.charsUntil("<\u0000"); |  | 
| 476       _addToken(new CharactersToken("${data}${chars}")); |  | 
| 477     } |  | 
| 478     return true; |  | 
| 479   } |  | 
| 480 |  | 
| 481   bool scriptDataState() { |  | 
| 482     var data = stream.char(); |  | 
| 483     if (data == "<") { |  | 
| 484       state = scriptDataLessThanSignState; |  | 
| 485     } else if (data == "\u0000") { |  | 
| 486       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 487       _addToken(new CharactersToken("\uFFFD")); |  | 
| 488     } else if (data == EOF) { |  | 
| 489       // Tokenization ends. |  | 
| 490       return false; |  | 
| 491     } else { |  | 
| 492       var chars = stream.charsUntil("<\u0000"); |  | 
| 493       _addToken(new CharactersToken("${data}${chars}")); |  | 
| 494     } |  | 
| 495     return true; |  | 
| 496   } |  | 
| 497 |  | 
| 498   bool plaintextState() { |  | 
| 499     var data = stream.char(); |  | 
| 500     if (data == EOF) { |  | 
| 501       // Tokenization ends. |  | 
| 502       return false; |  | 
| 503     } else if (data == "\u0000") { |  | 
| 504       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 505       _addToken(new CharactersToken("\uFFFD")); |  | 
| 506     } else { |  | 
| 507       _addToken(new CharactersToken('${data}${stream.charsUntil("\u0000")}')); |  | 
| 508     } |  | 
| 509     return true; |  | 
| 510   } |  | 
| 511 |  | 
| 512   bool tagOpenState() { |  | 
| 513     var data = stream.char(); |  | 
| 514     if (data == "!") { |  | 
| 515       state = markupDeclarationOpenState; |  | 
| 516     } else if (data == "/") { |  | 
| 517       state = closeTagOpenState; |  | 
| 518     } else if (isLetter(data)) { |  | 
| 519       currentToken = new StartTagToken(data); |  | 
| 520       state = tagNameState; |  | 
| 521     } else if (data == ">") { |  | 
| 522       // XXX In theory it could be something besides a tag name. But |  | 
| 523       // do we really care? |  | 
| 524       _addToken(new ParseErrorToken("expected-tag-name-but-got-right-bracket")); |  | 
| 525       _addToken(new CharactersToken("<>")); |  | 
| 526       state = dataState; |  | 
| 527     } else if (data == "?") { |  | 
| 528       // XXX In theory it could be something besides a tag name. But |  | 
| 529       // do we really care? |  | 
| 530       _addToken(new ParseErrorToken("expected-tag-name-but-got-question-mark")); |  | 
| 531       stream.unget(data); |  | 
| 532       state = bogusCommentState; |  | 
| 533     } else { |  | 
| 534       // XXX |  | 
| 535       _addToken(new ParseErrorToken("expected-tag-name")); |  | 
| 536       _addToken(new CharactersToken("<")); |  | 
| 537       stream.unget(data); |  | 
| 538       state = dataState; |  | 
| 539     } |  | 
| 540     return true; |  | 
| 541   } |  | 
| 542 |  | 
| 543   bool closeTagOpenState() { |  | 
| 544     var data = stream.char(); |  | 
| 545     if (isLetter(data)) { |  | 
| 546       currentToken = new EndTagToken(data); |  | 
| 547       state = tagNameState; |  | 
| 548     } else if (data == ">") { |  | 
| 549       _addToken( |  | 
| 550           new ParseErrorToken("expected-closing-tag-but-got-right-bracket")); |  | 
| 551       state = dataState; |  | 
| 552     } else if (data == EOF) { |  | 
| 553       _addToken(new ParseErrorToken("expected-closing-tag-but-got-eof")); |  | 
| 554       _addToken(new CharactersToken("</")); |  | 
| 555       state = dataState; |  | 
| 556     } else { |  | 
| 557       // XXX data can be _'_... |  | 
| 558       _addToken(new ParseErrorToken("expected-closing-tag-but-got-char", |  | 
| 559           messageParams: {"data": data})); |  | 
| 560       stream.unget(data); |  | 
| 561       state = bogusCommentState; |  | 
| 562     } |  | 
| 563     return true; |  | 
| 564   } |  | 
| 565 |  | 
| 566   bool tagNameState() { |  | 
| 567     var data = stream.char(); |  | 
| 568     if (isWhitespace(data)) { |  | 
| 569       state = beforeAttributeNameState; |  | 
| 570     } else if (data == ">") { |  | 
| 571       emitCurrentToken(); |  | 
| 572     } else if (data == EOF) { |  | 
| 573       _addToken(new ParseErrorToken("eof-in-tag-name")); |  | 
| 574       state = dataState; |  | 
| 575     } else if (data == "/") { |  | 
| 576       state = selfClosingStartTagState; |  | 
| 577     } else if (data == "\u0000") { |  | 
| 578       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 579       currentTagToken.name = '${currentTagToken.name}\uFFFD'; |  | 
| 580     } else { |  | 
| 581       currentTagToken.name = '${currentTagToken.name}$data'; |  | 
| 582       // (Don't use charsUntil here, because tag names are |  | 
| 583       // very short and it's faster to not do anything fancy) |  | 
| 584     } |  | 
| 585     return true; |  | 
| 586   } |  | 
| 587 |  | 
| 588   bool rcdataLessThanSignState() { |  | 
| 589     var data = stream.char(); |  | 
| 590     if (data == "/") { |  | 
| 591       _buffer.clear(); |  | 
| 592       state = rcdataEndTagOpenState; |  | 
| 593     } else { |  | 
| 594       _addToken(new CharactersToken("<")); |  | 
| 595       stream.unget(data); |  | 
| 596       state = rcdataState; |  | 
| 597     } |  | 
| 598     return true; |  | 
| 599   } |  | 
| 600 |  | 
| 601   bool rcdataEndTagOpenState() { |  | 
| 602     var data = stream.char(); |  | 
| 603     if (isLetter(data)) { |  | 
| 604       _buffer.write(data); |  | 
| 605       state = rcdataEndTagNameState; |  | 
| 606     } else { |  | 
| 607       _addToken(new CharactersToken("</")); |  | 
| 608       stream.unget(data); |  | 
| 609       state = rcdataState; |  | 
| 610     } |  | 
| 611     return true; |  | 
| 612   } |  | 
| 613 |  | 
| 614   bool _tokenIsAppropriate() { |  | 
| 615     // TODO(jmesserly): this should use case insensitive compare instead. |  | 
| 616     return currentToken is TagToken && |  | 
| 617         currentTagToken.name.toLowerCase() == '$_buffer'.toLowerCase(); |  | 
| 618   } |  | 
| 619 |  | 
| 620   bool rcdataEndTagNameState() { |  | 
| 621     var appropriate = _tokenIsAppropriate(); |  | 
| 622     var data = stream.char(); |  | 
| 623     if (isWhitespace(data) && appropriate) { |  | 
| 624       currentToken = new EndTagToken('$_buffer'); |  | 
| 625       state = beforeAttributeNameState; |  | 
| 626     } else if (data == "/" && appropriate) { |  | 
| 627       currentToken = new EndTagToken('$_buffer'); |  | 
| 628       state = selfClosingStartTagState; |  | 
| 629     } else if (data == ">" && appropriate) { |  | 
| 630       currentToken = new EndTagToken('$_buffer'); |  | 
| 631       emitCurrentToken(); |  | 
| 632       state = dataState; |  | 
| 633     } else if (isLetter(data)) { |  | 
| 634       _buffer.write(data); |  | 
| 635     } else { |  | 
| 636       _addToken(new CharactersToken("</$_buffer")); |  | 
| 637       stream.unget(data); |  | 
| 638       state = rcdataState; |  | 
| 639     } |  | 
| 640     return true; |  | 
| 641   } |  | 
| 642 |  | 
| 643   bool rawtextLessThanSignState() { |  | 
| 644     var data = stream.char(); |  | 
| 645     if (data == "/") { |  | 
| 646       _buffer.clear(); |  | 
| 647       state = rawtextEndTagOpenState; |  | 
| 648     } else { |  | 
| 649       _addToken(new CharactersToken("<")); |  | 
| 650       stream.unget(data); |  | 
| 651       state = rawtextState; |  | 
| 652     } |  | 
| 653     return true; |  | 
| 654   } |  | 
| 655 |  | 
| 656   bool rawtextEndTagOpenState() { |  | 
| 657     var data = stream.char(); |  | 
| 658     if (isLetter(data)) { |  | 
| 659       _buffer.write(data); |  | 
| 660       state = rawtextEndTagNameState; |  | 
| 661     } else { |  | 
| 662       _addToken(new CharactersToken("</")); |  | 
| 663       stream.unget(data); |  | 
| 664       state = rawtextState; |  | 
| 665     } |  | 
| 666     return true; |  | 
| 667   } |  | 
| 668 |  | 
| 669   bool rawtextEndTagNameState() { |  | 
| 670     var appropriate = _tokenIsAppropriate(); |  | 
| 671     var data = stream.char(); |  | 
| 672     if (isWhitespace(data) && appropriate) { |  | 
| 673       currentToken = new EndTagToken('$_buffer'); |  | 
| 674       state = beforeAttributeNameState; |  | 
| 675     } else if (data == "/" && appropriate) { |  | 
| 676       currentToken = new EndTagToken('$_buffer'); |  | 
| 677       state = selfClosingStartTagState; |  | 
| 678     } else if (data == ">" && appropriate) { |  | 
| 679       currentToken = new EndTagToken('$_buffer'); |  | 
| 680       emitCurrentToken(); |  | 
| 681       state = dataState; |  | 
| 682     } else if (isLetter(data)) { |  | 
| 683       _buffer.write(data); |  | 
| 684     } else { |  | 
| 685       _addToken(new CharactersToken("</$_buffer")); |  | 
| 686       stream.unget(data); |  | 
| 687       state = rawtextState; |  | 
| 688     } |  | 
| 689     return true; |  | 
| 690   } |  | 
| 691 |  | 
| 692   bool scriptDataLessThanSignState() { |  | 
| 693     var data = stream.char(); |  | 
| 694     if (data == "/") { |  | 
| 695       _buffer.clear(); |  | 
| 696       state = scriptDataEndTagOpenState; |  | 
| 697     } else if (data == "!") { |  | 
| 698       _addToken(new CharactersToken("<!")); |  | 
| 699       state = scriptDataEscapeStartState; |  | 
| 700     } else { |  | 
| 701       _addToken(new CharactersToken("<")); |  | 
| 702       stream.unget(data); |  | 
| 703       state = scriptDataState; |  | 
| 704     } |  | 
| 705     return true; |  | 
| 706   } |  | 
| 707 |  | 
| 708   bool scriptDataEndTagOpenState() { |  | 
| 709     var data = stream.char(); |  | 
| 710     if (isLetter(data)) { |  | 
| 711       _buffer.write(data); |  | 
| 712       state = scriptDataEndTagNameState; |  | 
| 713     } else { |  | 
| 714       _addToken(new CharactersToken("</")); |  | 
| 715       stream.unget(data); |  | 
| 716       state = scriptDataState; |  | 
| 717     } |  | 
| 718     return true; |  | 
| 719   } |  | 
| 720 |  | 
| 721   bool scriptDataEndTagNameState() { |  | 
| 722     var appropriate = _tokenIsAppropriate(); |  | 
| 723     var data = stream.char(); |  | 
| 724     if (isWhitespace(data) && appropriate) { |  | 
| 725       currentToken = new EndTagToken('$_buffer'); |  | 
| 726       state = beforeAttributeNameState; |  | 
| 727     } else if (data == "/" && appropriate) { |  | 
| 728       currentToken = new EndTagToken('$_buffer'); |  | 
| 729       state = selfClosingStartTagState; |  | 
| 730     } else if (data == ">" && appropriate) { |  | 
| 731       currentToken = new EndTagToken('$_buffer'); |  | 
| 732       emitCurrentToken(); |  | 
| 733       state = dataState; |  | 
| 734     } else if (isLetter(data)) { |  | 
| 735       _buffer.write(data); |  | 
| 736     } else { |  | 
| 737       _addToken(new CharactersToken("</$_buffer")); |  | 
| 738       stream.unget(data); |  | 
| 739       state = scriptDataState; |  | 
| 740     } |  | 
| 741     return true; |  | 
| 742   } |  | 
| 743 |  | 
| 744   bool scriptDataEscapeStartState() { |  | 
| 745     var data = stream.char(); |  | 
| 746     if (data == "-") { |  | 
| 747       _addToken(new CharactersToken("-")); |  | 
| 748       state = scriptDataEscapeStartDashState; |  | 
| 749     } else { |  | 
| 750       stream.unget(data); |  | 
| 751       state = scriptDataState; |  | 
| 752     } |  | 
| 753     return true; |  | 
| 754   } |  | 
| 755 |  | 
| 756   bool scriptDataEscapeStartDashState() { |  | 
| 757     var data = stream.char(); |  | 
| 758     if (data == "-") { |  | 
| 759       _addToken(new CharactersToken("-")); |  | 
| 760       state = scriptDataEscapedDashDashState; |  | 
| 761     } else { |  | 
| 762       stream.unget(data); |  | 
| 763       state = scriptDataState; |  | 
| 764     } |  | 
| 765     return true; |  | 
| 766   } |  | 
| 767 |  | 
| 768   bool scriptDataEscapedState() { |  | 
| 769     var data = stream.char(); |  | 
| 770     if (data == "-") { |  | 
| 771       _addToken(new CharactersToken("-")); |  | 
| 772       state = scriptDataEscapedDashState; |  | 
| 773     } else if (data == "<") { |  | 
| 774       state = scriptDataEscapedLessThanSignState; |  | 
| 775     } else if (data == "\u0000") { |  | 
| 776       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 777       _addToken(new CharactersToken("\uFFFD")); |  | 
| 778     } else if (data == EOF) { |  | 
| 779       state = dataState; |  | 
| 780     } else { |  | 
| 781       var chars = stream.charsUntil("<-\u0000"); |  | 
| 782       _addToken(new CharactersToken("${data}${chars}")); |  | 
| 783     } |  | 
| 784     return true; |  | 
| 785   } |  | 
| 786 |  | 
| 787   bool scriptDataEscapedDashState() { |  | 
| 788     var data = stream.char(); |  | 
| 789     if (data == "-") { |  | 
| 790       _addToken(new CharactersToken("-")); |  | 
| 791       state = scriptDataEscapedDashDashState; |  | 
| 792     } else if (data == "<") { |  | 
| 793       state = scriptDataEscapedLessThanSignState; |  | 
| 794     } else if (data == "\u0000") { |  | 
| 795       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 796       _addToken(new CharactersToken("\uFFFD")); |  | 
| 797       state = scriptDataEscapedState; |  | 
| 798     } else if (data == EOF) { |  | 
| 799       state = dataState; |  | 
| 800     } else { |  | 
| 801       _addToken(new CharactersToken(data)); |  | 
| 802       state = scriptDataEscapedState; |  | 
| 803     } |  | 
| 804     return true; |  | 
| 805   } |  | 
| 806 |  | 
| 807   bool scriptDataEscapedDashDashState() { |  | 
| 808     var data = stream.char(); |  | 
| 809     if (data == "-") { |  | 
| 810       _addToken(new CharactersToken("-")); |  | 
| 811     } else if (data == "<") { |  | 
| 812       state = scriptDataEscapedLessThanSignState; |  | 
| 813     } else if (data == ">") { |  | 
| 814       _addToken(new CharactersToken(">")); |  | 
| 815       state = scriptDataState; |  | 
| 816     } else if (data == "\u0000") { |  | 
| 817       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 818       _addToken(new CharactersToken("\uFFFD")); |  | 
| 819       state = scriptDataEscapedState; |  | 
| 820     } else if (data == EOF) { |  | 
| 821       state = dataState; |  | 
| 822     } else { |  | 
| 823       _addToken(new CharactersToken(data)); |  | 
| 824       state = scriptDataEscapedState; |  | 
| 825     } |  | 
| 826     return true; |  | 
| 827   } |  | 
| 828 |  | 
| 829   bool scriptDataEscapedLessThanSignState() { |  | 
| 830     var data = stream.char(); |  | 
| 831     if (data == "/") { |  | 
| 832       _buffer.clear(); |  | 
| 833       state = scriptDataEscapedEndTagOpenState; |  | 
| 834     } else if (isLetter(data)) { |  | 
| 835       _addToken(new CharactersToken("<$data")); |  | 
| 836       _buffer.clear(); |  | 
| 837       _buffer.write(data); |  | 
| 838       state = scriptDataDoubleEscapeStartState; |  | 
| 839     } else { |  | 
| 840       _addToken(new CharactersToken("<")); |  | 
| 841       stream.unget(data); |  | 
| 842       state = scriptDataEscapedState; |  | 
| 843     } |  | 
| 844     return true; |  | 
| 845   } |  | 
| 846 |  | 
| 847   bool scriptDataEscapedEndTagOpenState() { |  | 
| 848     var data = stream.char(); |  | 
| 849     if (isLetter(data)) { |  | 
| 850       _buffer.clear(); |  | 
| 851       _buffer.write(data); |  | 
| 852       state = scriptDataEscapedEndTagNameState; |  | 
| 853     } else { |  | 
| 854       _addToken(new CharactersToken("</")); |  | 
| 855       stream.unget(data); |  | 
| 856       state = scriptDataEscapedState; |  | 
| 857     } |  | 
| 858     return true; |  | 
| 859   } |  | 
| 860 |  | 
| 861   bool scriptDataEscapedEndTagNameState() { |  | 
| 862     var appropriate = _tokenIsAppropriate(); |  | 
| 863     var data = stream.char(); |  | 
| 864     if (isWhitespace(data) && appropriate) { |  | 
| 865       currentToken = new EndTagToken('$_buffer'); |  | 
| 866       state = beforeAttributeNameState; |  | 
| 867     } else if (data == "/" && appropriate) { |  | 
| 868       currentToken = new EndTagToken('$_buffer'); |  | 
| 869       state = selfClosingStartTagState; |  | 
| 870     } else if (data == ">" && appropriate) { |  | 
| 871       currentToken = new EndTagToken('$_buffer'); |  | 
| 872       emitCurrentToken(); |  | 
| 873       state = dataState; |  | 
| 874     } else if (isLetter(data)) { |  | 
| 875       _buffer.write(data); |  | 
| 876     } else { |  | 
| 877       _addToken(new CharactersToken("</$_buffer")); |  | 
| 878       stream.unget(data); |  | 
| 879       state = scriptDataEscapedState; |  | 
| 880     } |  | 
| 881     return true; |  | 
| 882   } |  | 
| 883 |  | 
| 884   bool scriptDataDoubleEscapeStartState() { |  | 
| 885     var data = stream.char(); |  | 
| 886     if (isWhitespace(data) || data == "/" || data == ">") { |  | 
| 887       _addToken(new CharactersToken(data)); |  | 
| 888       if ('$_buffer'.toLowerCase() == "script") { |  | 
| 889         state = scriptDataDoubleEscapedState; |  | 
| 890       } else { |  | 
| 891         state = scriptDataEscapedState; |  | 
| 892       } |  | 
| 893     } else if (isLetter(data)) { |  | 
| 894       _addToken(new CharactersToken(data)); |  | 
| 895       _buffer.write(data); |  | 
| 896     } else { |  | 
| 897       stream.unget(data); |  | 
| 898       state = scriptDataEscapedState; |  | 
| 899     } |  | 
| 900     return true; |  | 
| 901   } |  | 
| 902 |  | 
| 903   bool scriptDataDoubleEscapedState() { |  | 
| 904     var data = stream.char(); |  | 
| 905     if (data == "-") { |  | 
| 906       _addToken(new CharactersToken("-")); |  | 
| 907       state = scriptDataDoubleEscapedDashState; |  | 
| 908     } else if (data == "<") { |  | 
| 909       _addToken(new CharactersToken("<")); |  | 
| 910       state = scriptDataDoubleEscapedLessThanSignState; |  | 
| 911     } else if (data == "\u0000") { |  | 
| 912       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 913       _addToken(new CharactersToken("\uFFFD")); |  | 
| 914     } else if (data == EOF) { |  | 
| 915       _addToken(new ParseErrorToken("eof-in-script-in-script")); |  | 
| 916       state = dataState; |  | 
| 917     } else { |  | 
| 918       _addToken(new CharactersToken(data)); |  | 
| 919     } |  | 
| 920     return true; |  | 
| 921   } |  | 
| 922 |  | 
| 923   bool scriptDataDoubleEscapedDashState() { |  | 
| 924     var data = stream.char(); |  | 
| 925     if (data == "-") { |  | 
| 926       _addToken(new CharactersToken("-")); |  | 
| 927       state = scriptDataDoubleEscapedDashDashState; |  | 
| 928     } else if (data == "<") { |  | 
| 929       _addToken(new CharactersToken("<")); |  | 
| 930       state = scriptDataDoubleEscapedLessThanSignState; |  | 
| 931     } else if (data == "\u0000") { |  | 
| 932       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 933       _addToken(new CharactersToken("\uFFFD")); |  | 
| 934       state = scriptDataDoubleEscapedState; |  | 
| 935     } else if (data == EOF) { |  | 
| 936       _addToken(new ParseErrorToken("eof-in-script-in-script")); |  | 
| 937       state = dataState; |  | 
| 938     } else { |  | 
| 939       _addToken(new CharactersToken(data)); |  | 
| 940       state = scriptDataDoubleEscapedState; |  | 
| 941     } |  | 
| 942     return true; |  | 
| 943   } |  | 
| 944 |  | 
| 945   // TODO(jmesserly): report bug in original code |  | 
| 946   // (was "Dash" instead of "DashDash") |  | 
| 947   bool scriptDataDoubleEscapedDashDashState() { |  | 
| 948     var data = stream.char(); |  | 
| 949     if (data == "-") { |  | 
| 950       _addToken(new CharactersToken("-")); |  | 
| 951     } else if (data == "<") { |  | 
| 952       _addToken(new CharactersToken("<")); |  | 
| 953       state = scriptDataDoubleEscapedLessThanSignState; |  | 
| 954     } else if (data == ">") { |  | 
| 955       _addToken(new CharactersToken(">")); |  | 
| 956       state = scriptDataState; |  | 
| 957     } else if (data == "\u0000") { |  | 
| 958       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 959       _addToken(new CharactersToken("\uFFFD")); |  | 
| 960       state = scriptDataDoubleEscapedState; |  | 
| 961     } else if (data == EOF) { |  | 
| 962       _addToken(new ParseErrorToken("eof-in-script-in-script")); |  | 
| 963       state = dataState; |  | 
| 964     } else { |  | 
| 965       _addToken(new CharactersToken(data)); |  | 
| 966       state = scriptDataDoubleEscapedState; |  | 
| 967     } |  | 
| 968     return true; |  | 
| 969   } |  | 
| 970 |  | 
| 971   bool scriptDataDoubleEscapedLessThanSignState() { |  | 
| 972     var data = stream.char(); |  | 
| 973     if (data == "/") { |  | 
| 974       _addToken(new CharactersToken("/")); |  | 
| 975       _buffer.clear(); |  | 
| 976       state = scriptDataDoubleEscapeEndState; |  | 
| 977     } else { |  | 
| 978       stream.unget(data); |  | 
| 979       state = scriptDataDoubleEscapedState; |  | 
| 980     } |  | 
| 981     return true; |  | 
| 982   } |  | 
| 983 |  | 
| 984   bool scriptDataDoubleEscapeEndState() { |  | 
| 985     var data = stream.char(); |  | 
| 986     if (isWhitespace(data) || data == "/" || data == ">") { |  | 
| 987       _addToken(new CharactersToken(data)); |  | 
| 988       if ('$_buffer'.toLowerCase() == "script") { |  | 
| 989         state = scriptDataEscapedState; |  | 
| 990       } else { |  | 
| 991         state = scriptDataDoubleEscapedState; |  | 
| 992       } |  | 
| 993     } else if (isLetter(data)) { |  | 
| 994       _addToken(new CharactersToken(data)); |  | 
| 995       _buffer.write(data); |  | 
| 996     } else { |  | 
| 997       stream.unget(data); |  | 
| 998       state = scriptDataDoubleEscapedState; |  | 
| 999     } |  | 
| 1000     return true; |  | 
| 1001   } |  | 
| 1002 |  | 
| 1003   bool beforeAttributeNameState() { |  | 
| 1004     var data = stream.char(); |  | 
| 1005     if (isWhitespace(data)) { |  | 
| 1006       stream.charsUntil(spaceCharacters, true); |  | 
| 1007     } else if (isLetter(data)) { |  | 
| 1008       _addAttribute(data); |  | 
| 1009       state = attributeNameState; |  | 
| 1010     } else if (data == ">") { |  | 
| 1011       emitCurrentToken(); |  | 
| 1012     } else if (data == "/") { |  | 
| 1013       state = selfClosingStartTagState; |  | 
| 1014     } else if (data == EOF) { |  | 
| 1015       _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof")); |  | 
| 1016       state = dataState; |  | 
| 1017     } else if ("'\"=<".contains(data)) { |  | 
| 1018       _addToken(new ParseErrorToken("invalid-character-in-attribute-name")); |  | 
| 1019       _addAttribute(data); |  | 
| 1020       state = attributeNameState; |  | 
| 1021     } else if (data == "\u0000") { |  | 
| 1022       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1023       _addAttribute("\uFFFD"); |  | 
| 1024       state = attributeNameState; |  | 
| 1025     } else { |  | 
| 1026       _addAttribute(data); |  | 
| 1027       state = attributeNameState; |  | 
| 1028     } |  | 
| 1029     return true; |  | 
| 1030   } |  | 
| 1031 |  | 
| 1032   bool attributeNameState() { |  | 
| 1033     var data = stream.char(); |  | 
| 1034     bool leavingThisState = true; |  | 
| 1035     bool emitToken = false; |  | 
| 1036     if (data == "=") { |  | 
| 1037       state = beforeAttributeValueState; |  | 
| 1038     } else if (isLetter(data)) { |  | 
| 1039       _attributeName.write(data); |  | 
| 1040       _attributeName.write(stream.charsUntil(asciiLetters, true)); |  | 
| 1041       leavingThisState = false; |  | 
| 1042     } else if (data == ">") { |  | 
| 1043       // XXX If we emit here the attributes are converted to a dict |  | 
| 1044       // without being checked and when the code below runs we error |  | 
| 1045       // because data is a dict not a list |  | 
| 1046       emitToken = true; |  | 
| 1047     } else if (isWhitespace(data)) { |  | 
| 1048       state = afterAttributeNameState; |  | 
| 1049     } else if (data == "/") { |  | 
| 1050       state = selfClosingStartTagState; |  | 
| 1051     } else if (data == "\u0000") { |  | 
| 1052       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1053       _attributeName.write('\uFFFD'); |  | 
| 1054       leavingThisState = false; |  | 
| 1055     } else if (data == EOF) { |  | 
| 1056       _addToken(new ParseErrorToken("eof-in-attribute-name")); |  | 
| 1057       state = dataState; |  | 
| 1058     } else if ("'\"<".contains(data)) { |  | 
| 1059       _addToken(new ParseErrorToken("invalid-character-in-attribute-name")); |  | 
| 1060       _attributeName.write(data); |  | 
| 1061       leavingThisState = false; |  | 
| 1062     } else { |  | 
| 1063       _attributeName.write(data); |  | 
| 1064       leavingThisState = false; |  | 
| 1065     } |  | 
| 1066 |  | 
| 1067     if (leavingThisState) { |  | 
| 1068       _markAttributeNameEnd(-1); |  | 
| 1069 |  | 
| 1070       // Attributes are not dropped at this stage. That happens when the |  | 
| 1071       // start tag token is emitted so values can still be safely appended |  | 
| 1072       // to attributes, but we do want to report the parse error in time. |  | 
| 1073       var attrName = _attributeName.toString(); |  | 
| 1074       if (lowercaseAttrName) { |  | 
| 1075         attrName = asciiUpper2Lower(attrName); |  | 
| 1076       } |  | 
| 1077       _attributes.last.name = attrName; |  | 
| 1078       if (_attributeNames == null) _attributeNames = new Set(); |  | 
| 1079       if (_attributeNames.contains(attrName)) { |  | 
| 1080         _addToken(new ParseErrorToken("duplicate-attribute")); |  | 
| 1081       } |  | 
| 1082       _attributeNames.add(attrName); |  | 
| 1083 |  | 
| 1084       // XXX Fix for above XXX |  | 
| 1085       if (emitToken) { |  | 
| 1086         emitCurrentToken(); |  | 
| 1087       } |  | 
| 1088     } |  | 
| 1089     return true; |  | 
| 1090   } |  | 
| 1091 |  | 
| 1092   bool afterAttributeNameState() { |  | 
| 1093     var data = stream.char(); |  | 
| 1094     if (isWhitespace(data)) { |  | 
| 1095       stream.charsUntil(spaceCharacters, true); |  | 
| 1096     } else if (data == "=") { |  | 
| 1097       state = beforeAttributeValueState; |  | 
| 1098     } else if (data == ">") { |  | 
| 1099       emitCurrentToken(); |  | 
| 1100     } else if (isLetter(data)) { |  | 
| 1101       _addAttribute(data); |  | 
| 1102       state = attributeNameState; |  | 
| 1103     } else if (data == "/") { |  | 
| 1104       state = selfClosingStartTagState; |  | 
| 1105     } else if (data == "\u0000") { |  | 
| 1106       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1107       _addAttribute("\uFFFD"); |  | 
| 1108       state = attributeNameState; |  | 
| 1109     } else if (data == EOF) { |  | 
| 1110       _addToken(new ParseErrorToken("expected-end-of-tag-but-got-eof")); |  | 
| 1111       state = dataState; |  | 
| 1112     } else if ("'\"<".contains(data)) { |  | 
| 1113       _addToken(new ParseErrorToken("invalid-character-after-attribute-name")); |  | 
| 1114       _addAttribute(data); |  | 
| 1115       state = attributeNameState; |  | 
| 1116     } else { |  | 
| 1117       _addAttribute(data); |  | 
| 1118       state = attributeNameState; |  | 
| 1119     } |  | 
| 1120     return true; |  | 
| 1121   } |  | 
| 1122 |  | 
| 1123   bool beforeAttributeValueState() { |  | 
| 1124     var data = stream.char(); |  | 
| 1125     if (isWhitespace(data)) { |  | 
| 1126       stream.charsUntil(spaceCharacters, true); |  | 
| 1127     } else if (data == "\"") { |  | 
| 1128       _markAttributeValueStart(0); |  | 
| 1129       state = attributeValueDoubleQuotedState; |  | 
| 1130     } else if (data == "&") { |  | 
| 1131       state = attributeValueUnQuotedState; |  | 
| 1132       stream.unget(data); |  | 
| 1133       _markAttributeValueStart(0); |  | 
| 1134     } else if (data == "'") { |  | 
| 1135       _markAttributeValueStart(0); |  | 
| 1136       state = attributeValueSingleQuotedState; |  | 
| 1137     } else if (data == ">") { |  | 
| 1138       _addToken(new ParseErrorToken( |  | 
| 1139           "expected-attribute-value-but-got-right-bracket")); |  | 
| 1140       emitCurrentToken(); |  | 
| 1141     } else if (data == "\u0000") { |  | 
| 1142       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1143       _markAttributeValueStart(-1); |  | 
| 1144       _attributeValue.write('\uFFFD'); |  | 
| 1145       state = attributeValueUnQuotedState; |  | 
| 1146     } else if (data == EOF) { |  | 
| 1147       _addToken(new ParseErrorToken("expected-attribute-value-but-got-eof")); |  | 
| 1148       state = dataState; |  | 
| 1149     } else if ("=<`".contains(data)) { |  | 
| 1150       _addToken(new ParseErrorToken("equals-in-unquoted-attribute-value")); |  | 
| 1151       _markAttributeValueStart(-1); |  | 
| 1152       _attributeValue.write(data); |  | 
| 1153       state = attributeValueUnQuotedState; |  | 
| 1154     } else { |  | 
| 1155       _markAttributeValueStart(-1); |  | 
| 1156       _attributeValue.write(data); |  | 
| 1157       state = attributeValueUnQuotedState; |  | 
| 1158     } |  | 
| 1159     return true; |  | 
| 1160   } |  | 
| 1161 |  | 
| 1162   bool attributeValueDoubleQuotedState() { |  | 
| 1163     var data = stream.char(); |  | 
| 1164     if (data == "\"") { |  | 
| 1165       _markAttributeValueEnd(-1); |  | 
| 1166       _markAttributeEnd(0); |  | 
| 1167       state = afterAttributeValueState; |  | 
| 1168     } else if (data == "&") { |  | 
| 1169       processEntityInAttribute('"'); |  | 
| 1170     } else if (data == "\u0000") { |  | 
| 1171       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1172       _attributeValue.write('\uFFFD'); |  | 
| 1173     } else if (data == EOF) { |  | 
| 1174       _addToken(new ParseErrorToken("eof-in-attribute-value-double-quote")); |  | 
| 1175       _markAttributeValueEnd(-1); |  | 
| 1176       state = dataState; |  | 
| 1177     } else { |  | 
| 1178       _attributeValue.write(data); |  | 
| 1179       _attributeValue.write(stream.charsUntil("\"&")); |  | 
| 1180     } |  | 
| 1181     return true; |  | 
| 1182   } |  | 
| 1183 |  | 
| 1184   bool attributeValueSingleQuotedState() { |  | 
| 1185     var data = stream.char(); |  | 
| 1186     if (data == "'") { |  | 
| 1187       _markAttributeValueEnd(-1); |  | 
| 1188       _markAttributeEnd(0); |  | 
| 1189       state = afterAttributeValueState; |  | 
| 1190     } else if (data == "&") { |  | 
| 1191       processEntityInAttribute("'"); |  | 
| 1192     } else if (data == "\u0000") { |  | 
| 1193       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1194       _attributeValue.write('\uFFFD'); |  | 
| 1195     } else if (data == EOF) { |  | 
| 1196       _addToken(new ParseErrorToken("eof-in-attribute-value-single-quote")); |  | 
| 1197       _markAttributeValueEnd(-1); |  | 
| 1198       state = dataState; |  | 
| 1199     } else { |  | 
| 1200       _attributeValue.write(data); |  | 
| 1201       _attributeValue.write(stream.charsUntil("\'&")); |  | 
| 1202     } |  | 
| 1203     return true; |  | 
| 1204   } |  | 
| 1205 |  | 
| 1206   bool attributeValueUnQuotedState() { |  | 
| 1207     var data = stream.char(); |  | 
| 1208     if (isWhitespace(data)) { |  | 
| 1209       _markAttributeValueEnd(-1); |  | 
| 1210       state = beforeAttributeNameState; |  | 
| 1211     } else if (data == "&") { |  | 
| 1212       processEntityInAttribute(">"); |  | 
| 1213     } else if (data == ">") { |  | 
| 1214       _markAttributeValueEnd(-1); |  | 
| 1215       emitCurrentToken(); |  | 
| 1216     } else if (data == EOF) { |  | 
| 1217       _addToken(new ParseErrorToken("eof-in-attribute-value-no-quotes")); |  | 
| 1218       _markAttributeValueEnd(-1); |  | 
| 1219       state = dataState; |  | 
| 1220     } else if ('"\'=<`'.contains(data)) { |  | 
| 1221       _addToken(new ParseErrorToken( |  | 
| 1222           "unexpected-character-in-unquoted-attribute-value")); |  | 
| 1223       _attributeValue.write(data); |  | 
| 1224     } else if (data == "\u0000") { |  | 
| 1225       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1226       _attributeValue.write('\uFFFD'); |  | 
| 1227     } else { |  | 
| 1228       _attributeValue.write(data); |  | 
| 1229       _attributeValue.write(stream.charsUntil("&>\"\'=<`$spaceCharacters")); |  | 
| 1230     } |  | 
| 1231     return true; |  | 
| 1232   } |  | 
| 1233 |  | 
| 1234   bool afterAttributeValueState() { |  | 
| 1235     var data = stream.char(); |  | 
| 1236     if (isWhitespace(data)) { |  | 
| 1237       state = beforeAttributeNameState; |  | 
| 1238     } else if (data == ">") { |  | 
| 1239       emitCurrentToken(); |  | 
| 1240     } else if (data == "/") { |  | 
| 1241       state = selfClosingStartTagState; |  | 
| 1242     } else if (data == EOF) { |  | 
| 1243       _addToken(new ParseErrorToken("unexpected-EOF-after-attribute-value")); |  | 
| 1244       stream.unget(data); |  | 
| 1245       state = dataState; |  | 
| 1246     } else { |  | 
| 1247       _addToken( |  | 
| 1248           new ParseErrorToken("unexpected-character-after-attribute-value")); |  | 
| 1249       stream.unget(data); |  | 
| 1250       state = beforeAttributeNameState; |  | 
| 1251     } |  | 
| 1252     return true; |  | 
| 1253   } |  | 
| 1254 |  | 
| 1255   bool selfClosingStartTagState() { |  | 
| 1256     var data = stream.char(); |  | 
| 1257     if (data == ">") { |  | 
| 1258       currentTagToken.selfClosing = true; |  | 
| 1259       emitCurrentToken(); |  | 
| 1260     } else if (data == EOF) { |  | 
| 1261       _addToken(new ParseErrorToken("unexpected-EOF-after-solidus-in-tag")); |  | 
| 1262       stream.unget(data); |  | 
| 1263       state = dataState; |  | 
| 1264     } else { |  | 
| 1265       _addToken( |  | 
| 1266           new ParseErrorToken("unexpected-character-after-soldius-in-tag")); |  | 
| 1267       stream.unget(data); |  | 
| 1268       state = beforeAttributeNameState; |  | 
| 1269     } |  | 
| 1270     return true; |  | 
| 1271   } |  | 
| 1272 |  | 
| 1273   bool bogusCommentState() { |  | 
| 1274     // Make a new comment token and give it as value all the characters |  | 
| 1275     // until the first > or EOF (charsUntil checks for EOF automatically) |  | 
| 1276     // and emit it. |  | 
| 1277     var data = stream.charsUntil(">"); |  | 
| 1278     data = data.replaceAll("\u0000", "\uFFFD"); |  | 
| 1279     _addToken(new CommentToken(data)); |  | 
| 1280 |  | 
| 1281     // Eat the character directly after the bogus comment which is either a |  | 
| 1282     // ">" or an EOF. |  | 
| 1283     stream.char(); |  | 
| 1284     state = dataState; |  | 
| 1285     return true; |  | 
| 1286   } |  | 
| 1287 |  | 
| 1288   bool markupDeclarationOpenState() { |  | 
| 1289     var charStack = [stream.char()]; |  | 
| 1290     if (charStack.last == "-") { |  | 
| 1291       charStack.add(stream.char()); |  | 
| 1292       if (charStack.last == "-") { |  | 
| 1293         currentToken = new CommentToken(); |  | 
| 1294         state = commentStartState; |  | 
| 1295         return true; |  | 
| 1296       } |  | 
| 1297     } else if (charStack.last == 'd' || charStack.last == 'D') { |  | 
| 1298       var matched = true; |  | 
| 1299       for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) { |  | 
| 1300         var char = stream.char(); |  | 
| 1301         charStack.add(char); |  | 
| 1302         if (char == EOF || !expected.contains(char)) { |  | 
| 1303           matched = false; |  | 
| 1304           break; |  | 
| 1305         } |  | 
| 1306       } |  | 
| 1307       if (matched) { |  | 
| 1308         currentToken = new DoctypeToken(correct: true); |  | 
| 1309         state = doctypeState; |  | 
| 1310         return true; |  | 
| 1311       } |  | 
| 1312     } else if (charStack.last == "[" && |  | 
| 1313         parser != null && |  | 
| 1314         parser.tree.openElements.length > 0 && |  | 
| 1315         parser.tree.openElements.last.namespaceUri != |  | 
| 1316             parser.tree.defaultNamespace) { |  | 
| 1317       var matched = true; |  | 
| 1318       for (var expected in const ["C", "D", "A", "T", "A", "["]) { |  | 
| 1319         charStack.add(stream.char()); |  | 
| 1320         if (charStack.last != expected) { |  | 
| 1321           matched = false; |  | 
| 1322           break; |  | 
| 1323         } |  | 
| 1324       } |  | 
| 1325       if (matched) { |  | 
| 1326         state = cdataSectionState; |  | 
| 1327         return true; |  | 
| 1328       } |  | 
| 1329     } |  | 
| 1330 |  | 
| 1331     _addToken(new ParseErrorToken("expected-dashes-or-doctype")); |  | 
| 1332 |  | 
| 1333     while (charStack.length > 0) { |  | 
| 1334       stream.unget(charStack.removeLast()); |  | 
| 1335     } |  | 
| 1336     state = bogusCommentState; |  | 
| 1337     return true; |  | 
| 1338   } |  | 
| 1339 |  | 
| 1340   bool commentStartState() { |  | 
| 1341     var data = stream.char(); |  | 
| 1342     if (data == "-") { |  | 
| 1343       state = commentStartDashState; |  | 
| 1344     } else if (data == "\u0000") { |  | 
| 1345       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1346       currentStringToken.add('\uFFFD'); |  | 
| 1347     } else if (data == ">") { |  | 
| 1348       _addToken(new ParseErrorToken("incorrect-comment")); |  | 
| 1349       _addToken(currentToken); |  | 
| 1350       state = dataState; |  | 
| 1351     } else if (data == EOF) { |  | 
| 1352       _addToken(new ParseErrorToken("eof-in-comment")); |  | 
| 1353       _addToken(currentToken); |  | 
| 1354       state = dataState; |  | 
| 1355     } else { |  | 
| 1356       currentStringToken.add(data); |  | 
| 1357       state = commentState; |  | 
| 1358     } |  | 
| 1359     return true; |  | 
| 1360   } |  | 
| 1361 |  | 
| 1362   bool commentStartDashState() { |  | 
| 1363     var data = stream.char(); |  | 
| 1364     if (data == "-") { |  | 
| 1365       state = commentEndState; |  | 
| 1366     } else if (data == "\u0000") { |  | 
| 1367       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1368       currentStringToken.add('-\uFFFD'); |  | 
| 1369     } else if (data == ">") { |  | 
| 1370       _addToken(new ParseErrorToken("incorrect-comment")); |  | 
| 1371       _addToken(currentToken); |  | 
| 1372       state = dataState; |  | 
| 1373     } else if (data == EOF) { |  | 
| 1374       _addToken(new ParseErrorToken("eof-in-comment")); |  | 
| 1375       _addToken(currentToken); |  | 
| 1376       state = dataState; |  | 
| 1377     } else { |  | 
| 1378       currentStringToken.add('-').add(data); |  | 
| 1379       state = commentState; |  | 
| 1380     } |  | 
| 1381     return true; |  | 
| 1382   } |  | 
| 1383 |  | 
| 1384   bool commentState() { |  | 
| 1385     var data = stream.char(); |  | 
| 1386     if (data == "-") { |  | 
| 1387       state = commentEndDashState; |  | 
| 1388     } else if (data == "\u0000") { |  | 
| 1389       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1390       currentStringToken.add('\uFFFD'); |  | 
| 1391     } else if (data == EOF) { |  | 
| 1392       _addToken(new ParseErrorToken("eof-in-comment")); |  | 
| 1393       _addToken(currentToken); |  | 
| 1394       state = dataState; |  | 
| 1395     } else { |  | 
| 1396       currentStringToken.add(data).add(stream.charsUntil("-\u0000")); |  | 
| 1397     } |  | 
| 1398     return true; |  | 
| 1399   } |  | 
| 1400 |  | 
| 1401   bool commentEndDashState() { |  | 
| 1402     var data = stream.char(); |  | 
| 1403     if (data == "-") { |  | 
| 1404       state = commentEndState; |  | 
| 1405     } else if (data == "\u0000") { |  | 
| 1406       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1407       currentStringToken.add('-\uFFFD'); |  | 
| 1408       state = commentState; |  | 
| 1409     } else if (data == EOF) { |  | 
| 1410       _addToken(new ParseErrorToken("eof-in-comment-end-dash")); |  | 
| 1411       _addToken(currentToken); |  | 
| 1412       state = dataState; |  | 
| 1413     } else { |  | 
| 1414       currentStringToken.add('-').add(data); |  | 
| 1415       state = commentState; |  | 
| 1416     } |  | 
| 1417     return true; |  | 
| 1418   } |  | 
| 1419 |  | 
| 1420   bool commentEndState() { |  | 
| 1421     var data = stream.char(); |  | 
| 1422     if (data == ">") { |  | 
| 1423       _addToken(currentToken); |  | 
| 1424       state = dataState; |  | 
| 1425     } else if (data == "\u0000") { |  | 
| 1426       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1427       currentStringToken.add('--\uFFFD'); |  | 
| 1428       state = commentState; |  | 
| 1429     } else if (data == "!") { |  | 
| 1430       _addToken( |  | 
| 1431           new ParseErrorToken("unexpected-bang-after-double-dash-in-comment")); |  | 
| 1432       state = commentEndBangState; |  | 
| 1433     } else if (data == "-") { |  | 
| 1434       _addToken( |  | 
| 1435           new ParseErrorToken("unexpected-dash-after-double-dash-in-comment")); |  | 
| 1436       currentStringToken.add(data); |  | 
| 1437     } else if (data == EOF) { |  | 
| 1438       _addToken(new ParseErrorToken("eof-in-comment-double-dash")); |  | 
| 1439       _addToken(currentToken); |  | 
| 1440       state = dataState; |  | 
| 1441     } else { |  | 
| 1442       // XXX |  | 
| 1443       _addToken(new ParseErrorToken("unexpected-char-in-comment")); |  | 
| 1444       currentStringToken.add('--').add(data); |  | 
| 1445       state = commentState; |  | 
| 1446     } |  | 
| 1447     return true; |  | 
| 1448   } |  | 
| 1449 |  | 
| 1450   bool commentEndBangState() { |  | 
| 1451     var data = stream.char(); |  | 
| 1452     if (data == ">") { |  | 
| 1453       _addToken(currentToken); |  | 
| 1454       state = dataState; |  | 
| 1455     } else if (data == "-") { |  | 
| 1456       currentStringToken.add('--!'); |  | 
| 1457       state = commentEndDashState; |  | 
| 1458     } else if (data == "\u0000") { |  | 
| 1459       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1460       currentStringToken.add('--!\uFFFD'); |  | 
| 1461       state = commentState; |  | 
| 1462     } else if (data == EOF) { |  | 
| 1463       _addToken(new ParseErrorToken("eof-in-comment-end-bang-state")); |  | 
| 1464       _addToken(currentToken); |  | 
| 1465       state = dataState; |  | 
| 1466     } else { |  | 
| 1467       currentStringToken.add('--!').add(data); |  | 
| 1468       state = commentState; |  | 
| 1469     } |  | 
| 1470     return true; |  | 
| 1471   } |  | 
| 1472 |  | 
| 1473   bool doctypeState() { |  | 
| 1474     var data = stream.char(); |  | 
| 1475     if (isWhitespace(data)) { |  | 
| 1476       state = beforeDoctypeNameState; |  | 
| 1477     } else if (data == EOF) { |  | 
| 1478       _addToken(new ParseErrorToken("expected-doctype-name-but-got-eof")); |  | 
| 1479       currentDoctypeToken.correct = false; |  | 
| 1480       _addToken(currentToken); |  | 
| 1481       state = dataState; |  | 
| 1482     } else { |  | 
| 1483       _addToken(new ParseErrorToken("need-space-after-doctype")); |  | 
| 1484       stream.unget(data); |  | 
| 1485       state = beforeDoctypeNameState; |  | 
| 1486     } |  | 
| 1487     return true; |  | 
| 1488   } |  | 
| 1489 |  | 
| 1490   bool beforeDoctypeNameState() { |  | 
| 1491     var data = stream.char(); |  | 
| 1492     if (isWhitespace(data)) { |  | 
| 1493       return true; |  | 
| 1494     } else if (data == ">") { |  | 
| 1495       _addToken( |  | 
| 1496           new ParseErrorToken("expected-doctype-name-but-got-right-bracket")); |  | 
| 1497       currentDoctypeToken.correct = false; |  | 
| 1498       _addToken(currentToken); |  | 
| 1499       state = dataState; |  | 
| 1500     } else if (data == "\u0000") { |  | 
| 1501       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1502       currentDoctypeToken.name = "\uFFFD"; |  | 
| 1503       state = doctypeNameState; |  | 
| 1504     } else if (data == EOF) { |  | 
| 1505       _addToken(new ParseErrorToken("expected-doctype-name-but-got-eof")); |  | 
| 1506       currentDoctypeToken.correct = false; |  | 
| 1507       _addToken(currentToken); |  | 
| 1508       state = dataState; |  | 
| 1509     } else { |  | 
| 1510       currentDoctypeToken.name = data; |  | 
| 1511       state = doctypeNameState; |  | 
| 1512     } |  | 
| 1513     return true; |  | 
| 1514   } |  | 
| 1515 |  | 
| 1516   bool doctypeNameState() { |  | 
| 1517     var data = stream.char(); |  | 
| 1518     if (isWhitespace(data)) { |  | 
| 1519       currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name); |  | 
| 1520       state = afterDoctypeNameState; |  | 
| 1521     } else if (data == ">") { |  | 
| 1522       currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name); |  | 
| 1523       _addToken(currentToken); |  | 
| 1524       state = dataState; |  | 
| 1525     } else if (data == "\u0000") { |  | 
| 1526       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1527       currentDoctypeToken.name = "${currentDoctypeToken.name}\uFFFD"; |  | 
| 1528       state = doctypeNameState; |  | 
| 1529     } else if (data == EOF) { |  | 
| 1530       _addToken(new ParseErrorToken("eof-in-doctype-name")); |  | 
| 1531       currentDoctypeToken.correct = false; |  | 
| 1532       currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name); |  | 
| 1533       _addToken(currentToken); |  | 
| 1534       state = dataState; |  | 
| 1535     } else { |  | 
| 1536       currentDoctypeToken.name = '${currentDoctypeToken.name}$data'; |  | 
| 1537     } |  | 
| 1538     return true; |  | 
| 1539   } |  | 
| 1540 |  | 
| 1541   bool afterDoctypeNameState() { |  | 
| 1542     var data = stream.char(); |  | 
| 1543     if (isWhitespace(data)) { |  | 
| 1544       return true; |  | 
| 1545     } else if (data == ">") { |  | 
| 1546       _addToken(currentToken); |  | 
| 1547       state = dataState; |  | 
| 1548     } else if (data == EOF) { |  | 
| 1549       currentDoctypeToken.correct = false; |  | 
| 1550       stream.unget(data); |  | 
| 1551       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1552       _addToken(currentToken); |  | 
| 1553       state = dataState; |  | 
| 1554     } else { |  | 
| 1555       if (data == "p" || data == "P") { |  | 
| 1556         // TODO(jmesserly): would be nice to have a helper for this. |  | 
| 1557         var matched = true; |  | 
| 1558         for (var expected in const ["uU", "bB", "lL", "iI", "cC"]) { |  | 
| 1559           data = stream.char(); |  | 
| 1560           if (data == EOF || !expected.contains(data)) { |  | 
| 1561             matched = false; |  | 
| 1562             break; |  | 
| 1563           } |  | 
| 1564         } |  | 
| 1565         if (matched) { |  | 
| 1566           state = afterDoctypePublicKeywordState; |  | 
| 1567           return true; |  | 
| 1568         } |  | 
| 1569       } else if (data == "s" || data == "S") { |  | 
| 1570         var matched = true; |  | 
| 1571         for (var expected in const ["yY", "sS", "tT", "eE", "mM"]) { |  | 
| 1572           data = stream.char(); |  | 
| 1573           if (data == EOF || !expected.contains(data)) { |  | 
| 1574             matched = false; |  | 
| 1575             break; |  | 
| 1576           } |  | 
| 1577         } |  | 
| 1578         if (matched) { |  | 
| 1579           state = afterDoctypeSystemKeywordState; |  | 
| 1580           return true; |  | 
| 1581         } |  | 
| 1582       } |  | 
| 1583 |  | 
| 1584       // All the characters read before the current 'data' will be |  | 
| 1585       // [a-zA-Z], so they're garbage in the bogus doctype and can be |  | 
| 1586       // discarded; only the latest character might be '>' or EOF |  | 
| 1587       // and needs to be ungetted |  | 
| 1588       stream.unget(data); |  | 
| 1589       _addToken(new ParseErrorToken( |  | 
| 1590           "expected-space-or-right-bracket-in-doctype", |  | 
| 1591           messageParams: {"data": data})); |  | 
| 1592       currentDoctypeToken.correct = false; |  | 
| 1593       state = bogusDoctypeState; |  | 
| 1594     } |  | 
| 1595     return true; |  | 
| 1596   } |  | 
| 1597 |  | 
| 1598   bool afterDoctypePublicKeywordState() { |  | 
| 1599     var data = stream.char(); |  | 
| 1600     if (isWhitespace(data)) { |  | 
| 1601       state = beforeDoctypePublicIdentifierState; |  | 
| 1602     } else if (data == "'" || data == '"') { |  | 
| 1603       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1604       stream.unget(data); |  | 
| 1605       state = beforeDoctypePublicIdentifierState; |  | 
| 1606     } else if (data == EOF) { |  | 
| 1607       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1608       currentDoctypeToken.correct = false; |  | 
| 1609       _addToken(currentToken); |  | 
| 1610       state = dataState; |  | 
| 1611     } else { |  | 
| 1612       stream.unget(data); |  | 
| 1613       state = beforeDoctypePublicIdentifierState; |  | 
| 1614     } |  | 
| 1615     return true; |  | 
| 1616   } |  | 
| 1617 |  | 
| 1618   bool beforeDoctypePublicIdentifierState() { |  | 
| 1619     var data = stream.char(); |  | 
| 1620     if (isWhitespace(data)) { |  | 
| 1621       return true; |  | 
| 1622     } else if (data == "\"") { |  | 
| 1623       currentDoctypeToken.publicId = ""; |  | 
| 1624       state = doctypePublicIdentifierDoubleQuotedState; |  | 
| 1625     } else if (data == "'") { |  | 
| 1626       currentDoctypeToken.publicId = ""; |  | 
| 1627       state = doctypePublicIdentifierSingleQuotedState; |  | 
| 1628     } else if (data == ">") { |  | 
| 1629       _addToken(new ParseErrorToken("unexpected-end-of-doctype")); |  | 
| 1630       currentDoctypeToken.correct = false; |  | 
| 1631       _addToken(currentToken); |  | 
| 1632       state = dataState; |  | 
| 1633     } else if (data == EOF) { |  | 
| 1634       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1635       currentDoctypeToken.correct = false; |  | 
| 1636       _addToken(currentToken); |  | 
| 1637       state = dataState; |  | 
| 1638     } else { |  | 
| 1639       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1640       currentDoctypeToken.correct = false; |  | 
| 1641       state = bogusDoctypeState; |  | 
| 1642     } |  | 
| 1643     return true; |  | 
| 1644   } |  | 
| 1645 |  | 
| 1646   bool doctypePublicIdentifierDoubleQuotedState() { |  | 
| 1647     var data = stream.char(); |  | 
| 1648     if (data == '"') { |  | 
| 1649       state = afterDoctypePublicIdentifierState; |  | 
| 1650     } else if (data == "\u0000") { |  | 
| 1651       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1652       currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD"; |  | 
| 1653     } else if (data == ">") { |  | 
| 1654       _addToken(new ParseErrorToken("unexpected-end-of-doctype")); |  | 
| 1655       currentDoctypeToken.correct = false; |  | 
| 1656       _addToken(currentToken); |  | 
| 1657       state = dataState; |  | 
| 1658     } else if (data == EOF) { |  | 
| 1659       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1660       currentDoctypeToken.correct = false; |  | 
| 1661       _addToken(currentToken); |  | 
| 1662       state = dataState; |  | 
| 1663     } else { |  | 
| 1664       currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data'; |  | 
| 1665     } |  | 
| 1666     return true; |  | 
| 1667   } |  | 
| 1668 |  | 
| 1669   bool doctypePublicIdentifierSingleQuotedState() { |  | 
| 1670     var data = stream.char(); |  | 
| 1671     if (data == "'") { |  | 
| 1672       state = afterDoctypePublicIdentifierState; |  | 
| 1673     } else if (data == "\u0000") { |  | 
| 1674       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1675       currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD"; |  | 
| 1676     } else if (data == ">") { |  | 
| 1677       _addToken(new ParseErrorToken("unexpected-end-of-doctype")); |  | 
| 1678       currentDoctypeToken.correct = false; |  | 
| 1679       _addToken(currentToken); |  | 
| 1680       state = dataState; |  | 
| 1681     } else if (data == EOF) { |  | 
| 1682       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1683       currentDoctypeToken.correct = false; |  | 
| 1684       _addToken(currentToken); |  | 
| 1685       state = dataState; |  | 
| 1686     } else { |  | 
| 1687       currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data'; |  | 
| 1688     } |  | 
| 1689     return true; |  | 
| 1690   } |  | 
| 1691 |  | 
| 1692   bool afterDoctypePublicIdentifierState() { |  | 
| 1693     var data = stream.char(); |  | 
| 1694     if (isWhitespace(data)) { |  | 
| 1695       state = betweenDoctypePublicAndSystemIdentifiersState; |  | 
| 1696     } else if (data == ">") { |  | 
| 1697       _addToken(currentToken); |  | 
| 1698       state = dataState; |  | 
| 1699     } else if (data == '"') { |  | 
| 1700       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1701       currentDoctypeToken.systemId = ""; |  | 
| 1702       state = doctypeSystemIdentifierDoubleQuotedState; |  | 
| 1703     } else if (data == "'") { |  | 
| 1704       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1705       currentDoctypeToken.systemId = ""; |  | 
| 1706       state = doctypeSystemIdentifierSingleQuotedState; |  | 
| 1707     } else if (data == EOF) { |  | 
| 1708       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1709       currentDoctypeToken.correct = false; |  | 
| 1710       _addToken(currentToken); |  | 
| 1711       state = dataState; |  | 
| 1712     } else { |  | 
| 1713       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1714       currentDoctypeToken.correct = false; |  | 
| 1715       state = bogusDoctypeState; |  | 
| 1716     } |  | 
| 1717     return true; |  | 
| 1718   } |  | 
| 1719 |  | 
| 1720   bool betweenDoctypePublicAndSystemIdentifiersState() { |  | 
| 1721     var data = stream.char(); |  | 
| 1722     if (isWhitespace(data)) { |  | 
| 1723       return true; |  | 
| 1724     } else if (data == ">") { |  | 
| 1725       _addToken(currentToken); |  | 
| 1726       state = dataState; |  | 
| 1727     } else if (data == '"') { |  | 
| 1728       currentDoctypeToken.systemId = ""; |  | 
| 1729       state = doctypeSystemIdentifierDoubleQuotedState; |  | 
| 1730     } else if (data == "'") { |  | 
| 1731       currentDoctypeToken.systemId = ""; |  | 
| 1732       state = doctypeSystemIdentifierSingleQuotedState; |  | 
| 1733     } else if (data == EOF) { |  | 
| 1734       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1735       currentDoctypeToken.correct = false; |  | 
| 1736       _addToken(currentToken); |  | 
| 1737       state = dataState; |  | 
| 1738     } else { |  | 
| 1739       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1740       currentDoctypeToken.correct = false; |  | 
| 1741       state = bogusDoctypeState; |  | 
| 1742     } |  | 
| 1743     return true; |  | 
| 1744   } |  | 
| 1745 |  | 
| 1746   bool afterDoctypeSystemKeywordState() { |  | 
| 1747     var data = stream.char(); |  | 
| 1748     if (isWhitespace(data)) { |  | 
| 1749       state = beforeDoctypeSystemIdentifierState; |  | 
| 1750     } else if (data == "'" || data == '"') { |  | 
| 1751       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1752       stream.unget(data); |  | 
| 1753       state = beforeDoctypeSystemIdentifierState; |  | 
| 1754     } else if (data == EOF) { |  | 
| 1755       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1756       currentDoctypeToken.correct = false; |  | 
| 1757       _addToken(currentToken); |  | 
| 1758       state = dataState; |  | 
| 1759     } else { |  | 
| 1760       stream.unget(data); |  | 
| 1761       state = beforeDoctypeSystemIdentifierState; |  | 
| 1762     } |  | 
| 1763     return true; |  | 
| 1764   } |  | 
| 1765 |  | 
| 1766   bool beforeDoctypeSystemIdentifierState() { |  | 
| 1767     var data = stream.char(); |  | 
| 1768     if (isWhitespace(data)) { |  | 
| 1769       return true; |  | 
| 1770     } else if (data == "\"") { |  | 
| 1771       currentDoctypeToken.systemId = ""; |  | 
| 1772       state = doctypeSystemIdentifierDoubleQuotedState; |  | 
| 1773     } else if (data == "'") { |  | 
| 1774       currentDoctypeToken.systemId = ""; |  | 
| 1775       state = doctypeSystemIdentifierSingleQuotedState; |  | 
| 1776     } else if (data == ">") { |  | 
| 1777       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1778       currentDoctypeToken.correct = false; |  | 
| 1779       _addToken(currentToken); |  | 
| 1780       state = dataState; |  | 
| 1781     } else if (data == EOF) { |  | 
| 1782       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1783       currentDoctypeToken.correct = false; |  | 
| 1784       _addToken(currentToken); |  | 
| 1785       state = dataState; |  | 
| 1786     } else { |  | 
| 1787       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1788       currentDoctypeToken.correct = false; |  | 
| 1789       state = bogusDoctypeState; |  | 
| 1790     } |  | 
| 1791     return true; |  | 
| 1792   } |  | 
| 1793 |  | 
| 1794   bool doctypeSystemIdentifierDoubleQuotedState() { |  | 
| 1795     var data = stream.char(); |  | 
| 1796     if (data == "\"") { |  | 
| 1797       state = afterDoctypeSystemIdentifierState; |  | 
| 1798     } else if (data == "\u0000") { |  | 
| 1799       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1800       currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD"; |  | 
| 1801     } else if (data == ">") { |  | 
| 1802       _addToken(new ParseErrorToken("unexpected-end-of-doctype")); |  | 
| 1803       currentDoctypeToken.correct = false; |  | 
| 1804       _addToken(currentToken); |  | 
| 1805       state = dataState; |  | 
| 1806     } else if (data == EOF) { |  | 
| 1807       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1808       currentDoctypeToken.correct = false; |  | 
| 1809       _addToken(currentToken); |  | 
| 1810       state = dataState; |  | 
| 1811     } else { |  | 
| 1812       currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data'; |  | 
| 1813     } |  | 
| 1814     return true; |  | 
| 1815   } |  | 
| 1816 |  | 
| 1817   bool doctypeSystemIdentifierSingleQuotedState() { |  | 
| 1818     var data = stream.char(); |  | 
| 1819     if (data == "'") { |  | 
| 1820       state = afterDoctypeSystemIdentifierState; |  | 
| 1821     } else if (data == "\u0000") { |  | 
| 1822       _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1823       currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD"; |  | 
| 1824     } else if (data == ">") { |  | 
| 1825       _addToken(new ParseErrorToken("unexpected-end-of-doctype")); |  | 
| 1826       currentDoctypeToken.correct = false; |  | 
| 1827       _addToken(currentToken); |  | 
| 1828       state = dataState; |  | 
| 1829     } else if (data == EOF) { |  | 
| 1830       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1831       currentDoctypeToken.correct = false; |  | 
| 1832       _addToken(currentToken); |  | 
| 1833       state = dataState; |  | 
| 1834     } else { |  | 
| 1835       currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data'; |  | 
| 1836     } |  | 
| 1837     return true; |  | 
| 1838   } |  | 
| 1839 |  | 
| 1840   bool afterDoctypeSystemIdentifierState() { |  | 
| 1841     var data = stream.char(); |  | 
| 1842     if (isWhitespace(data)) { |  | 
| 1843       return true; |  | 
| 1844     } else if (data == ">") { |  | 
| 1845       _addToken(currentToken); |  | 
| 1846       state = dataState; |  | 
| 1847     } else if (data == EOF) { |  | 
| 1848       _addToken(new ParseErrorToken("eof-in-doctype")); |  | 
| 1849       currentDoctypeToken.correct = false; |  | 
| 1850       _addToken(currentToken); |  | 
| 1851       state = dataState; |  | 
| 1852     } else { |  | 
| 1853       _addToken(new ParseErrorToken("unexpected-char-in-doctype")); |  | 
| 1854       state = bogusDoctypeState; |  | 
| 1855     } |  | 
| 1856     return true; |  | 
| 1857   } |  | 
| 1858 |  | 
| 1859   bool bogusDoctypeState() { |  | 
| 1860     var data = stream.char(); |  | 
| 1861     if (data == ">") { |  | 
| 1862       _addToken(currentToken); |  | 
| 1863       state = dataState; |  | 
| 1864     } else if (data == EOF) { |  | 
| 1865       // XXX EMIT |  | 
| 1866       stream.unget(data); |  | 
| 1867       _addToken(currentToken); |  | 
| 1868       state = dataState; |  | 
| 1869     } |  | 
| 1870     return true; |  | 
| 1871   } |  | 
| 1872 |  | 
| 1873   bool cdataSectionState() { |  | 
| 1874     var data = []; |  | 
| 1875     int matchedEnd = 0; |  | 
| 1876     while (true) { |  | 
| 1877       var ch = stream.char(); |  | 
| 1878       if (ch == EOF) { |  | 
| 1879         break; |  | 
| 1880       } |  | 
| 1881       // Deal with null here rather than in the parser |  | 
| 1882       if (ch == "\u0000") { |  | 
| 1883         _addToken(new ParseErrorToken("invalid-codepoint")); |  | 
| 1884         ch = "\uFFFD"; |  | 
| 1885       } |  | 
| 1886       data.add(ch); |  | 
| 1887       // TODO(jmesserly): it'd be nice if we had an easier way to match the end, |  | 
| 1888       // perhaps with a "peek" API. |  | 
| 1889       if (ch == "]" && matchedEnd < 2) { |  | 
| 1890         matchedEnd++; |  | 
| 1891       } else if (ch == ">" && matchedEnd == 2) { |  | 
| 1892         // Remove "]]>" from the end. |  | 
| 1893         data.removeLast(); |  | 
| 1894         data.removeLast(); |  | 
| 1895         data.removeLast(); |  | 
| 1896         break; |  | 
| 1897       } else { |  | 
| 1898         matchedEnd = 0; |  | 
| 1899       } |  | 
| 1900     } |  | 
| 1901 |  | 
| 1902     if (data.length > 0) { |  | 
| 1903       _addToken(new CharactersToken(data.join())); |  | 
| 1904     } |  | 
| 1905     state = dataState; |  | 
| 1906     return true; |  | 
| 1907   } |  | 
| 1908 } |  | 
| OLD | NEW | 
|---|