Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(444)

Side by Side Diff: pkg/third_party/html5lib/lib/src/tokenizer.dart

Issue 22375011: move html5lib code into dart svn repo (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: change location of html5lib to pkg/third_party/html5lib Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 library tokenizer;
2
3 import 'dart:collection';
4 import 'dart:math';
5 import 'package:html5lib/parser.dart' show HtmlParser;
6 import 'package:source_maps/span.dart' show Span, FileSpan;
7 import 'constants.dart';
8 import 'inputstream.dart';
9 import 'token.dart';
10 import 'utils.dart';
11
12 // Group entities by their first character, for faster lookups
13
14 // TODO(jmesserly): we could use a better data structure here like a trie, if
15 // we had it implemented in Dart.
16 Map<String, List<String>> entitiesByFirstChar = (() {
17 var result = {};
18 for (var k in entities.keys) {
19 result.putIfAbsent(k[0], () => []).add(k);
20 }
21 return result;
22 })();
23
24 // TODO(jmesserly): lots of ways to make this faster:
25 // - use char codes everywhere instead of 1-char strings
26 // - use switch instead of contains, indexOf
27 // - use switch instead of the sequential if tests
28 // - avoid string concat
29
30 /**
31 * This class takes care of tokenizing HTML.
32 */
33 class HtmlTokenizer implements Iterator<Token> {
34 // TODO(jmesserly): a lot of these could be made private
35
36 final HtmlInputStream stream;
37
38 final bool lowercaseElementName;
39
40 final bool lowercaseAttrName;
41
42 /** True to generate spans in for [Token.span]. */
43 final bool generateSpans;
44
45 /** True to generate spans for attributes. */
46 final bool attributeSpans;
47
48 /**
49 * This reference to the parser is used for correct CDATA handling.
50 * The [HtmlParser] will set this at construction time.
51 */
52 HtmlParser parser;
53
54 final Queue<Token> tokenQueue;
55
56 /** Holds the token that is currently being processed. */
57 Token currentToken;
58
59 /**
60 * Holds a reference to the method to be invoked for the next parser state.
61 */
62 Predicate state;
63
64 String temporaryBuffer;
65
66 int _lastOffset;
67
68 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add
69 // an item until it's ready. But the code doesn't have a clear notion of when
70 // it's "done" with the attribute.
71 List<TagAttribute> _attributes;
72 Set<String> _attributeNames;
73
74 HtmlTokenizer(doc, {String encoding, bool parseMeta: true,
75 this.lowercaseElementName: true, this.lowercaseAttrName: true,
76 bool generateSpans: false, String sourceUrl, this.attributeSpans: false})
77 : stream = new HtmlInputStream(
78 doc, encoding, parseMeta, generateSpans, sourceUrl),
79 tokenQueue = new Queue(),
80 generateSpans = generateSpans {
81 reset();
82 }
83
84 TagToken get currentTagToken => currentToken;
85 DoctypeToken get currentDoctypeToken => currentToken;
86 StringToken get currentStringToken => currentToken;
87
88 Token _current;
89 Token get current => _current;
90
91 String get _attributeName => _attributes.last.name;
92 set _attributeName(String value) {
93 _attributes.last.name = value;
94 }
95
96 String get _attributeValue => _attributes.last.value;
97 set _attributeValue(String value) {
98 _attributes.last.value = value;
99 }
100
101 void _markAttributeEnd(int offset) {
102 if (attributeSpans) _attributes.last.end = stream.position + offset;
103 }
104
105 void _markAttributeValueStart(int offset) {
106 if (attributeSpans) _attributes.last.startValue = stream.position + offset;
107 }
108
109 void _markAttributeValueEnd(int offset) {
110 if (attributeSpans) {
111 _attributes.last.endValue = stream.position + offset;
112 _markAttributeEnd(offset);
113 }
114 }
115
116 // Note: we could track the name span here, if we need it.
117 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset);
118
119 void _addAttribute(String name) {
120 if (_attributes == null) _attributes = [];
121 var attr = new TagAttribute(name);
122 _attributes.add(attr);
123 if (attributeSpans) attr.start = stream.position - name.length;
124 }
125
126 /**
127 * This is where the magic happens.
128 *
129 * We do our usually processing through the states and when we have a token
130 * to return we yield the token which pauses processing until the next token
131 * is requested.
132 */
133 bool moveNext() {
134 // Start processing. When EOF is reached state will return false;
135 // instead of true and the loop will terminate.
136 while (stream.errors.length == 0 && tokenQueue.length == 0) {
137 if (!state()) {
138 _current = null;
139 return false;
140 }
141 }
142 if (stream.errors.length > 0) {
143 _current = new ParseErrorToken(stream.errors.removeFirst());
144 } else {
145 assert (tokenQueue.length > 0);
146 _current = tokenQueue.removeFirst();
147 }
148 return true;
149 }
150
151 /**
152 * Resets the tokenizer state. Calling this does not reset the [stream] or
153 * the [parser].
154 */
155 void reset() {
156 _lastOffset = 0;
157 tokenQueue.clear();
158 currentToken = null;
159 temporaryBuffer = null;
160 _attributes = null;
161 _attributeNames = null;
162 state = dataState;
163 }
164
165 /** Adds a token to the queue. Sets the span if needed. */
166 void _addToken(Token token) {
167 if (generateSpans && token.span == null) {
168 int offset = stream.position;
169 token.span = new FileSpan(stream.fileInfo, _lastOffset, offset);
170 if (token is! ParseErrorToken) {
171 _lastOffset = offset;
172 }
173 }
174 tokenQueue.add(token);
175 }
176
177 /**
178 * This function returns either U+FFFD or the character based on the
179 * decimal or hexadecimal representation. It also discards ";" if present.
180 * If not present it will add a [ParseErrorToken].
181 */
182 String consumeNumberEntity(bool isHex) {
183 var allowed = isDigit;
184 var radix = 10;
185 if (isHex) {
186 allowed = isHexDigit;
187 radix = 16;
188 }
189
190 var charStack = [];
191
192 // Consume all the characters that are in range while making sure we
193 // don't hit an EOF.
194 var c = stream.char();
195 while (allowed(c) && c != EOF) {
196 charStack.add(c);
197 c = stream.char();
198 }
199
200 // Convert the set of characters consumed to an int.
201 var charAsInt = parseIntRadix(charStack.join(), radix);
202
203 // Certain characters get replaced with others
204 var char = replacementCharacters[charAsInt];
205 if (char != null) {
206 _addToken(new ParseErrorToken(
207 "illegal-codepoint-for-numeric-entity",
208 messageParams: {"charAsInt": charAsInt}));
209 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF)
210 || (charAsInt > 0x10FFFF)) {
211 char = "\uFFFD";
212 _addToken(new ParseErrorToken(
213 "illegal-codepoint-for-numeric-entity",
214 messageParams: {"charAsInt": charAsInt}));
215 } else {
216 // Should speed up this check somehow (e.g. move the set to a constant)
217 if ((0x0001 <= charAsInt && charAsInt <= 0x0008) ||
218 (0x000E <= charAsInt && charAsInt <= 0x001F) ||
219 (0x007F <= charAsInt && charAsInt <= 0x009F) ||
220 (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) ||
221 const [0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
222 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
223 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
224 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
225 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
226 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
227 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
228 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
229 0xFFFFF, 0x10FFFE, 0x10FFFF].contains(charAsInt)) {
230 _addToken(new ParseErrorToken(
231 "illegal-codepoint-for-numeric-entity",
232 messageParams: {"charAsInt": charAsInt}));
233 }
234 char = new String.fromCharCodes([charAsInt]);
235 }
236
237 // Discard the ; if present. Otherwise, put it back on the queue and
238 // invoke parseError on parser.
239 if (c != ";") {
240 _addToken(new ParseErrorToken(
241 "numeric-entity-without-semicolon"));
242 stream.unget(c);
243 }
244 return char;
245 }
246
247 void consumeEntity({String allowedChar, bool fromAttribute: false}) {
248 // Initialise to the default output for when no entity is matched
249 var output = "&";
250
251 var charStack = [stream.char()];
252 if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&'
253 || charStack[0] == EOF || allowedChar == charStack[0]) {
254 stream.unget(charStack[0]);
255 } else if (charStack[0] == "#") {
256 // Read the next character to see if it's hex or decimal
257 bool hex = false;
258 charStack.add(stream.char());
259 if (charStack.last == 'x' || charStack.last == 'X') {
260 hex = true;
261 charStack.add(stream.char());
262 }
263
264 // charStack.last should be the first digit
265 if (hex && isHexDigit(charStack.last) ||
266 (!hex && isDigit(charStack.last))) {
267 // At least one digit found, so consume the whole number
268 stream.unget(charStack.last);
269 output = consumeNumberEntity(hex);
270 } else {
271 // No digits found
272 _addToken(new ParseErrorToken("expected-numeric-entity"));
273 stream.unget(charStack.removeLast());
274 output = "&${charStack.join()}";
275 }
276 } else {
277 // At this point in the process might have named entity. Entities
278 // are stored in the global variable "entities".
279 //
280 // Consume characters and compare to these to a substring of the
281 // entity names in the list until the substring no longer matches.
282 var filteredEntityList = entitiesByFirstChar[charStack[0]];
283 if (filteredEntityList == null) filteredEntityList = const [];
284
285 while (charStack.last != EOF) {
286 var name = charStack.join();
287 filteredEntityList = filteredEntityList.where(
288 (e) => e.startsWith(name)).toList();
289
290 if (filteredEntityList.length == 0) {
291 break;
292 }
293 charStack.add(stream.char());
294 }
295
296 // At this point we have a string that starts with some characters
297 // that may match an entity
298 String entityName = null;
299
300 // Try to find the longest entity the string will match to take care
301 // of &noti for instance.
302
303 int entityLen;
304 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) {
305 var possibleEntityName = charStack.sublist(0, entityLen).join();
306 if (entities.containsKey(possibleEntityName)) {
307 entityName = possibleEntityName;
308 break;
309 }
310 }
311
312 if (entityName != null) {
313 var lastChar = entityName[entityName.length - 1];
314 if (lastChar != ";") {
315 _addToken(new ParseErrorToken(
316 "named-entity-without-semicolon"));
317 }
318 if (lastChar != ";" && fromAttribute &&
319 (isLetterOrDigit(charStack[entityLen]) ||
320 charStack[entityLen] == '=')) {
321 stream.unget(charStack.removeLast());
322 output = "&${charStack.join()}";
323 } else {
324 output = entities[entityName];
325 stream.unget(charStack.removeLast());
326 output = '${output}${slice(charStack, entityLen).join()}';
327 }
328 } else {
329 _addToken(new ParseErrorToken("expected-named-entity"));
330 stream.unget(charStack.removeLast());
331 output = "&${charStack.join()}";
332 }
333 }
334 if (fromAttribute) {
335 _attributeValue = '$_attributeValue$output';
336 } else {
337 var token;
338 if (isWhitespace(output)) {
339 token = new SpaceCharactersToken(output);
340 } else {
341 token = new CharactersToken(output);
342 }
343 _addToken(token);
344 }
345 }
346
347 /** This method replaces the need for "entityInAttributeValueState". */
348 void processEntityInAttribute(String allowedChar) {
349 consumeEntity(allowedChar: allowedChar, fromAttribute: true);
350 }
351
352 /**
353 * This method is a generic handler for emitting the tags. It also sets
354 * the state to "data" because that's what's needed after a token has been
355 * emitted.
356 */
357 void emitCurrentToken() {
358 var token = currentToken;
359 // Add token to the queue to be yielded
360 if (token is TagToken) {
361 if (lowercaseElementName) {
362 token.name = asciiUpper2Lower(token.name);
363 }
364 if (token is EndTagToken) {
365 if (_attributes != null) {
366 _addToken(new ParseErrorToken("attributes-in-end-tag"));
367 }
368 if (token.selfClosing) {
369 _addToken(new ParseErrorToken("this-closing-flag-on-end-tag"));
370 }
371 } else if (token is StartTagToken) {
372 // HTML5 specific normalizations to the token stream.
373 // Convert the list into a map where first key wins.
374 token.data = new LinkedHashMap<Object, String>();
375 if (_attributes != null) {
376 for (var attr in _attributes) {
377 token.data.putIfAbsent(attr.name, () => attr.value);
378 }
379 if (attributeSpans) token.attributeSpans = _attributes;
380 }
381 }
382 _attributes = null;
383 _attributeNames = null;
384 }
385 _addToken(token);
386 state = dataState;
387 }
388
389 // Below are the various tokenizer states worked out.
390
391 bool dataState() {
392 var data = stream.char();
393 if (data == "&") {
394 state = entityDataState;
395 } else if (data == "<") {
396 state = tagOpenState;
397 } else if (data == "\u0000") {
398 _addToken(new ParseErrorToken("invalid-codepoint"));
399 _addToken(new CharactersToken("\u0000"));
400 } else if (data == EOF) {
401 // Tokenization ends.
402 return false;
403 } else if (isWhitespace(data)) {
404 // Directly after emitting a token you switch back to the "data
405 // state". At that point spaceCharacters are important so they are
406 // emitted separately.
407 _addToken(new SpaceCharactersToken(
408 '${data}${stream.charsUntil(spaceCharacters, true)}'));
409 // No need to update lastFourChars here, since the first space will
410 // have already been appended to lastFourChars and will have broken
411 // any <!-- or --> sequences
412 } else {
413 var chars = stream.charsUntil("&<\u0000");
414 _addToken(new CharactersToken('${data}${chars}'));
415 }
416 return true;
417 }
418
419 bool entityDataState() {
420 consumeEntity();
421 state = dataState;
422 return true;
423 }
424
425 bool rcdataState() {
426 var data = stream.char();
427 if (data == "&") {
428 state = characterReferenceInRcdata;
429 } else if (data == "<") {
430 state = rcdataLessThanSignState;
431 } else if (data == EOF) {
432 // Tokenization ends.
433 return false;
434 } else if (data == "\u0000") {
435 _addToken(new ParseErrorToken("invalid-codepoint"));
436 _addToken(new CharactersToken("\uFFFD"));
437 } else if (isWhitespace(data)) {
438 // Directly after emitting a token you switch back to the "data
439 // state". At that point spaceCharacters are important so they are
440 // emitted separately.
441 _addToken(new SpaceCharactersToken(
442 '${data}${stream.charsUntil(spaceCharacters, true)}'));
443 } else {
444 var chars = stream.charsUntil("&<");
445 _addToken(new CharactersToken('${data}${chars}'));
446 }
447 return true;
448 }
449
450 bool characterReferenceInRcdata() {
451 consumeEntity();
452 state = rcdataState;
453 return true;
454 }
455
456 bool rawtextState() {
457 var data = stream.char();
458 if (data == "<") {
459 state = rawtextLessThanSignState;
460 } else if (data == "\u0000") {
461 _addToken(new ParseErrorToken("invalid-codepoint"));
462 _addToken(new CharactersToken("\uFFFD"));
463 } else if (data == EOF) {
464 // Tokenization ends.
465 return false;
466 } else {
467 var chars = stream.charsUntil("<\u0000");
468 _addToken(new CharactersToken("${data}${chars}"));
469 }
470 return true;
471 }
472
473 bool scriptDataState() {
474 var data = stream.char();
475 if (data == "<") {
476 state = scriptDataLessThanSignState;
477 } else if (data == "\u0000") {
478 _addToken(new ParseErrorToken("invalid-codepoint"));
479 _addToken(new CharactersToken("\uFFFD"));
480 } else if (data == EOF) {
481 // Tokenization ends.
482 return false;
483 } else {
484 var chars = stream.charsUntil("<\u0000");
485 _addToken(new CharactersToken("${data}${chars}"));
486 }
487 return true;
488 }
489
490 bool plaintextState() {
491 var data = stream.char();
492 if (data == EOF) {
493 // Tokenization ends.
494 return false;
495 } else if (data == "\u0000") {
496 _addToken(new ParseErrorToken("invalid-codepoint"));
497 _addToken(new CharactersToken("\uFFFD"));
498 } else {
499 _addToken(new CharactersToken(
500 '${data}${stream.charsUntil("\u0000")}'));
501 }
502 return true;
503 }
504
505 bool tagOpenState() {
506 var data = stream.char();
507 if (data == "!") {
508 state = markupDeclarationOpenState;
509 } else if (data == "/") {
510 state = closeTagOpenState;
511 } else if (isLetter(data)) {
512 currentToken = new StartTagToken(data);
513 state = tagNameState;
514 } else if (data == ">") {
515 // XXX In theory it could be something besides a tag name. But
516 // do we really care?
517 _addToken(new ParseErrorToken(
518 "expected-tag-name-but-got-right-bracket"));
519 _addToken(new CharactersToken("<>"));
520 state = dataState;
521 } else if (data == "?") {
522 // XXX In theory it could be something besides a tag name. But
523 // do we really care?
524 _addToken(new ParseErrorToken(
525 "expected-tag-name-but-got-question-mark"));
526 stream.unget(data);
527 state = bogusCommentState;
528 } else {
529 // XXX
530 _addToken(new ParseErrorToken("expected-tag-name"));
531 _addToken(new CharactersToken("<"));
532 stream.unget(data);
533 state = dataState;
534 }
535 return true;
536 }
537
538 bool closeTagOpenState() {
539 var data = stream.char();
540 if (isLetter(data)) {
541 currentToken = new EndTagToken(data);
542 state = tagNameState;
543 } else if (data == ">") {
544 _addToken(new ParseErrorToken(
545 "expected-closing-tag-but-got-right-bracket"));
546 state = dataState;
547 } else if (data == EOF) {
548 _addToken(new ParseErrorToken(
549 "expected-closing-tag-but-got-eof"));
550 _addToken(new CharactersToken("</"));
551 state = dataState;
552 } else {
553 // XXX data can be _'_...
554 _addToken(new ParseErrorToken(
555 "expected-closing-tag-but-got-char", messageParams: {"data": data}));
556 stream.unget(data);
557 state = bogusCommentState;
558 }
559 return true;
560 }
561
562 bool tagNameState() {
563 var data = stream.char();
564 if (isWhitespace(data)) {
565 state = beforeAttributeNameState;
566 } else if (data == ">") {
567 emitCurrentToken();
568 } else if (data == EOF) {
569 _addToken(new ParseErrorToken("eof-in-tag-name"));
570 state = dataState;
571 } else if (data == "/") {
572 state = selfClosingStartTagState;
573 } else if (data == "\u0000") {
574 _addToken(new ParseErrorToken("invalid-codepoint"));
575 currentTagToken.name = '${currentTagToken.name}\uFFFD';
576 } else {
577 currentTagToken.name = '${currentTagToken.name}$data';
578 // (Don't use charsUntil here, because tag names are
579 // very short and it's faster to not do anything fancy)
580 }
581 return true;
582 }
583
584 bool rcdataLessThanSignState() {
585 var data = stream.char();
586 if (data == "/") {
587 temporaryBuffer = "";
588 state = rcdataEndTagOpenState;
589 } else {
590 _addToken(new CharactersToken("<"));
591 stream.unget(data);
592 state = rcdataState;
593 }
594 return true;
595 }
596
597 bool rcdataEndTagOpenState() {
598 var data = stream.char();
599 if (isLetter(data)) {
600 temporaryBuffer = '${temporaryBuffer}$data';
601 state = rcdataEndTagNameState;
602 } else {
603 _addToken(new CharactersToken("</"));
604 stream.unget(data);
605 state = rcdataState;
606 }
607 return true;
608 }
609
610 bool _tokenIsAppropriate() {
611 return currentToken is TagToken &&
612 currentTagToken.name.toLowerCase() == temporaryBuffer.toLowerCase();
613 }
614
615 bool rcdataEndTagNameState() {
616 var appropriate = _tokenIsAppropriate();
617 var data = stream.char();
618 if (isWhitespace(data) && appropriate) {
619 currentToken = new EndTagToken(temporaryBuffer);
620 state = beforeAttributeNameState;
621 } else if (data == "/" && appropriate) {
622 currentToken = new EndTagToken(temporaryBuffer);
623 state = selfClosingStartTagState;
624 } else if (data == ">" && appropriate) {
625 currentToken = new EndTagToken(temporaryBuffer);
626 emitCurrentToken();
627 state = dataState;
628 } else if (isLetter(data)) {
629 temporaryBuffer = '${temporaryBuffer}$data';
630 } else {
631 _addToken(new CharactersToken("</$temporaryBuffer"));
632 stream.unget(data);
633 state = rcdataState;
634 }
635 return true;
636 }
637
638 bool rawtextLessThanSignState() {
639 var data = stream.char();
640 if (data == "/") {
641 temporaryBuffer = "";
642 state = rawtextEndTagOpenState;
643 } else {
644 _addToken(new CharactersToken("<"));
645 stream.unget(data);
646 state = rawtextState;
647 }
648 return true;
649 }
650
651 bool rawtextEndTagOpenState() {
652 var data = stream.char();
653 if (isLetter(data)) {
654 temporaryBuffer = '${temporaryBuffer}$data';
655 state = rawtextEndTagNameState;
656 } else {
657 _addToken(new CharactersToken("</"));
658 stream.unget(data);
659 state = rawtextState;
660 }
661 return true;
662 }
663
664 bool rawtextEndTagNameState() {
665 var appropriate = _tokenIsAppropriate();
666 var data = stream.char();
667 if (isWhitespace(data) && appropriate) {
668 currentToken = new EndTagToken(temporaryBuffer);
669 state = beforeAttributeNameState;
670 } else if (data == "/" && appropriate) {
671 currentToken = new EndTagToken(temporaryBuffer);
672 state = selfClosingStartTagState;
673 } else if (data == ">" && appropriate) {
674 currentToken = new EndTagToken(temporaryBuffer);
675 emitCurrentToken();
676 state = dataState;
677 } else if (isLetter(data)) {
678 temporaryBuffer = '${temporaryBuffer}$data';
679 } else {
680 _addToken(new CharactersToken("</$temporaryBuffer"));
681 stream.unget(data);
682 state = rawtextState;
683 }
684 return true;
685 }
686
687 bool scriptDataLessThanSignState() {
688 var data = stream.char();
689 if (data == "/") {
690 temporaryBuffer = "";
691 state = scriptDataEndTagOpenState;
692 } else if (data == "!") {
693 _addToken(new CharactersToken("<!"));
694 state = scriptDataEscapeStartState;
695 } else {
696 _addToken(new CharactersToken("<"));
697 stream.unget(data);
698 state = scriptDataState;
699 }
700 return true;
701 }
702
703 bool scriptDataEndTagOpenState() {
704 var data = stream.char();
705 if (isLetter(data)) {
706 temporaryBuffer = '${temporaryBuffer}$data';
707 state = scriptDataEndTagNameState;
708 } else {
709 _addToken(new CharactersToken("</"));
710 stream.unget(data);
711 state = scriptDataState;
712 }
713 return true;
714 }
715
716 bool scriptDataEndTagNameState() {
717 var appropriate = _tokenIsAppropriate();
718 var data = stream.char();
719 if (isWhitespace(data) && appropriate) {
720 currentToken = new EndTagToken(temporaryBuffer);
721 state = beforeAttributeNameState;
722 } else if (data == "/" && appropriate) {
723 currentToken = new EndTagToken(temporaryBuffer);
724 state = selfClosingStartTagState;
725 } else if (data == ">" && appropriate) {
726 currentToken = new EndTagToken(temporaryBuffer);
727 emitCurrentToken();
728 state = dataState;
729 } else if (isLetter(data)) {
730 temporaryBuffer = '${temporaryBuffer}$data';
731 } else {
732 _addToken(new CharactersToken("</$temporaryBuffer"));
733 stream.unget(data);
734 state = scriptDataState;
735 }
736 return true;
737 }
738
739 bool scriptDataEscapeStartState() {
740 var data = stream.char();
741 if (data == "-") {
742 _addToken(new CharactersToken("-"));
743 state = scriptDataEscapeStartDashState;
744 } else {
745 stream.unget(data);
746 state = scriptDataState;
747 }
748 return true;
749 }
750
751 bool scriptDataEscapeStartDashState() {
752 var data = stream.char();
753 if (data == "-") {
754 _addToken(new CharactersToken("-"));
755 state = scriptDataEscapedDashDashState;
756 } else {
757 stream.unget(data);
758 state = scriptDataState;
759 }
760 return true;
761 }
762
763 bool scriptDataEscapedState() {
764 var data = stream.char();
765 if (data == "-") {
766 _addToken(new CharactersToken("-"));
767 state = scriptDataEscapedDashState;
768 } else if (data == "<") {
769 state = scriptDataEscapedLessThanSignState;
770 } else if (data == "\u0000") {
771 _addToken(new ParseErrorToken("invalid-codepoint"));
772 _addToken(new CharactersToken("\uFFFD"));
773 } else if (data == EOF) {
774 state = dataState;
775 } else {
776 var chars = stream.charsUntil("<-\u0000");
777 _addToken(new CharactersToken("${data}${chars}"));
778 }
779 return true;
780 }
781
782 bool scriptDataEscapedDashState() {
783 var data = stream.char();
784 if (data == "-") {
785 _addToken(new CharactersToken("-"));
786 state = scriptDataEscapedDashDashState;
787 } else if (data == "<") {
788 state = scriptDataEscapedLessThanSignState;
789 } else if (data == "\u0000") {
790 _addToken(new ParseErrorToken("invalid-codepoint"));
791 _addToken(new CharactersToken("\uFFFD"));
792 state = scriptDataEscapedState;
793 } else if (data == EOF) {
794 state = dataState;
795 } else {
796 _addToken(new CharactersToken(data));
797 state = scriptDataEscapedState;
798 }
799 return true;
800 }
801
802 bool scriptDataEscapedDashDashState() {
803 var data = stream.char();
804 if (data == "-") {
805 _addToken(new CharactersToken("-"));
806 } else if (data == "<") {
807 state = scriptDataEscapedLessThanSignState;
808 } else if (data == ">") {
809 _addToken(new CharactersToken(">"));
810 state = scriptDataState;
811 } else if (data == "\u0000") {
812 _addToken(new ParseErrorToken("invalid-codepoint"));
813 _addToken(new CharactersToken("\uFFFD"));
814 state = scriptDataEscapedState;
815 } else if (data == EOF) {
816 state = dataState;
817 } else {
818 _addToken(new CharactersToken(data));
819 state = scriptDataEscapedState;
820 }
821 return true;
822 }
823
824 bool scriptDataEscapedLessThanSignState() {
825 var data = stream.char();
826 if (data == "/") {
827 temporaryBuffer = "";
828 state = scriptDataEscapedEndTagOpenState;
829 } else if (isLetter(data)) {
830 _addToken(new CharactersToken("<$data"));
831 temporaryBuffer = data;
832 state = scriptDataDoubleEscapeStartState;
833 } else {
834 _addToken(new CharactersToken("<"));
835 stream.unget(data);
836 state = scriptDataEscapedState;
837 }
838 return true;
839 }
840
841 bool scriptDataEscapedEndTagOpenState() {
842 var data = stream.char();
843 if (isLetter(data)) {
844 temporaryBuffer = data;
845 state = scriptDataEscapedEndTagNameState;
846 } else {
847 _addToken(new CharactersToken("</"));
848 stream.unget(data);
849 state = scriptDataEscapedState;
850 }
851 return true;
852 }
853
854 bool scriptDataEscapedEndTagNameState() {
855 var appropriate = _tokenIsAppropriate();
856 var data = stream.char();
857 if (isWhitespace(data) && appropriate) {
858 currentToken = new EndTagToken(temporaryBuffer);
859 state = beforeAttributeNameState;
860 } else if (data == "/" && appropriate) {
861 currentToken = new EndTagToken(temporaryBuffer);
862 state = selfClosingStartTagState;
863 } else if (data == ">" && appropriate) {
864 currentToken = new EndTagToken(temporaryBuffer);
865 emitCurrentToken();
866 state = dataState;
867 } else if (isLetter(data)) {
868 temporaryBuffer = '${temporaryBuffer}$data';
869 } else {
870 _addToken(new CharactersToken("</$temporaryBuffer"));
871 stream.unget(data);
872 state = scriptDataEscapedState;
873 }
874 return true;
875 }
876
877 bool scriptDataDoubleEscapeStartState() {
878 var data = stream.char();
879 if (isWhitespace(data) || data == "/" || data == ">") {
880 _addToken(new CharactersToken(data));
881 if (temporaryBuffer.toLowerCase() == "script") {
882 state = scriptDataDoubleEscapedState;
883 } else {
884 state = scriptDataEscapedState;
885 }
886 } else if (isLetter(data)) {
887 _addToken(new CharactersToken(data));
888 temporaryBuffer = '${temporaryBuffer}$data';
889 } else {
890 stream.unget(data);
891 state = scriptDataEscapedState;
892 }
893 return true;
894 }
895
896 bool scriptDataDoubleEscapedState() {
897 var data = stream.char();
898 if (data == "-") {
899 _addToken(new CharactersToken("-"));
900 state = scriptDataDoubleEscapedDashState;
901 } else if (data == "<") {
902 _addToken(new CharactersToken("<"));
903 state = scriptDataDoubleEscapedLessThanSignState;
904 } else if (data == "\u0000") {
905 _addToken(new ParseErrorToken("invalid-codepoint"));
906 _addToken(new CharactersToken("\uFFFD"));
907 } else if (data == EOF) {
908 _addToken(new ParseErrorToken("eof-in-script-in-script"));
909 state = dataState;
910 } else {
911 _addToken(new CharactersToken(data));
912 }
913 return true;
914 }
915
916 bool scriptDataDoubleEscapedDashState() {
917 var data = stream.char();
918 if (data == "-") {
919 _addToken(new CharactersToken("-"));
920 state = scriptDataDoubleEscapedDashDashState;
921 } else if (data == "<") {
922 _addToken(new CharactersToken("<"));
923 state = scriptDataDoubleEscapedLessThanSignState;
924 } else if (data == "\u0000") {
925 _addToken(new ParseErrorToken("invalid-codepoint"));
926 _addToken(new CharactersToken("\uFFFD"));
927 state = scriptDataDoubleEscapedState;
928 } else if (data == EOF) {
929 _addToken(new ParseErrorToken("eof-in-script-in-script"));
930 state = dataState;
931 } else {
932 _addToken(new CharactersToken(data));
933 state = scriptDataDoubleEscapedState;
934 }
935 return true;
936 }
937
938 // TODO(jmesserly): report bug in original code
939 // (was "Dash" instead of "DashDash")
940 bool scriptDataDoubleEscapedDashDashState() {
941 var data = stream.char();
942 if (data == "-") {
943 _addToken(new CharactersToken("-"));
944 } else if (data == "<") {
945 _addToken(new CharactersToken("<"));
946 state = scriptDataDoubleEscapedLessThanSignState;
947 } else if (data == ">") {
948 _addToken(new CharactersToken(">"));
949 state = scriptDataState;
950 } else if (data == "\u0000") {
951 _addToken(new ParseErrorToken("invalid-codepoint"));
952 _addToken(new CharactersToken("\uFFFD"));
953 state = scriptDataDoubleEscapedState;
954 } else if (data == EOF) {
955 _addToken(new ParseErrorToken("eof-in-script-in-script"));
956 state = dataState;
957 } else {
958 _addToken(new CharactersToken(data));
959 state = scriptDataDoubleEscapedState;
960 }
961 return true;
962 }
963
964 bool scriptDataDoubleEscapedLessThanSignState() {
965 var data = stream.char();
966 if (data == "/") {
967 _addToken(new CharactersToken("/"));
968 temporaryBuffer = "";
969 state = scriptDataDoubleEscapeEndState;
970 } else {
971 stream.unget(data);
972 state = scriptDataDoubleEscapedState;
973 }
974 return true;
975 }
976
977 bool scriptDataDoubleEscapeEndState() {
978 var data = stream.char();
979 if (isWhitespace(data) || data == "/" || data == ">") {
980 _addToken(new CharactersToken(data));
981 if (temporaryBuffer.toLowerCase() == "script") {
982 state = scriptDataEscapedState;
983 } else {
984 state = scriptDataDoubleEscapedState;
985 }
986 } else if (isLetter(data)) {
987 _addToken(new CharactersToken(data));
988 temporaryBuffer = '${temporaryBuffer}$data';
989 } else {
990 stream.unget(data);
991 state = scriptDataDoubleEscapedState;
992 }
993 return true;
994 }
995
996 bool beforeAttributeNameState() {
997 var data = stream.char();
998 if (isWhitespace(data)) {
999 stream.charsUntil(spaceCharacters, true);
1000 } else if (isLetter(data)) {
1001 _addAttribute(data);
1002 state = attributeNameState;
1003 } else if (data == ">") {
1004 emitCurrentToken();
1005 } else if (data == "/") {
1006 state = selfClosingStartTagState;
1007 } else if (data == EOF) {
1008 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof"));
1009 state = dataState;
1010 } else if ("'\"=<".contains(data)) {
1011 _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
1012 _addAttribute(data);
1013 state = attributeNameState;
1014 } else if (data == "\u0000") {
1015 _addToken(new ParseErrorToken("invalid-codepoint"));
1016 _addAttribute("\uFFFD");
1017 state = attributeNameState;
1018 } else {
1019 _addAttribute(data);
1020 state = attributeNameState;
1021 }
1022 return true;
1023 }
1024
1025 bool attributeNameState() {
1026 var data = stream.char();
1027 bool leavingThisState = true;
1028 bool emitToken = false;
1029 if (data == "=") {
1030 state = beforeAttributeValueState;
1031 } else if (isLetter(data)) {
1032 _attributeName = '$_attributeName$data'
1033 '${stream.charsUntil(asciiLetters, true)}';
1034 leavingThisState = false;
1035 } else if (data == ">") {
1036 // XXX If we emit here the attributes are converted to a dict
1037 // without being checked and when the code below runs we error
1038 // because data is a dict not a list
1039 emitToken = true;
1040 } else if (isWhitespace(data)) {
1041 state = afterAttributeNameState;
1042 } else if (data == "/") {
1043 state = selfClosingStartTagState;
1044 } else if (data == "\u0000") {
1045 _addToken(new ParseErrorToken("invalid-codepoint"));
1046 _attributeName = '${_attributeName}\uFFFD';
1047 leavingThisState = false;
1048 } else if (data == EOF) {
1049 _addToken(new ParseErrorToken("eof-in-attribute-name"));
1050 state = dataState;
1051 } else if ("'\"<".contains(data)) {
1052 _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
1053 _attributeName = '$_attributeName$data';
1054 leavingThisState = false;
1055 } else {
1056 _attributeName = '$_attributeName$data';
1057 leavingThisState = false;
1058 }
1059
1060 if (leavingThisState) {
1061 _markAttributeNameEnd(-1);
1062
1063 // Attributes are not dropped at this stage. That happens when the
1064 // start tag token is emitted so values can still be safely appended
1065 // to attributes, but we do want to report the parse error in time.
1066 if (lowercaseAttrName) {
1067 _attributeName = asciiUpper2Lower(_attributeName);
1068 }
1069 if (_attributeNames == null) _attributeNames = new Set();
1070 if (_attributeNames.contains(_attributeName)) {
1071 _addToken(new ParseErrorToken("duplicate-attribute"));
1072 }
1073 _attributeNames.add(_attributeName);
1074
1075 // XXX Fix for above XXX
1076 if (emitToken) {
1077 emitCurrentToken();
1078 }
1079 }
1080 return true;
1081 }
1082
1083 bool afterAttributeNameState() {
1084 var data = stream.char();
1085 if (isWhitespace(data)) {
1086 stream.charsUntil(spaceCharacters, true);
1087 } else if (data == "=") {
1088 state = beforeAttributeValueState;
1089 } else if (data == ">") {
1090 emitCurrentToken();
1091 } else if (isLetter(data)) {
1092 _addAttribute(data);
1093 state = attributeNameState;
1094 } else if (data == "/") {
1095 state = selfClosingStartTagState;
1096 } else if (data == "\u0000") {
1097 _addToken(new ParseErrorToken("invalid-codepoint"));
1098 _addAttribute("\uFFFD");
1099 state = attributeNameState;
1100 } else if (data == EOF) {
1101 _addToken(new ParseErrorToken("expected-end-of-tag-but-got-eof"));
1102 state = dataState;
1103 } else if ("'\"<".contains(data)) {
1104 _addToken(new ParseErrorToken("invalid-character-after-attribute-name"));
1105 _addAttribute(data);
1106 state = attributeNameState;
1107 } else {
1108 _addAttribute(data);
1109 state = attributeNameState;
1110 }
1111 return true;
1112 }
1113
1114 bool beforeAttributeValueState() {
1115 var data = stream.char();
1116 if (isWhitespace(data)) {
1117 stream.charsUntil(spaceCharacters, true);
1118 } else if (data == "\"") {
1119 _markAttributeValueStart(0);
1120 state = attributeValueDoubleQuotedState;
1121 } else if (data == "&") {
1122 state = attributeValueUnQuotedState;
1123 stream.unget(data);
1124 _markAttributeValueStart(0);
1125 } else if (data == "'") {
1126 _markAttributeValueStart(0);
1127 state = attributeValueSingleQuotedState;
1128 } else if (data == ">") {
1129 _addToken(new ParseErrorToken(
1130 "expected-attribute-value-but-got-right-bracket"));
1131 emitCurrentToken();
1132 } else if (data == "\u0000") {
1133 _addToken(new ParseErrorToken("invalid-codepoint"));
1134 _markAttributeValueStart(-1);
1135 _attributeValue = '${_attributeValue}\uFFFD';
1136 state = attributeValueUnQuotedState;
1137 } else if (data == EOF) {
1138 _addToken(new ParseErrorToken("expected-attribute-value-but-got-eof"));
1139 state = dataState;
1140 } else if ("=<`".contains(data)) {
1141 _addToken(new ParseErrorToken("equals-in-unquoted-attribute-value"));
1142 _markAttributeValueStart(-1);
1143 _attributeValue = '$_attributeValue$data';
1144 state = attributeValueUnQuotedState;
1145 } else {
1146 _markAttributeValueStart(-1);
1147 _attributeValue = '$_attributeValue$data';
1148 state = attributeValueUnQuotedState;
1149 }
1150 return true;
1151 }
1152
1153 bool attributeValueDoubleQuotedState() {
1154 var data = stream.char();
1155 if (data == "\"") {
1156 _markAttributeValueEnd(-1);
1157 _markAttributeEnd(0);
1158 state = afterAttributeValueState;
1159 } else if (data == "&") {
1160 processEntityInAttribute('"');
1161 } else if (data == "\u0000") {
1162 _addToken(new ParseErrorToken("invalid-codepoint"));
1163 _attributeValue = '${_attributeValue}\uFFFD';
1164 } else if (data == EOF) {
1165 _addToken(new ParseErrorToken("eof-in-attribute-value-double-quote"));
1166 _markAttributeValueEnd(-1);
1167 state = dataState;
1168 } else {
1169 _attributeValue = '$_attributeValue$data${stream.charsUntil("\"&")}';
1170 }
1171 return true;
1172 }
1173
1174 bool attributeValueSingleQuotedState() {
1175 var data = stream.char();
1176 if (data == "'") {
1177 _markAttributeValueEnd(-1);
1178 _markAttributeEnd(0);
1179 state = afterAttributeValueState;
1180 } else if (data == "&") {
1181 processEntityInAttribute("'");
1182 } else if (data == "\u0000") {
1183 _addToken(new ParseErrorToken("invalid-codepoint"));
1184 _attributeValue = '${_attributeValue}\uFFFD';
1185 } else if (data == EOF) {
1186 _addToken(new ParseErrorToken("eof-in-attribute-value-single-quote"));
1187 _markAttributeValueEnd(-1);
1188 state = dataState;
1189 } else {
1190 _attributeValue = '$_attributeValue$data${stream.charsUntil("\'&")}';
1191 }
1192 return true;
1193 }
1194
1195 bool attributeValueUnQuotedState() {
1196 var data = stream.char();
1197 if (isWhitespace(data)) {
1198 _markAttributeValueEnd(-1);
1199 state = beforeAttributeNameState;
1200 } else if (data == "&") {
1201 processEntityInAttribute(">");
1202 } else if (data == ">") {
1203 _markAttributeValueEnd(-1);
1204 emitCurrentToken();
1205 } else if (data == EOF) {
1206 _addToken(new ParseErrorToken("eof-in-attribute-value-no-quotes"));
1207 _markAttributeValueEnd(-1);
1208 state = dataState;
1209 } else if ('"\'=<`'.contains(data)) {
1210 _addToken(new ParseErrorToken(
1211 "unexpected-character-in-unquoted-attribute-value"));
1212 _attributeValue = '$_attributeValue$data';
1213 } else if (data == "\u0000") {
1214 _addToken(new ParseErrorToken("invalid-codepoint"));
1215 _attributeValue = '${_attributeValue}\uFFFD';
1216 } else {
1217 _attributeValue = '$_attributeValue$data'
1218 '${stream.charsUntil("&>\"\'=<`$spaceCharacters")}';
1219 }
1220 return true;
1221 }
1222
1223 bool afterAttributeValueState() {
1224 var data = stream.char();
1225 if (isWhitespace(data)) {
1226 state = beforeAttributeNameState;
1227 } else if (data == ">") {
1228 emitCurrentToken();
1229 } else if (data == "/") {
1230 state = selfClosingStartTagState;
1231 } else if (data == EOF) {
1232 _addToken(new ParseErrorToken("unexpected-EOF-after-attribute-value"));
1233 stream.unget(data);
1234 state = dataState;
1235 } else {
1236 _addToken(new ParseErrorToken(
1237 "unexpected-character-after-attribute-value"));
1238 stream.unget(data);
1239 state = beforeAttributeNameState;
1240 }
1241 return true;
1242 }
1243
1244 bool selfClosingStartTagState() {
1245 var data = stream.char();
1246 if (data == ">") {
1247 currentTagToken.selfClosing = true;
1248 emitCurrentToken();
1249 } else if (data == EOF) {
1250 _addToken(new ParseErrorToken("unexpected-EOF-after-solidus-in-tag"));
1251 stream.unget(data);
1252 state = dataState;
1253 } else {
1254 _addToken(new ParseErrorToken(
1255 "unexpected-character-after-soldius-in-tag"));
1256 stream.unget(data);
1257 state = beforeAttributeNameState;
1258 }
1259 return true;
1260 }
1261
1262 bool bogusCommentState() {
1263 // Make a new comment token and give it as value all the characters
1264 // until the first > or EOF (charsUntil checks for EOF automatically)
1265 // and emit it.
1266 var data = stream.charsUntil(">");
1267 data = data.replaceAll("\u0000", "\uFFFD");
1268 _addToken(new CommentToken(data));
1269
1270 // Eat the character directly after the bogus comment which is either a
1271 // ">" or an EOF.
1272 stream.char();
1273 state = dataState;
1274 return true;
1275 }
1276
1277 bool markupDeclarationOpenState() {
1278 var charStack = [stream.char()];
1279 if (charStack.last == "-") {
1280 charStack.add(stream.char());
1281 if (charStack.last == "-") {
1282 currentToken = new CommentToken("");
1283 state = commentStartState;
1284 return true;
1285 }
1286 } else if (charStack.last == 'd' || charStack.last == 'D') {
1287 var matched = true;
1288 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) {
1289 var char = stream.char();
1290 charStack.add(char);
1291 if (char == EOF || !expected.contains(char)) {
1292 matched = false;
1293 break;
1294 }
1295 }
1296 if (matched) {
1297 currentToken = new DoctypeToken(correct: true);
1298 state = doctypeState;
1299 return true;
1300 }
1301 } else if (charStack.last == "[" &&
1302 parser != null && parser.tree.openElements.length > 0 &&
1303 parser.tree.openElements.last.namespace
1304 != parser.tree.defaultNamespace) {
1305 var matched = true;
1306 for (var expected in const ["C", "D", "A", "T", "A", "["]) {
1307 charStack.add(stream.char());
1308 if (charStack.last != expected) {
1309 matched = false;
1310 break;
1311 }
1312 }
1313 if (matched) {
1314 state = cdataSectionState;
1315 return true;
1316 }
1317 }
1318
1319 _addToken(new ParseErrorToken("expected-dashes-or-doctype"));
1320
1321 while (charStack.length > 0) {
1322 stream.unget(charStack.removeLast());
1323 }
1324 state = bogusCommentState;
1325 return true;
1326 }
1327
1328 bool commentStartState() {
1329 var data = stream.char();
1330 if (data == "-") {
1331 state = commentStartDashState;
1332 } else if (data == "\u0000") {
1333 _addToken(new ParseErrorToken("invalid-codepoint"));
1334 currentStringToken.data = '${currentStringToken.data}\uFFFD';
1335 } else if (data == ">") {
1336 _addToken(new ParseErrorToken("incorrect-comment"));
1337 _addToken(currentToken);
1338 state = dataState;
1339 } else if (data == EOF) {
1340 _addToken(new ParseErrorToken("eof-in-comment"));
1341 _addToken(currentToken);
1342 state = dataState;
1343 } else {
1344 currentStringToken.data = '${currentStringToken.data}$data';
1345 state = commentState;
1346 }
1347 return true;
1348 }
1349
1350 bool commentStartDashState() {
1351 var data = stream.char();
1352 if (data == "-") {
1353 state = commentEndState;
1354 } else if (data == "\u0000") {
1355 _addToken(new ParseErrorToken("invalid-codepoint"));
1356 currentStringToken.data = '${currentStringToken.data}-\uFFFD';
1357 } else if (data == ">") {
1358 _addToken(new ParseErrorToken("incorrect-comment"));
1359 _addToken(currentToken);
1360 state = dataState;
1361 } else if (data == EOF) {
1362 _addToken(new ParseErrorToken("eof-in-comment"));
1363 _addToken(currentToken);
1364 state = dataState;
1365 } else {
1366 currentStringToken.data = '${currentStringToken.data}-${data}';
1367 state = commentState;
1368 }
1369 return true;
1370 }
1371
1372 bool commentState() {
1373 var data = stream.char();
1374 if (data == "-") {
1375 state = commentEndDashState;
1376 } else if (data == "\u0000") {
1377 _addToken(new ParseErrorToken("invalid-codepoint"));
1378 currentStringToken.data = '${currentStringToken.data}\uFFFD';
1379 } else if (data == EOF) {
1380 _addToken(new ParseErrorToken("eof-in-comment"));
1381 _addToken(currentToken);
1382 state = dataState;
1383 } else {
1384 currentStringToken.data = '${currentStringToken.data}$data'
1385 '${stream.charsUntil("-\u0000")}';
1386 }
1387 return true;
1388 }
1389
1390 bool commentEndDashState() {
1391 var data = stream.char();
1392 if (data == "-") {
1393 state = commentEndState;
1394 } else if (data == "\u0000") {
1395 _addToken(new ParseErrorToken("invalid-codepoint"));
1396 currentStringToken.data = "${currentStringToken.data}-\uFFFD";
1397 state = commentState;
1398 } else if (data == EOF) {
1399 _addToken(new ParseErrorToken("eof-in-comment-end-dash"));
1400 _addToken(currentToken);
1401 state = dataState;
1402 } else {
1403 currentStringToken.data = "${currentStringToken.data}-${data}";
1404 state = commentState;
1405 }
1406 return true;
1407 }
1408
1409 bool commentEndState() {
1410 var data = stream.char();
1411 if (data == ">") {
1412 _addToken(currentToken);
1413 state = dataState;
1414 } else if (data == "\u0000") {
1415 _addToken(new ParseErrorToken("invalid-codepoint"));
1416 currentStringToken.data = '${currentStringToken.data}--\uFFFD';
1417 state = commentState;
1418 } else if (data == "!") {
1419 _addToken(new ParseErrorToken(
1420 "unexpected-bang-after-double-dash-in-comment"));
1421 state = commentEndBangState;
1422 } else if (data == "-") {
1423 _addToken(new ParseErrorToken(
1424 "unexpected-dash-after-double-dash-in-comment"));
1425 currentStringToken.data = '${currentStringToken.data}$data';
1426 } else if (data == EOF) {
1427 _addToken(new ParseErrorToken("eof-in-comment-double-dash"));
1428 _addToken(currentToken);
1429 state = dataState;
1430 } else {
1431 // XXX
1432 _addToken(new ParseErrorToken("unexpected-char-in-comment"));
1433 currentStringToken.data = "${currentStringToken.data}--${data}";
1434 state = commentState;
1435 }
1436 return true;
1437 }
1438
1439 bool commentEndBangState() {
1440 var data = stream.char();
1441 if (data == ">") {
1442 _addToken(currentToken);
1443 state = dataState;
1444 } else if (data == "-") {
1445 currentStringToken.data = '${currentStringToken.data}--!';
1446 state = commentEndDashState;
1447 } else if (data == "\u0000") {
1448 _addToken(new ParseErrorToken("invalid-codepoint"));
1449 currentStringToken.data = '${currentStringToken.data}--!\uFFFD';
1450 state = commentState;
1451 } else if (data == EOF) {
1452 _addToken(new ParseErrorToken("eof-in-comment-end-bang-state"));
1453 _addToken(currentToken);
1454 state = dataState;
1455 } else {
1456 currentStringToken.data = "${currentStringToken.data}--!${data}";
1457 state = commentState;
1458 }
1459 return true;
1460 }
1461
1462 bool doctypeState() {
1463 var data = stream.char();
1464 if (isWhitespace(data)) {
1465 state = beforeDoctypeNameState;
1466 } else if (data == EOF) {
1467 _addToken(new ParseErrorToken(
1468 "expected-doctype-name-but-got-eof"));
1469 currentDoctypeToken.correct = false;
1470 _addToken(currentToken);
1471 state = dataState;
1472 } else {
1473 _addToken(new ParseErrorToken("need-space-after-doctype"));
1474 stream.unget(data);
1475 state = beforeDoctypeNameState;
1476 }
1477 return true;
1478 }
1479
1480 bool beforeDoctypeNameState() {
1481 var data = stream.char();
1482 if (isWhitespace(data)) {
1483 return true;
1484 } else if (data == ">") {
1485 _addToken(new ParseErrorToken(
1486 "expected-doctype-name-but-got-right-bracket"));
1487 currentDoctypeToken.correct = false;
1488 _addToken(currentToken);
1489 state = dataState;
1490 } else if (data == "\u0000") {
1491 _addToken(new ParseErrorToken("invalid-codepoint"));
1492 currentDoctypeToken.name = "\uFFFD";
1493 state = doctypeNameState;
1494 } else if (data == EOF) {
1495 _addToken(new ParseErrorToken(
1496 "expected-doctype-name-but-got-eof"));
1497 currentDoctypeToken.correct = false;
1498 _addToken(currentToken);
1499 state = dataState;
1500 } else {
1501 currentDoctypeToken.name = data;
1502 state = doctypeNameState;
1503 }
1504 return true;
1505 }
1506
1507 bool doctypeNameState() {
1508 var data = stream.char();
1509 if (isWhitespace(data)) {
1510 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
1511 state = afterDoctypeNameState;
1512 } else if (data == ">") {
1513 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
1514 _addToken(currentToken);
1515 state = dataState;
1516 } else if (data == "\u0000") {
1517 _addToken(new ParseErrorToken("invalid-codepoint"));
1518 currentDoctypeToken.name = "${currentDoctypeToken.name}\uFFFD";
1519 state = doctypeNameState;
1520 } else if (data == EOF) {
1521 _addToken(new ParseErrorToken("eof-in-doctype-name"));
1522 currentDoctypeToken.correct = false;
1523 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
1524 _addToken(currentToken);
1525 state = dataState;
1526 } else {
1527 currentDoctypeToken.name = '${currentDoctypeToken.name}$data';
1528 }
1529 return true;
1530 }
1531
1532 bool afterDoctypeNameState() {
1533 var data = stream.char();
1534 if (isWhitespace(data)) {
1535 return true;
1536 } else if (data == ">") {
1537 _addToken(currentToken);
1538 state = dataState;
1539 } else if (data == EOF) {
1540 currentDoctypeToken.correct = false;
1541 stream.unget(data);
1542 _addToken(new ParseErrorToken("eof-in-doctype"));
1543 _addToken(currentToken);
1544 state = dataState;
1545 } else {
1546 if (data == "p" || data == "P") {
1547 // TODO(jmesserly): would be nice to have a helper for this.
1548 var matched = true;
1549 for (var expected in const ["uU", "bB", "lL", "iI", "cC"]) {
1550 data = stream.char();
1551 if (data == EOF || !expected.contains(data)) {
1552 matched = false;
1553 break;
1554 }
1555 }
1556 if (matched) {
1557 state = afterDoctypePublicKeywordState;
1558 return true;
1559 }
1560 } else if (data == "s" || data == "S") {
1561 var matched = true;
1562 for (var expected in const ["yY", "sS", "tT", "eE", "mM"]) {
1563 data = stream.char();
1564 if (data == EOF || !expected.contains(data)) {
1565 matched = false;
1566 break;
1567 }
1568 }
1569 if (matched) {
1570 state = afterDoctypeSystemKeywordState;
1571 return true;
1572 }
1573 }
1574
1575 // All the characters read before the current 'data' will be
1576 // [a-zA-Z], so they're garbage in the bogus doctype and can be
1577 // discarded; only the latest character might be '>' or EOF
1578 // and needs to be ungetted
1579 stream.unget(data);
1580 _addToken(new ParseErrorToken(
1581 "expected-space-or-right-bracket-in-doctype",
1582 messageParams: {"data": data}));
1583 currentDoctypeToken.correct = false;
1584 state = bogusDoctypeState;
1585 }
1586 return true;
1587 }
1588
1589 bool afterDoctypePublicKeywordState() {
1590 var data = stream.char();
1591 if (isWhitespace(data)) {
1592 state = beforeDoctypePublicIdentifierState;
1593 } else if (data == "'" || data == '"') {
1594 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1595 stream.unget(data);
1596 state = beforeDoctypePublicIdentifierState;
1597 } else if (data == EOF) {
1598 _addToken(new ParseErrorToken("eof-in-doctype"));
1599 currentDoctypeToken.correct = false;
1600 _addToken(currentToken);
1601 state = dataState;
1602 } else {
1603 stream.unget(data);
1604 state = beforeDoctypePublicIdentifierState;
1605 }
1606 return true;
1607 }
1608
1609 bool beforeDoctypePublicIdentifierState() {
1610 var data = stream.char();
1611 if (isWhitespace(data)) {
1612 return true;
1613 } else if (data == "\"") {
1614 currentDoctypeToken.publicId = "";
1615 state = doctypePublicIdentifierDoubleQuotedState;
1616 } else if (data == "'") {
1617 currentDoctypeToken.publicId = "";
1618 state = doctypePublicIdentifierSingleQuotedState;
1619 } else if (data == ">") {
1620 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1621 currentDoctypeToken.correct = false;
1622 _addToken(currentToken);
1623 state = dataState;
1624 } else if (data == EOF) {
1625 _addToken(new ParseErrorToken("eof-in-doctype"));
1626 currentDoctypeToken.correct = false;
1627 _addToken(currentToken);
1628 state = dataState;
1629 } else {
1630 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1631 currentDoctypeToken.correct = false;
1632 state = bogusDoctypeState;
1633 }
1634 return true;
1635 }
1636
1637 bool doctypePublicIdentifierDoubleQuotedState() {
1638 var data = stream.char();
1639 if (data == '"') {
1640 state = afterDoctypePublicIdentifierState;
1641 } else if (data == "\u0000") {
1642 _addToken(new ParseErrorToken("invalid-codepoint"));
1643 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
1644 } else if (data == ">") {
1645 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1646 currentDoctypeToken.correct = false;
1647 _addToken(currentToken);
1648 state = dataState;
1649 } else if (data == EOF) {
1650 _addToken(new ParseErrorToken("eof-in-doctype"));
1651 currentDoctypeToken.correct = false;
1652 _addToken(currentToken);
1653 state = dataState;
1654 } else {
1655 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
1656 }
1657 return true;
1658 }
1659
1660 bool doctypePublicIdentifierSingleQuotedState() {
1661 var data = stream.char();
1662 if (data == "'") {
1663 state = afterDoctypePublicIdentifierState;
1664 } else if (data == "\u0000") {
1665 _addToken(new ParseErrorToken("invalid-codepoint"));
1666 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
1667 } else if (data == ">") {
1668 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1669 currentDoctypeToken.correct = false;
1670 _addToken(currentToken);
1671 state = dataState;
1672 } else if (data == EOF) {
1673 _addToken(new ParseErrorToken("eof-in-doctype"));
1674 currentDoctypeToken.correct = false;
1675 _addToken(currentToken);
1676 state = dataState;
1677 } else {
1678 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
1679 }
1680 return true;
1681 }
1682
1683 bool afterDoctypePublicIdentifierState() {
1684 var data = stream.char();
1685 if (isWhitespace(data)) {
1686 state = betweenDoctypePublicAndSystemIdentifiersState;
1687 } else if (data == ">") {
1688 _addToken(currentToken);
1689 state = dataState;
1690 } else if (data == '"') {
1691 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1692 currentDoctypeToken.systemId = "";
1693 state = doctypeSystemIdentifierDoubleQuotedState;
1694 } else if (data == "'") {
1695 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1696 currentDoctypeToken.systemId = "";
1697 state = doctypeSystemIdentifierSingleQuotedState;
1698 } else if (data == EOF) {
1699 _addToken(new ParseErrorToken("eof-in-doctype"));
1700 currentDoctypeToken.correct = false;
1701 _addToken(currentToken);
1702 state = dataState;
1703 } else {
1704 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1705 currentDoctypeToken.correct = false;
1706 state = bogusDoctypeState;
1707 }
1708 return true;
1709 }
1710
1711 bool betweenDoctypePublicAndSystemIdentifiersState() {
1712 var data = stream.char();
1713 if (isWhitespace(data)) {
1714 return true;
1715 } else if (data == ">") {
1716 _addToken(currentToken);
1717 state = dataState;
1718 } else if (data == '"') {
1719 currentDoctypeToken.systemId = "";
1720 state = doctypeSystemIdentifierDoubleQuotedState;
1721 } else if (data == "'") {
1722 currentDoctypeToken.systemId = "";
1723 state = doctypeSystemIdentifierSingleQuotedState;
1724 } else if (data == EOF) {
1725 _addToken(new ParseErrorToken("eof-in-doctype"));
1726 currentDoctypeToken.correct = false;
1727 _addToken(currentToken);
1728 state = dataState;
1729 } else {
1730 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1731 currentDoctypeToken.correct = false;
1732 state = bogusDoctypeState;
1733 }
1734 return true;
1735 }
1736
1737 bool afterDoctypeSystemKeywordState() {
1738 var data = stream.char();
1739 if (isWhitespace(data)) {
1740 state = beforeDoctypeSystemIdentifierState;
1741 } else if (data == "'" || data == '"') {
1742 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1743 stream.unget(data);
1744 state = beforeDoctypeSystemIdentifierState;
1745 } else if (data == EOF) {
1746 _addToken(new ParseErrorToken("eof-in-doctype"));
1747 currentDoctypeToken.correct = false;
1748 _addToken(currentToken);
1749 state = dataState;
1750 } else {
1751 stream.unget(data);
1752 state = beforeDoctypeSystemIdentifierState;
1753 }
1754 return true;
1755 }
1756
1757 bool beforeDoctypeSystemIdentifierState() {
1758 var data = stream.char();
1759 if (isWhitespace(data)) {
1760 return true;
1761 } else if (data == "\"") {
1762 currentDoctypeToken.systemId = "";
1763 state = doctypeSystemIdentifierDoubleQuotedState;
1764 } else if (data == "'") {
1765 currentDoctypeToken.systemId = "";
1766 state = doctypeSystemIdentifierSingleQuotedState;
1767 } else if (data == ">") {
1768 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1769 currentDoctypeToken.correct = false;
1770 _addToken(currentToken);
1771 state = dataState;
1772 } else if (data == EOF) {
1773 _addToken(new ParseErrorToken("eof-in-doctype"));
1774 currentDoctypeToken.correct = false;
1775 _addToken(currentToken);
1776 state = dataState;
1777 } else {
1778 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1779 currentDoctypeToken.correct = false;
1780 state = bogusDoctypeState;
1781 }
1782 return true;
1783 }
1784
1785 bool doctypeSystemIdentifierDoubleQuotedState() {
1786 var data = stream.char();
1787 if (data == "\"") {
1788 state = afterDoctypeSystemIdentifierState;
1789 } else if (data == "\u0000") {
1790 _addToken(new ParseErrorToken("invalid-codepoint"));
1791 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
1792 } else if (data == ">") {
1793 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1794 currentDoctypeToken.correct = false;
1795 _addToken(currentToken);
1796 state = dataState;
1797 } else if (data == EOF) {
1798 _addToken(new ParseErrorToken("eof-in-doctype"));
1799 currentDoctypeToken.correct = false;
1800 _addToken(currentToken);
1801 state = dataState;
1802 } else {
1803 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
1804 }
1805 return true;
1806 }
1807
1808 bool doctypeSystemIdentifierSingleQuotedState() {
1809 var data = stream.char();
1810 if (data == "'") {
1811 state = afterDoctypeSystemIdentifierState;
1812 } else if (data == "\u0000") {
1813 _addToken(new ParseErrorToken("invalid-codepoint"));
1814 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
1815 } else if (data == ">") {
1816 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1817 currentDoctypeToken.correct = false;
1818 _addToken(currentToken);
1819 state = dataState;
1820 } else if (data == EOF) {
1821 _addToken(new ParseErrorToken("eof-in-doctype"));
1822 currentDoctypeToken.correct = false;
1823 _addToken(currentToken);
1824 state = dataState;
1825 } else {
1826 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
1827 }
1828 return true;
1829 }
1830
1831 bool afterDoctypeSystemIdentifierState() {
1832 var data = stream.char();
1833 if (isWhitespace(data)) {
1834 return true;
1835 } else if (data == ">") {
1836 _addToken(currentToken);
1837 state = dataState;
1838 } else if (data == EOF) {
1839 _addToken(new ParseErrorToken("eof-in-doctype"));
1840 currentDoctypeToken.correct = false;
1841 _addToken(currentToken);
1842 state = dataState;
1843 } else {
1844 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1845 state = bogusDoctypeState;
1846 }
1847 return true;
1848 }
1849
1850 bool bogusDoctypeState() {
1851 var data = stream.char();
1852 if (data == ">") {
1853 _addToken(currentToken);
1854 state = dataState;
1855 } else if (data == EOF) {
1856 // XXX EMIT
1857 stream.unget(data);
1858 _addToken(currentToken);
1859 state = dataState;
1860 }
1861 return true;
1862 }
1863
1864 bool cdataSectionState() {
1865 var data = [];
1866 int matchedEnd = 0;
1867 while (true) {
1868 var ch = stream.char();
1869 if (ch == EOF) {
1870 break;
1871 }
1872 // Deal with null here rather than in the parser
1873 if (ch == "\u0000") {
1874 _addToken(new ParseErrorToken("invalid-codepoint"));
1875 ch = "\uFFFD";
1876 }
1877 data.add(ch);
1878 // TODO(jmesserly): it'd be nice if we had an easier way to match the end,
1879 // perhaps with a "peek" API.
1880 if (ch == "]" && matchedEnd < 2) {
1881 matchedEnd++;
1882 } else if (ch == ">" && matchedEnd == 2) {
1883 // Remove "]]>" from the end.
1884 data.removeLast();
1885 data.removeLast();
1886 data.removeLast();
1887 break;
1888 } else {
1889 matchedEnd = 0;
1890 }
1891 }
1892
1893 if (data.length > 0) {
1894 _addToken(new CharactersToken(data.join()));
1895 }
1896 state = dataState;
1897 return true;
1898 }
1899 }
1900
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698