Chromium Code Reviews (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out

Side by Side Diff: pkg/third_party/html5lib/lib/src/tokenizer.dart

Issue 814113004: Pull args, intl, logging, shelf, and source_maps out of the SDK. (Closed) Base URL:
Patch Set: Also csslib. Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
1 library tokenizer;
3 import 'dart:collection';
4 import 'package:html5lib/parser.dart' show HtmlParser;
5 import 'constants.dart';
6 import 'inputstream.dart';
7 import 'token.dart';
8 import 'utils.dart';
10 // Group entities by their first character, for faster lookups
12 // TODO(jmesserly): we could use a better data structure here like a trie, if
13 // we had it implemented in Dart.
14 Map<String, List<String>> entitiesByFirstChar = (() {
15 var result = {};
16 for (var k in entities.keys) {
17 result.putIfAbsent(k[0], () => []).add(k);
18 }
19 return result;
20 })();
22 // TODO(jmesserly): lots of ways to make this faster:
23 // - use char codes everywhere instead of 1-char strings
24 // - use switch instead of contains, indexOf
25 // - use switch instead of the sequential if tests
26 // - avoid string concat
28 /// This class takes care of tokenizing HTML.
29 class HtmlTokenizer implements Iterator<Token> {
30 // TODO(jmesserly): a lot of these could be made private
32 final HtmlInputStream stream;
34 final bool lowercaseElementName;
36 final bool lowercaseAttrName;
38 /// True to generate spans in for [Token.span].
39 final bool generateSpans;
41 /// True to generate spans for attributes.
42 final bool attributeSpans;
44 /// This reference to the parser is used for correct CDATA handling.
45 /// The [HtmlParser] will set this at construction time.
46 HtmlParser parser;
48 final Queue<Token> tokenQueue;
50 /// Holds the token that is currently being processed.
51 Token currentToken;
53 /// Holds a reference to the method to be invoked for the next parser state.
54 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode
55 // bug prevents us from doing that. See
56 Function state;
58 String temporaryBuffer;
60 int _lastOffset;
62 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add
63 // an item until it's ready. But the code doesn't have a clear notion of when
64 // it's "done" with the attribute.
65 List<TagAttribute> _attributes;
66 Set<String> _attributeNames;
68 HtmlTokenizer(doc, {String encoding, bool parseMeta: true,
69 this.lowercaseElementName: true, this.lowercaseAttrName: true,
70 bool generateSpans: false, String sourceUrl, this.attributeSpans: false})
71 : stream = new HtmlInputStream(
72 doc, encoding, parseMeta, generateSpans, sourceUrl),
73 tokenQueue = new Queue(),
74 generateSpans = generateSpans {
75 reset();
76 }
78 TagToken get currentTagToken => currentToken;
79 DoctypeToken get currentDoctypeToken => currentToken;
80 StringToken get currentStringToken => currentToken;
82 Token _current;
83 Token get current => _current;
85 String get _attributeName =>;
86 set _attributeName(String value) {
87 = value;
88 }
90 String get _attributeValue => _attributes.last.value;
91 set _attributeValue(String value) {
92 _attributes.last.value = value;
93 }
95 void _markAttributeEnd(int offset) {
96 if (attributeSpans) _attributes.last.end = stream.position + offset;
97 }
99 void _markAttributeValueStart(int offset) {
100 if (attributeSpans) _attributes.last.startValue = stream.position + offset;
101 }
103 void _markAttributeValueEnd(int offset) {
104 if (attributeSpans) {
105 _attributes.last.endValue = stream.position + offset;
106 _markAttributeEnd(offset);
107 }
108 }
110 // Note: we could track the name span here, if we need it.
111 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset);
113 void _addAttribute(String name) {
114 if (_attributes == null) _attributes = [];
115 var attr = new TagAttribute(name);
116 _attributes.add(attr);
117 if (attributeSpans) attr.start = stream.position - name.length;
118 }
120 /// This is where the magic happens.
121 ///
122 /// We do our usually processing through the states and when we have a token
123 /// to return we yield the token which pauses processing until the next token
124 /// is requested.
125 bool moveNext() {
126 // Start processing. When EOF is reached state will return false;
127 // instead of true and the loop will terminate.
128 while (stream.errors.length == 0 && tokenQueue.length == 0) {
129 if (!state()) {
130 _current = null;
131 return false;
132 }
133 }
134 if (stream.errors.length > 0) {
135 _current = new ParseErrorToken(stream.errors.removeFirst());
136 } else {
137 assert (tokenQueue.length > 0);
138 _current = tokenQueue.removeFirst();
139 }
140 return true;
141 }
143 /// Resets the tokenizer state. Calling this does not reset the [stream] or
144 /// the [parser].
145 void reset() {
146 _lastOffset = 0;
147 tokenQueue.clear();
148 currentToken = null;
149 temporaryBuffer = null;
150 _attributes = null;
151 _attributeNames = null;
152 state = dataState;
153 }
155 /// Adds a token to the queue. Sets the span if needed.
156 void _addToken(Token token) {
157 if (generateSpans && token.span == null) {
158 int offset = stream.position;
159 token.span = stream.fileInfo.span(_lastOffset, offset);
160 if (token is! ParseErrorToken) {
161 _lastOffset = offset;
162 }
163 }
164 tokenQueue.add(token);
165 }
167 /// This function returns either U+FFFD or the character based on the
168 /// decimal or hexadecimal representation. It also discards ";" if present.
169 /// If not present it will add a [ParseErrorToken].
170 String consumeNumberEntity(bool isHex) {
171 var allowed = isDigit;
172 var radix = 10;
173 if (isHex) {
174 allowed = isHexDigit;
175 radix = 16;
176 }
178 var charStack = [];
180 // Consume all the characters that are in range while making sure we
181 // don't hit an EOF.
182 var c = stream.char();
183 while (allowed(c) && c != EOF) {
184 charStack.add(c);
185 c = stream.char();
186 }
188 // Convert the set of characters consumed to an int.
189 var charAsInt = parseIntRadix(charStack.join(), radix);
191 // Certain characters get replaced with others
192 var char = replacementCharacters[charAsInt];
193 if (char != null) {
194 _addToken(new ParseErrorToken(
195 "illegal-codepoint-for-numeric-entity",
196 messageParams: {"charAsInt": charAsInt}));
197 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF)
198 || (charAsInt > 0x10FFFF)) {
199 char = "\uFFFD";
200 _addToken(new ParseErrorToken(
201 "illegal-codepoint-for-numeric-entity",
202 messageParams: {"charAsInt": charAsInt}));
203 } else {
204 // Should speed up this check somehow (e.g. move the set to a constant)
205 if ((0x0001 <= charAsInt && charAsInt <= 0x0008) ||
206 (0x000E <= charAsInt && charAsInt <= 0x001F) ||
207 (0x007F <= charAsInt && charAsInt <= 0x009F) ||
208 (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) ||
209 const [0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
210 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
211 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
212 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
213 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
214 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
217 0xFFFFF, 0x10FFFE, 0x10FFFF].contains(charAsInt)) {
218 _addToken(new ParseErrorToken(
219 "illegal-codepoint-for-numeric-entity",
220 messageParams: {"charAsInt": charAsInt}));
221 }
222 char = new String.fromCharCodes([charAsInt]);
223 }
225 // Discard the ; if present. Otherwise, put it back on the queue and
226 // invoke parseError on parser.
227 if (c != ";") {
228 _addToken(new ParseErrorToken(
229 "numeric-entity-without-semicolon"));
230 stream.unget(c);
231 }
232 return char;
233 }
235 void consumeEntity({String allowedChar, bool fromAttribute: false}) {
236 // Initialise to the default output for when no entity is matched
237 var output = "&";
239 var charStack = [stream.char()];
240 if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&'
241 || charStack[0] == EOF || allowedChar == charStack[0]) {
242 stream.unget(charStack[0]);
243 } else if (charStack[0] == "#") {
244 // Read the next character to see if it's hex or decimal
245 bool hex = false;
246 charStack.add(stream.char());
247 if (charStack.last == 'x' || charStack.last == 'X') {
248 hex = true;
249 charStack.add(stream.char());
250 }
252 // charStack.last should be the first digit
253 if (hex && isHexDigit(charStack.last) ||
254 (!hex && isDigit(charStack.last))) {
255 // At least one digit found, so consume the whole number
256 stream.unget(charStack.last);
257 output = consumeNumberEntity(hex);
258 } else {
259 // No digits found
260 _addToken(new ParseErrorToken("expected-numeric-entity"));
261 stream.unget(charStack.removeLast());
262 output = "&${charStack.join()}";
263 }
264 } else {
265 // At this point in the process might have named entity. Entities
266 // are stored in the global variable "entities".
267 //
268 // Consume characters and compare to these to a substring of the
269 // entity names in the list until the substring no longer matches.
270 var filteredEntityList = entitiesByFirstChar[charStack[0]];
271 if (filteredEntityList == null) filteredEntityList = const [];
273 while (charStack.last != EOF) {
274 var name = charStack.join();
275 filteredEntityList = filteredEntityList.where(
276 (e) => e.startsWith(name)).toList();
278 if (filteredEntityList.length == 0) {
279 break;
280 }
281 charStack.add(stream.char());
282 }
284 // At this point we have a string that starts with some characters
285 // that may match an entity
286 String entityName = null;
288 // Try to find the longest entity the string will match to take care
289 // of &noti for instance.
291 int entityLen;
292 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) {
293 var possibleEntityName = charStack.sublist(0, entityLen).join();
294 if (entities.containsKey(possibleEntityName)) {
295 entityName = possibleEntityName;
296 break;
297 }
298 }
300 if (entityName != null) {
301 var lastChar = entityName[entityName.length - 1];
302 if (lastChar != ";") {
303 _addToken(new ParseErrorToken(
304 "named-entity-without-semicolon"));
305 }
306 if (lastChar != ";" && fromAttribute &&
307 (isLetterOrDigit(charStack[entityLen]) ||
308 charStack[entityLen] == '=')) {
309 stream.unget(charStack.removeLast());
310 output = "&${charStack.join()}";
311 } else {
312 output = entities[entityName];
313 stream.unget(charStack.removeLast());
314 output = '${output}${slice(charStack, entityLen).join()}';
315 }
316 } else {
317 _addToken(new ParseErrorToken("expected-named-entity"));
318 stream.unget(charStack.removeLast());
319 output = "&${charStack.join()}";
320 }
321 }
322 if (fromAttribute) {
323 _attributeValue = '$_attributeValue$output';
324 } else {
325 var token;
326 if (isWhitespace(output)) {
327 token = new SpaceCharactersToken(output);
328 } else {
329 token = new CharactersToken(output);
330 }
331 _addToken(token);
332 }
333 }
335 /// This method replaces the need for "entityInAttributeValueState".
336 void processEntityInAttribute(String allowedChar) {
337 consumeEntity(allowedChar: allowedChar, fromAttribute: true);
338 }
340 /// This method is a generic handler for emitting the tags. It also sets
341 /// the state to "data" because that's what's needed after a token has been
342 /// emitted.
343 void emitCurrentToken() {
344 var token = currentToken;
345 // Add token to the queue to be yielded
346 if (token is TagToken) {
347 if (lowercaseElementName) {
348 = asciiUpper2Lower(;
349 }
350 if (token is EndTagToken) {
351 if (_attributes != null) {
352 _addToken(new ParseErrorToken("attributes-in-end-tag"));
353 }
354 if (token.selfClosing) {
355 _addToken(new ParseErrorToken("this-closing-flag-on-end-tag"));
356 }
357 } else if (token is StartTagToken) {
358 // HTML5 specific normalizations to the token stream.
359 // Convert the list into a map where first key wins.
360 = new LinkedHashMap<Object, String>();
361 if (_attributes != null) {
362 for (var attr in _attributes) {
363, () => attr.value);
364 }
365 if (attributeSpans) token.attributeSpans = _attributes;
366 }
367 }
368 _attributes = null;
369 _attributeNames = null;
370 }
371 _addToken(token);
372 state = dataState;
373 }
375 // Below are the various tokenizer states worked out.
377 bool dataState() {
378 var data = stream.char();
379 if (data == "&") {
380 state = entityDataState;
381 } else if (data == "<") {
382 state = tagOpenState;
383 } else if (data == "\u0000") {
384 _addToken(new ParseErrorToken("invalid-codepoint"));
385 _addToken(new CharactersToken("\u0000"));
386 } else if (data == EOF) {
387 // Tokenization ends.
388 return false;
389 } else if (isWhitespace(data)) {
390 // Directly after emitting a token you switch back to the "data
391 // state". At that point spaceCharacters are important so they are
392 // emitted separately.
393 _addToken(new SpaceCharactersToken(
394 '${data}${stream.charsUntil(spaceCharacters, true)}'));
395 // No need to update lastFourChars here, since the first space will
396 // have already been appended to lastFourChars and will have broken
397 // any <!-- or --> sequences
398 } else {
399 var chars = stream.charsUntil("&<\u0000");
400 _addToken(new CharactersToken('${data}${chars}'));
401 }
402 return true;
403 }
405 bool entityDataState() {
406 consumeEntity();
407 state = dataState;
408 return true;
409 }
411 bool rcdataState() {
412 var data = stream.char();
413 if (data == "&") {
414 state = characterReferenceInRcdata;
415 } else if (data == "<") {
416 state = rcdataLessThanSignState;
417 } else if (data == EOF) {
418 // Tokenization ends.
419 return false;
420 } else if (data == "\u0000") {
421 _addToken(new ParseErrorToken("invalid-codepoint"));
422 _addToken(new CharactersToken("\uFFFD"));
423 } else if (isWhitespace(data)) {
424 // Directly after emitting a token you switch back to the "data
425 // state". At that point spaceCharacters are important so they are
426 // emitted separately.
427 _addToken(new SpaceCharactersToken(
428 '${data}${stream.charsUntil(spaceCharacters, true)}'));
429 } else {
430 var chars = stream.charsUntil("&<");
431 _addToken(new CharactersToken('${data}${chars}'));
432 }
433 return true;
434 }
436 bool characterReferenceInRcdata() {
437 consumeEntity();
438 state = rcdataState;
439 return true;
440 }
442 bool rawtextState() {
443 var data = stream.char();
444 if (data == "<") {
445 state = rawtextLessThanSignState;
446 } else if (data == "\u0000") {
447 _addToken(new ParseErrorToken("invalid-codepoint"));
448 _addToken(new CharactersToken("\uFFFD"));
449 } else if (data == EOF) {
450 // Tokenization ends.
451 return false;
452 } else {
453 var chars = stream.charsUntil("<\u0000");
454 _addToken(new CharactersToken("${data}${chars}"));
455 }
456 return true;
457 }
459 bool scriptDataState() {
460 var data = stream.char();
461 if (data == "<") {
462 state = scriptDataLessThanSignState;
463 } else if (data == "\u0000") {
464 _addToken(new ParseErrorToken("invalid-codepoint"));
465 _addToken(new CharactersToken("\uFFFD"));
466 } else if (data == EOF) {
467 // Tokenization ends.
468 return false;
469 } else {
470 var chars = stream.charsUntil("<\u0000");
471 _addToken(new CharactersToken("${data}${chars}"));
472 }
473 return true;
474 }
476 bool plaintextState() {
477 var data = stream.char();
478 if (data == EOF) {
479 // Tokenization ends.
480 return false;
481 } else if (data == "\u0000") {
482 _addToken(new ParseErrorToken("invalid-codepoint"));
483 _addToken(new CharactersToken("\uFFFD"));
484 } else {
485 _addToken(new CharactersToken(
486 '${data}${stream.charsUntil("\u0000")}'));
487 }
488 return true;
489 }
491 bool tagOpenState() {
492 var data = stream.char();
493 if (data == "!") {
494 state = markupDeclarationOpenState;
495 } else if (data == "/") {
496 state = closeTagOpenState;
497 } else if (isLetter(data)) {
498 currentToken = new StartTagToken(data);
499 state = tagNameState;
500 } else if (data == ">") {
501 // XXX In theory it could be something besides a tag name. But
502 // do we really care?
503 _addToken(new ParseErrorToken(
504 "expected-tag-name-but-got-right-bracket"));
505 _addToken(new CharactersToken("<>"));
506 state = dataState;
507 } else if (data == "?") {
508 // XXX In theory it could be something besides a tag name. But
509 // do we really care?
510 _addToken(new ParseErrorToken(
511 "expected-tag-name-but-got-question-mark"));
512 stream.unget(data);
513 state = bogusCommentState;
514 } else {
515 // XXX
516 _addToken(new ParseErrorToken("expected-tag-name"));
517 _addToken(new CharactersToken("<"));
518 stream.unget(data);
519 state = dataState;
520 }
521 return true;
522 }
524 bool closeTagOpenState() {
525 var data = stream.char();
526 if (isLetter(data)) {
527 currentToken = new EndTagToken(data);
528 state = tagNameState;
529 } else if (data == ">") {
530 _addToken(new ParseErrorToken(
531 "expected-closing-tag-but-got-right-bracket"));
532 state = dataState;
533 } else if (data == EOF) {
534 _addToken(new ParseErrorToken(
535 "expected-closing-tag-but-got-eof"));
536 _addToken(new CharactersToken("</"));
537 state = dataState;
538 } else {
539 // XXX data can be _'_...
540 _addToken(new ParseErrorToken(
541 "expected-closing-tag-but-got-char", messageParams: {"data": data}));
542 stream.unget(data);
543 state = bogusCommentState;
544 }
545 return true;
546 }
548 bool tagNameState() {
549 var data = stream.char();
550 if (isWhitespace(data)) {
551 state = beforeAttributeNameState;
552 } else if (data == ">") {
553 emitCurrentToken();
554 } else if (data == EOF) {
555 _addToken(new ParseErrorToken("eof-in-tag-name"));
556 state = dataState;
557 } else if (data == "/") {
558 state = selfClosingStartTagState;
559 } else if (data == "\u0000") {
560 _addToken(new ParseErrorToken("invalid-codepoint"));
561 = '${}\uFFFD';
562 } else {
563 = '${}$data';
564 // (Don't use charsUntil here, because tag names are
565 // very short and it's faster to not do anything fancy)
566 }
567 return true;
568 }
570 bool rcdataLessThanSignState() {
571 var data = stream.char();
572 if (data == "/") {
573 temporaryBuffer = "";
574 state = rcdataEndTagOpenState;
575 } else {
576 _addToken(new CharactersToken("<"));
577 stream.unget(data);
578 state = rcdataState;
579 }
580 return true;
581 }
583 bool rcdataEndTagOpenState() {
584 var data = stream.char();
585 if (isLetter(data)) {
586 temporaryBuffer = '${temporaryBuffer}$data';
587 state = rcdataEndTagNameState;
588 } else {
589 _addToken(new CharactersToken("</"));
590 stream.unget(data);
591 state = rcdataState;
592 }
593 return true;
594 }
596 bool _tokenIsAppropriate() {
597 return currentToken is TagToken &&
598 == temporaryBuffer.toLowerCase();
599 }
601 bool rcdataEndTagNameState() {
602 var appropriate = _tokenIsAppropriate();
603 var data = stream.char();
604 if (isWhitespace(data) && appropriate) {
605 currentToken = new EndTagToken(temporaryBuffer);
606 state = beforeAttributeNameState;
607 } else if (data == "/" && appropriate) {
608 currentToken = new EndTagToken(temporaryBuffer);
609 state = selfClosingStartTagState;
610 } else if (data == ">" && appropriate) {
611 currentToken = new EndTagToken(temporaryBuffer);
612 emitCurrentToken();
613 state = dataState;
614 } else if (isLetter(data)) {
615 temporaryBuffer = '${temporaryBuffer}$data';
616 } else {
617 _addToken(new CharactersToken("</$temporaryBuffer"));
618 stream.unget(data);
619 state = rcdataState;
620 }
621 return true;
622 }
624 bool rawtextLessThanSignState() {
625 var data = stream.char();
626 if (data == "/") {
627 temporaryBuffer = "";
628 state = rawtextEndTagOpenState;
629 } else {
630 _addToken(new CharactersToken("<"));
631 stream.unget(data);
632 state = rawtextState;
633 }
634 return true;
635 }
637 bool rawtextEndTagOpenState() {
638 var data = stream.char();
639 if (isLetter(data)) {
640 temporaryBuffer = '${temporaryBuffer}$data';
641 state = rawtextEndTagNameState;
642 } else {
643 _addToken(new CharactersToken("</"));
644 stream.unget(data);
645 state = rawtextState;
646 }
647 return true;
648 }
650 bool rawtextEndTagNameState() {
651 var appropriate = _tokenIsAppropriate();
652 var data = stream.char();
653 if (isWhitespace(data) && appropriate) {
654 currentToken = new EndTagToken(temporaryBuffer);
655 state = beforeAttributeNameState;
656 } else if (data == "/" && appropriate) {
657 currentToken = new EndTagToken(temporaryBuffer);
658 state = selfClosingStartTagState;
659 } else if (data == ">" && appropriate) {
660 currentToken = new EndTagToken(temporaryBuffer);
661 emitCurrentToken();
662 state = dataState;
663 } else if (isLetter(data)) {
664 temporaryBuffer = '${temporaryBuffer}$data';
665 } else {
666 _addToken(new CharactersToken("</$temporaryBuffer"));
667 stream.unget(data);
668 state = rawtextState;
669 }
670 return true;
671 }
673 bool scriptDataLessThanSignState() {
674 var data = stream.char();
675 if (data == "/") {
676 temporaryBuffer = "";
677 state = scriptDataEndTagOpenState;
678 } else if (data == "!") {
679 _addToken(new CharactersToken("<!"));
680 state = scriptDataEscapeStartState;
681 } else {
682 _addToken(new CharactersToken("<"));
683 stream.unget(data);
684 state = scriptDataState;
685 }
686 return true;
687 }
689 bool scriptDataEndTagOpenState() {
690 var data = stream.char();
691 if (isLetter(data)) {
692 temporaryBuffer = '${temporaryBuffer}$data';
693 state = scriptDataEndTagNameState;
694 } else {
695 _addToken(new CharactersToken("</"));
696 stream.unget(data);
697 state = scriptDataState;
698 }
699 return true;
700 }
702 bool scriptDataEndTagNameState() {
703 var appropriate = _tokenIsAppropriate();
704 var data = stream.char();
705 if (isWhitespace(data) && appropriate) {
706 currentToken = new EndTagToken(temporaryBuffer);
707 state = beforeAttributeNameState;
708 } else if (data == "/" && appropriate) {
709 currentToken = new EndTagToken(temporaryBuffer);
710 state = selfClosingStartTagState;
711 } else if (data == ">" && appropriate) {
712 currentToken = new EndTagToken(temporaryBuffer);
713 emitCurrentToken();
714 state = dataState;
715 } else if (isLetter(data)) {
716 temporaryBuffer = '${temporaryBuffer}$data';
717 } else {
718 _addToken(new CharactersToken("</$temporaryBuffer"));
719 stream.unget(data);
720 state = scriptDataState;
721 }
722 return true;
723 }
725 bool scriptDataEscapeStartState() {
726 var data = stream.char();
727 if (data == "-") {
728 _addToken(new CharactersToken("-"));
729 state = scriptDataEscapeStartDashState;
730 } else {
731 stream.unget(data);
732 state = scriptDataState;
733 }
734 return true;
735 }
737 bool scriptDataEscapeStartDashState() {
738 var data = stream.char();
739 if (data == "-") {
740 _addToken(new CharactersToken("-"));
741 state = scriptDataEscapedDashDashState;
742 } else {
743 stream.unget(data);
744 state = scriptDataState;
745 }
746 return true;
747 }
749 bool scriptDataEscapedState() {
750 var data = stream.char();
751 if (data == "-") {
752 _addToken(new CharactersToken("-"));
753 state = scriptDataEscapedDashState;
754 } else if (data == "<") {
755 state = scriptDataEscapedLessThanSignState;
756 } else if (data == "\u0000") {
757 _addToken(new ParseErrorToken("invalid-codepoint"));
758 _addToken(new CharactersToken("\uFFFD"));
759 } else if (data == EOF) {
760 state = dataState;
761 } else {
762 var chars = stream.charsUntil("<-\u0000");
763 _addToken(new CharactersToken("${data}${chars}"));
764 }
765 return true;
766 }
768 bool scriptDataEscapedDashState() {
769 var data = stream.char();
770 if (data == "-") {
771 _addToken(new CharactersToken("-"));
772 state = scriptDataEscapedDashDashState;
773 } else if (data == "<") {
774 state = scriptDataEscapedLessThanSignState;
775 } else if (data == "\u0000") {
776 _addToken(new ParseErrorToken("invalid-codepoint"));
777 _addToken(new CharactersToken("\uFFFD"));
778 state = scriptDataEscapedState;
779 } else if (data == EOF) {
780 state = dataState;
781 } else {
782 _addToken(new CharactersToken(data));
783 state = scriptDataEscapedState;
784 }
785 return true;
786 }
788 bool scriptDataEscapedDashDashState() {
789 var data = stream.char();
790 if (data == "-") {
791 _addToken(new CharactersToken("-"));
792 } else if (data == "<") {
793 state = scriptDataEscapedLessThanSignState;
794 } else if (data == ">") {
795 _addToken(new CharactersToken(">"));
796 state = scriptDataState;
797 } else if (data == "\u0000") {
798 _addToken(new ParseErrorToken("invalid-codepoint"));
799 _addToken(new CharactersToken("\uFFFD"));
800 state = scriptDataEscapedState;
801 } else if (data == EOF) {
802 state = dataState;
803 } else {
804 _addToken(new CharactersToken(data));
805 state = scriptDataEscapedState;
806 }
807 return true;
808 }
810 bool scriptDataEscapedLessThanSignState() {
811 var data = stream.char();
812 if (data == "/") {
813 temporaryBuffer = "";
814 state = scriptDataEscapedEndTagOpenState;
815 } else if (isLetter(data)) {
816 _addToken(new CharactersToken("<$data"));
817 temporaryBuffer = data;
818 state = scriptDataDoubleEscapeStartState;
819 } else {
820 _addToken(new CharactersToken("<"));
821 stream.unget(data);
822 state = scriptDataEscapedState;
823 }
824 return true;
825 }
827 bool scriptDataEscapedEndTagOpenState() {
828 var data = stream.char();
829 if (isLetter(data)) {
830 temporaryBuffer = data;
831 state = scriptDataEscapedEndTagNameState;
832 } else {
833 _addToken(new CharactersToken("</"));
834 stream.unget(data);
835 state = scriptDataEscapedState;
836 }
837 return true;
838 }
840 bool scriptDataEscapedEndTagNameState() {
841 var appropriate = _tokenIsAppropriate();
842 var data = stream.char();
843 if (isWhitespace(data) && appropriate) {
844 currentToken = new EndTagToken(temporaryBuffer);
845 state = beforeAttributeNameState;
846 } else if (data == "/" && appropriate) {
847 currentToken = new EndTagToken(temporaryBuffer);
848 state = selfClosingStartTagState;
849 } else if (data == ">" && appropriate) {
850 currentToken = new EndTagToken(temporaryBuffer);
851 emitCurrentToken();
852 state = dataState;
853 } else if (isLetter(data)) {
854 temporaryBuffer = '${temporaryBuffer}$data';
855 } else {
856 _addToken(new CharactersToken("</$temporaryBuffer"));
857 stream.unget(data);
858 state = scriptDataEscapedState;
859 }
860 return true;
861 }
863 bool scriptDataDoubleEscapeStartState() {
864 var data = stream.char();
865 if (isWhitespace(data) || data == "/" || data == ">") {
866 _addToken(new CharactersToken(data));
867 if (temporaryBuffer.toLowerCase() == "script") {
868 state = scriptDataDoubleEscapedState;
869 } else {
870 state = scriptDataEscapedState;
871 }
872 } else if (isLetter(data)) {
873 _addToken(new CharactersToken(data));
874 temporaryBuffer = '${temporaryBuffer}$data';
875 } else {
876 stream.unget(data);
877 state = scriptDataEscapedState;
878 }
879 return true;
880 }
882 bool scriptDataDoubleEscapedState() {
883 var data = stream.char();
884 if (data == "-") {
885 _addToken(new CharactersToken("-"));
886 state = scriptDataDoubleEscapedDashState;
887 } else if (data == "<") {
888 _addToken(new CharactersToken("<"));
889 state = scriptDataDoubleEscapedLessThanSignState;
890 } else if (data == "\u0000") {
891 _addToken(new ParseErrorToken("invalid-codepoint"));
892 _addToken(new CharactersToken("\uFFFD"));
893 } else if (data == EOF) {
894 _addToken(new ParseErrorToken("eof-in-script-in-script"));
895 state = dataState;
896 } else {
897 _addToken(new CharactersToken(data));
898 }
899 return true;
900 }
902 bool scriptDataDoubleEscapedDashState() {
903 var data = stream.char();
904 if (data == "-") {
905 _addToken(new CharactersToken("-"));
906 state = scriptDataDoubleEscapedDashDashState;
907 } else if (data == "<") {
908 _addToken(new CharactersToken("<"));
909 state = scriptDataDoubleEscapedLessThanSignState;
910 } else if (data == "\u0000") {
911 _addToken(new ParseErrorToken("invalid-codepoint"));
912 _addToken(new CharactersToken("\uFFFD"));
913 state = scriptDataDoubleEscapedState;
914 } else if (data == EOF) {
915 _addToken(new ParseErrorToken("eof-in-script-in-script"));
916 state = dataState;
917 } else {
918 _addToken(new CharactersToken(data));
919 state = scriptDataDoubleEscapedState;
920 }
921 return true;
922 }
924 // TODO(jmesserly): report bug in original code
925 // (was "Dash" instead of "DashDash")
926 bool scriptDataDoubleEscapedDashDashState() {
927 var data = stream.char();
928 if (data == "-") {
929 _addToken(new CharactersToken("-"));
930 } else if (data == "<") {
931 _addToken(new CharactersToken("<"));
932 state = scriptDataDoubleEscapedLessThanSignState;
933 } else if (data == ">") {
934 _addToken(new CharactersToken(">"));
935 state = scriptDataState;
936 } else if (data == "\u0000") {
937 _addToken(new ParseErrorToken("invalid-codepoint"));
938 _addToken(new CharactersToken("\uFFFD"));
939 state = scriptDataDoubleEscapedState;
940 } else if (data == EOF) {
941 _addToken(new ParseErrorToken("eof-in-script-in-script"));
942 state = dataState;
943 } else {
944 _addToken(new CharactersToken(data));
945 state = scriptDataDoubleEscapedState;
946 }
947 return true;
948 }
950 bool scriptDataDoubleEscapedLessThanSignState() {
951 var data = stream.char();
952 if (data == "/") {
953 _addToken(new CharactersToken("/"));
954 temporaryBuffer = "";
955 state = scriptDataDoubleEscapeEndState;
956 } else {
957 stream.unget(data);
958 state = scriptDataDoubleEscapedState;
959 }
960 return true;
961 }
963 bool scriptDataDoubleEscapeEndState() {
964 var data = stream.char();
965 if (isWhitespace(data) || data == "/" || data == ">") {
966 _addToken(new CharactersToken(data));
967 if (temporaryBuffer.toLowerCase() == "script") {
968 state = scriptDataEscapedState;
969 } else {
970 state = scriptDataDoubleEscapedState;
971 }
972 } else if (isLetter(data)) {
973 _addToken(new CharactersToken(data));
974 temporaryBuffer = '${temporaryBuffer}$data';
975 } else {
976 stream.unget(data);
977 state = scriptDataDoubleEscapedState;
978 }
979 return true;
980 }
982 bool beforeAttributeNameState() {
983 var data = stream.char();
984 if (isWhitespace(data)) {
985 stream.charsUntil(spaceCharacters, true);
986 } else if (isLetter(data)) {
987 _addAttribute(data);
988 state = attributeNameState;
989 } else if (data == ">") {
990 emitCurrentToken();
991 } else if (data == "/") {
992 state = selfClosingStartTagState;
993 } else if (data == EOF) {
994 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof"));
995 state = dataState;
996 } else if ("'\"=<".contains(data)) {
997 _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
998 _addAttribute(data);
999 state = attributeNameState;
1000 } else if (data == "\u0000") {
1001 _addToken(new ParseErrorToken("invalid-codepoint"));
1002 _addAttribute("\uFFFD");
1003 state = attributeNameState;
1004 } else {
1005 _addAttribute(data);
1006 state = attributeNameState;
1007 }
1008 return true;
1009 }
1011 bool attributeNameState() {
1012 var data = stream.char();
1013 bool leavingThisState = true;
1014 bool emitToken = false;
1015 if (data == "=") {
1016 state = beforeAttributeValueState;
1017 } else if (isLetter(data)) {
1018 _attributeName = '$_attributeName$data'
1019 '${stream.charsUntil(asciiLetters, true)}';
1020 leavingThisState = false;
1021 } else if (data == ">") {
1022 // XXX If we emit here the attributes are converted to a dict
1023 // without being checked and when the code below runs we error
1024 // because data is a dict not a list
1025 emitToken = true;
1026 } else if (isWhitespace(data)) {
1027 state = afterAttributeNameState;
1028 } else if (data == "/") {
1029 state = selfClosingStartTagState;
1030 } else if (data == "\u0000") {
1031 _addToken(new ParseErrorToken("invalid-codepoint"));
1032 _attributeName = '${_attributeName}\uFFFD';
1033 leavingThisState = false;
1034 } else if (data == EOF) {
1035 _addToken(new ParseErrorToken("eof-in-attribute-name"));
1036 state = dataState;
1037 } else if ("'\"<".contains(data)) {
1038 _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
1039 _attributeName = '$_attributeName$data';
1040 leavingThisState = false;
1041 } else {
1042 _attributeName = '$_attributeName$data';
1043 leavingThisState = false;
1044 }
1046 if (leavingThisState) {
1047 _markAttributeNameEnd(-1);
1049 // Attributes are not dropped at this stage. That happens when the
1050 // start tag token is emitted so values can still be safely appended
1051 // to attributes, but we do want to report the parse error in time.
1052 if (lowercaseAttrName) {
1053 _attributeName = asciiUpper2Lower(_attributeName);
1054 }
1055 if (_attributeNames == null) _attributeNames = new Set();
1056 if (_attributeNames.contains(_attributeName)) {
1057 _addToken(new ParseErrorToken("duplicate-attribute"));
1058 }
1059 _attributeNames.add(_attributeName);
1061 // XXX Fix for above XXX
1062 if (emitToken) {
1063 emitCurrentToken();
1064 }
1065 }
1066 return true;
1067 }
1069 bool afterAttributeNameState() {
1070 var data = stream.char();
1071 if (isWhitespace(data)) {
1072 stream.charsUntil(spaceCharacters, true);
1073 } else if (data == "=") {
1074 state = beforeAttributeValueState;
1075 } else if (data == ">") {
1076 emitCurrentToken();
1077 } else if (isLetter(data)) {
1078 _addAttribute(data);
1079 state = attributeNameState;
1080 } else if (data == "/") {
1081 state = selfClosingStartTagState;
1082 } else if (data == "\u0000") {
1083 _addToken(new ParseErrorToken("invalid-codepoint"));
1084 _addAttribute("\uFFFD");
1085 state = attributeNameState;
1086 } else if (data == EOF) {
1087 _addToken(new ParseErrorToken("expected-end-of-tag-but-got-eof"));
1088 state = dataState;
1089 } else if ("'\"<".contains(data)) {
1090 _addToken(new ParseErrorToken("invalid-character-after-attribute-name"));
1091 _addAttribute(data);
1092 state = attributeNameState;
1093 } else {
1094 _addAttribute(data);
1095 state = attributeNameState;
1096 }
1097 return true;
1098 }
1100 bool beforeAttributeValueState() {
1101 var data = stream.char();
1102 if (isWhitespace(data)) {
1103 stream.charsUntil(spaceCharacters, true);
1104 } else if (data == "\"") {
1105 _markAttributeValueStart(0);
1106 state = attributeValueDoubleQuotedState;
1107 } else if (data == "&") {
1108 state = attributeValueUnQuotedState;
1109 stream.unget(data);
1110 _markAttributeValueStart(0);
1111 } else if (data == "'") {
1112 _markAttributeValueStart(0);
1113 state = attributeValueSingleQuotedState;
1114 } else if (data == ">") {
1115 _addToken(new ParseErrorToken(
1116 "expected-attribute-value-but-got-right-bracket"));
1117 emitCurrentToken();
1118 } else if (data == "\u0000") {
1119 _addToken(new ParseErrorToken("invalid-codepoint"));
1120 _markAttributeValueStart(-1);
1121 _attributeValue = '${_attributeValue}\uFFFD';
1122 state = attributeValueUnQuotedState;
1123 } else if (data == EOF) {
1124 _addToken(new ParseErrorToken("expected-attribute-value-but-got-eof"));
1125 state = dataState;
1126 } else if ("=<`".contains(data)) {
1127 _addToken(new ParseErrorToken("equals-in-unquoted-attribute-value"));
1128 _markAttributeValueStart(-1);
1129 _attributeValue = '$_attributeValue$data';
1130 state = attributeValueUnQuotedState;
1131 } else {
1132 _markAttributeValueStart(-1);
1133 _attributeValue = '$_attributeValue$data';
1134 state = attributeValueUnQuotedState;
1135 }
1136 return true;
1137 }
1139 bool attributeValueDoubleQuotedState() {
1140 var data = stream.char();
1141 if (data == "\"") {
1142 _markAttributeValueEnd(-1);
1143 _markAttributeEnd(0);
1144 state = afterAttributeValueState;
1145 } else if (data == "&") {
1146 processEntityInAttribute('"');
1147 } else if (data == "\u0000") {
1148 _addToken(new ParseErrorToken("invalid-codepoint"));
1149 _attributeValue = '${_attributeValue}\uFFFD';
1150 } else if (data == EOF) {
1151 _addToken(new ParseErrorToken("eof-in-attribute-value-double-quote"));
1152 _markAttributeValueEnd(-1);
1153 state = dataState;
1154 } else {
1155 _attributeValue = '$_attributeValue$data${stream.charsUntil("\"&")}';
1156 }
1157 return true;
1158 }
1160 bool attributeValueSingleQuotedState() {
1161 var data = stream.char();
1162 if (data == "'") {
1163 _markAttributeValueEnd(-1);
1164 _markAttributeEnd(0);
1165 state = afterAttributeValueState;
1166 } else if (data == "&") {
1167 processEntityInAttribute("'");
1168 } else if (data == "\u0000") {
1169 _addToken(new ParseErrorToken("invalid-codepoint"));
1170 _attributeValue = '${_attributeValue}\uFFFD';
1171 } else if (data == EOF) {
1172 _addToken(new ParseErrorToken("eof-in-attribute-value-single-quote"));
1173 _markAttributeValueEnd(-1);
1174 state = dataState;
1175 } else {
1176 _attributeValue = '$_attributeValue$data${stream.charsUntil("\'&")}';
1177 }
1178 return true;
1179 }
1181 bool attributeValueUnQuotedState() {
1182 var data = stream.char();
1183 if (isWhitespace(data)) {
1184 _markAttributeValueEnd(-1);
1185 state = beforeAttributeNameState;
1186 } else if (data == "&") {
1187 processEntityInAttribute(">");
1188 } else if (data == ">") {
1189 _markAttributeValueEnd(-1);
1190 emitCurrentToken();
1191 } else if (data == EOF) {
1192 _addToken(new ParseErrorToken("eof-in-attribute-value-no-quotes"));
1193 _markAttributeValueEnd(-1);
1194 state = dataState;
1195 } else if ('"\'=<`'.contains(data)) {
1196 _addToken(new ParseErrorToken(
1197 "unexpected-character-in-unquoted-attribute-value"));
1198 _attributeValue = '$_attributeValue$data';
1199 } else if (data == "\u0000") {
1200 _addToken(new ParseErrorToken("invalid-codepoint"));
1201 _attributeValue = '${_attributeValue}\uFFFD';
1202 } else {
1203 _attributeValue = '$_attributeValue$data'
1204 '${stream.charsUntil("&>\"\'=<`$spaceCharacters")}';
1205 }
1206 return true;
1207 }
1209 bool afterAttributeValueState() {
1210 var data = stream.char();
1211 if (isWhitespace(data)) {
1212 state = beforeAttributeNameState;
1213 } else if (data == ">") {
1214 emitCurrentToken();
1215 } else if (data == "/") {
1216 state = selfClosingStartTagState;
1217 } else if (data == EOF) {
1218 _addToken(new ParseErrorToken("unexpected-EOF-after-attribute-value"));
1219 stream.unget(data);
1220 state = dataState;
1221 } else {
1222 _addToken(new ParseErrorToken(
1223 "unexpected-character-after-attribute-value"));
1224 stream.unget(data);
1225 state = beforeAttributeNameState;
1226 }
1227 return true;
1228 }
1230 bool selfClosingStartTagState() {
1231 var data = stream.char();
1232 if (data == ">") {
1233 currentTagToken.selfClosing = true;
1234 emitCurrentToken();
1235 } else if (data == EOF) {
1236 _addToken(new ParseErrorToken("unexpected-EOF-after-solidus-in-tag"));
1237 stream.unget(data);
1238 state = dataState;
1239 } else {
1240 _addToken(new ParseErrorToken(
1241 "unexpected-character-after-soldius-in-tag"));
1242 stream.unget(data);
1243 state = beforeAttributeNameState;
1244 }
1245 return true;
1246 }
1248 bool bogusCommentState() {
1249 // Make a new comment token and give it as value all the characters
1250 // until the first > or EOF (charsUntil checks for EOF automatically)
1251 // and emit it.
1252 var data = stream.charsUntil(">");
1253 data = data.replaceAll("\u0000", "\uFFFD");
1254 _addToken(new CommentToken(data));
1256 // Eat the character directly after the bogus comment which is either a
1257 // ">" or an EOF.
1258 stream.char();
1259 state = dataState;
1260 return true;
1261 }
1263 bool markupDeclarationOpenState() {
1264 var charStack = [stream.char()];
1265 if (charStack.last == "-") {
1266 charStack.add(stream.char());
1267 if (charStack.last == "-") {
1268 currentToken = new CommentToken("");
1269 state = commentStartState;
1270 return true;
1271 }
1272 } else if (charStack.last == 'd' || charStack.last == 'D') {
1273 var matched = true;
1274 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) {
1275 var char = stream.char();
1276 charStack.add(char);
1277 if (char == EOF || !expected.contains(char)) {
1278 matched = false;
1279 break;
1280 }
1281 }
1282 if (matched) {
1283 currentToken = new DoctypeToken(correct: true);
1284 state = doctypeState;
1285 return true;
1286 }
1287 } else if (charStack.last == "[" &&
1288 parser != null && parser.tree.openElements.length > 0 &&
1289 parser.tree.openElements.last.namespaceUri
1290 != parser.tree.defaultNamespace) {
1291 var matched = true;
1292 for (var expected in const ["C", "D", "A", "T", "A", "["]) {
1293 charStack.add(stream.char());
1294 if (charStack.last != expected) {
1295 matched = false;
1296 break;
1297 }
1298 }
1299 if (matched) {
1300 state = cdataSectionState;
1301 return true;
1302 }
1303 }
1305 _addToken(new ParseErrorToken("expected-dashes-or-doctype"));
1307 while (charStack.length > 0) {
1308 stream.unget(charStack.removeLast());
1309 }
1310 state = bogusCommentState;
1311 return true;
1312 }
1314 bool commentStartState() {
1315 var data = stream.char();
1316 if (data == "-") {
1317 state = commentStartDashState;
1318 } else if (data == "\u0000") {
1319 _addToken(new ParseErrorToken("invalid-codepoint"));
1320 = '${}\uFFFD';
1321 } else if (data == ">") {
1322 _addToken(new ParseErrorToken("incorrect-comment"));
1323 _addToken(currentToken);
1324 state = dataState;
1325 } else if (data == EOF) {
1326 _addToken(new ParseErrorToken("eof-in-comment"));
1327 _addToken(currentToken);
1328 state = dataState;
1329 } else {
1330 = '${}$data';
1331 state = commentState;
1332 }
1333 return true;
1334 }
1336 bool commentStartDashState() {
1337 var data = stream.char();
1338 if (data == "-") {
1339 state = commentEndState;
1340 } else if (data == "\u0000") {
1341 _addToken(new ParseErrorToken("invalid-codepoint"));
1342 = '${}-\uFFFD';
1343 } else if (data == ">") {
1344 _addToken(new ParseErrorToken("incorrect-comment"));
1345 _addToken(currentToken);
1346 state = dataState;
1347 } else if (data == EOF) {
1348 _addToken(new ParseErrorToken("eof-in-comment"));
1349 _addToken(currentToken);
1350 state = dataState;
1351 } else {
1352 = '${}-${data}';
1353 state = commentState;
1354 }
1355 return true;
1356 }
1358 bool commentState() {
1359 var data = stream.char();
1360 if (data == "-") {
1361 state = commentEndDashState;
1362 } else if (data == "\u0000") {
1363 _addToken(new ParseErrorToken("invalid-codepoint"));
1364 = '${}\uFFFD';
1365 } else if (data == EOF) {
1366 _addToken(new ParseErrorToken("eof-in-comment"));
1367 _addToken(currentToken);
1368 state = dataState;
1369 } else {
1370 = '${}$data'
1371 '${stream.charsUntil("-\u0000")}';
1372 }
1373 return true;
1374 }
1376 bool commentEndDashState() {
1377 var data = stream.char();
1378 if (data == "-") {
1379 state = commentEndState;
1380 } else if (data == "\u0000") {
1381 _addToken(new ParseErrorToken("invalid-codepoint"));
1382 = "${}-\uFFFD";
1383 state = commentState;
1384 } else if (data == EOF) {
1385 _addToken(new ParseErrorToken("eof-in-comment-end-dash"));
1386 _addToken(currentToken);
1387 state = dataState;
1388 } else {
1389 = "${}-${data}";
1390 state = commentState;
1391 }
1392 return true;
1393 }
1395 bool commentEndState() {
1396 var data = stream.char();
1397 if (data == ">") {
1398 _addToken(currentToken);
1399 state = dataState;
1400 } else if (data == "\u0000") {
1401 _addToken(new ParseErrorToken("invalid-codepoint"));
1402 = '${}--\uFFFD';
1403 state = commentState;
1404 } else if (data == "!") {
1405 _addToken(new ParseErrorToken(
1406 "unexpected-bang-after-double-dash-in-comment"));
1407 state = commentEndBangState;
1408 } else if (data == "-") {
1409 _addToken(new ParseErrorToken(
1410 "unexpected-dash-after-double-dash-in-comment"));
1411 = '${}$data';
1412 } else if (data == EOF) {
1413 _addToken(new ParseErrorToken("eof-in-comment-double-dash"));
1414 _addToken(currentToken);
1415 state = dataState;
1416 } else {
1417 // XXX
1418 _addToken(new ParseErrorToken("unexpected-char-in-comment"));
1419 = "${}--${data}";
1420 state = commentState;
1421 }
1422 return true;
1423 }
1425 bool commentEndBangState() {
1426 var data = stream.char();
1427 if (data == ">") {
1428 _addToken(currentToken);
1429 state = dataState;
1430 } else if (data == "-") {
1431 = '${}--!';
1432 state = commentEndDashState;
1433 } else if (data == "\u0000") {
1434 _addToken(new ParseErrorToken("invalid-codepoint"));
1435 = '${}--!\uFFFD';
1436 state = commentState;
1437 } else if (data == EOF) {
1438 _addToken(new ParseErrorToken("eof-in-comment-end-bang-state"));
1439 _addToken(currentToken);
1440 state = dataState;
1441 } else {
1442 = "${}--!${data}";
1443 state = commentState;
1444 }
1445 return true;
1446 }
1448 bool doctypeState() {
1449 var data = stream.char();
1450 if (isWhitespace(data)) {
1451 state = beforeDoctypeNameState;
1452 } else if (data == EOF) {
1453 _addToken(new ParseErrorToken(
1454 "expected-doctype-name-but-got-eof"));
1455 currentDoctypeToken.correct = false;
1456 _addToken(currentToken);
1457 state = dataState;
1458 } else {
1459 _addToken(new ParseErrorToken("need-space-after-doctype"));
1460 stream.unget(data);
1461 state = beforeDoctypeNameState;
1462 }
1463 return true;
1464 }
1466 bool beforeDoctypeNameState() {
1467 var data = stream.char();
1468 if (isWhitespace(data)) {
1469 return true;
1470 } else if (data == ">") {
1471 _addToken(new ParseErrorToken(
1472 "expected-doctype-name-but-got-right-bracket"));
1473 currentDoctypeToken.correct = false;
1474 _addToken(currentToken);
1475 state = dataState;
1476 } else if (data == "\u0000") {
1477 _addToken(new ParseErrorToken("invalid-codepoint"));
1478 = "\uFFFD";
1479 state = doctypeNameState;
1480 } else if (data == EOF) {
1481 _addToken(new ParseErrorToken(
1482 "expected-doctype-name-but-got-eof"));
1483 currentDoctypeToken.correct = false;
1484 _addToken(currentToken);
1485 state = dataState;
1486 } else {
1487 = data;
1488 state = doctypeNameState;
1489 }
1490 return true;
1491 }
1493 bool doctypeNameState() {
1494 var data = stream.char();
1495 if (isWhitespace(data)) {
1496 = asciiUpper2Lower(;
1497 state = afterDoctypeNameState;
1498 } else if (data == ">") {
1499 = asciiUpper2Lower(;
1500 _addToken(currentToken);
1501 state = dataState;
1502 } else if (data == "\u0000") {
1503 _addToken(new ParseErrorToken("invalid-codepoint"));
1504 = "${}\uFFFD";
1505 state = doctypeNameState;
1506 } else if (data == EOF) {
1507 _addToken(new ParseErrorToken("eof-in-doctype-name"));
1508 currentDoctypeToken.correct = false;
1509 = asciiUpper2Lower(;
1510 _addToken(currentToken);
1511 state = dataState;
1512 } else {
1513 = '${}$data';
1514 }
1515 return true;
1516 }
1518 bool afterDoctypeNameState() {
1519 var data = stream.char();
1520 if (isWhitespace(data)) {
1521 return true;
1522 } else if (data == ">") {
1523 _addToken(currentToken);
1524 state = dataState;
1525 } else if (data == EOF) {
1526 currentDoctypeToken.correct = false;
1527 stream.unget(data);
1528 _addToken(new ParseErrorToken("eof-in-doctype"));
1529 _addToken(currentToken);
1530 state = dataState;
1531 } else {
1532 if (data == "p" || data == "P") {
1533 // TODO(jmesserly): would be nice to have a helper for this.
1534 var matched = true;
1535 for (var expected in const ["uU", "bB", "lL", "iI", "cC"]) {
1536 data = stream.char();
1537 if (data == EOF || !expected.contains(data)) {
1538 matched = false;
1539 break;
1540 }
1541 }
1542 if (matched) {
1543 state = afterDoctypePublicKeywordState;
1544 return true;
1545 }
1546 } else if (data == "s" || data == "S") {
1547 var matched = true;
1548 for (var expected in const ["yY", "sS", "tT", "eE", "mM"]) {
1549 data = stream.char();
1550 if (data == EOF || !expected.contains(data)) {
1551 matched = false;
1552 break;
1553 }
1554 }
1555 if (matched) {
1556 state = afterDoctypeSystemKeywordState;
1557 return true;
1558 }
1559 }
1561 // All the characters read before the current 'data' will be
1562 // [a-zA-Z], so they're garbage in the bogus doctype and can be
1563 // discarded; only the latest character might be '>' or EOF
1564 // and needs to be ungetted
1565 stream.unget(data);
1566 _addToken(new ParseErrorToken(
1567 "expected-space-or-right-bracket-in-doctype",
1568 messageParams: {"data": data}));
1569 currentDoctypeToken.correct = false;
1570 state = bogusDoctypeState;
1571 }
1572 return true;
1573 }
1575 bool afterDoctypePublicKeywordState() {
1576 var data = stream.char();
1577 if (isWhitespace(data)) {
1578 state = beforeDoctypePublicIdentifierState;
1579 } else if (data == "'" || data == '"') {
1580 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1581 stream.unget(data);
1582 state = beforeDoctypePublicIdentifierState;
1583 } else if (data == EOF) {
1584 _addToken(new ParseErrorToken("eof-in-doctype"));
1585 currentDoctypeToken.correct = false;
1586 _addToken(currentToken);
1587 state = dataState;
1588 } else {
1589 stream.unget(data);
1590 state = beforeDoctypePublicIdentifierState;
1591 }
1592 return true;
1593 }
1595 bool beforeDoctypePublicIdentifierState() {
1596 var data = stream.char();
1597 if (isWhitespace(data)) {
1598 return true;
1599 } else if (data == "\"") {
1600 currentDoctypeToken.publicId = "";
1601 state = doctypePublicIdentifierDoubleQuotedState;
1602 } else if (data == "'") {
1603 currentDoctypeToken.publicId = "";
1604 state = doctypePublicIdentifierSingleQuotedState;
1605 } else if (data == ">") {
1606 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1607 currentDoctypeToken.correct = false;
1608 _addToken(currentToken);
1609 state = dataState;
1610 } else if (data == EOF) {
1611 _addToken(new ParseErrorToken("eof-in-doctype"));
1612 currentDoctypeToken.correct = false;
1613 _addToken(currentToken);
1614 state = dataState;
1615 } else {
1616 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1617 currentDoctypeToken.correct = false;
1618 state = bogusDoctypeState;
1619 }
1620 return true;
1621 }
1623 bool doctypePublicIdentifierDoubleQuotedState() {
1624 var data = stream.char();
1625 if (data == '"') {
1626 state = afterDoctypePublicIdentifierState;
1627 } else if (data == "\u0000") {
1628 _addToken(new ParseErrorToken("invalid-codepoint"));
1629 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
1630 } else if (data == ">") {
1631 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1632 currentDoctypeToken.correct = false;
1633 _addToken(currentToken);
1634 state = dataState;
1635 } else if (data == EOF) {
1636 _addToken(new ParseErrorToken("eof-in-doctype"));
1637 currentDoctypeToken.correct = false;
1638 _addToken(currentToken);
1639 state = dataState;
1640 } else {
1641 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
1642 }
1643 return true;
1644 }
1646 bool doctypePublicIdentifierSingleQuotedState() {
1647 var data = stream.char();
1648 if (data == "'") {
1649 state = afterDoctypePublicIdentifierState;
1650 } else if (data == "\u0000") {
1651 _addToken(new ParseErrorToken("invalid-codepoint"));
1652 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
1653 } else if (data == ">") {
1654 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1655 currentDoctypeToken.correct = false;
1656 _addToken(currentToken);
1657 state = dataState;
1658 } else if (data == EOF) {
1659 _addToken(new ParseErrorToken("eof-in-doctype"));
1660 currentDoctypeToken.correct = false;
1661 _addToken(currentToken);
1662 state = dataState;
1663 } else {
1664 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
1665 }
1666 return true;
1667 }
1669 bool afterDoctypePublicIdentifierState() {
1670 var data = stream.char();
1671 if (isWhitespace(data)) {
1672 state = betweenDoctypePublicAndSystemIdentifiersState;
1673 } else if (data == ">") {
1674 _addToken(currentToken);
1675 state = dataState;
1676 } else if (data == '"') {
1677 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1678 currentDoctypeToken.systemId = "";
1679 state = doctypeSystemIdentifierDoubleQuotedState;
1680 } else if (data == "'") {
1681 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1682 currentDoctypeToken.systemId = "";
1683 state = doctypeSystemIdentifierSingleQuotedState;
1684 } else if (data == EOF) {
1685 _addToken(new ParseErrorToken("eof-in-doctype"));
1686 currentDoctypeToken.correct = false;
1687 _addToken(currentToken);
1688 state = dataState;
1689 } else {
1690 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1691 currentDoctypeToken.correct = false;
1692 state = bogusDoctypeState;
1693 }
1694 return true;
1695 }
1697 bool betweenDoctypePublicAndSystemIdentifiersState() {
1698 var data = stream.char();
1699 if (isWhitespace(data)) {
1700 return true;
1701 } else if (data == ">") {
1702 _addToken(currentToken);
1703 state = dataState;
1704 } else if (data == '"') {
1705 currentDoctypeToken.systemId = "";
1706 state = doctypeSystemIdentifierDoubleQuotedState;
1707 } else if (data == "'") {
1708 currentDoctypeToken.systemId = "";
1709 state = doctypeSystemIdentifierSingleQuotedState;
1710 } else if (data == EOF) {
1711 _addToken(new ParseErrorToken("eof-in-doctype"));
1712 currentDoctypeToken.correct = false;
1713 _addToken(currentToken);
1714 state = dataState;
1715 } else {
1716 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1717 currentDoctypeToken.correct = false;
1718 state = bogusDoctypeState;
1719 }
1720 return true;
1721 }
1723 bool afterDoctypeSystemKeywordState() {
1724 var data = stream.char();
1725 if (isWhitespace(data)) {
1726 state = beforeDoctypeSystemIdentifierState;
1727 } else if (data == "'" || data == '"') {
1728 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1729 stream.unget(data);
1730 state = beforeDoctypeSystemIdentifierState;
1731 } else if (data == EOF) {
1732 _addToken(new ParseErrorToken("eof-in-doctype"));
1733 currentDoctypeToken.correct = false;
1734 _addToken(currentToken);
1735 state = dataState;
1736 } else {
1737 stream.unget(data);
1738 state = beforeDoctypeSystemIdentifierState;
1739 }
1740 return true;
1741 }
1743 bool beforeDoctypeSystemIdentifierState() {
1744 var data = stream.char();
1745 if (isWhitespace(data)) {
1746 return true;
1747 } else if (data == "\"") {
1748 currentDoctypeToken.systemId = "";
1749 state = doctypeSystemIdentifierDoubleQuotedState;
1750 } else if (data == "'") {
1751 currentDoctypeToken.systemId = "";
1752 state = doctypeSystemIdentifierSingleQuotedState;
1753 } else if (data == ">") {
1754 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1755 currentDoctypeToken.correct = false;
1756 _addToken(currentToken);
1757 state = dataState;
1758 } else if (data == EOF) {
1759 _addToken(new ParseErrorToken("eof-in-doctype"));
1760 currentDoctypeToken.correct = false;
1761 _addToken(currentToken);
1762 state = dataState;
1763 } else {
1764 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1765 currentDoctypeToken.correct = false;
1766 state = bogusDoctypeState;
1767 }
1768 return true;
1769 }
1771 bool doctypeSystemIdentifierDoubleQuotedState() {
1772 var data = stream.char();
1773 if (data == "\"") {
1774 state = afterDoctypeSystemIdentifierState;
1775 } else if (data == "\u0000") {
1776 _addToken(new ParseErrorToken("invalid-codepoint"));
1777 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
1778 } else if (data == ">") {
1779 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1780 currentDoctypeToken.correct = false;
1781 _addToken(currentToken);
1782 state = dataState;
1783 } else if (data == EOF) {
1784 _addToken(new ParseErrorToken("eof-in-doctype"));
1785 currentDoctypeToken.correct = false;
1786 _addToken(currentToken);
1787 state = dataState;
1788 } else {
1789 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
1790 }
1791 return true;
1792 }
1794 bool doctypeSystemIdentifierSingleQuotedState() {
1795 var data = stream.char();
1796 if (data == "'") {
1797 state = afterDoctypeSystemIdentifierState;
1798 } else if (data == "\u0000") {
1799 _addToken(new ParseErrorToken("invalid-codepoint"));
1800 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
1801 } else if (data == ">") {
1802 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1803 currentDoctypeToken.correct = false;
1804 _addToken(currentToken);
1805 state = dataState;
1806 } else if (data == EOF) {
1807 _addToken(new ParseErrorToken("eof-in-doctype"));
1808 currentDoctypeToken.correct = false;
1809 _addToken(currentToken);
1810 state = dataState;
1811 } else {
1812 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
1813 }
1814 return true;
1815 }
1817 bool afterDoctypeSystemIdentifierState() {
1818 var data = stream.char();
1819 if (isWhitespace(data)) {
1820 return true;
1821 } else if (data == ">") {
1822 _addToken(currentToken);
1823 state = dataState;
1824 } else if (data == EOF) {
1825 _addToken(new ParseErrorToken("eof-in-doctype"));
1826 currentDoctypeToken.correct = false;
1827 _addToken(currentToken);
1828 state = dataState;
1829 } else {
1830 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1831 state = bogusDoctypeState;
1832 }
1833 return true;
1834 }
1836 bool bogusDoctypeState() {
1837 var data = stream.char();
1838 if (data == ">") {
1839 _addToken(currentToken);
1840 state = dataState;
1841 } else if (data == EOF) {
1842 // XXX EMIT
1843 stream.unget(data);
1844 _addToken(currentToken);
1845 state = dataState;
1846 }
1847 return true;
1848 }
1850 bool cdataSectionState() {
1851 var data = [];
1852 int matchedEnd = 0;
1853 while (true) {
1854 var ch = stream.char();
1855 if (ch == EOF) {
1856 break;
1857 }
1858 // Deal with null here rather than in the parser
1859 if (ch == "\u0000") {
1860 _addToken(new ParseErrorToken("invalid-codepoint"));
1861 ch = "\uFFFD";
1862 }
1863 data.add(ch);
1864 // TODO(jmesserly): it'd be nice if we had an easier way to match the end,
1865 // perhaps with a "peek" API.
1866 if (ch == "]" && matchedEnd < 2) {
1867 matchedEnd++;
1868 } else if (ch == ">" && matchedEnd == 2) {
1869 // Remove "]]>" from the end.
1870 data.removeLast();
1871 data.removeLast();
1872 data.removeLast();
1873 break;
1874 } else {
1875 matchedEnd = 0;
1876 }
1877 }
1879 if (data.length > 0) {
1880 _addToken(new CharactersToken(data.join()));
1881 }
1882 state = dataState;
1883 return true;
1884 }
1885 }
« no previous file with comments | « pkg/third_party/html5lib/lib/src/token.dart ('k') | pkg/third_party/html5lib/lib/src/treebuilder.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698