Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(55)

Side by Side Diff: html/lib/src/tokenizer.dart

Issue 1400473008: Roll Observatory packages and add a roll script (Closed) Base URL: git@github.com:dart-lang/observatory_pub_packages.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « html/lib/src/token.dart ('k') | html/lib/src/treebuilder.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 library tokenizer;
2
3 import 'dart:collection';
4 import 'package:html/parser.dart' show HtmlParser;
5 import 'constants.dart';
6 import 'inputstream.dart';
7 import 'token.dart';
8 import 'utils.dart';
9
10 // Group entities by their first character, for faster lookups
11
12 // TODO(jmesserly): we could use a better data structure here like a trie, if
13 // we had it implemented in Dart.
14 Map<String, List<String>> entitiesByFirstChar = (() {
15 var result = {};
16 for (var k in entities.keys) {
17 result.putIfAbsent(k[0], () => []).add(k);
18 }
19 return result;
20 })();
21
22 // TODO(jmesserly): lots of ways to make this faster:
23 // - use char codes everywhere instead of 1-char strings
24 // - use switch instead of contains, indexOf
25 // - use switch instead of the sequential if tests
26 // - avoid string concat
27
28 /// This class takes care of tokenizing HTML.
29 class HtmlTokenizer implements Iterator<Token> {
30 // TODO(jmesserly): a lot of these could be made private
31
32 final HtmlInputStream stream;
33
34 final bool lowercaseElementName;
35
36 final bool lowercaseAttrName;
37
38 /// True to generate spans in for [Token.span].
39 final bool generateSpans;
40
41 /// True to generate spans for attributes.
42 final bool attributeSpans;
43
44 /// This reference to the parser is used for correct CDATA handling.
45 /// The [HtmlParser] will set this at construction time.
46 HtmlParser parser;
47
48 final Queue<Token> tokenQueue;
49
50 /// Holds the token that is currently being processed.
51 Token currentToken;
52
53 /// Holds a reference to the method to be invoked for the next parser state.
54 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode
55 // bug prevents us from doing that. See http://dartbug.com/12465
56 Function state;
57
58 final StringBuffer _buffer = new StringBuffer();
59
60 int _lastOffset;
61
62 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add
63 // an item until it's ready. But the code doesn't have a clear notion of when
64 // it's "done" with the attribute.
65 List<TagAttribute> _attributes;
66 Set<String> _attributeNames;
67
68 HtmlTokenizer(doc, {String encoding, bool parseMeta: true,
69 this.lowercaseElementName: true, this.lowercaseAttrName: true,
70 bool generateSpans: false, String sourceUrl, this.attributeSpans: false})
71 : stream = new HtmlInputStream(
72 doc, encoding, parseMeta, generateSpans, sourceUrl),
73 tokenQueue = new Queue(),
74 generateSpans = generateSpans {
75 reset();
76 }
77
78 TagToken get currentTagToken => currentToken;
79 DoctypeToken get currentDoctypeToken => currentToken;
80 StringToken get currentStringToken => currentToken;
81
82 Token _current;
83 Token get current => _current;
84
85 final StringBuffer _attributeName = new StringBuffer();
86 final StringBuffer _attributeValue = new StringBuffer();
87
88 void _markAttributeEnd(int offset) {
89 _attributes.last.value = '$_attributeValue';
90 if (attributeSpans) _attributes.last.end = stream.position + offset;
91 }
92
93 void _markAttributeValueStart(int offset) {
94 if (attributeSpans) _attributes.last.startValue = stream.position + offset;
95 }
96
97 void _markAttributeValueEnd(int offset) {
98 if (attributeSpans) _attributes.last.endValue = stream.position + offset;
99 _markAttributeEnd(offset);
100 }
101
102 // Note: we could track the name span here, if we need it.
103 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset);
104
105 void _addAttribute(String name) {
106 if (_attributes == null) _attributes = [];
107 _attributeName.clear();
108 _attributeName.write(name);
109 _attributeValue.clear();
110 var attr = new TagAttribute();
111 _attributes.add(attr);
112 if (attributeSpans) attr.start = stream.position - name.length;
113 }
114
115 /// This is where the magic happens.
116 ///
117 /// We do our usually processing through the states and when we have a token
118 /// to return we yield the token which pauses processing until the next token
119 /// is requested.
120 bool moveNext() {
121 // Start processing. When EOF is reached state will return false;
122 // instead of true and the loop will terminate.
123 while (stream.errors.length == 0 && tokenQueue.length == 0) {
124 if (!state()) {
125 _current = null;
126 return false;
127 }
128 }
129 if (stream.errors.length > 0) {
130 _current = new ParseErrorToken(stream.errors.removeFirst());
131 } else {
132 assert(tokenQueue.length > 0);
133 _current = tokenQueue.removeFirst();
134 }
135 return true;
136 }
137
138 /// Resets the tokenizer state. Calling this does not reset the [stream] or
139 /// the [parser].
140 void reset() {
141 _lastOffset = 0;
142 tokenQueue.clear();
143 currentToken = null;
144 _buffer.clear();
145 _attributes = null;
146 _attributeNames = null;
147 state = dataState;
148 }
149
150 /// Adds a token to the queue. Sets the span if needed.
151 void _addToken(Token token) {
152 if (generateSpans && token.span == null) {
153 int offset = stream.position;
154 token.span = stream.fileInfo.span(_lastOffset, offset);
155 if (token is! ParseErrorToken) {
156 _lastOffset = offset;
157 }
158 }
159 tokenQueue.add(token);
160 }
161
162 /// This function returns either U+FFFD or the character based on the
163 /// decimal or hexadecimal representation. It also discards ";" if present.
164 /// If not present it will add a [ParseErrorToken].
165 String consumeNumberEntity(bool isHex) {
166 var allowed = isDigit;
167 var radix = 10;
168 if (isHex) {
169 allowed = isHexDigit;
170 radix = 16;
171 }
172
173 var charStack = [];
174
175 // Consume all the characters that are in range while making sure we
176 // don't hit an EOF.
177 var c = stream.char();
178 while (allowed(c) && c != EOF) {
179 charStack.add(c);
180 c = stream.char();
181 }
182
183 // Convert the set of characters consumed to an int.
184 var charAsInt = parseIntRadix(charStack.join(), radix);
185
186 // Certain characters get replaced with others
187 var char = replacementCharacters[charAsInt];
188 if (char != null) {
189 _addToken(new ParseErrorToken("illegal-codepoint-for-numeric-entity",
190 messageParams: {"charAsInt": charAsInt}));
191 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF) ||
192 (charAsInt > 0x10FFFF)) {
193 char = "\uFFFD";
194 _addToken(new ParseErrorToken("illegal-codepoint-for-numeric-entity",
195 messageParams: {"charAsInt": charAsInt}));
196 } else {
197 // Should speed up this check somehow (e.g. move the set to a constant)
198 if ((0x0001 <= charAsInt && charAsInt <= 0x0008) ||
199 (0x000E <= charAsInt && charAsInt <= 0x001F) ||
200 (0x007F <= charAsInt && charAsInt <= 0x009F) ||
201 (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) ||
202 const [
203 0x000B,
204 0xFFFE,
205 0xFFFF,
206 0x1FFFE,
207 0x1FFFF,
208 0x2FFFE,
209 0x2FFFF,
210 0x3FFFE,
211 0x3FFFF,
212 0x4FFFE,
213 0x4FFFF,
214 0x5FFFE,
215 0x5FFFF,
216 0x6FFFE,
217 0x6FFFF,
218 0x7FFFE,
219 0x7FFFF,
220 0x8FFFE,
221 0x8FFFF,
222 0x9FFFE,
223 0x9FFFF,
224 0xAFFFE,
225 0xAFFFF,
226 0xBFFFE,
227 0xBFFFF,
228 0xCFFFE,
229 0xCFFFF,
230 0xDFFFE,
231 0xDFFFF,
232 0xEFFFE,
233 0xEFFFF,
234 0xFFFFE,
235 0xFFFFF,
236 0x10FFFE,
237 0x10FFFF
238 ].contains(charAsInt)) {
239 _addToken(new ParseErrorToken("illegal-codepoint-for-numeric-entity",
240 messageParams: {"charAsInt": charAsInt}));
241 }
242 char = new String.fromCharCodes([charAsInt]);
243 }
244
245 // Discard the ; if present. Otherwise, put it back on the queue and
246 // invoke parseError on parser.
247 if (c != ";") {
248 _addToken(new ParseErrorToken("numeric-entity-without-semicolon"));
249 stream.unget(c);
250 }
251 return char;
252 }
253
254 void consumeEntity({String allowedChar, bool fromAttribute: false}) {
255 // Initialise to the default output for when no entity is matched
256 var output = "&";
257
258 var charStack = [stream.char()];
259 if (isWhitespace(charStack[0]) ||
260 charStack[0] == '<' ||
261 charStack[0] == '&' ||
262 charStack[0] == EOF ||
263 allowedChar == charStack[0]) {
264 stream.unget(charStack[0]);
265 } else if (charStack[0] == "#") {
266 // Read the next character to see if it's hex or decimal
267 bool hex = false;
268 charStack.add(stream.char());
269 if (charStack.last == 'x' || charStack.last == 'X') {
270 hex = true;
271 charStack.add(stream.char());
272 }
273
274 // charStack.last should be the first digit
275 if (hex && isHexDigit(charStack.last) ||
276 (!hex && isDigit(charStack.last))) {
277 // At least one digit found, so consume the whole number
278 stream.unget(charStack.last);
279 output = consumeNumberEntity(hex);
280 } else {
281 // No digits found
282 _addToken(new ParseErrorToken("expected-numeric-entity"));
283 stream.unget(charStack.removeLast());
284 output = "&${charStack.join()}";
285 }
286 } else {
287 // At this point in the process might have named entity. Entities
288 // are stored in the global variable "entities".
289 //
290 // Consume characters and compare to these to a substring of the
291 // entity names in the list until the substring no longer matches.
292 var filteredEntityList = entitiesByFirstChar[charStack[0]];
293 if (filteredEntityList == null) filteredEntityList = const [];
294
295 while (charStack.last != EOF) {
296 var name = charStack.join();
297 filteredEntityList =
298 filteredEntityList.where((e) => e.startsWith(name)).toList();
299
300 if (filteredEntityList.length == 0) {
301 break;
302 }
303 charStack.add(stream.char());
304 }
305
306 // At this point we have a string that starts with some characters
307 // that may match an entity
308 String entityName = null;
309
310 // Try to find the longest entity the string will match to take care
311 // of &noti for instance.
312
313 int entityLen;
314 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) {
315 var possibleEntityName = charStack.sublist(0, entityLen).join();
316 if (entities.containsKey(possibleEntityName)) {
317 entityName = possibleEntityName;
318 break;
319 }
320 }
321
322 if (entityName != null) {
323 var lastChar = entityName[entityName.length - 1];
324 if (lastChar != ";") {
325 _addToken(new ParseErrorToken("named-entity-without-semicolon"));
326 }
327 if (lastChar != ";" &&
328 fromAttribute &&
329 (isLetterOrDigit(charStack[entityLen]) ||
330 charStack[entityLen] == '=')) {
331 stream.unget(charStack.removeLast());
332 output = "&${charStack.join()}";
333 } else {
334 output = entities[entityName];
335 stream.unget(charStack.removeLast());
336 output = '${output}${slice(charStack, entityLen).join()}';
337 }
338 } else {
339 _addToken(new ParseErrorToken("expected-named-entity"));
340 stream.unget(charStack.removeLast());
341 output = "&${charStack.join()}";
342 }
343 }
344 if (fromAttribute) {
345 _attributeValue.write(output);
346 } else {
347 var token;
348 if (isWhitespace(output)) {
349 token = new SpaceCharactersToken(output);
350 } else {
351 token = new CharactersToken(output);
352 }
353 _addToken(token);
354 }
355 }
356
357 /// This method replaces the need for "entityInAttributeValueState".
358 void processEntityInAttribute(String allowedChar) {
359 consumeEntity(allowedChar: allowedChar, fromAttribute: true);
360 }
361
362 /// This method is a generic handler for emitting the tags. It also sets
363 /// the state to "data" because that's what's needed after a token has been
364 /// emitted.
365 void emitCurrentToken() {
366 var token = currentToken;
367 // Add token to the queue to be yielded
368 if (token is TagToken) {
369 if (lowercaseElementName) {
370 token.name = asciiUpper2Lower(token.name);
371 }
372 if (token is EndTagToken) {
373 if (_attributes != null) {
374 _addToken(new ParseErrorToken("attributes-in-end-tag"));
375 }
376 if (token.selfClosing) {
377 _addToken(new ParseErrorToken("this-closing-flag-on-end-tag"));
378 }
379 } else if (token is StartTagToken) {
380 // HTML5 specific normalizations to the token stream.
381 // Convert the list into a map where first key wins.
382 token.data = new LinkedHashMap<Object, String>();
383 if (_attributes != null) {
384 for (var attr in _attributes) {
385 token.data.putIfAbsent(attr.name, () => attr.value);
386 }
387 if (attributeSpans) token.attributeSpans = _attributes;
388 }
389 }
390 _attributes = null;
391 _attributeNames = null;
392 }
393 _addToken(token);
394 state = dataState;
395 }
396
397 // Below are the various tokenizer states worked out.
398
399 bool dataState() {
400 var data = stream.char();
401 if (data == "&") {
402 state = entityDataState;
403 } else if (data == "<") {
404 state = tagOpenState;
405 } else if (data == "\u0000") {
406 _addToken(new ParseErrorToken("invalid-codepoint"));
407 _addToken(new CharactersToken("\u0000"));
408 } else if (data == EOF) {
409 // Tokenization ends.
410 return false;
411 } else if (isWhitespace(data)) {
412 // Directly after emitting a token you switch back to the "data
413 // state". At that point spaceCharacters are important so they are
414 // emitted separately.
415 _addToken(new SpaceCharactersToken(
416 '${data}${stream.charsUntil(spaceCharacters, true)}'));
417 // No need to update lastFourChars here, since the first space will
418 // have already been appended to lastFourChars and will have broken
419 // any <!-- or --> sequences
420 } else {
421 var chars = stream.charsUntil("&<\u0000");
422 _addToken(new CharactersToken('${data}${chars}'));
423 }
424 return true;
425 }
426
427 bool entityDataState() {
428 consumeEntity();
429 state = dataState;
430 return true;
431 }
432
433 bool rcdataState() {
434 var data = stream.char();
435 if (data == "&") {
436 state = characterReferenceInRcdata;
437 } else if (data == "<") {
438 state = rcdataLessThanSignState;
439 } else if (data == EOF) {
440 // Tokenization ends.
441 return false;
442 } else if (data == "\u0000") {
443 _addToken(new ParseErrorToken("invalid-codepoint"));
444 _addToken(new CharactersToken("\uFFFD"));
445 } else if (isWhitespace(data)) {
446 // Directly after emitting a token you switch back to the "data
447 // state". At that point spaceCharacters are important so they are
448 // emitted separately.
449 _addToken(new SpaceCharactersToken(
450 '${data}${stream.charsUntil(spaceCharacters, true)}'));
451 } else {
452 var chars = stream.charsUntil("&<");
453 _addToken(new CharactersToken('${data}${chars}'));
454 }
455 return true;
456 }
457
458 bool characterReferenceInRcdata() {
459 consumeEntity();
460 state = rcdataState;
461 return true;
462 }
463
464 bool rawtextState() {
465 var data = stream.char();
466 if (data == "<") {
467 state = rawtextLessThanSignState;
468 } else if (data == "\u0000") {
469 _addToken(new ParseErrorToken("invalid-codepoint"));
470 _addToken(new CharactersToken("\uFFFD"));
471 } else if (data == EOF) {
472 // Tokenization ends.
473 return false;
474 } else {
475 var chars = stream.charsUntil("<\u0000");
476 _addToken(new CharactersToken("${data}${chars}"));
477 }
478 return true;
479 }
480
481 bool scriptDataState() {
482 var data = stream.char();
483 if (data == "<") {
484 state = scriptDataLessThanSignState;
485 } else if (data == "\u0000") {
486 _addToken(new ParseErrorToken("invalid-codepoint"));
487 _addToken(new CharactersToken("\uFFFD"));
488 } else if (data == EOF) {
489 // Tokenization ends.
490 return false;
491 } else {
492 var chars = stream.charsUntil("<\u0000");
493 _addToken(new CharactersToken("${data}${chars}"));
494 }
495 return true;
496 }
497
498 bool plaintextState() {
499 var data = stream.char();
500 if (data == EOF) {
501 // Tokenization ends.
502 return false;
503 } else if (data == "\u0000") {
504 _addToken(new ParseErrorToken("invalid-codepoint"));
505 _addToken(new CharactersToken("\uFFFD"));
506 } else {
507 _addToken(new CharactersToken('${data}${stream.charsUntil("\u0000")}'));
508 }
509 return true;
510 }
511
512 bool tagOpenState() {
513 var data = stream.char();
514 if (data == "!") {
515 state = markupDeclarationOpenState;
516 } else if (data == "/") {
517 state = closeTagOpenState;
518 } else if (isLetter(data)) {
519 currentToken = new StartTagToken(data);
520 state = tagNameState;
521 } else if (data == ">") {
522 // XXX In theory it could be something besides a tag name. But
523 // do we really care?
524 _addToken(new ParseErrorToken("expected-tag-name-but-got-right-bracket"));
525 _addToken(new CharactersToken("<>"));
526 state = dataState;
527 } else if (data == "?") {
528 // XXX In theory it could be something besides a tag name. But
529 // do we really care?
530 _addToken(new ParseErrorToken("expected-tag-name-but-got-question-mark"));
531 stream.unget(data);
532 state = bogusCommentState;
533 } else {
534 // XXX
535 _addToken(new ParseErrorToken("expected-tag-name"));
536 _addToken(new CharactersToken("<"));
537 stream.unget(data);
538 state = dataState;
539 }
540 return true;
541 }
542
543 bool closeTagOpenState() {
544 var data = stream.char();
545 if (isLetter(data)) {
546 currentToken = new EndTagToken(data);
547 state = tagNameState;
548 } else if (data == ">") {
549 _addToken(
550 new ParseErrorToken("expected-closing-tag-but-got-right-bracket"));
551 state = dataState;
552 } else if (data == EOF) {
553 _addToken(new ParseErrorToken("expected-closing-tag-but-got-eof"));
554 _addToken(new CharactersToken("</"));
555 state = dataState;
556 } else {
557 // XXX data can be _'_...
558 _addToken(new ParseErrorToken("expected-closing-tag-but-got-char",
559 messageParams: {"data": data}));
560 stream.unget(data);
561 state = bogusCommentState;
562 }
563 return true;
564 }
565
566 bool tagNameState() {
567 var data = stream.char();
568 if (isWhitespace(data)) {
569 state = beforeAttributeNameState;
570 } else if (data == ">") {
571 emitCurrentToken();
572 } else if (data == EOF) {
573 _addToken(new ParseErrorToken("eof-in-tag-name"));
574 state = dataState;
575 } else if (data == "/") {
576 state = selfClosingStartTagState;
577 } else if (data == "\u0000") {
578 _addToken(new ParseErrorToken("invalid-codepoint"));
579 currentTagToken.name = '${currentTagToken.name}\uFFFD';
580 } else {
581 currentTagToken.name = '${currentTagToken.name}$data';
582 // (Don't use charsUntil here, because tag names are
583 // very short and it's faster to not do anything fancy)
584 }
585 return true;
586 }
587
588 bool rcdataLessThanSignState() {
589 var data = stream.char();
590 if (data == "/") {
591 _buffer.clear();
592 state = rcdataEndTagOpenState;
593 } else {
594 _addToken(new CharactersToken("<"));
595 stream.unget(data);
596 state = rcdataState;
597 }
598 return true;
599 }
600
601 bool rcdataEndTagOpenState() {
602 var data = stream.char();
603 if (isLetter(data)) {
604 _buffer.write(data);
605 state = rcdataEndTagNameState;
606 } else {
607 _addToken(new CharactersToken("</"));
608 stream.unget(data);
609 state = rcdataState;
610 }
611 return true;
612 }
613
614 bool _tokenIsAppropriate() {
615 // TODO(jmesserly): this should use case insensitive compare instead.
616 return currentToken is TagToken &&
617 currentTagToken.name.toLowerCase() == '$_buffer'.toLowerCase();
618 }
619
620 bool rcdataEndTagNameState() {
621 var appropriate = _tokenIsAppropriate();
622 var data = stream.char();
623 if (isWhitespace(data) && appropriate) {
624 currentToken = new EndTagToken('$_buffer');
625 state = beforeAttributeNameState;
626 } else if (data == "/" && appropriate) {
627 currentToken = new EndTagToken('$_buffer');
628 state = selfClosingStartTagState;
629 } else if (data == ">" && appropriate) {
630 currentToken = new EndTagToken('$_buffer');
631 emitCurrentToken();
632 state = dataState;
633 } else if (isLetter(data)) {
634 _buffer.write(data);
635 } else {
636 _addToken(new CharactersToken("</$_buffer"));
637 stream.unget(data);
638 state = rcdataState;
639 }
640 return true;
641 }
642
643 bool rawtextLessThanSignState() {
644 var data = stream.char();
645 if (data == "/") {
646 _buffer.clear();
647 state = rawtextEndTagOpenState;
648 } else {
649 _addToken(new CharactersToken("<"));
650 stream.unget(data);
651 state = rawtextState;
652 }
653 return true;
654 }
655
656 bool rawtextEndTagOpenState() {
657 var data = stream.char();
658 if (isLetter(data)) {
659 _buffer.write(data);
660 state = rawtextEndTagNameState;
661 } else {
662 _addToken(new CharactersToken("</"));
663 stream.unget(data);
664 state = rawtextState;
665 }
666 return true;
667 }
668
669 bool rawtextEndTagNameState() {
670 var appropriate = _tokenIsAppropriate();
671 var data = stream.char();
672 if (isWhitespace(data) && appropriate) {
673 currentToken = new EndTagToken('$_buffer');
674 state = beforeAttributeNameState;
675 } else if (data == "/" && appropriate) {
676 currentToken = new EndTagToken('$_buffer');
677 state = selfClosingStartTagState;
678 } else if (data == ">" && appropriate) {
679 currentToken = new EndTagToken('$_buffer');
680 emitCurrentToken();
681 state = dataState;
682 } else if (isLetter(data)) {
683 _buffer.write(data);
684 } else {
685 _addToken(new CharactersToken("</$_buffer"));
686 stream.unget(data);
687 state = rawtextState;
688 }
689 return true;
690 }
691
692 bool scriptDataLessThanSignState() {
693 var data = stream.char();
694 if (data == "/") {
695 _buffer.clear();
696 state = scriptDataEndTagOpenState;
697 } else if (data == "!") {
698 _addToken(new CharactersToken("<!"));
699 state = scriptDataEscapeStartState;
700 } else {
701 _addToken(new CharactersToken("<"));
702 stream.unget(data);
703 state = scriptDataState;
704 }
705 return true;
706 }
707
708 bool scriptDataEndTagOpenState() {
709 var data = stream.char();
710 if (isLetter(data)) {
711 _buffer.write(data);
712 state = scriptDataEndTagNameState;
713 } else {
714 _addToken(new CharactersToken("</"));
715 stream.unget(data);
716 state = scriptDataState;
717 }
718 return true;
719 }
720
721 bool scriptDataEndTagNameState() {
722 var appropriate = _tokenIsAppropriate();
723 var data = stream.char();
724 if (isWhitespace(data) && appropriate) {
725 currentToken = new EndTagToken('$_buffer');
726 state = beforeAttributeNameState;
727 } else if (data == "/" && appropriate) {
728 currentToken = new EndTagToken('$_buffer');
729 state = selfClosingStartTagState;
730 } else if (data == ">" && appropriate) {
731 currentToken = new EndTagToken('$_buffer');
732 emitCurrentToken();
733 state = dataState;
734 } else if (isLetter(data)) {
735 _buffer.write(data);
736 } else {
737 _addToken(new CharactersToken("</$_buffer"));
738 stream.unget(data);
739 state = scriptDataState;
740 }
741 return true;
742 }
743
744 bool scriptDataEscapeStartState() {
745 var data = stream.char();
746 if (data == "-") {
747 _addToken(new CharactersToken("-"));
748 state = scriptDataEscapeStartDashState;
749 } else {
750 stream.unget(data);
751 state = scriptDataState;
752 }
753 return true;
754 }
755
756 bool scriptDataEscapeStartDashState() {
757 var data = stream.char();
758 if (data == "-") {
759 _addToken(new CharactersToken("-"));
760 state = scriptDataEscapedDashDashState;
761 } else {
762 stream.unget(data);
763 state = scriptDataState;
764 }
765 return true;
766 }
767
768 bool scriptDataEscapedState() {
769 var data = stream.char();
770 if (data == "-") {
771 _addToken(new CharactersToken("-"));
772 state = scriptDataEscapedDashState;
773 } else if (data == "<") {
774 state = scriptDataEscapedLessThanSignState;
775 } else if (data == "\u0000") {
776 _addToken(new ParseErrorToken("invalid-codepoint"));
777 _addToken(new CharactersToken("\uFFFD"));
778 } else if (data == EOF) {
779 state = dataState;
780 } else {
781 var chars = stream.charsUntil("<-\u0000");
782 _addToken(new CharactersToken("${data}${chars}"));
783 }
784 return true;
785 }
786
787 bool scriptDataEscapedDashState() {
788 var data = stream.char();
789 if (data == "-") {
790 _addToken(new CharactersToken("-"));
791 state = scriptDataEscapedDashDashState;
792 } else if (data == "<") {
793 state = scriptDataEscapedLessThanSignState;
794 } else if (data == "\u0000") {
795 _addToken(new ParseErrorToken("invalid-codepoint"));
796 _addToken(new CharactersToken("\uFFFD"));
797 state = scriptDataEscapedState;
798 } else if (data == EOF) {
799 state = dataState;
800 } else {
801 _addToken(new CharactersToken(data));
802 state = scriptDataEscapedState;
803 }
804 return true;
805 }
806
807 bool scriptDataEscapedDashDashState() {
808 var data = stream.char();
809 if (data == "-") {
810 _addToken(new CharactersToken("-"));
811 } else if (data == "<") {
812 state = scriptDataEscapedLessThanSignState;
813 } else if (data == ">") {
814 _addToken(new CharactersToken(">"));
815 state = scriptDataState;
816 } else if (data == "\u0000") {
817 _addToken(new ParseErrorToken("invalid-codepoint"));
818 _addToken(new CharactersToken("\uFFFD"));
819 state = scriptDataEscapedState;
820 } else if (data == EOF) {
821 state = dataState;
822 } else {
823 _addToken(new CharactersToken(data));
824 state = scriptDataEscapedState;
825 }
826 return true;
827 }
828
829 bool scriptDataEscapedLessThanSignState() {
830 var data = stream.char();
831 if (data == "/") {
832 _buffer.clear();
833 state = scriptDataEscapedEndTagOpenState;
834 } else if (isLetter(data)) {
835 _addToken(new CharactersToken("<$data"));
836 _buffer.clear();
837 _buffer.write(data);
838 state = scriptDataDoubleEscapeStartState;
839 } else {
840 _addToken(new CharactersToken("<"));
841 stream.unget(data);
842 state = scriptDataEscapedState;
843 }
844 return true;
845 }
846
847 bool scriptDataEscapedEndTagOpenState() {
848 var data = stream.char();
849 if (isLetter(data)) {
850 _buffer.clear();
851 _buffer.write(data);
852 state = scriptDataEscapedEndTagNameState;
853 } else {
854 _addToken(new CharactersToken("</"));
855 stream.unget(data);
856 state = scriptDataEscapedState;
857 }
858 return true;
859 }
860
861 bool scriptDataEscapedEndTagNameState() {
862 var appropriate = _tokenIsAppropriate();
863 var data = stream.char();
864 if (isWhitespace(data) && appropriate) {
865 currentToken = new EndTagToken('$_buffer');
866 state = beforeAttributeNameState;
867 } else if (data == "/" && appropriate) {
868 currentToken = new EndTagToken('$_buffer');
869 state = selfClosingStartTagState;
870 } else if (data == ">" && appropriate) {
871 currentToken = new EndTagToken('$_buffer');
872 emitCurrentToken();
873 state = dataState;
874 } else if (isLetter(data)) {
875 _buffer.write(data);
876 } else {
877 _addToken(new CharactersToken("</$_buffer"));
878 stream.unget(data);
879 state = scriptDataEscapedState;
880 }
881 return true;
882 }
883
884 bool scriptDataDoubleEscapeStartState() {
885 var data = stream.char();
886 if (isWhitespace(data) || data == "/" || data == ">") {
887 _addToken(new CharactersToken(data));
888 if ('$_buffer'.toLowerCase() == "script") {
889 state = scriptDataDoubleEscapedState;
890 } else {
891 state = scriptDataEscapedState;
892 }
893 } else if (isLetter(data)) {
894 _addToken(new CharactersToken(data));
895 _buffer.write(data);
896 } else {
897 stream.unget(data);
898 state = scriptDataEscapedState;
899 }
900 return true;
901 }
902
903 bool scriptDataDoubleEscapedState() {
904 var data = stream.char();
905 if (data == "-") {
906 _addToken(new CharactersToken("-"));
907 state = scriptDataDoubleEscapedDashState;
908 } else if (data == "<") {
909 _addToken(new CharactersToken("<"));
910 state = scriptDataDoubleEscapedLessThanSignState;
911 } else if (data == "\u0000") {
912 _addToken(new ParseErrorToken("invalid-codepoint"));
913 _addToken(new CharactersToken("\uFFFD"));
914 } else if (data == EOF) {
915 _addToken(new ParseErrorToken("eof-in-script-in-script"));
916 state = dataState;
917 } else {
918 _addToken(new CharactersToken(data));
919 }
920 return true;
921 }
922
923 bool scriptDataDoubleEscapedDashState() {
924 var data = stream.char();
925 if (data == "-") {
926 _addToken(new CharactersToken("-"));
927 state = scriptDataDoubleEscapedDashDashState;
928 } else if (data == "<") {
929 _addToken(new CharactersToken("<"));
930 state = scriptDataDoubleEscapedLessThanSignState;
931 } else if (data == "\u0000") {
932 _addToken(new ParseErrorToken("invalid-codepoint"));
933 _addToken(new CharactersToken("\uFFFD"));
934 state = scriptDataDoubleEscapedState;
935 } else if (data == EOF) {
936 _addToken(new ParseErrorToken("eof-in-script-in-script"));
937 state = dataState;
938 } else {
939 _addToken(new CharactersToken(data));
940 state = scriptDataDoubleEscapedState;
941 }
942 return true;
943 }
944
945 // TODO(jmesserly): report bug in original code
946 // (was "Dash" instead of "DashDash")
947 bool scriptDataDoubleEscapedDashDashState() {
948 var data = stream.char();
949 if (data == "-") {
950 _addToken(new CharactersToken("-"));
951 } else if (data == "<") {
952 _addToken(new CharactersToken("<"));
953 state = scriptDataDoubleEscapedLessThanSignState;
954 } else if (data == ">") {
955 _addToken(new CharactersToken(">"));
956 state = scriptDataState;
957 } else if (data == "\u0000") {
958 _addToken(new ParseErrorToken("invalid-codepoint"));
959 _addToken(new CharactersToken("\uFFFD"));
960 state = scriptDataDoubleEscapedState;
961 } else if (data == EOF) {
962 _addToken(new ParseErrorToken("eof-in-script-in-script"));
963 state = dataState;
964 } else {
965 _addToken(new CharactersToken(data));
966 state = scriptDataDoubleEscapedState;
967 }
968 return true;
969 }
970
971 bool scriptDataDoubleEscapedLessThanSignState() {
972 var data = stream.char();
973 if (data == "/") {
974 _addToken(new CharactersToken("/"));
975 _buffer.clear();
976 state = scriptDataDoubleEscapeEndState;
977 } else {
978 stream.unget(data);
979 state = scriptDataDoubleEscapedState;
980 }
981 return true;
982 }
983
984 bool scriptDataDoubleEscapeEndState() {
985 var data = stream.char();
986 if (isWhitespace(data) || data == "/" || data == ">") {
987 _addToken(new CharactersToken(data));
988 if ('$_buffer'.toLowerCase() == "script") {
989 state = scriptDataEscapedState;
990 } else {
991 state = scriptDataDoubleEscapedState;
992 }
993 } else if (isLetter(data)) {
994 _addToken(new CharactersToken(data));
995 _buffer.write(data);
996 } else {
997 stream.unget(data);
998 state = scriptDataDoubleEscapedState;
999 }
1000 return true;
1001 }
1002
1003 bool beforeAttributeNameState() {
1004 var data = stream.char();
1005 if (isWhitespace(data)) {
1006 stream.charsUntil(spaceCharacters, true);
1007 } else if (isLetter(data)) {
1008 _addAttribute(data);
1009 state = attributeNameState;
1010 } else if (data == ">") {
1011 emitCurrentToken();
1012 } else if (data == "/") {
1013 state = selfClosingStartTagState;
1014 } else if (data == EOF) {
1015 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof"));
1016 state = dataState;
1017 } else if ("'\"=<".contains(data)) {
1018 _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
1019 _addAttribute(data);
1020 state = attributeNameState;
1021 } else if (data == "\u0000") {
1022 _addToken(new ParseErrorToken("invalid-codepoint"));
1023 _addAttribute("\uFFFD");
1024 state = attributeNameState;
1025 } else {
1026 _addAttribute(data);
1027 state = attributeNameState;
1028 }
1029 return true;
1030 }
1031
1032 bool attributeNameState() {
1033 var data = stream.char();
1034 bool leavingThisState = true;
1035 bool emitToken = false;
1036 if (data == "=") {
1037 state = beforeAttributeValueState;
1038 } else if (isLetter(data)) {
1039 _attributeName.write(data);
1040 _attributeName.write(stream.charsUntil(asciiLetters, true));
1041 leavingThisState = false;
1042 } else if (data == ">") {
1043 // XXX If we emit here the attributes are converted to a dict
1044 // without being checked and when the code below runs we error
1045 // because data is a dict not a list
1046 emitToken = true;
1047 } else if (isWhitespace(data)) {
1048 state = afterAttributeNameState;
1049 } else if (data == "/") {
1050 state = selfClosingStartTagState;
1051 } else if (data == "\u0000") {
1052 _addToken(new ParseErrorToken("invalid-codepoint"));
1053 _attributeName.write('\uFFFD');
1054 leavingThisState = false;
1055 } else if (data == EOF) {
1056 _addToken(new ParseErrorToken("eof-in-attribute-name"));
1057 state = dataState;
1058 } else if ("'\"<".contains(data)) {
1059 _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
1060 _attributeName.write(data);
1061 leavingThisState = false;
1062 } else {
1063 _attributeName.write(data);
1064 leavingThisState = false;
1065 }
1066
1067 if (leavingThisState) {
1068 _markAttributeNameEnd(-1);
1069
1070 // Attributes are not dropped at this stage. That happens when the
1071 // start tag token is emitted so values can still be safely appended
1072 // to attributes, but we do want to report the parse error in time.
1073 var attrName = _attributeName.toString();
1074 if (lowercaseAttrName) {
1075 attrName = asciiUpper2Lower(attrName);
1076 }
1077 _attributes.last.name = attrName;
1078 if (_attributeNames == null) _attributeNames = new Set();
1079 if (_attributeNames.contains(attrName)) {
1080 _addToken(new ParseErrorToken("duplicate-attribute"));
1081 }
1082 _attributeNames.add(attrName);
1083
1084 // XXX Fix for above XXX
1085 if (emitToken) {
1086 emitCurrentToken();
1087 }
1088 }
1089 return true;
1090 }
1091
1092 bool afterAttributeNameState() {
1093 var data = stream.char();
1094 if (isWhitespace(data)) {
1095 stream.charsUntil(spaceCharacters, true);
1096 } else if (data == "=") {
1097 state = beforeAttributeValueState;
1098 } else if (data == ">") {
1099 emitCurrentToken();
1100 } else if (isLetter(data)) {
1101 _addAttribute(data);
1102 state = attributeNameState;
1103 } else if (data == "/") {
1104 state = selfClosingStartTagState;
1105 } else if (data == "\u0000") {
1106 _addToken(new ParseErrorToken("invalid-codepoint"));
1107 _addAttribute("\uFFFD");
1108 state = attributeNameState;
1109 } else if (data == EOF) {
1110 _addToken(new ParseErrorToken("expected-end-of-tag-but-got-eof"));
1111 state = dataState;
1112 } else if ("'\"<".contains(data)) {
1113 _addToken(new ParseErrorToken("invalid-character-after-attribute-name"));
1114 _addAttribute(data);
1115 state = attributeNameState;
1116 } else {
1117 _addAttribute(data);
1118 state = attributeNameState;
1119 }
1120 return true;
1121 }
1122
1123 bool beforeAttributeValueState() {
1124 var data = stream.char();
1125 if (isWhitespace(data)) {
1126 stream.charsUntil(spaceCharacters, true);
1127 } else if (data == "\"") {
1128 _markAttributeValueStart(0);
1129 state = attributeValueDoubleQuotedState;
1130 } else if (data == "&") {
1131 state = attributeValueUnQuotedState;
1132 stream.unget(data);
1133 _markAttributeValueStart(0);
1134 } else if (data == "'") {
1135 _markAttributeValueStart(0);
1136 state = attributeValueSingleQuotedState;
1137 } else if (data == ">") {
1138 _addToken(new ParseErrorToken(
1139 "expected-attribute-value-but-got-right-bracket"));
1140 emitCurrentToken();
1141 } else if (data == "\u0000") {
1142 _addToken(new ParseErrorToken("invalid-codepoint"));
1143 _markAttributeValueStart(-1);
1144 _attributeValue.write('\uFFFD');
1145 state = attributeValueUnQuotedState;
1146 } else if (data == EOF) {
1147 _addToken(new ParseErrorToken("expected-attribute-value-but-got-eof"));
1148 state = dataState;
1149 } else if ("=<`".contains(data)) {
1150 _addToken(new ParseErrorToken("equals-in-unquoted-attribute-value"));
1151 _markAttributeValueStart(-1);
1152 _attributeValue.write(data);
1153 state = attributeValueUnQuotedState;
1154 } else {
1155 _markAttributeValueStart(-1);
1156 _attributeValue.write(data);
1157 state = attributeValueUnQuotedState;
1158 }
1159 return true;
1160 }
1161
1162 bool attributeValueDoubleQuotedState() {
1163 var data = stream.char();
1164 if (data == "\"") {
1165 _markAttributeValueEnd(-1);
1166 _markAttributeEnd(0);
1167 state = afterAttributeValueState;
1168 } else if (data == "&") {
1169 processEntityInAttribute('"');
1170 } else if (data == "\u0000") {
1171 _addToken(new ParseErrorToken("invalid-codepoint"));
1172 _attributeValue.write('\uFFFD');
1173 } else if (data == EOF) {
1174 _addToken(new ParseErrorToken("eof-in-attribute-value-double-quote"));
1175 _markAttributeValueEnd(-1);
1176 state = dataState;
1177 } else {
1178 _attributeValue.write(data);
1179 _attributeValue.write(stream.charsUntil("\"&"));
1180 }
1181 return true;
1182 }
1183
1184 bool attributeValueSingleQuotedState() {
1185 var data = stream.char();
1186 if (data == "'") {
1187 _markAttributeValueEnd(-1);
1188 _markAttributeEnd(0);
1189 state = afterAttributeValueState;
1190 } else if (data == "&") {
1191 processEntityInAttribute("'");
1192 } else if (data == "\u0000") {
1193 _addToken(new ParseErrorToken("invalid-codepoint"));
1194 _attributeValue.write('\uFFFD');
1195 } else if (data == EOF) {
1196 _addToken(new ParseErrorToken("eof-in-attribute-value-single-quote"));
1197 _markAttributeValueEnd(-1);
1198 state = dataState;
1199 } else {
1200 _attributeValue.write(data);
1201 _attributeValue.write(stream.charsUntil("\'&"));
1202 }
1203 return true;
1204 }
1205
1206 bool attributeValueUnQuotedState() {
1207 var data = stream.char();
1208 if (isWhitespace(data)) {
1209 _markAttributeValueEnd(-1);
1210 state = beforeAttributeNameState;
1211 } else if (data == "&") {
1212 processEntityInAttribute(">");
1213 } else if (data == ">") {
1214 _markAttributeValueEnd(-1);
1215 emitCurrentToken();
1216 } else if (data == EOF) {
1217 _addToken(new ParseErrorToken("eof-in-attribute-value-no-quotes"));
1218 _markAttributeValueEnd(-1);
1219 state = dataState;
1220 } else if ('"\'=<`'.contains(data)) {
1221 _addToken(new ParseErrorToken(
1222 "unexpected-character-in-unquoted-attribute-value"));
1223 _attributeValue.write(data);
1224 } else if (data == "\u0000") {
1225 _addToken(new ParseErrorToken("invalid-codepoint"));
1226 _attributeValue.write('\uFFFD');
1227 } else {
1228 _attributeValue.write(data);
1229 _attributeValue.write(stream.charsUntil("&>\"\'=<`$spaceCharacters"));
1230 }
1231 return true;
1232 }
1233
1234 bool afterAttributeValueState() {
1235 var data = stream.char();
1236 if (isWhitespace(data)) {
1237 state = beforeAttributeNameState;
1238 } else if (data == ">") {
1239 emitCurrentToken();
1240 } else if (data == "/") {
1241 state = selfClosingStartTagState;
1242 } else if (data == EOF) {
1243 _addToken(new ParseErrorToken("unexpected-EOF-after-attribute-value"));
1244 stream.unget(data);
1245 state = dataState;
1246 } else {
1247 _addToken(
1248 new ParseErrorToken("unexpected-character-after-attribute-value"));
1249 stream.unget(data);
1250 state = beforeAttributeNameState;
1251 }
1252 return true;
1253 }
1254
1255 bool selfClosingStartTagState() {
1256 var data = stream.char();
1257 if (data == ">") {
1258 currentTagToken.selfClosing = true;
1259 emitCurrentToken();
1260 } else if (data == EOF) {
1261 _addToken(new ParseErrorToken("unexpected-EOF-after-solidus-in-tag"));
1262 stream.unget(data);
1263 state = dataState;
1264 } else {
1265 _addToken(
1266 new ParseErrorToken("unexpected-character-after-soldius-in-tag"));
1267 stream.unget(data);
1268 state = beforeAttributeNameState;
1269 }
1270 return true;
1271 }
1272
1273 bool bogusCommentState() {
1274 // Make a new comment token and give it as value all the characters
1275 // until the first > or EOF (charsUntil checks for EOF automatically)
1276 // and emit it.
1277 var data = stream.charsUntil(">");
1278 data = data.replaceAll("\u0000", "\uFFFD");
1279 _addToken(new CommentToken(data));
1280
1281 // Eat the character directly after the bogus comment which is either a
1282 // ">" or an EOF.
1283 stream.char();
1284 state = dataState;
1285 return true;
1286 }
1287
1288 bool markupDeclarationOpenState() {
1289 var charStack = [stream.char()];
1290 if (charStack.last == "-") {
1291 charStack.add(stream.char());
1292 if (charStack.last == "-") {
1293 currentToken = new CommentToken();
1294 state = commentStartState;
1295 return true;
1296 }
1297 } else if (charStack.last == 'd' || charStack.last == 'D') {
1298 var matched = true;
1299 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) {
1300 var char = stream.char();
1301 charStack.add(char);
1302 if (char == EOF || !expected.contains(char)) {
1303 matched = false;
1304 break;
1305 }
1306 }
1307 if (matched) {
1308 currentToken = new DoctypeToken(correct: true);
1309 state = doctypeState;
1310 return true;
1311 }
1312 } else if (charStack.last == "[" &&
1313 parser != null &&
1314 parser.tree.openElements.length > 0 &&
1315 parser.tree.openElements.last.namespaceUri !=
1316 parser.tree.defaultNamespace) {
1317 var matched = true;
1318 for (var expected in const ["C", "D", "A", "T", "A", "["]) {
1319 charStack.add(stream.char());
1320 if (charStack.last != expected) {
1321 matched = false;
1322 break;
1323 }
1324 }
1325 if (matched) {
1326 state = cdataSectionState;
1327 return true;
1328 }
1329 }
1330
1331 _addToken(new ParseErrorToken("expected-dashes-or-doctype"));
1332
1333 while (charStack.length > 0) {
1334 stream.unget(charStack.removeLast());
1335 }
1336 state = bogusCommentState;
1337 return true;
1338 }
1339
1340 bool commentStartState() {
1341 var data = stream.char();
1342 if (data == "-") {
1343 state = commentStartDashState;
1344 } else if (data == "\u0000") {
1345 _addToken(new ParseErrorToken("invalid-codepoint"));
1346 currentStringToken.add('\uFFFD');
1347 } else if (data == ">") {
1348 _addToken(new ParseErrorToken("incorrect-comment"));
1349 _addToken(currentToken);
1350 state = dataState;
1351 } else if (data == EOF) {
1352 _addToken(new ParseErrorToken("eof-in-comment"));
1353 _addToken(currentToken);
1354 state = dataState;
1355 } else {
1356 currentStringToken.add(data);
1357 state = commentState;
1358 }
1359 return true;
1360 }
1361
1362 bool commentStartDashState() {
1363 var data = stream.char();
1364 if (data == "-") {
1365 state = commentEndState;
1366 } else if (data == "\u0000") {
1367 _addToken(new ParseErrorToken("invalid-codepoint"));
1368 currentStringToken.add('-\uFFFD');
1369 } else if (data == ">") {
1370 _addToken(new ParseErrorToken("incorrect-comment"));
1371 _addToken(currentToken);
1372 state = dataState;
1373 } else if (data == EOF) {
1374 _addToken(new ParseErrorToken("eof-in-comment"));
1375 _addToken(currentToken);
1376 state = dataState;
1377 } else {
1378 currentStringToken.add('-').add(data);
1379 state = commentState;
1380 }
1381 return true;
1382 }
1383
1384 bool commentState() {
1385 var data = stream.char();
1386 if (data == "-") {
1387 state = commentEndDashState;
1388 } else if (data == "\u0000") {
1389 _addToken(new ParseErrorToken("invalid-codepoint"));
1390 currentStringToken.add('\uFFFD');
1391 } else if (data == EOF) {
1392 _addToken(new ParseErrorToken("eof-in-comment"));
1393 _addToken(currentToken);
1394 state = dataState;
1395 } else {
1396 currentStringToken.add(data).add(stream.charsUntil("-\u0000"));
1397 }
1398 return true;
1399 }
1400
1401 bool commentEndDashState() {
1402 var data = stream.char();
1403 if (data == "-") {
1404 state = commentEndState;
1405 } else if (data == "\u0000") {
1406 _addToken(new ParseErrorToken("invalid-codepoint"));
1407 currentStringToken.add('-\uFFFD');
1408 state = commentState;
1409 } else if (data == EOF) {
1410 _addToken(new ParseErrorToken("eof-in-comment-end-dash"));
1411 _addToken(currentToken);
1412 state = dataState;
1413 } else {
1414 currentStringToken.add('-').add(data);
1415 state = commentState;
1416 }
1417 return true;
1418 }
1419
1420 bool commentEndState() {
1421 var data = stream.char();
1422 if (data == ">") {
1423 _addToken(currentToken);
1424 state = dataState;
1425 } else if (data == "\u0000") {
1426 _addToken(new ParseErrorToken("invalid-codepoint"));
1427 currentStringToken.add('--\uFFFD');
1428 state = commentState;
1429 } else if (data == "!") {
1430 _addToken(
1431 new ParseErrorToken("unexpected-bang-after-double-dash-in-comment"));
1432 state = commentEndBangState;
1433 } else if (data == "-") {
1434 _addToken(
1435 new ParseErrorToken("unexpected-dash-after-double-dash-in-comment"));
1436 currentStringToken.add(data);
1437 } else if (data == EOF) {
1438 _addToken(new ParseErrorToken("eof-in-comment-double-dash"));
1439 _addToken(currentToken);
1440 state = dataState;
1441 } else {
1442 // XXX
1443 _addToken(new ParseErrorToken("unexpected-char-in-comment"));
1444 currentStringToken.add('--').add(data);
1445 state = commentState;
1446 }
1447 return true;
1448 }
1449
1450 bool commentEndBangState() {
1451 var data = stream.char();
1452 if (data == ">") {
1453 _addToken(currentToken);
1454 state = dataState;
1455 } else if (data == "-") {
1456 currentStringToken.add('--!');
1457 state = commentEndDashState;
1458 } else if (data == "\u0000") {
1459 _addToken(new ParseErrorToken("invalid-codepoint"));
1460 currentStringToken.add('--!\uFFFD');
1461 state = commentState;
1462 } else if (data == EOF) {
1463 _addToken(new ParseErrorToken("eof-in-comment-end-bang-state"));
1464 _addToken(currentToken);
1465 state = dataState;
1466 } else {
1467 currentStringToken.add('--!').add(data);
1468 state = commentState;
1469 }
1470 return true;
1471 }
1472
1473 bool doctypeState() {
1474 var data = stream.char();
1475 if (isWhitespace(data)) {
1476 state = beforeDoctypeNameState;
1477 } else if (data == EOF) {
1478 _addToken(new ParseErrorToken("expected-doctype-name-but-got-eof"));
1479 currentDoctypeToken.correct = false;
1480 _addToken(currentToken);
1481 state = dataState;
1482 } else {
1483 _addToken(new ParseErrorToken("need-space-after-doctype"));
1484 stream.unget(data);
1485 state = beforeDoctypeNameState;
1486 }
1487 return true;
1488 }
1489
1490 bool beforeDoctypeNameState() {
1491 var data = stream.char();
1492 if (isWhitespace(data)) {
1493 return true;
1494 } else if (data == ">") {
1495 _addToken(
1496 new ParseErrorToken("expected-doctype-name-but-got-right-bracket"));
1497 currentDoctypeToken.correct = false;
1498 _addToken(currentToken);
1499 state = dataState;
1500 } else if (data == "\u0000") {
1501 _addToken(new ParseErrorToken("invalid-codepoint"));
1502 currentDoctypeToken.name = "\uFFFD";
1503 state = doctypeNameState;
1504 } else if (data == EOF) {
1505 _addToken(new ParseErrorToken("expected-doctype-name-but-got-eof"));
1506 currentDoctypeToken.correct = false;
1507 _addToken(currentToken);
1508 state = dataState;
1509 } else {
1510 currentDoctypeToken.name = data;
1511 state = doctypeNameState;
1512 }
1513 return true;
1514 }
1515
1516 bool doctypeNameState() {
1517 var data = stream.char();
1518 if (isWhitespace(data)) {
1519 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
1520 state = afterDoctypeNameState;
1521 } else if (data == ">") {
1522 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
1523 _addToken(currentToken);
1524 state = dataState;
1525 } else if (data == "\u0000") {
1526 _addToken(new ParseErrorToken("invalid-codepoint"));
1527 currentDoctypeToken.name = "${currentDoctypeToken.name}\uFFFD";
1528 state = doctypeNameState;
1529 } else if (data == EOF) {
1530 _addToken(new ParseErrorToken("eof-in-doctype-name"));
1531 currentDoctypeToken.correct = false;
1532 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
1533 _addToken(currentToken);
1534 state = dataState;
1535 } else {
1536 currentDoctypeToken.name = '${currentDoctypeToken.name}$data';
1537 }
1538 return true;
1539 }
1540
1541 bool afterDoctypeNameState() {
1542 var data = stream.char();
1543 if (isWhitespace(data)) {
1544 return true;
1545 } else if (data == ">") {
1546 _addToken(currentToken);
1547 state = dataState;
1548 } else if (data == EOF) {
1549 currentDoctypeToken.correct = false;
1550 stream.unget(data);
1551 _addToken(new ParseErrorToken("eof-in-doctype"));
1552 _addToken(currentToken);
1553 state = dataState;
1554 } else {
1555 if (data == "p" || data == "P") {
1556 // TODO(jmesserly): would be nice to have a helper for this.
1557 var matched = true;
1558 for (var expected in const ["uU", "bB", "lL", "iI", "cC"]) {
1559 data = stream.char();
1560 if (data == EOF || !expected.contains(data)) {
1561 matched = false;
1562 break;
1563 }
1564 }
1565 if (matched) {
1566 state = afterDoctypePublicKeywordState;
1567 return true;
1568 }
1569 } else if (data == "s" || data == "S") {
1570 var matched = true;
1571 for (var expected in const ["yY", "sS", "tT", "eE", "mM"]) {
1572 data = stream.char();
1573 if (data == EOF || !expected.contains(data)) {
1574 matched = false;
1575 break;
1576 }
1577 }
1578 if (matched) {
1579 state = afterDoctypeSystemKeywordState;
1580 return true;
1581 }
1582 }
1583
1584 // All the characters read before the current 'data' will be
1585 // [a-zA-Z], so they're garbage in the bogus doctype and can be
1586 // discarded; only the latest character might be '>' or EOF
1587 // and needs to be ungetted
1588 stream.unget(data);
1589 _addToken(new ParseErrorToken(
1590 "expected-space-or-right-bracket-in-doctype",
1591 messageParams: {"data": data}));
1592 currentDoctypeToken.correct = false;
1593 state = bogusDoctypeState;
1594 }
1595 return true;
1596 }
1597
1598 bool afterDoctypePublicKeywordState() {
1599 var data = stream.char();
1600 if (isWhitespace(data)) {
1601 state = beforeDoctypePublicIdentifierState;
1602 } else if (data == "'" || data == '"') {
1603 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1604 stream.unget(data);
1605 state = beforeDoctypePublicIdentifierState;
1606 } else if (data == EOF) {
1607 _addToken(new ParseErrorToken("eof-in-doctype"));
1608 currentDoctypeToken.correct = false;
1609 _addToken(currentToken);
1610 state = dataState;
1611 } else {
1612 stream.unget(data);
1613 state = beforeDoctypePublicIdentifierState;
1614 }
1615 return true;
1616 }
1617
1618 bool beforeDoctypePublicIdentifierState() {
1619 var data = stream.char();
1620 if (isWhitespace(data)) {
1621 return true;
1622 } else if (data == "\"") {
1623 currentDoctypeToken.publicId = "";
1624 state = doctypePublicIdentifierDoubleQuotedState;
1625 } else if (data == "'") {
1626 currentDoctypeToken.publicId = "";
1627 state = doctypePublicIdentifierSingleQuotedState;
1628 } else if (data == ">") {
1629 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1630 currentDoctypeToken.correct = false;
1631 _addToken(currentToken);
1632 state = dataState;
1633 } else if (data == EOF) {
1634 _addToken(new ParseErrorToken("eof-in-doctype"));
1635 currentDoctypeToken.correct = false;
1636 _addToken(currentToken);
1637 state = dataState;
1638 } else {
1639 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1640 currentDoctypeToken.correct = false;
1641 state = bogusDoctypeState;
1642 }
1643 return true;
1644 }
1645
1646 bool doctypePublicIdentifierDoubleQuotedState() {
1647 var data = stream.char();
1648 if (data == '"') {
1649 state = afterDoctypePublicIdentifierState;
1650 } else if (data == "\u0000") {
1651 _addToken(new ParseErrorToken("invalid-codepoint"));
1652 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
1653 } else if (data == ">") {
1654 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1655 currentDoctypeToken.correct = false;
1656 _addToken(currentToken);
1657 state = dataState;
1658 } else if (data == EOF) {
1659 _addToken(new ParseErrorToken("eof-in-doctype"));
1660 currentDoctypeToken.correct = false;
1661 _addToken(currentToken);
1662 state = dataState;
1663 } else {
1664 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
1665 }
1666 return true;
1667 }
1668
1669 bool doctypePublicIdentifierSingleQuotedState() {
1670 var data = stream.char();
1671 if (data == "'") {
1672 state = afterDoctypePublicIdentifierState;
1673 } else if (data == "\u0000") {
1674 _addToken(new ParseErrorToken("invalid-codepoint"));
1675 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
1676 } else if (data == ">") {
1677 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1678 currentDoctypeToken.correct = false;
1679 _addToken(currentToken);
1680 state = dataState;
1681 } else if (data == EOF) {
1682 _addToken(new ParseErrorToken("eof-in-doctype"));
1683 currentDoctypeToken.correct = false;
1684 _addToken(currentToken);
1685 state = dataState;
1686 } else {
1687 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
1688 }
1689 return true;
1690 }
1691
1692 bool afterDoctypePublicIdentifierState() {
1693 var data = stream.char();
1694 if (isWhitespace(data)) {
1695 state = betweenDoctypePublicAndSystemIdentifiersState;
1696 } else if (data == ">") {
1697 _addToken(currentToken);
1698 state = dataState;
1699 } else if (data == '"') {
1700 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1701 currentDoctypeToken.systemId = "";
1702 state = doctypeSystemIdentifierDoubleQuotedState;
1703 } else if (data == "'") {
1704 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1705 currentDoctypeToken.systemId = "";
1706 state = doctypeSystemIdentifierSingleQuotedState;
1707 } else if (data == EOF) {
1708 _addToken(new ParseErrorToken("eof-in-doctype"));
1709 currentDoctypeToken.correct = false;
1710 _addToken(currentToken);
1711 state = dataState;
1712 } else {
1713 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1714 currentDoctypeToken.correct = false;
1715 state = bogusDoctypeState;
1716 }
1717 return true;
1718 }
1719
1720 bool betweenDoctypePublicAndSystemIdentifiersState() {
1721 var data = stream.char();
1722 if (isWhitespace(data)) {
1723 return true;
1724 } else if (data == ">") {
1725 _addToken(currentToken);
1726 state = dataState;
1727 } else if (data == '"') {
1728 currentDoctypeToken.systemId = "";
1729 state = doctypeSystemIdentifierDoubleQuotedState;
1730 } else if (data == "'") {
1731 currentDoctypeToken.systemId = "";
1732 state = doctypeSystemIdentifierSingleQuotedState;
1733 } else if (data == EOF) {
1734 _addToken(new ParseErrorToken("eof-in-doctype"));
1735 currentDoctypeToken.correct = false;
1736 _addToken(currentToken);
1737 state = dataState;
1738 } else {
1739 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1740 currentDoctypeToken.correct = false;
1741 state = bogusDoctypeState;
1742 }
1743 return true;
1744 }
1745
1746 bool afterDoctypeSystemKeywordState() {
1747 var data = stream.char();
1748 if (isWhitespace(data)) {
1749 state = beforeDoctypeSystemIdentifierState;
1750 } else if (data == "'" || data == '"') {
1751 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1752 stream.unget(data);
1753 state = beforeDoctypeSystemIdentifierState;
1754 } else if (data == EOF) {
1755 _addToken(new ParseErrorToken("eof-in-doctype"));
1756 currentDoctypeToken.correct = false;
1757 _addToken(currentToken);
1758 state = dataState;
1759 } else {
1760 stream.unget(data);
1761 state = beforeDoctypeSystemIdentifierState;
1762 }
1763 return true;
1764 }
1765
1766 bool beforeDoctypeSystemIdentifierState() {
1767 var data = stream.char();
1768 if (isWhitespace(data)) {
1769 return true;
1770 } else if (data == "\"") {
1771 currentDoctypeToken.systemId = "";
1772 state = doctypeSystemIdentifierDoubleQuotedState;
1773 } else if (data == "'") {
1774 currentDoctypeToken.systemId = "";
1775 state = doctypeSystemIdentifierSingleQuotedState;
1776 } else if (data == ">") {
1777 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1778 currentDoctypeToken.correct = false;
1779 _addToken(currentToken);
1780 state = dataState;
1781 } else if (data == EOF) {
1782 _addToken(new ParseErrorToken("eof-in-doctype"));
1783 currentDoctypeToken.correct = false;
1784 _addToken(currentToken);
1785 state = dataState;
1786 } else {
1787 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1788 currentDoctypeToken.correct = false;
1789 state = bogusDoctypeState;
1790 }
1791 return true;
1792 }
1793
1794 bool doctypeSystemIdentifierDoubleQuotedState() {
1795 var data = stream.char();
1796 if (data == "\"") {
1797 state = afterDoctypeSystemIdentifierState;
1798 } else if (data == "\u0000") {
1799 _addToken(new ParseErrorToken("invalid-codepoint"));
1800 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
1801 } else if (data == ">") {
1802 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1803 currentDoctypeToken.correct = false;
1804 _addToken(currentToken);
1805 state = dataState;
1806 } else if (data == EOF) {
1807 _addToken(new ParseErrorToken("eof-in-doctype"));
1808 currentDoctypeToken.correct = false;
1809 _addToken(currentToken);
1810 state = dataState;
1811 } else {
1812 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
1813 }
1814 return true;
1815 }
1816
1817 bool doctypeSystemIdentifierSingleQuotedState() {
1818 var data = stream.char();
1819 if (data == "'") {
1820 state = afterDoctypeSystemIdentifierState;
1821 } else if (data == "\u0000") {
1822 _addToken(new ParseErrorToken("invalid-codepoint"));
1823 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
1824 } else if (data == ">") {
1825 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1826 currentDoctypeToken.correct = false;
1827 _addToken(currentToken);
1828 state = dataState;
1829 } else if (data == EOF) {
1830 _addToken(new ParseErrorToken("eof-in-doctype"));
1831 currentDoctypeToken.correct = false;
1832 _addToken(currentToken);
1833 state = dataState;
1834 } else {
1835 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
1836 }
1837 return true;
1838 }
1839
1840 bool afterDoctypeSystemIdentifierState() {
1841 var data = stream.char();
1842 if (isWhitespace(data)) {
1843 return true;
1844 } else if (data == ">") {
1845 _addToken(currentToken);
1846 state = dataState;
1847 } else if (data == EOF) {
1848 _addToken(new ParseErrorToken("eof-in-doctype"));
1849 currentDoctypeToken.correct = false;
1850 _addToken(currentToken);
1851 state = dataState;
1852 } else {
1853 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1854 state = bogusDoctypeState;
1855 }
1856 return true;
1857 }
1858
1859 bool bogusDoctypeState() {
1860 var data = stream.char();
1861 if (data == ">") {
1862 _addToken(currentToken);
1863 state = dataState;
1864 } else if (data == EOF) {
1865 // XXX EMIT
1866 stream.unget(data);
1867 _addToken(currentToken);
1868 state = dataState;
1869 }
1870 return true;
1871 }
1872
1873 bool cdataSectionState() {
1874 var data = [];
1875 int matchedEnd = 0;
1876 while (true) {
1877 var ch = stream.char();
1878 if (ch == EOF) {
1879 break;
1880 }
1881 // Deal with null here rather than in the parser
1882 if (ch == "\u0000") {
1883 _addToken(new ParseErrorToken("invalid-codepoint"));
1884 ch = "\uFFFD";
1885 }
1886 data.add(ch);
1887 // TODO(jmesserly): it'd be nice if we had an easier way to match the end,
1888 // perhaps with a "peek" API.
1889 if (ch == "]" && matchedEnd < 2) {
1890 matchedEnd++;
1891 } else if (ch == ">" && matchedEnd == 2) {
1892 // Remove "]]>" from the end.
1893 data.removeLast();
1894 data.removeLast();
1895 data.removeLast();
1896 break;
1897 } else {
1898 matchedEnd = 0;
1899 }
1900 }
1901
1902 if (data.length > 0) {
1903 _addToken(new CharactersToken(data.join()));
1904 }
1905 state = dataState;
1906 return true;
1907 }
1908 }
OLDNEW
« no previous file with comments | « html/lib/src/token.dart ('k') | html/lib/src/treebuilder.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698