Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(337)

Side by Side Diff: pkg/third_party/html5lib/lib/src/tokenizer.dart

Issue 814113004: Pull args, intl, logging, shelf, and source_maps out of the SDK. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Also csslib. Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 library tokenizer;
2
3 import 'dart:collection';
4 import 'package:html5lib/parser.dart' show HtmlParser;
5 import 'constants.dart';
6 import 'inputstream.dart';
7 import 'token.dart';
8 import 'utils.dart';
9
10 // Group entities by their first character, for faster lookups
11
12 // TODO(jmesserly): we could use a better data structure here like a trie, if
13 // we had it implemented in Dart.
14 Map<String, List<String>> entitiesByFirstChar = (() {
15 var result = {};
16 for (var k in entities.keys) {
17 result.putIfAbsent(k[0], () => []).add(k);
18 }
19 return result;
20 })();
21
22 // TODO(jmesserly): lots of ways to make this faster:
23 // - use char codes everywhere instead of 1-char strings
24 // - use switch instead of contains, indexOf
25 // - use switch instead of the sequential if tests
26 // - avoid string concat
27
28 /// This class takes care of tokenizing HTML.
29 class HtmlTokenizer implements Iterator<Token> {
30 // TODO(jmesserly): a lot of these could be made private
31
32 final HtmlInputStream stream;
33
34 final bool lowercaseElementName;
35
36 final bool lowercaseAttrName;
37
38 /// True to generate spans in for [Token.span].
39 final bool generateSpans;
40
41 /// True to generate spans for attributes.
42 final bool attributeSpans;
43
44 /// This reference to the parser is used for correct CDATA handling.
45 /// The [HtmlParser] will set this at construction time.
46 HtmlParser parser;
47
48 final Queue<Token> tokenQueue;
49
50 /// Holds the token that is currently being processed.
51 Token currentToken;
52
53 /// Holds a reference to the method to be invoked for the next parser state.
54 // TODO(jmesserly): the type should be "Predicate" but a dart2js checked mode
55 // bug prevents us from doing that. See http://dartbug.com/12465
56 Function state;
57
58 String temporaryBuffer;
59
60 int _lastOffset;
61
62 // TODO(jmesserly): ideally this would be a LinkedHashMap and we wouldn't add
63 // an item until it's ready. But the code doesn't have a clear notion of when
64 // it's "done" with the attribute.
65 List<TagAttribute> _attributes;
66 Set<String> _attributeNames;
67
68 HtmlTokenizer(doc, {String encoding, bool parseMeta: true,
69 this.lowercaseElementName: true, this.lowercaseAttrName: true,
70 bool generateSpans: false, String sourceUrl, this.attributeSpans: false})
71 : stream = new HtmlInputStream(
72 doc, encoding, parseMeta, generateSpans, sourceUrl),
73 tokenQueue = new Queue(),
74 generateSpans = generateSpans {
75 reset();
76 }
77
78 TagToken get currentTagToken => currentToken;
79 DoctypeToken get currentDoctypeToken => currentToken;
80 StringToken get currentStringToken => currentToken;
81
82 Token _current;
83 Token get current => _current;
84
85 String get _attributeName => _attributes.last.name;
86 set _attributeName(String value) {
87 _attributes.last.name = value;
88 }
89
90 String get _attributeValue => _attributes.last.value;
91 set _attributeValue(String value) {
92 _attributes.last.value = value;
93 }
94
95 void _markAttributeEnd(int offset) {
96 if (attributeSpans) _attributes.last.end = stream.position + offset;
97 }
98
99 void _markAttributeValueStart(int offset) {
100 if (attributeSpans) _attributes.last.startValue = stream.position + offset;
101 }
102
103 void _markAttributeValueEnd(int offset) {
104 if (attributeSpans) {
105 _attributes.last.endValue = stream.position + offset;
106 _markAttributeEnd(offset);
107 }
108 }
109
110 // Note: we could track the name span here, if we need it.
111 void _markAttributeNameEnd(int offset) => _markAttributeEnd(offset);
112
113 void _addAttribute(String name) {
114 if (_attributes == null) _attributes = [];
115 var attr = new TagAttribute(name);
116 _attributes.add(attr);
117 if (attributeSpans) attr.start = stream.position - name.length;
118 }
119
120 /// This is where the magic happens.
121 ///
122 /// We do our usually processing through the states and when we have a token
123 /// to return we yield the token which pauses processing until the next token
124 /// is requested.
125 bool moveNext() {
126 // Start processing. When EOF is reached state will return false;
127 // instead of true and the loop will terminate.
128 while (stream.errors.length == 0 && tokenQueue.length == 0) {
129 if (!state()) {
130 _current = null;
131 return false;
132 }
133 }
134 if (stream.errors.length > 0) {
135 _current = new ParseErrorToken(stream.errors.removeFirst());
136 } else {
137 assert (tokenQueue.length > 0);
138 _current = tokenQueue.removeFirst();
139 }
140 return true;
141 }
142
143 /// Resets the tokenizer state. Calling this does not reset the [stream] or
144 /// the [parser].
145 void reset() {
146 _lastOffset = 0;
147 tokenQueue.clear();
148 currentToken = null;
149 temporaryBuffer = null;
150 _attributes = null;
151 _attributeNames = null;
152 state = dataState;
153 }
154
155 /// Adds a token to the queue. Sets the span if needed.
156 void _addToken(Token token) {
157 if (generateSpans && token.span == null) {
158 int offset = stream.position;
159 token.span = stream.fileInfo.span(_lastOffset, offset);
160 if (token is! ParseErrorToken) {
161 _lastOffset = offset;
162 }
163 }
164 tokenQueue.add(token);
165 }
166
167 /// This function returns either U+FFFD or the character based on the
168 /// decimal or hexadecimal representation. It also discards ";" if present.
169 /// If not present it will add a [ParseErrorToken].
170 String consumeNumberEntity(bool isHex) {
171 var allowed = isDigit;
172 var radix = 10;
173 if (isHex) {
174 allowed = isHexDigit;
175 radix = 16;
176 }
177
178 var charStack = [];
179
180 // Consume all the characters that are in range while making sure we
181 // don't hit an EOF.
182 var c = stream.char();
183 while (allowed(c) && c != EOF) {
184 charStack.add(c);
185 c = stream.char();
186 }
187
188 // Convert the set of characters consumed to an int.
189 var charAsInt = parseIntRadix(charStack.join(), radix);
190
191 // Certain characters get replaced with others
192 var char = replacementCharacters[charAsInt];
193 if (char != null) {
194 _addToken(new ParseErrorToken(
195 "illegal-codepoint-for-numeric-entity",
196 messageParams: {"charAsInt": charAsInt}));
197 } else if ((0xD800 <= charAsInt && charAsInt <= 0xDFFF)
198 || (charAsInt > 0x10FFFF)) {
199 char = "\uFFFD";
200 _addToken(new ParseErrorToken(
201 "illegal-codepoint-for-numeric-entity",
202 messageParams: {"charAsInt": charAsInt}));
203 } else {
204 // Should speed up this check somehow (e.g. move the set to a constant)
205 if ((0x0001 <= charAsInt && charAsInt <= 0x0008) ||
206 (0x000E <= charAsInt && charAsInt <= 0x001F) ||
207 (0x007F <= charAsInt && charAsInt <= 0x009F) ||
208 (0xFDD0 <= charAsInt && charAsInt <= 0xFDEF) ||
209 const [0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
210 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
211 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
212 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
213 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
214 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
215 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
216 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
217 0xFFFFF, 0x10FFFE, 0x10FFFF].contains(charAsInt)) {
218 _addToken(new ParseErrorToken(
219 "illegal-codepoint-for-numeric-entity",
220 messageParams: {"charAsInt": charAsInt}));
221 }
222 char = new String.fromCharCodes([charAsInt]);
223 }
224
225 // Discard the ; if present. Otherwise, put it back on the queue and
226 // invoke parseError on parser.
227 if (c != ";") {
228 _addToken(new ParseErrorToken(
229 "numeric-entity-without-semicolon"));
230 stream.unget(c);
231 }
232 return char;
233 }
234
235 void consumeEntity({String allowedChar, bool fromAttribute: false}) {
236 // Initialise to the default output for when no entity is matched
237 var output = "&";
238
239 var charStack = [stream.char()];
240 if (isWhitespace(charStack[0]) || charStack[0] == '<' || charStack[0] == '&'
241 || charStack[0] == EOF || allowedChar == charStack[0]) {
242 stream.unget(charStack[0]);
243 } else if (charStack[0] == "#") {
244 // Read the next character to see if it's hex or decimal
245 bool hex = false;
246 charStack.add(stream.char());
247 if (charStack.last == 'x' || charStack.last == 'X') {
248 hex = true;
249 charStack.add(stream.char());
250 }
251
252 // charStack.last should be the first digit
253 if (hex && isHexDigit(charStack.last) ||
254 (!hex && isDigit(charStack.last))) {
255 // At least one digit found, so consume the whole number
256 stream.unget(charStack.last);
257 output = consumeNumberEntity(hex);
258 } else {
259 // No digits found
260 _addToken(new ParseErrorToken("expected-numeric-entity"));
261 stream.unget(charStack.removeLast());
262 output = "&${charStack.join()}";
263 }
264 } else {
265 // At this point in the process might have named entity. Entities
266 // are stored in the global variable "entities".
267 //
268 // Consume characters and compare to these to a substring of the
269 // entity names in the list until the substring no longer matches.
270 var filteredEntityList = entitiesByFirstChar[charStack[0]];
271 if (filteredEntityList == null) filteredEntityList = const [];
272
273 while (charStack.last != EOF) {
274 var name = charStack.join();
275 filteredEntityList = filteredEntityList.where(
276 (e) => e.startsWith(name)).toList();
277
278 if (filteredEntityList.length == 0) {
279 break;
280 }
281 charStack.add(stream.char());
282 }
283
284 // At this point we have a string that starts with some characters
285 // that may match an entity
286 String entityName = null;
287
288 // Try to find the longest entity the string will match to take care
289 // of &noti for instance.
290
291 int entityLen;
292 for (entityLen = charStack.length - 1; entityLen > 1; entityLen--) {
293 var possibleEntityName = charStack.sublist(0, entityLen).join();
294 if (entities.containsKey(possibleEntityName)) {
295 entityName = possibleEntityName;
296 break;
297 }
298 }
299
300 if (entityName != null) {
301 var lastChar = entityName[entityName.length - 1];
302 if (lastChar != ";") {
303 _addToken(new ParseErrorToken(
304 "named-entity-without-semicolon"));
305 }
306 if (lastChar != ";" && fromAttribute &&
307 (isLetterOrDigit(charStack[entityLen]) ||
308 charStack[entityLen] == '=')) {
309 stream.unget(charStack.removeLast());
310 output = "&${charStack.join()}";
311 } else {
312 output = entities[entityName];
313 stream.unget(charStack.removeLast());
314 output = '${output}${slice(charStack, entityLen).join()}';
315 }
316 } else {
317 _addToken(new ParseErrorToken("expected-named-entity"));
318 stream.unget(charStack.removeLast());
319 output = "&${charStack.join()}";
320 }
321 }
322 if (fromAttribute) {
323 _attributeValue = '$_attributeValue$output';
324 } else {
325 var token;
326 if (isWhitespace(output)) {
327 token = new SpaceCharactersToken(output);
328 } else {
329 token = new CharactersToken(output);
330 }
331 _addToken(token);
332 }
333 }
334
335 /// This method replaces the need for "entityInAttributeValueState".
336 void processEntityInAttribute(String allowedChar) {
337 consumeEntity(allowedChar: allowedChar, fromAttribute: true);
338 }
339
340 /// This method is a generic handler for emitting the tags. It also sets
341 /// the state to "data" because that's what's needed after a token has been
342 /// emitted.
343 void emitCurrentToken() {
344 var token = currentToken;
345 // Add token to the queue to be yielded
346 if (token is TagToken) {
347 if (lowercaseElementName) {
348 token.name = asciiUpper2Lower(token.name);
349 }
350 if (token is EndTagToken) {
351 if (_attributes != null) {
352 _addToken(new ParseErrorToken("attributes-in-end-tag"));
353 }
354 if (token.selfClosing) {
355 _addToken(new ParseErrorToken("this-closing-flag-on-end-tag"));
356 }
357 } else if (token is StartTagToken) {
358 // HTML5 specific normalizations to the token stream.
359 // Convert the list into a map where first key wins.
360 token.data = new LinkedHashMap<Object, String>();
361 if (_attributes != null) {
362 for (var attr in _attributes) {
363 token.data.putIfAbsent(attr.name, () => attr.value);
364 }
365 if (attributeSpans) token.attributeSpans = _attributes;
366 }
367 }
368 _attributes = null;
369 _attributeNames = null;
370 }
371 _addToken(token);
372 state = dataState;
373 }
374
375 // Below are the various tokenizer states worked out.
376
377 bool dataState() {
378 var data = stream.char();
379 if (data == "&") {
380 state = entityDataState;
381 } else if (data == "<") {
382 state = tagOpenState;
383 } else if (data == "\u0000") {
384 _addToken(new ParseErrorToken("invalid-codepoint"));
385 _addToken(new CharactersToken("\u0000"));
386 } else if (data == EOF) {
387 // Tokenization ends.
388 return false;
389 } else if (isWhitespace(data)) {
390 // Directly after emitting a token you switch back to the "data
391 // state". At that point spaceCharacters are important so they are
392 // emitted separately.
393 _addToken(new SpaceCharactersToken(
394 '${data}${stream.charsUntil(spaceCharacters, true)}'));
395 // No need to update lastFourChars here, since the first space will
396 // have already been appended to lastFourChars and will have broken
397 // any <!-- or --> sequences
398 } else {
399 var chars = stream.charsUntil("&<\u0000");
400 _addToken(new CharactersToken('${data}${chars}'));
401 }
402 return true;
403 }
404
405 bool entityDataState() {
406 consumeEntity();
407 state = dataState;
408 return true;
409 }
410
411 bool rcdataState() {
412 var data = stream.char();
413 if (data == "&") {
414 state = characterReferenceInRcdata;
415 } else if (data == "<") {
416 state = rcdataLessThanSignState;
417 } else if (data == EOF) {
418 // Tokenization ends.
419 return false;
420 } else if (data == "\u0000") {
421 _addToken(new ParseErrorToken("invalid-codepoint"));
422 _addToken(new CharactersToken("\uFFFD"));
423 } else if (isWhitespace(data)) {
424 // Directly after emitting a token you switch back to the "data
425 // state". At that point spaceCharacters are important so they are
426 // emitted separately.
427 _addToken(new SpaceCharactersToken(
428 '${data}${stream.charsUntil(spaceCharacters, true)}'));
429 } else {
430 var chars = stream.charsUntil("&<");
431 _addToken(new CharactersToken('${data}${chars}'));
432 }
433 return true;
434 }
435
436 bool characterReferenceInRcdata() {
437 consumeEntity();
438 state = rcdataState;
439 return true;
440 }
441
442 bool rawtextState() {
443 var data = stream.char();
444 if (data == "<") {
445 state = rawtextLessThanSignState;
446 } else if (data == "\u0000") {
447 _addToken(new ParseErrorToken("invalid-codepoint"));
448 _addToken(new CharactersToken("\uFFFD"));
449 } else if (data == EOF) {
450 // Tokenization ends.
451 return false;
452 } else {
453 var chars = stream.charsUntil("<\u0000");
454 _addToken(new CharactersToken("${data}${chars}"));
455 }
456 return true;
457 }
458
459 bool scriptDataState() {
460 var data = stream.char();
461 if (data == "<") {
462 state = scriptDataLessThanSignState;
463 } else if (data == "\u0000") {
464 _addToken(new ParseErrorToken("invalid-codepoint"));
465 _addToken(new CharactersToken("\uFFFD"));
466 } else if (data == EOF) {
467 // Tokenization ends.
468 return false;
469 } else {
470 var chars = stream.charsUntil("<\u0000");
471 _addToken(new CharactersToken("${data}${chars}"));
472 }
473 return true;
474 }
475
476 bool plaintextState() {
477 var data = stream.char();
478 if (data == EOF) {
479 // Tokenization ends.
480 return false;
481 } else if (data == "\u0000") {
482 _addToken(new ParseErrorToken("invalid-codepoint"));
483 _addToken(new CharactersToken("\uFFFD"));
484 } else {
485 _addToken(new CharactersToken(
486 '${data}${stream.charsUntil("\u0000")}'));
487 }
488 return true;
489 }
490
491 bool tagOpenState() {
492 var data = stream.char();
493 if (data == "!") {
494 state = markupDeclarationOpenState;
495 } else if (data == "/") {
496 state = closeTagOpenState;
497 } else if (isLetter(data)) {
498 currentToken = new StartTagToken(data);
499 state = tagNameState;
500 } else if (data == ">") {
501 // XXX In theory it could be something besides a tag name. But
502 // do we really care?
503 _addToken(new ParseErrorToken(
504 "expected-tag-name-but-got-right-bracket"));
505 _addToken(new CharactersToken("<>"));
506 state = dataState;
507 } else if (data == "?") {
508 // XXX In theory it could be something besides a tag name. But
509 // do we really care?
510 _addToken(new ParseErrorToken(
511 "expected-tag-name-but-got-question-mark"));
512 stream.unget(data);
513 state = bogusCommentState;
514 } else {
515 // XXX
516 _addToken(new ParseErrorToken("expected-tag-name"));
517 _addToken(new CharactersToken("<"));
518 stream.unget(data);
519 state = dataState;
520 }
521 return true;
522 }
523
524 bool closeTagOpenState() {
525 var data = stream.char();
526 if (isLetter(data)) {
527 currentToken = new EndTagToken(data);
528 state = tagNameState;
529 } else if (data == ">") {
530 _addToken(new ParseErrorToken(
531 "expected-closing-tag-but-got-right-bracket"));
532 state = dataState;
533 } else if (data == EOF) {
534 _addToken(new ParseErrorToken(
535 "expected-closing-tag-but-got-eof"));
536 _addToken(new CharactersToken("</"));
537 state = dataState;
538 } else {
539 // XXX data can be _'_...
540 _addToken(new ParseErrorToken(
541 "expected-closing-tag-but-got-char", messageParams: {"data": data}));
542 stream.unget(data);
543 state = bogusCommentState;
544 }
545 return true;
546 }
547
548 bool tagNameState() {
549 var data = stream.char();
550 if (isWhitespace(data)) {
551 state = beforeAttributeNameState;
552 } else if (data == ">") {
553 emitCurrentToken();
554 } else if (data == EOF) {
555 _addToken(new ParseErrorToken("eof-in-tag-name"));
556 state = dataState;
557 } else if (data == "/") {
558 state = selfClosingStartTagState;
559 } else if (data == "\u0000") {
560 _addToken(new ParseErrorToken("invalid-codepoint"));
561 currentTagToken.name = '${currentTagToken.name}\uFFFD';
562 } else {
563 currentTagToken.name = '${currentTagToken.name}$data';
564 // (Don't use charsUntil here, because tag names are
565 // very short and it's faster to not do anything fancy)
566 }
567 return true;
568 }
569
570 bool rcdataLessThanSignState() {
571 var data = stream.char();
572 if (data == "/") {
573 temporaryBuffer = "";
574 state = rcdataEndTagOpenState;
575 } else {
576 _addToken(new CharactersToken("<"));
577 stream.unget(data);
578 state = rcdataState;
579 }
580 return true;
581 }
582
583 bool rcdataEndTagOpenState() {
584 var data = stream.char();
585 if (isLetter(data)) {
586 temporaryBuffer = '${temporaryBuffer}$data';
587 state = rcdataEndTagNameState;
588 } else {
589 _addToken(new CharactersToken("</"));
590 stream.unget(data);
591 state = rcdataState;
592 }
593 return true;
594 }
595
596 bool _tokenIsAppropriate() {
597 return currentToken is TagToken &&
598 currentTagToken.name.toLowerCase() == temporaryBuffer.toLowerCase();
599 }
600
601 bool rcdataEndTagNameState() {
602 var appropriate = _tokenIsAppropriate();
603 var data = stream.char();
604 if (isWhitespace(data) && appropriate) {
605 currentToken = new EndTagToken(temporaryBuffer);
606 state = beforeAttributeNameState;
607 } else if (data == "/" && appropriate) {
608 currentToken = new EndTagToken(temporaryBuffer);
609 state = selfClosingStartTagState;
610 } else if (data == ">" && appropriate) {
611 currentToken = new EndTagToken(temporaryBuffer);
612 emitCurrentToken();
613 state = dataState;
614 } else if (isLetter(data)) {
615 temporaryBuffer = '${temporaryBuffer}$data';
616 } else {
617 _addToken(new CharactersToken("</$temporaryBuffer"));
618 stream.unget(data);
619 state = rcdataState;
620 }
621 return true;
622 }
623
624 bool rawtextLessThanSignState() {
625 var data = stream.char();
626 if (data == "/") {
627 temporaryBuffer = "";
628 state = rawtextEndTagOpenState;
629 } else {
630 _addToken(new CharactersToken("<"));
631 stream.unget(data);
632 state = rawtextState;
633 }
634 return true;
635 }
636
637 bool rawtextEndTagOpenState() {
638 var data = stream.char();
639 if (isLetter(data)) {
640 temporaryBuffer = '${temporaryBuffer}$data';
641 state = rawtextEndTagNameState;
642 } else {
643 _addToken(new CharactersToken("</"));
644 stream.unget(data);
645 state = rawtextState;
646 }
647 return true;
648 }
649
650 bool rawtextEndTagNameState() {
651 var appropriate = _tokenIsAppropriate();
652 var data = stream.char();
653 if (isWhitespace(data) && appropriate) {
654 currentToken = new EndTagToken(temporaryBuffer);
655 state = beforeAttributeNameState;
656 } else if (data == "/" && appropriate) {
657 currentToken = new EndTagToken(temporaryBuffer);
658 state = selfClosingStartTagState;
659 } else if (data == ">" && appropriate) {
660 currentToken = new EndTagToken(temporaryBuffer);
661 emitCurrentToken();
662 state = dataState;
663 } else if (isLetter(data)) {
664 temporaryBuffer = '${temporaryBuffer}$data';
665 } else {
666 _addToken(new CharactersToken("</$temporaryBuffer"));
667 stream.unget(data);
668 state = rawtextState;
669 }
670 return true;
671 }
672
673 bool scriptDataLessThanSignState() {
674 var data = stream.char();
675 if (data == "/") {
676 temporaryBuffer = "";
677 state = scriptDataEndTagOpenState;
678 } else if (data == "!") {
679 _addToken(new CharactersToken("<!"));
680 state = scriptDataEscapeStartState;
681 } else {
682 _addToken(new CharactersToken("<"));
683 stream.unget(data);
684 state = scriptDataState;
685 }
686 return true;
687 }
688
689 bool scriptDataEndTagOpenState() {
690 var data = stream.char();
691 if (isLetter(data)) {
692 temporaryBuffer = '${temporaryBuffer}$data';
693 state = scriptDataEndTagNameState;
694 } else {
695 _addToken(new CharactersToken("</"));
696 stream.unget(data);
697 state = scriptDataState;
698 }
699 return true;
700 }
701
702 bool scriptDataEndTagNameState() {
703 var appropriate = _tokenIsAppropriate();
704 var data = stream.char();
705 if (isWhitespace(data) && appropriate) {
706 currentToken = new EndTagToken(temporaryBuffer);
707 state = beforeAttributeNameState;
708 } else if (data == "/" && appropriate) {
709 currentToken = new EndTagToken(temporaryBuffer);
710 state = selfClosingStartTagState;
711 } else if (data == ">" && appropriate) {
712 currentToken = new EndTagToken(temporaryBuffer);
713 emitCurrentToken();
714 state = dataState;
715 } else if (isLetter(data)) {
716 temporaryBuffer = '${temporaryBuffer}$data';
717 } else {
718 _addToken(new CharactersToken("</$temporaryBuffer"));
719 stream.unget(data);
720 state = scriptDataState;
721 }
722 return true;
723 }
724
725 bool scriptDataEscapeStartState() {
726 var data = stream.char();
727 if (data == "-") {
728 _addToken(new CharactersToken("-"));
729 state = scriptDataEscapeStartDashState;
730 } else {
731 stream.unget(data);
732 state = scriptDataState;
733 }
734 return true;
735 }
736
737 bool scriptDataEscapeStartDashState() {
738 var data = stream.char();
739 if (data == "-") {
740 _addToken(new CharactersToken("-"));
741 state = scriptDataEscapedDashDashState;
742 } else {
743 stream.unget(data);
744 state = scriptDataState;
745 }
746 return true;
747 }
748
749 bool scriptDataEscapedState() {
750 var data = stream.char();
751 if (data == "-") {
752 _addToken(new CharactersToken("-"));
753 state = scriptDataEscapedDashState;
754 } else if (data == "<") {
755 state = scriptDataEscapedLessThanSignState;
756 } else if (data == "\u0000") {
757 _addToken(new ParseErrorToken("invalid-codepoint"));
758 _addToken(new CharactersToken("\uFFFD"));
759 } else if (data == EOF) {
760 state = dataState;
761 } else {
762 var chars = stream.charsUntil("<-\u0000");
763 _addToken(new CharactersToken("${data}${chars}"));
764 }
765 return true;
766 }
767
768 bool scriptDataEscapedDashState() {
769 var data = stream.char();
770 if (data == "-") {
771 _addToken(new CharactersToken("-"));
772 state = scriptDataEscapedDashDashState;
773 } else if (data == "<") {
774 state = scriptDataEscapedLessThanSignState;
775 } else if (data == "\u0000") {
776 _addToken(new ParseErrorToken("invalid-codepoint"));
777 _addToken(new CharactersToken("\uFFFD"));
778 state = scriptDataEscapedState;
779 } else if (data == EOF) {
780 state = dataState;
781 } else {
782 _addToken(new CharactersToken(data));
783 state = scriptDataEscapedState;
784 }
785 return true;
786 }
787
788 bool scriptDataEscapedDashDashState() {
789 var data = stream.char();
790 if (data == "-") {
791 _addToken(new CharactersToken("-"));
792 } else if (data == "<") {
793 state = scriptDataEscapedLessThanSignState;
794 } else if (data == ">") {
795 _addToken(new CharactersToken(">"));
796 state = scriptDataState;
797 } else if (data == "\u0000") {
798 _addToken(new ParseErrorToken("invalid-codepoint"));
799 _addToken(new CharactersToken("\uFFFD"));
800 state = scriptDataEscapedState;
801 } else if (data == EOF) {
802 state = dataState;
803 } else {
804 _addToken(new CharactersToken(data));
805 state = scriptDataEscapedState;
806 }
807 return true;
808 }
809
810 bool scriptDataEscapedLessThanSignState() {
811 var data = stream.char();
812 if (data == "/") {
813 temporaryBuffer = "";
814 state = scriptDataEscapedEndTagOpenState;
815 } else if (isLetter(data)) {
816 _addToken(new CharactersToken("<$data"));
817 temporaryBuffer = data;
818 state = scriptDataDoubleEscapeStartState;
819 } else {
820 _addToken(new CharactersToken("<"));
821 stream.unget(data);
822 state = scriptDataEscapedState;
823 }
824 return true;
825 }
826
827 bool scriptDataEscapedEndTagOpenState() {
828 var data = stream.char();
829 if (isLetter(data)) {
830 temporaryBuffer = data;
831 state = scriptDataEscapedEndTagNameState;
832 } else {
833 _addToken(new CharactersToken("</"));
834 stream.unget(data);
835 state = scriptDataEscapedState;
836 }
837 return true;
838 }
839
840 bool scriptDataEscapedEndTagNameState() {
841 var appropriate = _tokenIsAppropriate();
842 var data = stream.char();
843 if (isWhitespace(data) && appropriate) {
844 currentToken = new EndTagToken(temporaryBuffer);
845 state = beforeAttributeNameState;
846 } else if (data == "/" && appropriate) {
847 currentToken = new EndTagToken(temporaryBuffer);
848 state = selfClosingStartTagState;
849 } else if (data == ">" && appropriate) {
850 currentToken = new EndTagToken(temporaryBuffer);
851 emitCurrentToken();
852 state = dataState;
853 } else if (isLetter(data)) {
854 temporaryBuffer = '${temporaryBuffer}$data';
855 } else {
856 _addToken(new CharactersToken("</$temporaryBuffer"));
857 stream.unget(data);
858 state = scriptDataEscapedState;
859 }
860 return true;
861 }
862
863 bool scriptDataDoubleEscapeStartState() {
864 var data = stream.char();
865 if (isWhitespace(data) || data == "/" || data == ">") {
866 _addToken(new CharactersToken(data));
867 if (temporaryBuffer.toLowerCase() == "script") {
868 state = scriptDataDoubleEscapedState;
869 } else {
870 state = scriptDataEscapedState;
871 }
872 } else if (isLetter(data)) {
873 _addToken(new CharactersToken(data));
874 temporaryBuffer = '${temporaryBuffer}$data';
875 } else {
876 stream.unget(data);
877 state = scriptDataEscapedState;
878 }
879 return true;
880 }
881
882 bool scriptDataDoubleEscapedState() {
883 var data = stream.char();
884 if (data == "-") {
885 _addToken(new CharactersToken("-"));
886 state = scriptDataDoubleEscapedDashState;
887 } else if (data == "<") {
888 _addToken(new CharactersToken("<"));
889 state = scriptDataDoubleEscapedLessThanSignState;
890 } else if (data == "\u0000") {
891 _addToken(new ParseErrorToken("invalid-codepoint"));
892 _addToken(new CharactersToken("\uFFFD"));
893 } else if (data == EOF) {
894 _addToken(new ParseErrorToken("eof-in-script-in-script"));
895 state = dataState;
896 } else {
897 _addToken(new CharactersToken(data));
898 }
899 return true;
900 }
901
902 bool scriptDataDoubleEscapedDashState() {
903 var data = stream.char();
904 if (data == "-") {
905 _addToken(new CharactersToken("-"));
906 state = scriptDataDoubleEscapedDashDashState;
907 } else if (data == "<") {
908 _addToken(new CharactersToken("<"));
909 state = scriptDataDoubleEscapedLessThanSignState;
910 } else if (data == "\u0000") {
911 _addToken(new ParseErrorToken("invalid-codepoint"));
912 _addToken(new CharactersToken("\uFFFD"));
913 state = scriptDataDoubleEscapedState;
914 } else if (data == EOF) {
915 _addToken(new ParseErrorToken("eof-in-script-in-script"));
916 state = dataState;
917 } else {
918 _addToken(new CharactersToken(data));
919 state = scriptDataDoubleEscapedState;
920 }
921 return true;
922 }
923
924 // TODO(jmesserly): report bug in original code
925 // (was "Dash" instead of "DashDash")
926 bool scriptDataDoubleEscapedDashDashState() {
927 var data = stream.char();
928 if (data == "-") {
929 _addToken(new CharactersToken("-"));
930 } else if (data == "<") {
931 _addToken(new CharactersToken("<"));
932 state = scriptDataDoubleEscapedLessThanSignState;
933 } else if (data == ">") {
934 _addToken(new CharactersToken(">"));
935 state = scriptDataState;
936 } else if (data == "\u0000") {
937 _addToken(new ParseErrorToken("invalid-codepoint"));
938 _addToken(new CharactersToken("\uFFFD"));
939 state = scriptDataDoubleEscapedState;
940 } else if (data == EOF) {
941 _addToken(new ParseErrorToken("eof-in-script-in-script"));
942 state = dataState;
943 } else {
944 _addToken(new CharactersToken(data));
945 state = scriptDataDoubleEscapedState;
946 }
947 return true;
948 }
949
950 bool scriptDataDoubleEscapedLessThanSignState() {
951 var data = stream.char();
952 if (data == "/") {
953 _addToken(new CharactersToken("/"));
954 temporaryBuffer = "";
955 state = scriptDataDoubleEscapeEndState;
956 } else {
957 stream.unget(data);
958 state = scriptDataDoubleEscapedState;
959 }
960 return true;
961 }
962
963 bool scriptDataDoubleEscapeEndState() {
964 var data = stream.char();
965 if (isWhitespace(data) || data == "/" || data == ">") {
966 _addToken(new CharactersToken(data));
967 if (temporaryBuffer.toLowerCase() == "script") {
968 state = scriptDataEscapedState;
969 } else {
970 state = scriptDataDoubleEscapedState;
971 }
972 } else if (isLetter(data)) {
973 _addToken(new CharactersToken(data));
974 temporaryBuffer = '${temporaryBuffer}$data';
975 } else {
976 stream.unget(data);
977 state = scriptDataDoubleEscapedState;
978 }
979 return true;
980 }
981
982 bool beforeAttributeNameState() {
983 var data = stream.char();
984 if (isWhitespace(data)) {
985 stream.charsUntil(spaceCharacters, true);
986 } else if (isLetter(data)) {
987 _addAttribute(data);
988 state = attributeNameState;
989 } else if (data == ">") {
990 emitCurrentToken();
991 } else if (data == "/") {
992 state = selfClosingStartTagState;
993 } else if (data == EOF) {
994 _addToken(new ParseErrorToken("expected-attribute-name-but-got-eof"));
995 state = dataState;
996 } else if ("'\"=<".contains(data)) {
997 _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
998 _addAttribute(data);
999 state = attributeNameState;
1000 } else if (data == "\u0000") {
1001 _addToken(new ParseErrorToken("invalid-codepoint"));
1002 _addAttribute("\uFFFD");
1003 state = attributeNameState;
1004 } else {
1005 _addAttribute(data);
1006 state = attributeNameState;
1007 }
1008 return true;
1009 }
1010
1011 bool attributeNameState() {
1012 var data = stream.char();
1013 bool leavingThisState = true;
1014 bool emitToken = false;
1015 if (data == "=") {
1016 state = beforeAttributeValueState;
1017 } else if (isLetter(data)) {
1018 _attributeName = '$_attributeName$data'
1019 '${stream.charsUntil(asciiLetters, true)}';
1020 leavingThisState = false;
1021 } else if (data == ">") {
1022 // XXX If we emit here the attributes are converted to a dict
1023 // without being checked and when the code below runs we error
1024 // because data is a dict not a list
1025 emitToken = true;
1026 } else if (isWhitespace(data)) {
1027 state = afterAttributeNameState;
1028 } else if (data == "/") {
1029 state = selfClosingStartTagState;
1030 } else if (data == "\u0000") {
1031 _addToken(new ParseErrorToken("invalid-codepoint"));
1032 _attributeName = '${_attributeName}\uFFFD';
1033 leavingThisState = false;
1034 } else if (data == EOF) {
1035 _addToken(new ParseErrorToken("eof-in-attribute-name"));
1036 state = dataState;
1037 } else if ("'\"<".contains(data)) {
1038 _addToken(new ParseErrorToken("invalid-character-in-attribute-name"));
1039 _attributeName = '$_attributeName$data';
1040 leavingThisState = false;
1041 } else {
1042 _attributeName = '$_attributeName$data';
1043 leavingThisState = false;
1044 }
1045
1046 if (leavingThisState) {
1047 _markAttributeNameEnd(-1);
1048
1049 // Attributes are not dropped at this stage. That happens when the
1050 // start tag token is emitted so values can still be safely appended
1051 // to attributes, but we do want to report the parse error in time.
1052 if (lowercaseAttrName) {
1053 _attributeName = asciiUpper2Lower(_attributeName);
1054 }
1055 if (_attributeNames == null) _attributeNames = new Set();
1056 if (_attributeNames.contains(_attributeName)) {
1057 _addToken(new ParseErrorToken("duplicate-attribute"));
1058 }
1059 _attributeNames.add(_attributeName);
1060
1061 // XXX Fix for above XXX
1062 if (emitToken) {
1063 emitCurrentToken();
1064 }
1065 }
1066 return true;
1067 }
1068
1069 bool afterAttributeNameState() {
1070 var data = stream.char();
1071 if (isWhitespace(data)) {
1072 stream.charsUntil(spaceCharacters, true);
1073 } else if (data == "=") {
1074 state = beforeAttributeValueState;
1075 } else if (data == ">") {
1076 emitCurrentToken();
1077 } else if (isLetter(data)) {
1078 _addAttribute(data);
1079 state = attributeNameState;
1080 } else if (data == "/") {
1081 state = selfClosingStartTagState;
1082 } else if (data == "\u0000") {
1083 _addToken(new ParseErrorToken("invalid-codepoint"));
1084 _addAttribute("\uFFFD");
1085 state = attributeNameState;
1086 } else if (data == EOF) {
1087 _addToken(new ParseErrorToken("expected-end-of-tag-but-got-eof"));
1088 state = dataState;
1089 } else if ("'\"<".contains(data)) {
1090 _addToken(new ParseErrorToken("invalid-character-after-attribute-name"));
1091 _addAttribute(data);
1092 state = attributeNameState;
1093 } else {
1094 _addAttribute(data);
1095 state = attributeNameState;
1096 }
1097 return true;
1098 }
1099
1100 bool beforeAttributeValueState() {
1101 var data = stream.char();
1102 if (isWhitespace(data)) {
1103 stream.charsUntil(spaceCharacters, true);
1104 } else if (data == "\"") {
1105 _markAttributeValueStart(0);
1106 state = attributeValueDoubleQuotedState;
1107 } else if (data == "&") {
1108 state = attributeValueUnQuotedState;
1109 stream.unget(data);
1110 _markAttributeValueStart(0);
1111 } else if (data == "'") {
1112 _markAttributeValueStart(0);
1113 state = attributeValueSingleQuotedState;
1114 } else if (data == ">") {
1115 _addToken(new ParseErrorToken(
1116 "expected-attribute-value-but-got-right-bracket"));
1117 emitCurrentToken();
1118 } else if (data == "\u0000") {
1119 _addToken(new ParseErrorToken("invalid-codepoint"));
1120 _markAttributeValueStart(-1);
1121 _attributeValue = '${_attributeValue}\uFFFD';
1122 state = attributeValueUnQuotedState;
1123 } else if (data == EOF) {
1124 _addToken(new ParseErrorToken("expected-attribute-value-but-got-eof"));
1125 state = dataState;
1126 } else if ("=<`".contains(data)) {
1127 _addToken(new ParseErrorToken("equals-in-unquoted-attribute-value"));
1128 _markAttributeValueStart(-1);
1129 _attributeValue = '$_attributeValue$data';
1130 state = attributeValueUnQuotedState;
1131 } else {
1132 _markAttributeValueStart(-1);
1133 _attributeValue = '$_attributeValue$data';
1134 state = attributeValueUnQuotedState;
1135 }
1136 return true;
1137 }
1138
1139 bool attributeValueDoubleQuotedState() {
1140 var data = stream.char();
1141 if (data == "\"") {
1142 _markAttributeValueEnd(-1);
1143 _markAttributeEnd(0);
1144 state = afterAttributeValueState;
1145 } else if (data == "&") {
1146 processEntityInAttribute('"');
1147 } else if (data == "\u0000") {
1148 _addToken(new ParseErrorToken("invalid-codepoint"));
1149 _attributeValue = '${_attributeValue}\uFFFD';
1150 } else if (data == EOF) {
1151 _addToken(new ParseErrorToken("eof-in-attribute-value-double-quote"));
1152 _markAttributeValueEnd(-1);
1153 state = dataState;
1154 } else {
1155 _attributeValue = '$_attributeValue$data${stream.charsUntil("\"&")}';
1156 }
1157 return true;
1158 }
1159
1160 bool attributeValueSingleQuotedState() {
1161 var data = stream.char();
1162 if (data == "'") {
1163 _markAttributeValueEnd(-1);
1164 _markAttributeEnd(0);
1165 state = afterAttributeValueState;
1166 } else if (data == "&") {
1167 processEntityInAttribute("'");
1168 } else if (data == "\u0000") {
1169 _addToken(new ParseErrorToken("invalid-codepoint"));
1170 _attributeValue = '${_attributeValue}\uFFFD';
1171 } else if (data == EOF) {
1172 _addToken(new ParseErrorToken("eof-in-attribute-value-single-quote"));
1173 _markAttributeValueEnd(-1);
1174 state = dataState;
1175 } else {
1176 _attributeValue = '$_attributeValue$data${stream.charsUntil("\'&")}';
1177 }
1178 return true;
1179 }
1180
1181 bool attributeValueUnQuotedState() {
1182 var data = stream.char();
1183 if (isWhitespace(data)) {
1184 _markAttributeValueEnd(-1);
1185 state = beforeAttributeNameState;
1186 } else if (data == "&") {
1187 processEntityInAttribute(">");
1188 } else if (data == ">") {
1189 _markAttributeValueEnd(-1);
1190 emitCurrentToken();
1191 } else if (data == EOF) {
1192 _addToken(new ParseErrorToken("eof-in-attribute-value-no-quotes"));
1193 _markAttributeValueEnd(-1);
1194 state = dataState;
1195 } else if ('"\'=<`'.contains(data)) {
1196 _addToken(new ParseErrorToken(
1197 "unexpected-character-in-unquoted-attribute-value"));
1198 _attributeValue = '$_attributeValue$data';
1199 } else if (data == "\u0000") {
1200 _addToken(new ParseErrorToken("invalid-codepoint"));
1201 _attributeValue = '${_attributeValue}\uFFFD';
1202 } else {
1203 _attributeValue = '$_attributeValue$data'
1204 '${stream.charsUntil("&>\"\'=<`$spaceCharacters")}';
1205 }
1206 return true;
1207 }
1208
1209 bool afterAttributeValueState() {
1210 var data = stream.char();
1211 if (isWhitespace(data)) {
1212 state = beforeAttributeNameState;
1213 } else if (data == ">") {
1214 emitCurrentToken();
1215 } else if (data == "/") {
1216 state = selfClosingStartTagState;
1217 } else if (data == EOF) {
1218 _addToken(new ParseErrorToken("unexpected-EOF-after-attribute-value"));
1219 stream.unget(data);
1220 state = dataState;
1221 } else {
1222 _addToken(new ParseErrorToken(
1223 "unexpected-character-after-attribute-value"));
1224 stream.unget(data);
1225 state = beforeAttributeNameState;
1226 }
1227 return true;
1228 }
1229
1230 bool selfClosingStartTagState() {
1231 var data = stream.char();
1232 if (data == ">") {
1233 currentTagToken.selfClosing = true;
1234 emitCurrentToken();
1235 } else if (data == EOF) {
1236 _addToken(new ParseErrorToken("unexpected-EOF-after-solidus-in-tag"));
1237 stream.unget(data);
1238 state = dataState;
1239 } else {
1240 _addToken(new ParseErrorToken(
1241 "unexpected-character-after-soldius-in-tag"));
1242 stream.unget(data);
1243 state = beforeAttributeNameState;
1244 }
1245 return true;
1246 }
1247
1248 bool bogusCommentState() {
1249 // Make a new comment token and give it as value all the characters
1250 // until the first > or EOF (charsUntil checks for EOF automatically)
1251 // and emit it.
1252 var data = stream.charsUntil(">");
1253 data = data.replaceAll("\u0000", "\uFFFD");
1254 _addToken(new CommentToken(data));
1255
1256 // Eat the character directly after the bogus comment which is either a
1257 // ">" or an EOF.
1258 stream.char();
1259 state = dataState;
1260 return true;
1261 }
1262
1263 bool markupDeclarationOpenState() {
1264 var charStack = [stream.char()];
1265 if (charStack.last == "-") {
1266 charStack.add(stream.char());
1267 if (charStack.last == "-") {
1268 currentToken = new CommentToken("");
1269 state = commentStartState;
1270 return true;
1271 }
1272 } else if (charStack.last == 'd' || charStack.last == 'D') {
1273 var matched = true;
1274 for (var expected in const ['oO', 'cC', 'tT', 'yY', 'pP', 'eE']) {
1275 var char = stream.char();
1276 charStack.add(char);
1277 if (char == EOF || !expected.contains(char)) {
1278 matched = false;
1279 break;
1280 }
1281 }
1282 if (matched) {
1283 currentToken = new DoctypeToken(correct: true);
1284 state = doctypeState;
1285 return true;
1286 }
1287 } else if (charStack.last == "[" &&
1288 parser != null && parser.tree.openElements.length > 0 &&
1289 parser.tree.openElements.last.namespaceUri
1290 != parser.tree.defaultNamespace) {
1291 var matched = true;
1292 for (var expected in const ["C", "D", "A", "T", "A", "["]) {
1293 charStack.add(stream.char());
1294 if (charStack.last != expected) {
1295 matched = false;
1296 break;
1297 }
1298 }
1299 if (matched) {
1300 state = cdataSectionState;
1301 return true;
1302 }
1303 }
1304
1305 _addToken(new ParseErrorToken("expected-dashes-or-doctype"));
1306
1307 while (charStack.length > 0) {
1308 stream.unget(charStack.removeLast());
1309 }
1310 state = bogusCommentState;
1311 return true;
1312 }
1313
1314 bool commentStartState() {
1315 var data = stream.char();
1316 if (data == "-") {
1317 state = commentStartDashState;
1318 } else if (data == "\u0000") {
1319 _addToken(new ParseErrorToken("invalid-codepoint"));
1320 currentStringToken.data = '${currentStringToken.data}\uFFFD';
1321 } else if (data == ">") {
1322 _addToken(new ParseErrorToken("incorrect-comment"));
1323 _addToken(currentToken);
1324 state = dataState;
1325 } else if (data == EOF) {
1326 _addToken(new ParseErrorToken("eof-in-comment"));
1327 _addToken(currentToken);
1328 state = dataState;
1329 } else {
1330 currentStringToken.data = '${currentStringToken.data}$data';
1331 state = commentState;
1332 }
1333 return true;
1334 }
1335
1336 bool commentStartDashState() {
1337 var data = stream.char();
1338 if (data == "-") {
1339 state = commentEndState;
1340 } else if (data == "\u0000") {
1341 _addToken(new ParseErrorToken("invalid-codepoint"));
1342 currentStringToken.data = '${currentStringToken.data}-\uFFFD';
1343 } else if (data == ">") {
1344 _addToken(new ParseErrorToken("incorrect-comment"));
1345 _addToken(currentToken);
1346 state = dataState;
1347 } else if (data == EOF) {
1348 _addToken(new ParseErrorToken("eof-in-comment"));
1349 _addToken(currentToken);
1350 state = dataState;
1351 } else {
1352 currentStringToken.data = '${currentStringToken.data}-${data}';
1353 state = commentState;
1354 }
1355 return true;
1356 }
1357
1358 bool commentState() {
1359 var data = stream.char();
1360 if (data == "-") {
1361 state = commentEndDashState;
1362 } else if (data == "\u0000") {
1363 _addToken(new ParseErrorToken("invalid-codepoint"));
1364 currentStringToken.data = '${currentStringToken.data}\uFFFD';
1365 } else if (data == EOF) {
1366 _addToken(new ParseErrorToken("eof-in-comment"));
1367 _addToken(currentToken);
1368 state = dataState;
1369 } else {
1370 currentStringToken.data = '${currentStringToken.data}$data'
1371 '${stream.charsUntil("-\u0000")}';
1372 }
1373 return true;
1374 }
1375
1376 bool commentEndDashState() {
1377 var data = stream.char();
1378 if (data == "-") {
1379 state = commentEndState;
1380 } else if (data == "\u0000") {
1381 _addToken(new ParseErrorToken("invalid-codepoint"));
1382 currentStringToken.data = "${currentStringToken.data}-\uFFFD";
1383 state = commentState;
1384 } else if (data == EOF) {
1385 _addToken(new ParseErrorToken("eof-in-comment-end-dash"));
1386 _addToken(currentToken);
1387 state = dataState;
1388 } else {
1389 currentStringToken.data = "${currentStringToken.data}-${data}";
1390 state = commentState;
1391 }
1392 return true;
1393 }
1394
1395 bool commentEndState() {
1396 var data = stream.char();
1397 if (data == ">") {
1398 _addToken(currentToken);
1399 state = dataState;
1400 } else if (data == "\u0000") {
1401 _addToken(new ParseErrorToken("invalid-codepoint"));
1402 currentStringToken.data = '${currentStringToken.data}--\uFFFD';
1403 state = commentState;
1404 } else if (data == "!") {
1405 _addToken(new ParseErrorToken(
1406 "unexpected-bang-after-double-dash-in-comment"));
1407 state = commentEndBangState;
1408 } else if (data == "-") {
1409 _addToken(new ParseErrorToken(
1410 "unexpected-dash-after-double-dash-in-comment"));
1411 currentStringToken.data = '${currentStringToken.data}$data';
1412 } else if (data == EOF) {
1413 _addToken(new ParseErrorToken("eof-in-comment-double-dash"));
1414 _addToken(currentToken);
1415 state = dataState;
1416 } else {
1417 // XXX
1418 _addToken(new ParseErrorToken("unexpected-char-in-comment"));
1419 currentStringToken.data = "${currentStringToken.data}--${data}";
1420 state = commentState;
1421 }
1422 return true;
1423 }
1424
1425 bool commentEndBangState() {
1426 var data = stream.char();
1427 if (data == ">") {
1428 _addToken(currentToken);
1429 state = dataState;
1430 } else if (data == "-") {
1431 currentStringToken.data = '${currentStringToken.data}--!';
1432 state = commentEndDashState;
1433 } else if (data == "\u0000") {
1434 _addToken(new ParseErrorToken("invalid-codepoint"));
1435 currentStringToken.data = '${currentStringToken.data}--!\uFFFD';
1436 state = commentState;
1437 } else if (data == EOF) {
1438 _addToken(new ParseErrorToken("eof-in-comment-end-bang-state"));
1439 _addToken(currentToken);
1440 state = dataState;
1441 } else {
1442 currentStringToken.data = "${currentStringToken.data}--!${data}";
1443 state = commentState;
1444 }
1445 return true;
1446 }
1447
1448 bool doctypeState() {
1449 var data = stream.char();
1450 if (isWhitespace(data)) {
1451 state = beforeDoctypeNameState;
1452 } else if (data == EOF) {
1453 _addToken(new ParseErrorToken(
1454 "expected-doctype-name-but-got-eof"));
1455 currentDoctypeToken.correct = false;
1456 _addToken(currentToken);
1457 state = dataState;
1458 } else {
1459 _addToken(new ParseErrorToken("need-space-after-doctype"));
1460 stream.unget(data);
1461 state = beforeDoctypeNameState;
1462 }
1463 return true;
1464 }
1465
1466 bool beforeDoctypeNameState() {
1467 var data = stream.char();
1468 if (isWhitespace(data)) {
1469 return true;
1470 } else if (data == ">") {
1471 _addToken(new ParseErrorToken(
1472 "expected-doctype-name-but-got-right-bracket"));
1473 currentDoctypeToken.correct = false;
1474 _addToken(currentToken);
1475 state = dataState;
1476 } else if (data == "\u0000") {
1477 _addToken(new ParseErrorToken("invalid-codepoint"));
1478 currentDoctypeToken.name = "\uFFFD";
1479 state = doctypeNameState;
1480 } else if (data == EOF) {
1481 _addToken(new ParseErrorToken(
1482 "expected-doctype-name-but-got-eof"));
1483 currentDoctypeToken.correct = false;
1484 _addToken(currentToken);
1485 state = dataState;
1486 } else {
1487 currentDoctypeToken.name = data;
1488 state = doctypeNameState;
1489 }
1490 return true;
1491 }
1492
1493 bool doctypeNameState() {
1494 var data = stream.char();
1495 if (isWhitespace(data)) {
1496 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
1497 state = afterDoctypeNameState;
1498 } else if (data == ">") {
1499 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
1500 _addToken(currentToken);
1501 state = dataState;
1502 } else if (data == "\u0000") {
1503 _addToken(new ParseErrorToken("invalid-codepoint"));
1504 currentDoctypeToken.name = "${currentDoctypeToken.name}\uFFFD";
1505 state = doctypeNameState;
1506 } else if (data == EOF) {
1507 _addToken(new ParseErrorToken("eof-in-doctype-name"));
1508 currentDoctypeToken.correct = false;
1509 currentDoctypeToken.name = asciiUpper2Lower(currentDoctypeToken.name);
1510 _addToken(currentToken);
1511 state = dataState;
1512 } else {
1513 currentDoctypeToken.name = '${currentDoctypeToken.name}$data';
1514 }
1515 return true;
1516 }
1517
1518 bool afterDoctypeNameState() {
1519 var data = stream.char();
1520 if (isWhitespace(data)) {
1521 return true;
1522 } else if (data == ">") {
1523 _addToken(currentToken);
1524 state = dataState;
1525 } else if (data == EOF) {
1526 currentDoctypeToken.correct = false;
1527 stream.unget(data);
1528 _addToken(new ParseErrorToken("eof-in-doctype"));
1529 _addToken(currentToken);
1530 state = dataState;
1531 } else {
1532 if (data == "p" || data == "P") {
1533 // TODO(jmesserly): would be nice to have a helper for this.
1534 var matched = true;
1535 for (var expected in const ["uU", "bB", "lL", "iI", "cC"]) {
1536 data = stream.char();
1537 if (data == EOF || !expected.contains(data)) {
1538 matched = false;
1539 break;
1540 }
1541 }
1542 if (matched) {
1543 state = afterDoctypePublicKeywordState;
1544 return true;
1545 }
1546 } else if (data == "s" || data == "S") {
1547 var matched = true;
1548 for (var expected in const ["yY", "sS", "tT", "eE", "mM"]) {
1549 data = stream.char();
1550 if (data == EOF || !expected.contains(data)) {
1551 matched = false;
1552 break;
1553 }
1554 }
1555 if (matched) {
1556 state = afterDoctypeSystemKeywordState;
1557 return true;
1558 }
1559 }
1560
1561 // All the characters read before the current 'data' will be
1562 // [a-zA-Z], so they're garbage in the bogus doctype and can be
1563 // discarded; only the latest character might be '>' or EOF
1564 // and needs to be ungetted
1565 stream.unget(data);
1566 _addToken(new ParseErrorToken(
1567 "expected-space-or-right-bracket-in-doctype",
1568 messageParams: {"data": data}));
1569 currentDoctypeToken.correct = false;
1570 state = bogusDoctypeState;
1571 }
1572 return true;
1573 }
1574
1575 bool afterDoctypePublicKeywordState() {
1576 var data = stream.char();
1577 if (isWhitespace(data)) {
1578 state = beforeDoctypePublicIdentifierState;
1579 } else if (data == "'" || data == '"') {
1580 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1581 stream.unget(data);
1582 state = beforeDoctypePublicIdentifierState;
1583 } else if (data == EOF) {
1584 _addToken(new ParseErrorToken("eof-in-doctype"));
1585 currentDoctypeToken.correct = false;
1586 _addToken(currentToken);
1587 state = dataState;
1588 } else {
1589 stream.unget(data);
1590 state = beforeDoctypePublicIdentifierState;
1591 }
1592 return true;
1593 }
1594
1595 bool beforeDoctypePublicIdentifierState() {
1596 var data = stream.char();
1597 if (isWhitespace(data)) {
1598 return true;
1599 } else if (data == "\"") {
1600 currentDoctypeToken.publicId = "";
1601 state = doctypePublicIdentifierDoubleQuotedState;
1602 } else if (data == "'") {
1603 currentDoctypeToken.publicId = "";
1604 state = doctypePublicIdentifierSingleQuotedState;
1605 } else if (data == ">") {
1606 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1607 currentDoctypeToken.correct = false;
1608 _addToken(currentToken);
1609 state = dataState;
1610 } else if (data == EOF) {
1611 _addToken(new ParseErrorToken("eof-in-doctype"));
1612 currentDoctypeToken.correct = false;
1613 _addToken(currentToken);
1614 state = dataState;
1615 } else {
1616 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1617 currentDoctypeToken.correct = false;
1618 state = bogusDoctypeState;
1619 }
1620 return true;
1621 }
1622
1623 bool doctypePublicIdentifierDoubleQuotedState() {
1624 var data = stream.char();
1625 if (data == '"') {
1626 state = afterDoctypePublicIdentifierState;
1627 } else if (data == "\u0000") {
1628 _addToken(new ParseErrorToken("invalid-codepoint"));
1629 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
1630 } else if (data == ">") {
1631 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1632 currentDoctypeToken.correct = false;
1633 _addToken(currentToken);
1634 state = dataState;
1635 } else if (data == EOF) {
1636 _addToken(new ParseErrorToken("eof-in-doctype"));
1637 currentDoctypeToken.correct = false;
1638 _addToken(currentToken);
1639 state = dataState;
1640 } else {
1641 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
1642 }
1643 return true;
1644 }
1645
1646 bool doctypePublicIdentifierSingleQuotedState() {
1647 var data = stream.char();
1648 if (data == "'") {
1649 state = afterDoctypePublicIdentifierState;
1650 } else if (data == "\u0000") {
1651 _addToken(new ParseErrorToken("invalid-codepoint"));
1652 currentDoctypeToken.publicId = "${currentDoctypeToken.publicId}\uFFFD";
1653 } else if (data == ">") {
1654 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1655 currentDoctypeToken.correct = false;
1656 _addToken(currentToken);
1657 state = dataState;
1658 } else if (data == EOF) {
1659 _addToken(new ParseErrorToken("eof-in-doctype"));
1660 currentDoctypeToken.correct = false;
1661 _addToken(currentToken);
1662 state = dataState;
1663 } else {
1664 currentDoctypeToken.publicId = '${currentDoctypeToken.publicId}$data';
1665 }
1666 return true;
1667 }
1668
1669 bool afterDoctypePublicIdentifierState() {
1670 var data = stream.char();
1671 if (isWhitespace(data)) {
1672 state = betweenDoctypePublicAndSystemIdentifiersState;
1673 } else if (data == ">") {
1674 _addToken(currentToken);
1675 state = dataState;
1676 } else if (data == '"') {
1677 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1678 currentDoctypeToken.systemId = "";
1679 state = doctypeSystemIdentifierDoubleQuotedState;
1680 } else if (data == "'") {
1681 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1682 currentDoctypeToken.systemId = "";
1683 state = doctypeSystemIdentifierSingleQuotedState;
1684 } else if (data == EOF) {
1685 _addToken(new ParseErrorToken("eof-in-doctype"));
1686 currentDoctypeToken.correct = false;
1687 _addToken(currentToken);
1688 state = dataState;
1689 } else {
1690 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1691 currentDoctypeToken.correct = false;
1692 state = bogusDoctypeState;
1693 }
1694 return true;
1695 }
1696
1697 bool betweenDoctypePublicAndSystemIdentifiersState() {
1698 var data = stream.char();
1699 if (isWhitespace(data)) {
1700 return true;
1701 } else if (data == ">") {
1702 _addToken(currentToken);
1703 state = dataState;
1704 } else if (data == '"') {
1705 currentDoctypeToken.systemId = "";
1706 state = doctypeSystemIdentifierDoubleQuotedState;
1707 } else if (data == "'") {
1708 currentDoctypeToken.systemId = "";
1709 state = doctypeSystemIdentifierSingleQuotedState;
1710 } else if (data == EOF) {
1711 _addToken(new ParseErrorToken("eof-in-doctype"));
1712 currentDoctypeToken.correct = false;
1713 _addToken(currentToken);
1714 state = dataState;
1715 } else {
1716 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1717 currentDoctypeToken.correct = false;
1718 state = bogusDoctypeState;
1719 }
1720 return true;
1721 }
1722
1723 bool afterDoctypeSystemKeywordState() {
1724 var data = stream.char();
1725 if (isWhitespace(data)) {
1726 state = beforeDoctypeSystemIdentifierState;
1727 } else if (data == "'" || data == '"') {
1728 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1729 stream.unget(data);
1730 state = beforeDoctypeSystemIdentifierState;
1731 } else if (data == EOF) {
1732 _addToken(new ParseErrorToken("eof-in-doctype"));
1733 currentDoctypeToken.correct = false;
1734 _addToken(currentToken);
1735 state = dataState;
1736 } else {
1737 stream.unget(data);
1738 state = beforeDoctypeSystemIdentifierState;
1739 }
1740 return true;
1741 }
1742
1743 bool beforeDoctypeSystemIdentifierState() {
1744 var data = stream.char();
1745 if (isWhitespace(data)) {
1746 return true;
1747 } else if (data == "\"") {
1748 currentDoctypeToken.systemId = "";
1749 state = doctypeSystemIdentifierDoubleQuotedState;
1750 } else if (data == "'") {
1751 currentDoctypeToken.systemId = "";
1752 state = doctypeSystemIdentifierSingleQuotedState;
1753 } else if (data == ">") {
1754 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1755 currentDoctypeToken.correct = false;
1756 _addToken(currentToken);
1757 state = dataState;
1758 } else if (data == EOF) {
1759 _addToken(new ParseErrorToken("eof-in-doctype"));
1760 currentDoctypeToken.correct = false;
1761 _addToken(currentToken);
1762 state = dataState;
1763 } else {
1764 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1765 currentDoctypeToken.correct = false;
1766 state = bogusDoctypeState;
1767 }
1768 return true;
1769 }
1770
1771 bool doctypeSystemIdentifierDoubleQuotedState() {
1772 var data = stream.char();
1773 if (data == "\"") {
1774 state = afterDoctypeSystemIdentifierState;
1775 } else if (data == "\u0000") {
1776 _addToken(new ParseErrorToken("invalid-codepoint"));
1777 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
1778 } else if (data == ">") {
1779 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1780 currentDoctypeToken.correct = false;
1781 _addToken(currentToken);
1782 state = dataState;
1783 } else if (data == EOF) {
1784 _addToken(new ParseErrorToken("eof-in-doctype"));
1785 currentDoctypeToken.correct = false;
1786 _addToken(currentToken);
1787 state = dataState;
1788 } else {
1789 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
1790 }
1791 return true;
1792 }
1793
1794 bool doctypeSystemIdentifierSingleQuotedState() {
1795 var data = stream.char();
1796 if (data == "'") {
1797 state = afterDoctypeSystemIdentifierState;
1798 } else if (data == "\u0000") {
1799 _addToken(new ParseErrorToken("invalid-codepoint"));
1800 currentDoctypeToken.systemId = "${currentDoctypeToken.systemId}\uFFFD";
1801 } else if (data == ">") {
1802 _addToken(new ParseErrorToken("unexpected-end-of-doctype"));
1803 currentDoctypeToken.correct = false;
1804 _addToken(currentToken);
1805 state = dataState;
1806 } else if (data == EOF) {
1807 _addToken(new ParseErrorToken("eof-in-doctype"));
1808 currentDoctypeToken.correct = false;
1809 _addToken(currentToken);
1810 state = dataState;
1811 } else {
1812 currentDoctypeToken.systemId = '${currentDoctypeToken.systemId}$data';
1813 }
1814 return true;
1815 }
1816
1817 bool afterDoctypeSystemIdentifierState() {
1818 var data = stream.char();
1819 if (isWhitespace(data)) {
1820 return true;
1821 } else if (data == ">") {
1822 _addToken(currentToken);
1823 state = dataState;
1824 } else if (data == EOF) {
1825 _addToken(new ParseErrorToken("eof-in-doctype"));
1826 currentDoctypeToken.correct = false;
1827 _addToken(currentToken);
1828 state = dataState;
1829 } else {
1830 _addToken(new ParseErrorToken("unexpected-char-in-doctype"));
1831 state = bogusDoctypeState;
1832 }
1833 return true;
1834 }
1835
1836 bool bogusDoctypeState() {
1837 var data = stream.char();
1838 if (data == ">") {
1839 _addToken(currentToken);
1840 state = dataState;
1841 } else if (data == EOF) {
1842 // XXX EMIT
1843 stream.unget(data);
1844 _addToken(currentToken);
1845 state = dataState;
1846 }
1847 return true;
1848 }
1849
1850 bool cdataSectionState() {
1851 var data = [];
1852 int matchedEnd = 0;
1853 while (true) {
1854 var ch = stream.char();
1855 if (ch == EOF) {
1856 break;
1857 }
1858 // Deal with null here rather than in the parser
1859 if (ch == "\u0000") {
1860 _addToken(new ParseErrorToken("invalid-codepoint"));
1861 ch = "\uFFFD";
1862 }
1863 data.add(ch);
1864 // TODO(jmesserly): it'd be nice if we had an easier way to match the end,
1865 // perhaps with a "peek" API.
1866 if (ch == "]" && matchedEnd < 2) {
1867 matchedEnd++;
1868 } else if (ch == ">" && matchedEnd == 2) {
1869 // Remove "]]>" from the end.
1870 data.removeLast();
1871 data.removeLast();
1872 data.removeLast();
1873 break;
1874 } else {
1875 matchedEnd = 0;
1876 }
1877 }
1878
1879 if (data.length > 0) {
1880 _addToken(new CharactersToken(data.join()));
1881 }
1882 state = dataState;
1883 return true;
1884 }
1885 }
1886
OLDNEW
« no previous file with comments | « pkg/third_party/html5lib/lib/src/token.dart ('k') | pkg/third_party/html5lib/lib/src/treebuilder.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698